From 5d082c8ba32eb9686755280e159ffac3771c5a92 Mon Sep 17 00:00:00 2001 From: Zach Atkins Date: Thu, 21 May 2026 13:16:59 -0600 Subject: [PATCH 1/5] minor - fix edge cases in assembly fallback to prevent segmentation fault on initialized vector --- CHANGELOG.md | 1 + backends/cuda-gen/ceed-cuda-gen-operator.c | 2 +- backends/hip-gen/ceed-hip-gen-operator.c | 2 +- include/ceed/backend.h | 4 +-- interface/ceed-preconditioning.c | 35 +++++++++++++--------- 5 files changed, 26 insertions(+), 18 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d8e70bece4..a0aaa760ef 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,7 @@ On this page we provide a summary of the main API changes, new features and exam - Require use of `Ceed*Destroy()` on Ceed objects returned from `Ceed*Get*()`. - Rename `CeedCompositeOperatorCreate()` to `CeedOperatorCreateComposite()` for uniformity. - Rename `CeedCompositeOperator*()` to `CeedOperatorComposite*()` for uniformity. +- Add `build_objects` parameter to `CeedOperatorLinearAssembleQFunctionBuildOrUpdateFallback` to allow for passing uninitialized vectors and restrictions ### New features diff --git a/backends/cuda-gen/ceed-cuda-gen-operator.c b/backends/cuda-gen/ceed-cuda-gen-operator.c index 97fcf6b4b0..583dd07949 100644 --- a/backends/cuda-gen/ceed-cuda-gen-operator.c +++ b/backends/cuda-gen/ceed-cuda-gen-operator.c @@ -517,7 +517,7 @@ static int CeedOperatorLinearAssembleQFunctionCore_Cuda_gen(CeedOperator op, boo CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back to /gpu/cuda/ref CeedOperator for LinearAssemblyQFunction\n"); CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback)); - CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdateFallback(op_fallback, assembled, rstr, request)); + CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdateFallback(op_fallback, build_objects, assembled, rstr, request)); return CEED_ERROR_SUCCESS; } return CEED_ERROR_SUCCESS; diff --git a/backends/hip-gen/ceed-hip-gen-operator.c b/backends/hip-gen/ceed-hip-gen-operator.c index 7532ba55b6..014ebede3d 100644 --- a/backends/hip-gen/ceed-hip-gen-operator.c +++ b/backends/hip-gen/ceed-hip-gen-operator.c @@ -496,7 +496,7 @@ static int CeedOperatorLinearAssembleQFunctionCore_Hip_gen(CeedOperator op, bool CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back to /gpu/hip/ref CeedOperator for LinearAssembleQFunction\n"); CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback)); - CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdateFallback(op_fallback, assembled, rstr, request)); + CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdateFallback(op_fallback, build_objects, assembled, rstr, request)); return CEED_ERROR_SUCCESS; } return CEED_ERROR_SUCCESS; diff --git a/include/ceed/backend.h b/include/ceed/backend.h index 4a6e857215..c37b6794c1 100644 --- a/include/ceed/backend.h +++ b/include/ceed/backend.h @@ -473,8 +473,8 @@ CEED_EXTERN int CeedOperatorReference(CeedOperator op); CEED_EXTERN int CeedOperatorGetFallback(CeedOperator op, CeedOperator *op_fallback); CEED_EXTERN int CeedOperatorGetFallbackParent(CeedOperator op, CeedOperator *parent); CEED_EXTERN int CeedOperatorGetFallbackParentCeed(CeedOperator op, Ceed *parent); -CEED_EXTERN int CeedOperatorLinearAssembleQFunctionBuildOrUpdateFallback(CeedOperator op, CeedVector *assembled, CeedElemRestriction *rstr, - CeedRequest *request); +CEED_EXTERN int CeedOperatorLinearAssembleQFunctionBuildOrUpdateFallback(CeedOperator op, bool build_objects, CeedVector *assembled, + CeedElemRestriction *rstr, CeedRequest *request); CEED_INTERN int CeedOperatorAssembleSingle(CeedOperator op, CeedSize offset, CeedVector values); CEED_EXTERN int CeedOperatorSetSetupDone(CeedOperator op); diff --git a/interface/ceed-preconditioning.c b/interface/ceed-preconditioning.c index 12222db87d..ec1146167e 100644 --- a/interface/ceed-preconditioning.c +++ b/interface/ceed-preconditioning.c @@ -712,18 +712,19 @@ static int CeedOperatorAssembleSymbolicSingle(CeedOperator op, CeedSize offset, Note: If the value of `assembled` or `rstr` passed to this function are non-`NULL` , then it is assumed that they hold valid pointers. These objects will be destroyed if `*assembled` or `*rstr` is the only reference to the object. - @param[in] op `CeedOperator` to assemble `CeedQFunction` - @param[in] use_parent Boolean flag to check for fallback parent implementation - @param[out] assembled `CeedVector` to store assembled `CeedQFunction` at quadrature points - @param[out] rstr `CeedElemRestriction` for `CeedVector` containing assembled `CeedQFunction` - @param[in] request Address of @ref CeedRequest for non-blocking completion, else @ref CEED_REQUEST_IMMEDIATE + @param[in] op `CeedOperator` to assemble `CeedQFunction` + @param[in] build_objects Boolean flag indicating whether the `assembled` vector has been allocated + @param[in] use_parent Boolean flag to check for fallback parent implementation + @param[out] assembled `CeedVector` to store assembled `CeedQFunction` at quadrature points + @param[out] rstr `CeedElemRestriction` for `CeedVector` containing assembled `CeedQFunction` + @param[in] request Address of @ref CeedRequest for non-blocking completion, else @ref CEED_REQUEST_IMMEDIATE @return An error code: 0 - success, otherwise - failure @ref User **/ -static int CeedOperatorLinearAssembleQFunctionBuildOrUpdate_Core(CeedOperator op, bool use_parent, CeedVector *assembled, CeedElemRestriction *rstr, - CeedRequest *request) { +static int CeedOperatorLinearAssembleQFunctionBuildOrUpdate_Core(CeedOperator op, bool build_objects, bool use_parent, CeedVector *assembled, + CeedElemRestriction *rstr, CeedRequest *request) { int (*LinearAssembleQFunctionUpdate)(CeedOperator, CeedVector, CeedElemRestriction, CeedRequest *) = NULL; CeedOperator op_assemble = NULL; CeedOperator op_fallback_parent = NULL; @@ -766,6 +767,10 @@ static int CeedOperatorLinearAssembleQFunctionBuildOrUpdate_Core(CeedOperator op CeedCall(CeedQFunctionAssemblyDataSetUpdateNeeded(data, false)); // Copy reference from internally held copy + if (build_objects) { + *assembled = NULL; + *rstr = NULL; + } CeedCall(CeedVectorReferenceCopy(assembled_vec, assembled)); CeedCall(CeedElemRestrictionReferenceCopy(assembled_rstr, rstr)); CeedCall(CeedVectorDestroy(&assembled_vec)); @@ -792,18 +797,19 @@ static int CeedOperatorLinearAssembleQFunctionBuildOrUpdate_Core(CeedOperator op Note: If the value of `assembled` or `rstr` passed to this function are non-`NULL` , then it is assumed that they hold valid pointers. These objects will be destroyed if `*assembled` or `*rstr` is the only reference to the object. - @param[in] op `CeedOperator` to assemble `CeedQFunction` - @param[out] assembled `CeedVector` to store assembled `CeedQFunction` at quadrature points - @param[out] rstr `CeedElemRestriction` for `CeedVector` containing assembled `CeedQFunction` - @param[in] request Address of @ref CeedRequest for non-blocking completion, else @ref CEED_REQUEST_IMMEDIATE + @param[in] op `CeedOperator` to assemble `CeedQFunction` + @param[in] build_objects Boolean flag indicating whether the `assembled` vector has been allocated + @param[out] assembled `CeedVector` to store assembled `CeedQFunction` at quadrature points + @param[out] rstr `CeedElemRestriction` for `CeedVector` containing assembled `CeedQFunction` + @param[in] request Address of @ref CeedRequest for non-blocking completion, else @ref CEED_REQUEST_IMMEDIATE @return An error code: 0 - success, otherwise - failure @ref Developer **/ -int CeedOperatorLinearAssembleQFunctionBuildOrUpdateFallback(CeedOperator op, CeedVector *assembled, CeedElemRestriction *rstr, +int CeedOperatorLinearAssembleQFunctionBuildOrUpdateFallback(CeedOperator op, bool build_objects, CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request) { - return CeedOperatorLinearAssembleQFunctionBuildOrUpdate_Core(op, false, assembled, rstr, request); + return CeedOperatorLinearAssembleQFunctionBuildOrUpdate_Core(op, build_objects, false, assembled, rstr, request); } /** @@ -2325,7 +2331,8 @@ int CeedOperatorLinearAssembleQFunction(CeedOperator op, CeedVector *assembled, @ref User **/ int CeedOperatorLinearAssembleQFunctionBuildOrUpdate(CeedOperator op, CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request) { - return CeedOperatorLinearAssembleQFunctionBuildOrUpdate_Core(op, true, assembled, rstr, request); + assert(assembled); + return CeedOperatorLinearAssembleQFunctionBuildOrUpdate_Core(op, *assembled == NULL, true, assembled, rstr, request); } /** From 42ebf753060be6d207b94e0bba53aa843219d04f Mon Sep 17 00:00:00 2001 From: Zach Atkins Date: Thu, 21 May 2026 13:34:59 -0600 Subject: [PATCH 2/5] make - add -lhiprtc for HIP 7.X.X --- Makefile | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 9e687780e5..f87ef24a6a 100644 --- a/Makefile +++ b/Makefile @@ -570,12 +570,13 @@ HIP_LIB_DIR := $(wildcard $(foreach d,lib lib64,$(ROCM_DIR)/$d/lib${HIP_LIB_NAME HIP_LIB_DIR := $(patsubst %/,%,$(dir $(firstword $(HIP_LIB_DIR)))) HIP_BACKENDS = /gpu/hip/ref /gpu/hip/shared /gpu/hip/gen ifneq ($(HIP_LIB_DIR),) + HIP_CONFIG := $(ROCM_DIR)/bin/hipconfig ifeq ($(HIP_LIB_NAME),CHIP) # chipStar hipconfig -C emits clang-only flags; keep only -D/-I/-include for gcc - HIPCONFIG_CPPFLAGS := $(shell $(ROCM_DIR)/bin/hipconfig -C) + HIPCONFIG_CPPFLAGS := $(shell $(HIP_CONFIG) -C) HIPCONFIG_CPPFLAGS_C := $(filter-out --offload% -nohipwrapperinc --hip-path% --target%,$(HIPCONFIG_CPPFLAGS)) -I$(ROCM_DIR)/include else - HIPCONFIG_CPPFLAGS := $(subst =,,$(shell $(ROCM_DIR)/bin/hipconfig -C)) + HIPCONFIG_CPPFLAGS := $(subst =,,$(shell $(HIP_CONFIG) -C)) HIPCONFIG_CPPFLAGS_C := $(HIPCONFIG_CPPFLAGS) endif $(hip-all.c:%.c=$(OBJDIR)/%.o) $(hip-all.c:%=%.tidy): CPPFLAGS += $(HIPCONFIG_CPPFLAGS_C) @@ -583,6 +584,12 @@ ifneq ($(HIP_LIB_DIR),) $(hip-all.cpp:%.cpp=$(OBJDIR)/%.o) $(hip-all.cpp:%=%.tidy): CPPFLAGS += $(HIPCONFIG_CPPFLAGS_C) endif PKG_LIBS += -L$(abspath $(HIP_LIB_DIR)) -l${HIP_LIB_NAME} -lhipblas + HIP_MAJOR_VERSION := $(shell $(HIP_CONFIG) --version | cut -d'.' -f1) + ifeq ($(HIP_MAJOR_VERSION),7) + PKG_LIBS += -lhiprtc + $(info $(PKG_LIBS)) + endif + $(info $(HIP_MAJOR_VERSION)) LIBCEED_CONTAINS_CXX = 1 libceed.c += $(hip-all.c) libceed.cpp += $(hip-all.cpp) From 30b08b7d86b8e4436c93b4b6815466e2960bda3c Mon Sep 17 00:00:00 2001 From: Zach Atkins Date: Thu, 21 May 2026 13:37:41 -0600 Subject: [PATCH 3/5] hip(compile) - update include statements for HIP 7 - stddef.h is no longer supported - int types have been moved to the __hip_internal namespace --- backends/hip/ceed-hip-compile.cpp | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/backends/hip/ceed-hip-compile.cpp b/backends/hip/ceed-hip-compile.cpp index af892eedaf..7e4640c9cf 100644 --- a/backends/hip/ceed-hip-compile.cpp +++ b/backends/hip/ceed-hip-compile.cpp @@ -120,12 +120,16 @@ static int CeedCompileCore_Hip(Ceed ceed, const char *source, const char *name, CeedCallHip(ceed, hipRuntimeGetVersion(&runtime_version)); if (runtime_version < 40400000) { code << "#include \n\n"; - } - // With ROCm 4.5, need to include these definitions specifically for hiprtc (but cannot include the runtime header) - else { + } else if (runtime_version < 70000000) { + // With ROCm 4.5, need to include these definitions specifically for hiprtc (but cannot include the runtime header) code << "#include \n"; code << "#define __forceinline__ inline __attribute__((always_inline))\n"; code << "#define HIP_DYNAMIC_SHARED(type, var) extern __shared__ type var[];\n\n"; + } else { + code << "using __hip_internal::int32_t;\n"; + code << "using __hip_internal::int64_t;\n"; + code << "#define __forceinline__ inline __attribute__((always_inline))\n"; + code << "#define HIP_DYNAMIC_SHARED(type, var) extern __shared__ type var[];\n\n"; } } From 629a26b9e7e7838b368c6732a3c6d38047f2fc92 Mon Sep 17 00:00:00 2001 From: Zach Atkins Date: Thu, 21 May 2026 13:38:31 -0600 Subject: [PATCH 4/5] hip(kernels) - use nullptr instead of NULL for HIP/c++ source --- .../hip-gen/ceed-hip-gen-operator-build.cpp | 10 +++++----- .../hip/hip-ref-operator-assemble-diagonal.h | 4 ++-- .../jit-source/hip/hip-shared-basis-tensor.h | 18 +++++++++--------- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/backends/hip-gen/ceed-hip-gen-operator-build.cpp b/backends/hip-gen/ceed-hip-gen-operator-build.cpp index d4d47a39a1..2cc7243cd3 100644 --- a/backends/hip-gen/ceed-hip-gen-operator-build.cpp +++ b/backends/hip-gen/ceed-hip-gen-operator-build.cpp @@ -302,7 +302,7 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce CeedCallBackend(CeedBasisIsCollocated(basis, &is_collocated)); if ((is_active && skip_active_load) || (is_collocated && !is_at_points)) { - code << tab << "CeedScalar *s_B" << var_suffix << " = NULL;\n"; + code << tab << "CeedScalar *s_B" << var_suffix << " = nullptr;\n"; } else { code << tab << "__shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n"; code << tab << "LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n"; @@ -341,7 +341,7 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce CeedCallBackend(CeedBasisIsCollocated(basis, &is_collocated)); if ((is_active && skip_active_load) || (is_collocated && !is_at_points)) { - code << tab << "CeedScalar *s_B" << var_suffix << " = NULL;\n"; + code << tab << "CeedScalar *s_B" << var_suffix << " = nullptr;\n"; } else { code << tab << "__shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n"; code << tab << "LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n"; @@ -357,7 +357,7 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n"; } else if (is_active && skip_active_load) { - code << tab << "CeedScalar *s_G" << var_suffix << " = NULL;\n"; + code << tab << "CeedScalar *s_G" << var_suffix << " = nullptr;\n"; } else { code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n"; code << tab << "LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n"; @@ -373,7 +373,7 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n"; } else if (is_active && skip_active_load) { - code << tab << "CeedScalar *s_G" << var_suffix << " = NULL;\n"; + code << tab << "CeedScalar *s_G" << var_suffix << " = nullptr;\n"; } else { code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n"; code << tab << "LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n"; @@ -384,7 +384,7 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n"; } else if (is_active && skip_active_load) { - code << tab << "CeedScalar *s_G" << var_suffix << " = NULL;\n"; + code << tab << "CeedScalar *s_G" << var_suffix << " = nullptr;\n"; } else { code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << P_name << "*" << Q_name << (is_tensor ? "" : "*dim") << (is_tensor ? "" : var_suffix) << "];\n"; diff --git a/include/ceed/jit-source/hip/hip-ref-operator-assemble-diagonal.h b/include/ceed/jit-source/hip/hip-ref-operator-assemble-diagonal.h index 581545f71a..cfc28e27e7 100644 --- a/include/ceed/jit-source/hip/hip-ref-operator-assemble-diagonal.h +++ b/include/ceed/jit-source/hip/hip-ref-operator-assemble-diagonal.h @@ -63,7 +63,7 @@ extern "C" __launch_bounds__(BLOCK_SIZE) __global__ for (IndexType e_out = 0; e_out < NUM_EVAL_MODES_OUT; e_out++) { IndexType d_in = 0; CeedEvalMode eval_modes_in_prev = CEED_EVAL_NONE; - const CeedScalar *b_t = NULL; + const CeedScalar *b_t = nullptr; GetBasisPointer(&b_t, eval_modes_out[e_out], identity, interp_out, grad_out, div_out, curl_out); if (e_out == 0 || eval_modes_out[e_out] != eval_modes_out_prev) d_out = 0; @@ -71,7 +71,7 @@ extern "C" __launch_bounds__(BLOCK_SIZE) __global__ eval_modes_out_prev = eval_modes_out[e_out]; for (IndexType e_in = 0; e_in < NUM_EVAL_MODES_IN; e_in++) { - const CeedScalar *b = NULL; + const CeedScalar *b = nullptr; GetBasisPointer(&b, eval_modes_in[e_in], identity, interp_in, grad_in, div_in, curl_in); if (e_in == 0 || eval_modes_in[e_in] != eval_modes_in_prev) d_in = 0; diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor.h index 51d4c9d443..15dd302572 100644 --- a/include/ceed/jit-source/hip/hip-shared-basis-tensor.h +++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor.h @@ -407,13 +407,13 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__ if (BASIS_DIM == 1) { ReadElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U); - Grad1d(data, r_U, NULL, s_G, r_V); + Grad1d(data, r_U, nullptr, s_G, r_V); if (e < num_elem) { WriteElementStrided1d(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_V, d_V); } } else if (BASIS_DIM == 2) { ReadElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U); - GradTensorCollocatedNodes2d(data, r_U, NULL, s_G, r_V); + GradTensorCollocatedNodes2d(data, r_U, nullptr, s_G, r_V); if (e < num_elem) { WriteElementStrided2d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_V, d_V); @@ -421,7 +421,7 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__ } else if (BASIS_DIM == 3) { ReadElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U); - GradTensorCollocatedNodes3d(data, r_U, NULL, s_G, r_V); + GradTensorCollocatedNodes3d(data, r_U, nullptr, s_G, r_V); if (e < num_elem) { WriteElementStrided3d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_V, d_V); @@ -522,21 +522,21 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__ if (BASIS_DIM == 1) { ReadElementStrided1d(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U); - GradTranspose1d(data, r_U, NULL, s_G, r_V); + GradTranspose1d(data, r_U, nullptr, s_G, r_V); if (e < num_elem) { WriteElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); } } else if (BASIS_DIM == 2) { ReadElementStrided2d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); - GradTransposeTensorCollocatedNodes2d(data, r_U, NULL, s_G, r_V); + GradTransposeTensorCollocatedNodes2d(data, r_U, nullptr, s_G, r_V); if (e < num_elem) { WriteElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); } } else if (BASIS_DIM == 3) { ReadElementStrided3d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); - GradTransposeTensorCollocatedNodes3d(data, r_U, NULL, s_G, r_V); + GradTransposeTensorCollocatedNodes3d(data, r_U, nullptr, s_G, r_V); if (e < num_elem) { WriteElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); @@ -637,21 +637,21 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__ if (BASIS_DIM == 1) { ReadElementStrided1d(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U); - GradTranspose1d(data, r_U, NULL, s_G, r_V); + GradTranspose1d(data, r_U, nullptr, s_G, r_V); if (e < num_elem) { SumElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); } } else if (BASIS_DIM == 2) { ReadElementStrided2d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); - GradTransposeTensorCollocatedNodes2d(data, r_U, NULL, s_G, r_V); + GradTransposeTensorCollocatedNodes2d(data, r_U, nullptr, s_G, r_V); if (e < num_elem) { SumElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); } } else if (BASIS_DIM == 3) { ReadElementStrided3d(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); - GradTransposeTensorCollocatedNodes3d(data, r_U, NULL, s_G, r_V); + GradTransposeTensorCollocatedNodes3d(data, r_U, nullptr, s_G, r_V); if (e < num_elem) { SumElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); From 4dd9e2c8a4644b93d0092bc91ac3af8642828dfa Mon Sep 17 00:00:00 2001 From: Zach Atkins Date: Thu, 21 May 2026 16:19:42 -0400 Subject: [PATCH 5/5] Update comment for HIP defines Co-authored-by: Jeremy L Thompson --- backends/hip/ceed-hip-compile.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/backends/hip/ceed-hip-compile.cpp b/backends/hip/ceed-hip-compile.cpp index 7e4640c9cf..2158c0cc11 100644 --- a/backends/hip/ceed-hip-compile.cpp +++ b/backends/hip/ceed-hip-compile.cpp @@ -121,13 +121,15 @@ static int CeedCompileCore_Hip(Ceed ceed, const char *source, const char *name, if (runtime_version < 40400000) { code << "#include \n\n"; } else if (runtime_version < 70000000) { - // With ROCm 4.5, need to include these definitions specifically for hiprtc (but cannot include the runtime header) code << "#include \n"; + // With ROCm 4.5+, need to include these definitions specifically for hiprtc (but cannot include the runtime header) code << "#define __forceinline__ inline __attribute__((always_inline))\n"; code << "#define HIP_DYNAMIC_SHARED(type, var) extern __shared__ type var[];\n\n"; } else { + // ROCm 7 removed stddef header, so we use the internal HIP types code << "using __hip_internal::int32_t;\n"; code << "using __hip_internal::int64_t;\n"; + // With ROCm 4.5+, need to include these definitions specifically for hiprtc (but cannot include the runtime header) code << "#define __forceinline__ inline __attribute__((always_inline))\n"; code << "#define HIP_DYNAMIC_SHARED(type, var) extern __shared__ type var[];\n\n"; }