Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ On this page we provide a summary of the main API changes, new features and exam
- Require use of `Ceed*Destroy()` on Ceed objects returned from `Ceed*Get*()`.
- Rename `CeedCompositeOperatorCreate()` to `CeedOperatorCreateComposite()` for uniformity.
- Rename `CeedCompositeOperator*()` to `CeedOperatorComposite*()` for uniformity.
- Add `build_objects` parameter to `CeedOperatorLinearAssembleQFunctionBuildOrUpdateFallback` to allow for passing uninitialized vectors and restrictions

### New features

Expand Down
11 changes: 9 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -570,19 +570,26 @@ HIP_LIB_DIR := $(wildcard $(foreach d,lib lib64,$(ROCM_DIR)/$d/lib${HIP_LIB_NAME
HIP_LIB_DIR := $(patsubst %/,%,$(dir $(firstword $(HIP_LIB_DIR))))
HIP_BACKENDS = /gpu/hip/ref /gpu/hip/shared /gpu/hip/gen
ifneq ($(HIP_LIB_DIR),)
HIP_CONFIG := $(ROCM_DIR)/bin/hipconfig
ifeq ($(HIP_LIB_NAME),CHIP)
# chipStar hipconfig -C emits clang-only flags; keep only -D/-I/-include for gcc
HIPCONFIG_CPPFLAGS := $(shell $(ROCM_DIR)/bin/hipconfig -C)
HIPCONFIG_CPPFLAGS := $(shell $(HIP_CONFIG) -C)
HIPCONFIG_CPPFLAGS_C := $(filter-out --offload% -nohipwrapperinc --hip-path% --target%,$(HIPCONFIG_CPPFLAGS)) -I$(ROCM_DIR)/include
else
HIPCONFIG_CPPFLAGS := $(subst =,,$(shell $(ROCM_DIR)/bin/hipconfig -C))
HIPCONFIG_CPPFLAGS := $(subst =,,$(shell $(HIP_CONFIG) -C))
HIPCONFIG_CPPFLAGS_C := $(HIPCONFIG_CPPFLAGS)
endif
$(hip-all.c:%.c=$(OBJDIR)/%.o) $(hip-all.c:%=%.tidy): CPPFLAGS += $(HIPCONFIG_CPPFLAGS_C)
ifneq ($(CXX), $(HIPCC))
$(hip-all.cpp:%.cpp=$(OBJDIR)/%.o) $(hip-all.cpp:%=%.tidy): CPPFLAGS += $(HIPCONFIG_CPPFLAGS_C)
endif
PKG_LIBS += -L$(abspath $(HIP_LIB_DIR)) -l${HIP_LIB_NAME} -lhipblas
HIP_MAJOR_VERSION := $(shell $(HIP_CONFIG) --version | cut -d'.' -f1)
ifeq ($(HIP_MAJOR_VERSION),7)
PKG_LIBS += -lhiprtc
$(info $(PKG_LIBS))
endif
$(info $(HIP_MAJOR_VERSION))
LIBCEED_CONTAINS_CXX = 1
libceed.c += $(hip-all.c)
libceed.cpp += $(hip-all.cpp)
Expand Down
2 changes: 1 addition & 1 deletion backends/cuda-gen/ceed-cuda-gen-operator.c
Original file line number Diff line number Diff line change
Expand Up @@ -517,7 +517,7 @@ static int CeedOperatorLinearAssembleQFunctionCore_Cuda_gen(CeedOperator op, boo

CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back to /gpu/cuda/ref CeedOperator for LinearAssemblyQFunction\n");
CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdateFallback(op_fallback, assembled, rstr, request));
CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdateFallback(op_fallback, build_objects, assembled, rstr, request));
return CEED_ERROR_SUCCESS;
}
return CEED_ERROR_SUCCESS;
Expand Down
10 changes: 5 additions & 5 deletions backends/hip-gen/ceed-hip-gen-operator-build.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,7 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce

CeedCallBackend(CeedBasisIsCollocated(basis, &is_collocated));
if ((is_active && skip_active_load) || (is_collocated && !is_at_points)) {
code << tab << "CeedScalar *s_B" << var_suffix << " = NULL;\n";
code << tab << "CeedScalar *s_B" << var_suffix << " = nullptr;\n";
Comment thread
zatkins-dev marked this conversation as resolved.
} else {
code << tab << "__shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
code << tab << "LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
Expand Down Expand Up @@ -341,7 +341,7 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce

CeedCallBackend(CeedBasisIsCollocated(basis, &is_collocated));
if ((is_active && skip_active_load) || (is_collocated && !is_at_points)) {
code << tab << "CeedScalar *s_B" << var_suffix << " = NULL;\n";
code << tab << "CeedScalar *s_B" << var_suffix << " = nullptr;\n";
} else {
code << tab << "__shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
code << tab << "LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
Expand All @@ -357,7 +357,7 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce

code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
} else if (is_active && skip_active_load) {
code << tab << "CeedScalar *s_G" << var_suffix << " = NULL;\n";
code << tab << "CeedScalar *s_G" << var_suffix << " = nullptr;\n";
} else {
code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
code << tab << "LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
Expand All @@ -373,7 +373,7 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce

code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
} else if (is_active && skip_active_load) {
code << tab << "CeedScalar *s_G" << var_suffix << " = NULL;\n";
code << tab << "CeedScalar *s_G" << var_suffix << " = nullptr;\n";
} else {
code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
code << tab << "LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
Expand All @@ -384,7 +384,7 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce

code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
} else if (is_active && skip_active_load) {
code << tab << "CeedScalar *s_G" << var_suffix << " = NULL;\n";
code << tab << "CeedScalar *s_G" << var_suffix << " = nullptr;\n";
} else {
code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << P_name << "*" << Q_name << (is_tensor ? "" : "*dim")
<< (is_tensor ? "" : var_suffix) << "];\n";
Expand Down
2 changes: 1 addition & 1 deletion backends/hip-gen/ceed-hip-gen-operator.c
Original file line number Diff line number Diff line change
Expand Up @@ -496,7 +496,7 @@ static int CeedOperatorLinearAssembleQFunctionCore_Hip_gen(CeedOperator op, bool

CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back to /gpu/hip/ref CeedOperator for LinearAssembleQFunction\n");
CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdateFallback(op_fallback, assembled, rstr, request));
CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdateFallback(op_fallback, build_objects, assembled, rstr, request));
return CEED_ERROR_SUCCESS;
}
return CEED_ERROR_SUCCESS;
Expand Down
12 changes: 9 additions & 3 deletions backends/hip/ceed-hip-compile.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -120,10 +120,16 @@ static int CeedCompileCore_Hip(Ceed ceed, const char *source, const char *name,
CeedCallHip(ceed, hipRuntimeGetVersion(&runtime_version));
if (runtime_version < 40400000) {
code << "#include <hip/hip_runtime.h>\n\n";
}
// With ROCm 4.5, need to include these definitions specifically for hiprtc (but cannot include the runtime header)
else {
} else if (runtime_version < 70000000) {
code << "#include <stddef.h>\n";
// With ROCm 4.5+, need to include these definitions specifically for hiprtc (but cannot include the runtime header)
code << "#define __forceinline__ inline __attribute__((always_inline))\n";
code << "#define HIP_DYNAMIC_SHARED(type, var) extern __shared__ type var[];\n\n";
} else {
// ROCm 7 removed stddef header, so we use the internal HIP types
code << "using __hip_internal::int32_t;\n";
Comment thread
jeremylt marked this conversation as resolved.
code << "using __hip_internal::int64_t;\n";
// With ROCm 4.5+, need to include these definitions specifically for hiprtc (but cannot include the runtime header)
code << "#define __forceinline__ inline __attribute__((always_inline))\n";
code << "#define HIP_DYNAMIC_SHARED(type, var) extern __shared__ type var[];\n\n";
}
Expand Down
4 changes: 2 additions & 2 deletions include/ceed/backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -473,8 +473,8 @@ CEED_EXTERN int CeedOperatorReference(CeedOperator op);
CEED_EXTERN int CeedOperatorGetFallback(CeedOperator op, CeedOperator *op_fallback);
CEED_EXTERN int CeedOperatorGetFallbackParent(CeedOperator op, CeedOperator *parent);
CEED_EXTERN int CeedOperatorGetFallbackParentCeed(CeedOperator op, Ceed *parent);
CEED_EXTERN int CeedOperatorLinearAssembleQFunctionBuildOrUpdateFallback(CeedOperator op, CeedVector *assembled, CeedElemRestriction *rstr,
CeedRequest *request);
CEED_EXTERN int CeedOperatorLinearAssembleQFunctionBuildOrUpdateFallback(CeedOperator op, bool build_objects, CeedVector *assembled,
CeedElemRestriction *rstr, CeedRequest *request);
CEED_INTERN int CeedOperatorAssembleSingle(CeedOperator op, CeedSize offset, CeedVector values);
CEED_EXTERN int CeedOperatorSetSetupDone(CeedOperator op);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,15 +63,15 @@ extern "C" __launch_bounds__(BLOCK_SIZE) __global__
for (IndexType e_out = 0; e_out < NUM_EVAL_MODES_OUT; e_out++) {
IndexType d_in = 0;
CeedEvalMode eval_modes_in_prev = CEED_EVAL_NONE;
const CeedScalar *b_t = NULL;
const CeedScalar *b_t = nullptr;

GetBasisPointer(&b_t, eval_modes_out[e_out], identity, interp_out, grad_out, div_out, curl_out);
if (e_out == 0 || eval_modes_out[e_out] != eval_modes_out_prev) d_out = 0;
else b_t = &b_t[(++d_out) * NUM_QPTS * NUM_NODES];
eval_modes_out_prev = eval_modes_out[e_out];

for (IndexType e_in = 0; e_in < NUM_EVAL_MODES_IN; e_in++) {
const CeedScalar *b = NULL;
const CeedScalar *b = nullptr;

GetBasisPointer(&b, eval_modes_in[e_in], identity, interp_in, grad_in, div_in, curl_in);
if (e_in == 0 || eval_modes_in[e_in] != eval_modes_in_prev) d_in = 0;
Expand Down
18 changes: 9 additions & 9 deletions include/ceed/jit-source/hip/hip-shared-basis-tensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -407,21 +407,21 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__

if (BASIS_DIM == 1) {
ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
Grad1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
Grad1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, nullptr, s_G, r_V);
if (e < num_elem) {
WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_V, d_V);
}
} else if (BASIS_DIM == 2) {
ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U);
GradTensorCollocatedNodes2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
GradTensorCollocatedNodes2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, nullptr, s_G, r_V);
if (e < num_elem) {
WriteElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_V,
d_V);
}
} else if (BASIS_DIM == 3) {
ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U);
GradTensorCollocatedNodes3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
GradTensorCollocatedNodes3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, nullptr, s_G, r_V);
if (e < num_elem) {
WriteElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_V, d_V);
Expand Down Expand Up @@ -522,21 +522,21 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__

if (BASIS_DIM == 1) {
ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
GradTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
GradTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, nullptr, s_G, r_V);
if (e < num_elem) {
WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
}
} else if (BASIS_DIM == 2) {
ReadElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U,
r_U);
GradTransposeTensorCollocatedNodes2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
GradTransposeTensorCollocatedNodes2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, nullptr, s_G, r_V);
if (e < num_elem) {
WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
}
} else if (BASIS_DIM == 3) {
ReadElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
GradTransposeTensorCollocatedNodes3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
GradTransposeTensorCollocatedNodes3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, nullptr, s_G, r_V);
if (e < num_elem) {
WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
Expand Down Expand Up @@ -637,21 +637,21 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__

if (BASIS_DIM == 1) {
ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
GradTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
GradTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, nullptr, s_G, r_V);
if (e < num_elem) {
SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
}
} else if (BASIS_DIM == 2) {
ReadElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U,
r_U);
GradTransposeTensorCollocatedNodes2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
GradTransposeTensorCollocatedNodes2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, nullptr, s_G, r_V);
if (e < num_elem) {
SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
}
} else if (BASIS_DIM == 3) {
ReadElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
GradTransposeTensorCollocatedNodes3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
GradTransposeTensorCollocatedNodes3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, nullptr, s_G, r_V);
if (e < num_elem) {
SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
Expand Down
Loading
Loading