Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ On this page we provide a summary of the main API changes, new features and exam
- Require use of `Ceed*Destroy()` on Ceed objects returned from `Ceed*Get*()`.
- Rename `CeedCompositeOperatorCreate()` to `CeedOperatorCreateComposite()` for uniformity.
- Rename `CeedCompositeOperator*()` to `CeedOperatorComposite*()` for uniformity.
- Add `build_objects` parameter to `CeedOperatorLinearAssembleQFunctionBuildOrUpdateFallback` to allow for passing uninitialized vectors and restrictions

### New features

Expand Down
11 changes: 9 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -570,19 +570,26 @@ HIP_LIB_DIR := $(wildcard $(foreach d,lib lib64,$(ROCM_DIR)/$d/lib${HIP_LIB_NAME
HIP_LIB_DIR := $(patsubst %/,%,$(dir $(firstword $(HIP_LIB_DIR))))
HIP_BACKENDS = /gpu/hip/ref /gpu/hip/shared /gpu/hip/gen
ifneq ($(HIP_LIB_DIR),)
HIP_CONFIG := $(ROCM_DIR)/bin/hipconfig
ifeq ($(HIP_LIB_NAME),CHIP)
# chipStar hipconfig -C emits clang-only flags; keep only -D/-I/-include for gcc
HIPCONFIG_CPPFLAGS := $(shell $(ROCM_DIR)/bin/hipconfig -C)
HIPCONFIG_CPPFLAGS := $(shell $(HIP_CONFIG) -C)
HIPCONFIG_CPPFLAGS_C := $(filter-out --offload% -nohipwrapperinc --hip-path% --target%,$(HIPCONFIG_CPPFLAGS)) -I$(ROCM_DIR)/include
else
HIPCONFIG_CPPFLAGS := $(subst =,,$(shell $(ROCM_DIR)/bin/hipconfig -C))
HIPCONFIG_CPPFLAGS := $(subst =,,$(shell $(HIP_CONFIG) -C))
HIPCONFIG_CPPFLAGS_C := $(HIPCONFIG_CPPFLAGS)
endif
$(hip-all.c:%.c=$(OBJDIR)/%.o) $(hip-all.c:%=%.tidy): CPPFLAGS += $(HIPCONFIG_CPPFLAGS_C)
ifneq ($(CXX), $(HIPCC))
$(hip-all.cpp:%.cpp=$(OBJDIR)/%.o) $(hip-all.cpp:%=%.tidy): CPPFLAGS += $(HIPCONFIG_CPPFLAGS_C)
endif
PKG_LIBS += -L$(abspath $(HIP_LIB_DIR)) -l${HIP_LIB_NAME} -lhipblas
HIP_MAJOR_VERSION := $(shell $(HIP_CONFIG) --version | cut -d'.' -f1)
ifeq ($(HIP_MAJOR_VERSION),7)
PKG_LIBS += -lhiprtc
$(info $(PKG_LIBS))
endif
$(info $(HIP_MAJOR_VERSION))
LIBCEED_CONTAINS_CXX = 1
libceed.c += $(hip-all.c)
libceed.cpp += $(hip-all.cpp)
Expand Down
2 changes: 1 addition & 1 deletion backends/cuda-gen/ceed-cuda-gen-operator.c
Original file line number Diff line number Diff line change
Expand Up @@ -517,7 +517,7 @@ static int CeedOperatorLinearAssembleQFunctionCore_Cuda_gen(CeedOperator op, boo

CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back to /gpu/cuda/ref CeedOperator for LinearAssemblyQFunction\n");
CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdateFallback(op_fallback, assembled, rstr, request));
CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdateFallback(op_fallback, build_objects, assembled, rstr, request));
return CEED_ERROR_SUCCESS;
}
return CEED_ERROR_SUCCESS;
Expand Down
10 changes: 5 additions & 5 deletions backends/hip-gen/ceed-hip-gen-operator-build.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,7 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce

CeedCallBackend(CeedBasisIsCollocated(basis, &is_collocated));
if ((is_active && skip_active_load) || (is_collocated && !is_at_points)) {
code << tab << "CeedScalar *s_B" << var_suffix << " = NULL;\n";
code << tab << "CeedScalar *s_B" << var_suffix << " = nullptr;\n";
Comment thread
zatkins-dev marked this conversation as resolved.
} else {
code << tab << "__shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
code << tab << "LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
Expand Down Expand Up @@ -341,7 +341,7 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce

CeedCallBackend(CeedBasisIsCollocated(basis, &is_collocated));
if ((is_active && skip_active_load) || (is_collocated && !is_at_points)) {
code << tab << "CeedScalar *s_B" << var_suffix << " = NULL;\n";
code << tab << "CeedScalar *s_B" << var_suffix << " = nullptr;\n";
} else {
code << tab << "__shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
code << tab << "LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
Expand All @@ -357,7 +357,7 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce

code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
} else if (is_active && skip_active_load) {
code << tab << "CeedScalar *s_G" << var_suffix << " = NULL;\n";
code << tab << "CeedScalar *s_G" << var_suffix << " = nullptr;\n";
} else {
code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
code << tab << "LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
Expand All @@ -373,7 +373,7 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce

code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
} else if (is_active && skip_active_load) {
code << tab << "CeedScalar *s_G" << var_suffix << " = NULL;\n";
code << tab << "CeedScalar *s_G" << var_suffix << " = nullptr;\n";
} else {
code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
code << tab << "LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
Expand All @@ -384,7 +384,7 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce

code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
} else if (is_active && skip_active_load) {
code << tab << "CeedScalar *s_G" << var_suffix << " = NULL;\n";
code << tab << "CeedScalar *s_G" << var_suffix << " = nullptr;\n";
} else {
code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << P_name << "*" << Q_name << (is_tensor ? "" : "*dim")
<< (is_tensor ? "" : var_suffix) << "];\n";
Expand Down
2 changes: 1 addition & 1 deletion backends/hip-gen/ceed-hip-gen-operator.c
Original file line number Diff line number Diff line change
Expand Up @@ -496,7 +496,7 @@ static int CeedOperatorLinearAssembleQFunctionCore_Hip_gen(CeedOperator op, bool

CeedDebug(CeedOperatorReturnCeed(op), "\nFalling back to /gpu/hip/ref CeedOperator for LinearAssembleQFunction\n");
CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdateFallback(op_fallback, assembled, rstr, request));
CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdateFallback(op_fallback, build_objects, assembled, rstr, request));
return CEED_ERROR_SUCCESS;
}
return CEED_ERROR_SUCCESS;
Expand Down
10 changes: 7 additions & 3 deletions backends/hip/ceed-hip-compile.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -120,12 +120,16 @@ static int CeedCompileCore_Hip(Ceed ceed, const char *source, const char *name,
CeedCallHip(ceed, hipRuntimeGetVersion(&runtime_version));
if (runtime_version < 40400000) {
code << "#include <hip/hip_runtime.h>\n\n";
}
// With ROCm 4.5, need to include these definitions specifically for hiprtc (but cannot include the runtime header)
else {
} else if (runtime_version < 70000000) {
// With ROCm 4.5, need to include these definitions specifically for hiprtc (but cannot include the runtime header)
Comment thread
zatkins-dev marked this conversation as resolved.
Outdated
code << "#include <stddef.h>\n";
code << "#define __forceinline__ inline __attribute__((always_inline))\n";
code << "#define HIP_DYNAMIC_SHARED(type, var) extern __shared__ type var[];\n\n";
} else {
code << "using __hip_internal::int32_t;\n";
Comment thread
jeremylt marked this conversation as resolved.
code << "using __hip_internal::int64_t;\n";
code << "#define __forceinline__ inline __attribute__((always_inline))\n";
code << "#define HIP_DYNAMIC_SHARED(type, var) extern __shared__ type var[];\n\n";
}
}

Expand Down
4 changes: 2 additions & 2 deletions include/ceed/backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -473,8 +473,8 @@ CEED_EXTERN int CeedOperatorReference(CeedOperator op);
CEED_EXTERN int CeedOperatorGetFallback(CeedOperator op, CeedOperator *op_fallback);
CEED_EXTERN int CeedOperatorGetFallbackParent(CeedOperator op, CeedOperator *parent);
CEED_EXTERN int CeedOperatorGetFallbackParentCeed(CeedOperator op, Ceed *parent);
CEED_EXTERN int CeedOperatorLinearAssembleQFunctionBuildOrUpdateFallback(CeedOperator op, CeedVector *assembled, CeedElemRestriction *rstr,
CeedRequest *request);
CEED_EXTERN int CeedOperatorLinearAssembleQFunctionBuildOrUpdateFallback(CeedOperator op, bool build_objects, CeedVector *assembled,
CeedElemRestriction *rstr, CeedRequest *request);
CEED_INTERN int CeedOperatorAssembleSingle(CeedOperator op, CeedSize offset, CeedVector values);
CEED_EXTERN int CeedOperatorSetSetupDone(CeedOperator op);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,15 +63,15 @@ extern "C" __launch_bounds__(BLOCK_SIZE) __global__
for (IndexType e_out = 0; e_out < NUM_EVAL_MODES_OUT; e_out++) {
IndexType d_in = 0;
CeedEvalMode eval_modes_in_prev = CEED_EVAL_NONE;
const CeedScalar *b_t = NULL;
const CeedScalar *b_t = nullptr;

GetBasisPointer(&b_t, eval_modes_out[e_out], identity, interp_out, grad_out, div_out, curl_out);
if (e_out == 0 || eval_modes_out[e_out] != eval_modes_out_prev) d_out = 0;
else b_t = &b_t[(++d_out) * NUM_QPTS * NUM_NODES];
eval_modes_out_prev = eval_modes_out[e_out];

for (IndexType e_in = 0; e_in < NUM_EVAL_MODES_IN; e_in++) {
const CeedScalar *b = NULL;
const CeedScalar *b = nullptr;

GetBasisPointer(&b, eval_modes_in[e_in], identity, interp_in, grad_in, div_in, curl_in);
if (e_in == 0 || eval_modes_in[e_in] != eval_modes_in_prev) d_in = 0;
Expand Down
18 changes: 9 additions & 9 deletions include/ceed/jit-source/hip/hip-shared-basis-tensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -407,21 +407,21 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__

if (BASIS_DIM == 1) {
ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
Grad1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
Grad1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, nullptr, s_G, r_V);
if (e < num_elem) {
WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_V, d_V);
}
} else if (BASIS_DIM == 2) {
ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U);
GradTensorCollocatedNodes2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
GradTensorCollocatedNodes2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, nullptr, s_G, r_V);
if (e < num_elem) {
WriteElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_V,
d_V);
}
} else if (BASIS_DIM == 3) {
ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U);
GradTensorCollocatedNodes3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
GradTensorCollocatedNodes3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, nullptr, s_G, r_V);
if (e < num_elem) {
WriteElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_V, d_V);
Expand Down Expand Up @@ -522,21 +522,21 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__

if (BASIS_DIM == 1) {
ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
GradTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
GradTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, nullptr, s_G, r_V);
if (e < num_elem) {
WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
}
} else if (BASIS_DIM == 2) {
ReadElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U,
r_U);
GradTransposeTensorCollocatedNodes2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
GradTransposeTensorCollocatedNodes2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, nullptr, s_G, r_V);
if (e < num_elem) {
WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
}
} else if (BASIS_DIM == 3) {
ReadElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
GradTransposeTensorCollocatedNodes3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
GradTransposeTensorCollocatedNodes3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, nullptr, s_G, r_V);
if (e < num_elem) {
WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
Expand Down Expand Up @@ -637,21 +637,21 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__

if (BASIS_DIM == 1) {
ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
GradTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
GradTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, nullptr, s_G, r_V);
if (e < num_elem) {
SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
}
} else if (BASIS_DIM == 2) {
ReadElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U,
r_U);
GradTransposeTensorCollocatedNodes2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
GradTransposeTensorCollocatedNodes2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, nullptr, s_G, r_V);
if (e < num_elem) {
SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
}
} else if (BASIS_DIM == 3) {
ReadElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
GradTransposeTensorCollocatedNodes3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V);
GradTransposeTensorCollocatedNodes3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, nullptr, s_G, r_V);
if (e < num_elem) {
SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
Expand Down
Loading
Loading