Skip to content
7 changes: 0 additions & 7 deletions paddle/phi/api/include/compat/ATen/core/TensorBody.h
Original file line number Diff line number Diff line change
Expand Up @@ -726,13 +726,6 @@ class Tensor : public TensorBase {
void record_stream(at::Stream s) const;
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
void record_stream(at::cuda::CUDAStream s) const;
// TODO(youge325): Remove after DeepEP paddle branch is updated to use
// at::Stream
#ifdef PADDLE_WITH_HIP
void record_stream(hipStream_t s) const;
#else
void record_stream(cudaStream_t s) const;
#endif
#endif

Tensor var(int dim) const { return var(at::IntArrayRef{dim}, true, false); }
Expand Down
28 changes: 1 addition & 27 deletions paddle/phi/api/include/compat/ATen/ops/record_stream.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,32 +53,6 @@ inline void Tensor::record_stream(at::Stream s) const {
inline void Tensor::record_stream(at::cuda::CUDAStream s) const {
record_stream(static_cast<at::Stream>(s));
}

// TODO(youge325): Remove after DeepEP paddle branch is updated to use
// at::Stream
#ifdef PADDLE_WITH_HIP
inline void Tensor::record_stream(hipStream_t s) const {
auto dense_tensor =
std::dynamic_pointer_cast<phi::DenseTensor>(tensor_.impl());
PD_CHECK(dense_tensor != nullptr,
"record_stream only supports DenseTensor, but got a non-dense "
"tensor implementation.");
PD_CHECK(dense_tensor->place().GetType() != phi::AllocationType::CPU,
"record_stream is not supported for CPU tensors.");
paddle::memory::RecordStream(dense_tensor->Holder(), s);
}
#else
inline void Tensor::record_stream(cudaStream_t s) const {
auto dense_tensor =
std::dynamic_pointer_cast<phi::DenseTensor>(tensor_.impl());
PD_CHECK(dense_tensor != nullptr,
"record_stream only supports DenseTensor, but got a non-dense "
"tensor implementation.");
PD_CHECK(dense_tensor->place().GetType() != phi::AllocationType::CPU,
"record_stream is not supported for CPU tensors.");
paddle::memory::RecordStream(dense_tensor->Holder(),
reinterpret_cast<gpuStream_t>(s));
}
#endif
#endif

} // namespace at
19 changes: 0 additions & 19 deletions paddle/phi/api/include/compat/c10/core/Event.h
Original file line number Diff line number Diff line change
Expand Up @@ -95,25 +95,6 @@ struct Event final {
void record(const c10::cuda::CUDAStream& stream) { record(stream.unwrap()); }
#endif

#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
// TODO(youge325): Remove after DeepEP paddle branch is updated to use
// c10::Stream
#ifdef PADDLE_WITH_HIP
void record(const hipStream_t& stream) {
TORCH_CHECK(device_type_ == DeviceType::CUDA,
"Raw hipStream_t recording is only supported for CUDA events.");
RecordBackendEvent(stream, phi::backends::gpu::GetCurrentDeviceId());
}
#else
void record(const cudaStream_t& stream) {
TORCH_CHECK(
device_type_ == DeviceType::CUDA,
"Raw cudaStream_t recording is only supported for CUDA events.");
RecordBackendEvent(stream, phi::backends::gpu::GetCurrentDeviceId());
}
#endif
#endif

void block(const Stream& stream) const {
if (!was_marked_for_recording_) {
return;
Expand Down
4 changes: 4 additions & 0 deletions paddle/phi/api/include/compat/c10/core/Stream.h
Original file line number Diff line number Diff line change
Expand Up @@ -105,3 +105,7 @@ struct hash<c10::Stream> {
}
};
} // namespace std

namespace at {
using c10::Stream;
}
Comment on lines +108 to +111
Copy link

Copilot AI Apr 1, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Adding namespace at { using c10::Stream; } here conflicts with an existing at::Stream definition in paddle/phi/api/include/compat/ATen/core/TensorBody.h (currently using Stream = c10::Stream;). Since TensorBody.h includes <c10/core/Stream.h>, this will trigger a redefinition error when compiling any TU that includes ATen/core/TensorBody.h.

To fix: keep a single canonical at::Stream definition (either remove the Stream alias from TensorBody.h, or drop this new at::Stream export and rely on the existing one).

Suggested change
namespace at {
using c10::Stream;
}

Copilot uses AI. Check for mistakes.
15 changes: 15 additions & 0 deletions paddle/phi/api/include/compat/c10/cuda/CUDAStream.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,21 @@ inline void initTLSCurrentStreams() {

} // namespace

#ifdef PADDLE_WITH_HIP
inline CUDAStream make_cuda_stream(hipStream_t raw,
c10::DeviceIndex device_index) {
#else
inline CUDAStream make_cuda_stream(cudaStream_t raw,
c10::DeviceIndex device_index) {
#endif
c10::StreamId sid =
static_cast<c10::StreamId>(reinterpret_cast<intptr_t>(raw));
return CUDAStream(
c10::Stream(c10::Stream::UNSAFE,
c10::Device(c10::DeviceType::CUDA, device_index),
sid));
}

CUDAStream getStreamFromPool(const int priority,
c10::DeviceIndex device_index) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
Expand Down
28 changes: 0 additions & 28 deletions paddle/phi/api/include/compat/c10/cuda/CUDAStream.h
Original file line number Diff line number Diff line change
Expand Up @@ -100,12 +100,6 @@ class CUDAStream {

Device device() const { return Device(DeviceType::CUDA, device_index()); }

#ifdef PADDLE_WITH_HIP
hipStream_t raw_stream() const { return stream(); }
#else
cudaStream_t raw_stream() const { return stream(); }
#endif

struct c10::StreamData3 pack3() const {
return stream_.pack3();
}
Expand Down Expand Up @@ -139,28 +133,6 @@ class CUDAStream {
Stream stream_;
};

#ifdef PADDLE_WITH_HIP
inline CUDAStream make_cuda_stream(hipStream_t raw,
c10::DeviceIndex device_index) {
c10::StreamId sid =
static_cast<c10::StreamId>(reinterpret_cast<intptr_t>(raw));
return CUDAStream(
c10::Stream(c10::Stream::UNSAFE,
c10::Device(c10::DeviceType::CUDA, device_index),
sid));
}
#else
inline CUDAStream make_cuda_stream(cudaStream_t raw,
c10::DeviceIndex device_index) {
c10::StreamId sid =
static_cast<c10::StreamId>(reinterpret_cast<intptr_t>(raw));
return CUDAStream(
c10::Stream(c10::Stream::UNSAFE,
c10::Device(c10::DeviceType::CUDA, device_index),
sid));
}
#endif

/**
* Get the current CUDA stream for the passed CUDA device, or for the
* current device if no device index is passed.
Expand Down
18 changes: 0 additions & 18 deletions test/cpp/compat/ATen_record_stream_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -51,17 +51,6 @@ using RecordCudaStreamMethod = void (at::Tensor::*)(at::cuda::CUDAStream) const;
[[maybe_unused]] static RecordCudaStreamMethod g_record_cuda_stream_method =
&at::Tensor::record_stream;

// Raw stream type is platform-specific:
// - CUDA: cudaStream_t (CUstream_st*)
// - HIP: hipStream_t (ihipStream_t*)
// Only test the raw stream overload on CUDA builds where cudaStream_t is
// consistently defined. HIP builds use hipStream_t which is a different type.
#if defined(PADDLE_WITH_CUDA)
using RecordRawCudaStreamMethod = void (at::Tensor::*)(cudaStream_t) const;
[[maybe_unused]] static RecordRawCudaStreamMethod
g_record_raw_cuda_stream_method = &at::Tensor::record_stream;
#endif

TEST_F(RecordStreamTest, CudaTensorCurrentCudaStream) {
if (!at::cuda::is_available()) {
return;
Expand All @@ -80,13 +69,6 @@ TEST_F(RecordStreamTest, CudaTensorDefaultCudaStream) {
EXPECT_NO_THROW(cuda_tensor.record_stream(default_stream));
}

TEST_F(RecordStreamTest, CudaTensorRawCudaStream) {
if (!at::cuda::is_available()) {
return;
}
auto stream = at::cuda::getCurrentCUDAStream();
EXPECT_NO_THROW(cuda_tensor.record_stream(stream.raw_stream()));
}
#endif // PADDLE_WITH_CUDA || PADDLE_WITH_HIP

// --- Error path: CPU tensor + CPU stream (record_stream does not support CPU
Expand Down
31 changes: 0 additions & 31 deletions test/cpp/compat/c10_Event_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -40,24 +40,6 @@ TEST(EventTest, CpuEventRecordThrows) {
EXPECT_THROW(event.recordOnce(stream), std::exception);
}

// Test device_count() works in both CPU and CUDA builds
TEST(EventTest, DeviceCount) {
c10::DeviceIndex count = c10::cuda::device_count();
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
// In CUDA builds, should return actual device count (>= 0)
EXPECT_GE(count, 0);
#else
// In CPU-only builds, should return 0
EXPECT_EQ(count, 0);
#endif
}

#ifdef PADDLE_WITH_CUDA
using RawEventRecordMethod = void (c10::Event::*)(const cudaStream_t&);
[[maybe_unused]] static RawEventRecordMethod g_raw_event_record_method =
&c10::Event::record;
#endif

#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
TEST(EventTest, CudaEventLazyCreateAndRecord) {
if (!at::cuda::is_available()) {
Expand Down Expand Up @@ -111,19 +93,6 @@ TEST(EventTest, CudaEventElapsedTimeWithTimingEnabled) {
EXPECT_GE(elapsed_ms, 0.0);
}

#ifdef PADDLE_WITH_CUDA
TEST(EventTest, CudaEventRawStreamRecordCompatibility) {
if (!at::cuda::is_available()) {
return;
}
auto stream = c10::cuda::getCurrentCUDAStream();
c10::Event event(c10::DeviceType::CUDA);
EXPECT_NO_THROW(event.record(stream.raw_stream()));
EXPECT_EQ(event.device_index(), stream.device_index());
EXPECT_TRUE(event.was_marked_for_recording());
}
#endif

TEST(EventTest, CudaEventRejectsDifferentDeviceRecord) {
if (c10::cuda::device_count() < 2) {
return;
Expand Down
48 changes: 0 additions & 48 deletions test/cpp/compat/c10_Stream_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -64,20 +64,6 @@ void CUDART_CB BlockingStreamCallback(void* user_data) {
}
}

void EnqueueBlockingCallback(const c10::cuda::CUDAStream& stream,
StreamCallbackGate* gate) {
C10_CUDA_CHECK(
cudaLaunchHostFunc(stream.raw_stream(), BlockingStreamCallback, gate));
}

void CreateRawStream(cudaStream_t* stream) {
C10_CUDA_CHECK(cudaStreamCreate(stream));
}

void DestroyRawStream(cudaStream_t stream) {
C10_CUDA_CHECK(cudaStreamDestroy(stream));
}

void ClearLastStreamError() { (void)cudaGetLastError(); }
#endif

Expand Down Expand Up @@ -152,40 +138,6 @@ TEST(StreamTest, QueryCudaStreamReady) {
EXPECT_TRUE(s.query());
}

TEST(StreamTest, QueryCudaStreamNotReadyReturnsFalse) {
if (!at::cuda::is_available()) {
return;
}
auto cuda_stream = c10::cuda::getStreamFromPool(/*isHighPriority=*/false);
StreamCallbackGate release_callback{false};
ASSERT_NO_THROW(EnqueueBlockingCallback(cuda_stream, &release_callback));

c10::Stream s = cuda_stream.unwrap();
EXPECT_FALSE(s.query());

release_callback.store(true, std::memory_order_release);
EXPECT_NO_THROW(s.synchronize());
}

TEST(StreamTest, QueryCudaStreamInvalidHandleThrows) {
if (!at::cuda::is_available()) {
return;
}

auto device_index = c10::cuda::getCurrentCUDAStream().device_index();
#ifdef PADDLE_WITH_HIP
hipStream_t raw_stream = nullptr;
#else
cudaStream_t raw_stream = nullptr;
#endif
ASSERT_NO_THROW(CreateRawStream(&raw_stream));

auto cuda_stream = c10::cuda::getStreamFromExternal(raw_stream, device_index);
ASSERT_NO_THROW(DestroyRawStream(raw_stream));

EXPECT_THROW(cuda_stream.query(), std::exception);
ClearLastStreamError();
}
#endif // PADDLE_WITH_CUDA || PADDLE_WITH_HIP

// ==================== synchronize ====================
Expand Down
Loading