PaddlePaddle · youge325 · Mar 24, 2026 · Mar 25, 2026 · Mar 25, 2026 · Apr 3, 2026
diff --git a/paddle/phi/api/include/compat/ATen/core/TensorBody.h b/paddle/phi/api/include/compat/ATen/core/TensorBody.h
@@ -726,13 +726,6 @@ class Tensor : public TensorBase {
   void record_stream(at::Stream s) const;
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   void record_stream(at::cuda::CUDAStream s) const;
-// TODO(youge325): Remove after DeepEP paddle branch is updated to use
-// at::Stream
-#ifdef PADDLE_WITH_HIP
-  void record_stream(hipStream_t s) const;
-#else
-  void record_stream(cudaStream_t s) const;
-#endif
 #endif
 
   Tensor var(int dim) const { return var(at::IntArrayRef{dim}, true, false); }

diff --git a/paddle/phi/api/include/compat/ATen/ops/record_stream.h b/paddle/phi/api/include/compat/ATen/ops/record_stream.h
@@ -53,32 +53,6 @@ inline void Tensor::record_stream(at::Stream s) const {
 inline void Tensor::record_stream(at::cuda::CUDAStream s) const {
   record_stream(static_cast<at::Stream>(s));
 }
-
-// TODO(youge325): Remove after DeepEP paddle branch is updated to use
-// at::Stream
-#ifdef PADDLE_WITH_HIP
-inline void Tensor::record_stream(hipStream_t s) const {
-  auto dense_tensor =
-      std::dynamic_pointer_cast<phi::DenseTensor>(tensor_.impl());
-  PD_CHECK(dense_tensor != nullptr,
-           "record_stream only supports DenseTensor, but got a non-dense "
-           "tensor implementation.");
-  PD_CHECK(dense_tensor->place().GetType() != phi::AllocationType::CPU,
-           "record_stream is not supported for CPU tensors.");
-  paddle::memory::RecordStream(dense_tensor->Holder(), s);
-}
-#else
-inline void Tensor::record_stream(cudaStream_t s) const {
-  auto dense_tensor =
-      std::dynamic_pointer_cast<phi::DenseTensor>(tensor_.impl());
-  PD_CHECK(dense_tensor != nullptr,
-           "record_stream only supports DenseTensor, but got a non-dense "
-           "tensor implementation.");
-  PD_CHECK(dense_tensor->place().GetType() != phi::AllocationType::CPU,
-           "record_stream is not supported for CPU tensors.");
-  paddle::memory::RecordStream(dense_tensor->Holder(),
-                               reinterpret_cast<gpuStream_t>(s));
-}
-#endif
 #endif
+
 }  // namespace at
diff --git a/paddle/phi/api/include/compat/c10/core/Event.h b/paddle/phi/api/include/compat/c10/core/Event.h
@@ -95,25 +95,6 @@ struct Event final {
   void record(const c10::cuda::CUDAStream& stream) { record(stream.unwrap()); }
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  // TODO(youge325): Remove after DeepEP paddle branch is updated to use
-  // c10::Stream
-#ifdef PADDLE_WITH_HIP
-  void record(const hipStream_t& stream) {
-    TORCH_CHECK(device_type_ == DeviceType::CUDA,
-                "Raw hipStream_t recording is only supported for CUDA events.");
-    RecordBackendEvent(stream, phi::backends::gpu::GetCurrentDeviceId());
-  }
-#else
-  void record(const cudaStream_t& stream) {
-    TORCH_CHECK(
-        device_type_ == DeviceType::CUDA,
-        "Raw cudaStream_t recording is only supported for CUDA events.");
-    RecordBackendEvent(stream, phi::backends::gpu::GetCurrentDeviceId());
-  }
-#endif
-#endif
-
   void block(const Stream& stream) const {
     if (!was_marked_for_recording_) {
       return;

diff --git a/paddle/phi/api/include/compat/c10/core/Stream.h b/paddle/phi/api/include/compat/c10/core/Stream.h
@@ -105,3 +105,7 @@ struct hash<c10::Stream> {
   }
 };
 }  // namespace std
+
+namespace at {
+using c10::Stream;
+}
-
-namespace at {
-using c10::Stream;
-}
-
-namespace at {
-using c10::Stream;
-}
diff --git a/paddle/phi/api/include/compat/c10/cuda/CUDAStream.cpp b/paddle/phi/api/include/compat/c10/cuda/CUDAStream.cpp
@@ -115,6 +115,21 @@ inline void initTLSCurrentStreams() {
 
 }  // namespace
 
+#ifdef PADDLE_WITH_HIP
+inline CUDAStream make_cuda_stream(hipStream_t raw,
+                                   c10::DeviceIndex device_index) {
+#else
+inline CUDAStream make_cuda_stream(cudaStream_t raw,
+                                   c10::DeviceIndex device_index) {
+#endif
+  c10::StreamId sid =
+      static_cast<c10::StreamId>(reinterpret_cast<intptr_t>(raw));
+  return CUDAStream(
+      c10::Stream(c10::Stream::UNSAFE,
+                  c10::Device(c10::DeviceType::CUDA, device_index),
+                  sid));
+}
+
 CUDAStream getStreamFromPool(const int priority,
                              c10::DeviceIndex device_index) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)

diff --git a/paddle/phi/api/include/compat/c10/cuda/CUDAStream.h b/paddle/phi/api/include/compat/c10/cuda/CUDAStream.h
@@ -100,12 +100,6 @@ class CUDAStream {
 
   Device device() const { return Device(DeviceType::CUDA, device_index()); }
 
-#ifdef PADDLE_WITH_HIP
-  hipStream_t raw_stream() const { return stream(); }
-#else
-  cudaStream_t raw_stream() const { return stream(); }
-#endif
-
   struct c10::StreamData3 pack3() const {
     return stream_.pack3();
   }
@@ -139,28 +133,6 @@ class CUDAStream {
   Stream stream_;
 };
 
-#ifdef PADDLE_WITH_HIP
-inline CUDAStream make_cuda_stream(hipStream_t raw,
-                                   c10::DeviceIndex device_index) {
-  c10::StreamId sid =
-      static_cast<c10::StreamId>(reinterpret_cast<intptr_t>(raw));
-  return CUDAStream(
-      c10::Stream(c10::Stream::UNSAFE,
-                  c10::Device(c10::DeviceType::CUDA, device_index),
-                  sid));
-}
-#else
-inline CUDAStream make_cuda_stream(cudaStream_t raw,
-                                   c10::DeviceIndex device_index) {
-  c10::StreamId sid =
-      static_cast<c10::StreamId>(reinterpret_cast<intptr_t>(raw));
-  return CUDAStream(
-      c10::Stream(c10::Stream::UNSAFE,
-                  c10::Device(c10::DeviceType::CUDA, device_index),
-                  sid));
-}
-#endif
-
 /**
  * Get the current CUDA stream for the passed CUDA device, or for the
  * current device if no device index is passed.

diff --git a/test/cpp/compat/ATen_record_stream_test.cc b/test/cpp/compat/ATen_record_stream_test.cc
@@ -51,17 +51,6 @@ using RecordCudaStreamMethod = void (at::Tensor::*)(at::cuda::CUDAStream) const;
 [[maybe_unused]] static RecordCudaStreamMethod g_record_cuda_stream_method =
     &at::Tensor::record_stream;
 
-// Raw stream type is platform-specific:
-// - CUDA: cudaStream_t (CUstream_st*)
-// - HIP: hipStream_t (ihipStream_t*)
-// Only test the raw stream overload on CUDA builds where cudaStream_t is
-// consistently defined. HIP builds use hipStream_t which is a different type.
-#if defined(PADDLE_WITH_CUDA)
-using RecordRawCudaStreamMethod = void (at::Tensor::*)(cudaStream_t) const;
-[[maybe_unused]] static RecordRawCudaStreamMethod
-    g_record_raw_cuda_stream_method = &at::Tensor::record_stream;
-#endif
-
 TEST_F(RecordStreamTest, CudaTensorCurrentCudaStream) {
   if (!at::cuda::is_available()) {
     return;
@@ -80,13 +69,6 @@ TEST_F(RecordStreamTest, CudaTensorDefaultCudaStream) {
   EXPECT_NO_THROW(cuda_tensor.record_stream(default_stream));
 }
 
-TEST_F(RecordStreamTest, CudaTensorRawCudaStream) {
-  if (!at::cuda::is_available()) {
-    return;
-  }
-  auto stream = at::cuda::getCurrentCUDAStream();
-  EXPECT_NO_THROW(cuda_tensor.record_stream(stream.raw_stream()));
-}
 #endif  // PADDLE_WITH_CUDA || PADDLE_WITH_HIP
 
 // --- Error path: CPU tensor + CPU stream (record_stream does not support CPU

diff --git a/test/cpp/compat/c10_Event_test.cc b/test/cpp/compat/c10_Event_test.cc
@@ -40,24 +40,6 @@ TEST(EventTest, CpuEventRecordThrows) {
   EXPECT_THROW(event.recordOnce(stream), std::exception);
 }
 
-// Test device_count() works in both CPU and CUDA builds
-TEST(EventTest, DeviceCount) {
-  c10::DeviceIndex count = c10::cuda::device_count();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  // In CUDA builds, should return actual device count (>= 0)
-  EXPECT_GE(count, 0);
-#else
-  // In CPU-only builds, should return 0
-  EXPECT_EQ(count, 0);
-#endif
-}
-
-#ifdef PADDLE_WITH_CUDA
-using RawEventRecordMethod = void (c10::Event::*)(const cudaStream_t&);
-[[maybe_unused]] static RawEventRecordMethod g_raw_event_record_method =
-    &c10::Event::record;
-#endif
-
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 TEST(EventTest, CudaEventLazyCreateAndRecord) {
   if (!at::cuda::is_available()) {
@@ -111,19 +93,6 @@ TEST(EventTest, CudaEventElapsedTimeWithTimingEnabled) {
   EXPECT_GE(elapsed_ms, 0.0);
 }
 
-#ifdef PADDLE_WITH_CUDA
-TEST(EventTest, CudaEventRawStreamRecordCompatibility) {
-  if (!at::cuda::is_available()) {
-    return;
-  }
-  auto stream = c10::cuda::getCurrentCUDAStream();
-  c10::Event event(c10::DeviceType::CUDA);
-  EXPECT_NO_THROW(event.record(stream.raw_stream()));
-  EXPECT_EQ(event.device_index(), stream.device_index());
-  EXPECT_TRUE(event.was_marked_for_recording());
-}
-#endif
-
 TEST(EventTest, CudaEventRejectsDifferentDeviceRecord) {
   if (c10::cuda::device_count() < 2) {
     return;

diff --git a/test/cpp/compat/c10_Stream_test.cc b/test/cpp/compat/c10_Stream_test.cc
@@ -64,20 +64,6 @@ void CUDART_CB BlockingStreamCallback(void* user_data) {
   }
 }
 
-void EnqueueBlockingCallback(const c10::cuda::CUDAStream& stream,
-                             StreamCallbackGate* gate) {
-  C10_CUDA_CHECK(
-      cudaLaunchHostFunc(stream.raw_stream(), BlockingStreamCallback, gate));
-}
-
-void CreateRawStream(cudaStream_t* stream) {
-  C10_CUDA_CHECK(cudaStreamCreate(stream));
-}
-
-void DestroyRawStream(cudaStream_t stream) {
-  C10_CUDA_CHECK(cudaStreamDestroy(stream));
-}
-
 void ClearLastStreamError() { (void)cudaGetLastError(); }
 #endif
 
@@ -152,40 +138,6 @@ TEST(StreamTest, QueryCudaStreamReady) {
   EXPECT_TRUE(s.query());
 }
 
-TEST(StreamTest, QueryCudaStreamNotReadyReturnsFalse) {
-  if (!at::cuda::is_available()) {
-    return;
-  }
-  auto cuda_stream = c10::cuda::getStreamFromPool(/*isHighPriority=*/false);
-  StreamCallbackGate release_callback{false};
-  ASSERT_NO_THROW(EnqueueBlockingCallback(cuda_stream, &release_callback));
-
-  c10::Stream s = cuda_stream.unwrap();
-  EXPECT_FALSE(s.query());
-
-  release_callback.store(true, std::memory_order_release);
-  EXPECT_NO_THROW(s.synchronize());
-}
-
-TEST(StreamTest, QueryCudaStreamInvalidHandleThrows) {
-  if (!at::cuda::is_available()) {
-    return;
-  }
-
-  auto device_index = c10::cuda::getCurrentCUDAStream().device_index();
-#ifdef PADDLE_WITH_HIP
-  hipStream_t raw_stream = nullptr;
-#else
-  cudaStream_t raw_stream = nullptr;
-#endif
-  ASSERT_NO_THROW(CreateRawStream(&raw_stream));
-
-  auto cuda_stream = c10::cuda::getStreamFromExternal(raw_stream, device_index);
-  ASSERT_NO_THROW(DestroyRawStream(raw_stream));
-
-  EXPECT_THROW(cuda_stream.query(), std::exception);
-  ClearLastStreamError();
-}
 #endif  // PADDLE_WITH_CUDA || PADDLE_WITH_HIP
 
 // ==================== synchronize ====================