feat(rpc): Add AIMD congestion control and batch dispatch chunking for async RPC batch mode (#17227)

zhichenxu-meta · meta-codesync[bot] · commit dc9e9f369bc9 · 2026-04-20T22:24:27.000-07:00
Summary: Pull Request resolved: #17227 Batch dispatch chunking and AIMD congestion control for async RPC operators in batch mode. Problem: When using batch mode, dispatch_batch_size was not actually splitting rows — all rows were sent as a single RPC call, potentially exceeding the server's concurrent request limit. Additionally, there was no backpressure mechanism to throttle dispatch when the server was overloaded. Changes: 1. Batch dispatch chunking: flushBatch(maxRows) drains only maxRows from pending rows instead of all. RPCOperator loops flushBatchRequests(dispatchBatchSize_) to flush in chunks. AsyncRPCFunction.h updated with maxRows parameter. 2. Backpressure check in addInput: while loop checks isUnderBackpressure() between flushes to prevent overshooting maxPendingBatches. 3. AIMD congestion control: RPCState tracks effectiveMaxPendingBatches_ (starts at maxPendingBatches_=2). On success: +1 (additive increase). On error (>50% real errors or _rpc_retried signal): /2 (multiplicative decrease, floor 1). Null input responses are excluded from error counting. Suppresses redundant "decreased from 1 to 1" log messages. Reviewed By: Yuhta Differential Revision: D101062260 fbshipit-source-id: cc0f809cabdbf8c9b0abe53305cabaa10ba3b645
diff --git a/velox/exec/rpc/RPCOperator.cpp b/velox/exec/rpc/RPCOperator.cpp
@@ -191,12 +191,17 @@ void RPCOperator::addInput(RowVectorPtr input) {
 
     if (dispatchBatchSize_ > 0 &&
         function_->pendingBatchSize() >= dispatchBatchSize_) {
-      flushBatchRequests();
+      // Flush in chunks of dispatchBatchSize_ to avoid sending one
+      // giant batch_predict call that overwhelms the server.
+      while (function_->pendingBatchSize() >= dispatchBatchSize_ &&
+             !state_->isUnderBackpressure()) {
+        flushBatchRequests(dispatchBatchSize_);
+      }
     }
   }
 }
 
-void RPCOperator::flushBatchRequests() {
+void RPCOperator::flushBatchRequests(int32_t maxRows) {
   if (function_->pendingBatchSize() == 0) {
     VELOX_CHECK(
         batchRowLocations_.empty(),
@@ -207,13 +212,24 @@ void RPCOperator::flushBatchRequests() {
     return;
   }
 
-  RPC_OP_LOG(INFO) << "Flushing batch with " << function_->pendingBatchSize()
-                   << " accumulated rows";
+  // Determine how many rows to flush.
+  auto flushCount = maxRows > 0
+      ? std::min(static_cast<int32_t>(batchRowLocations_.size()), maxRows)
+      : static_cast<int32_t>(batchRowLocations_.size());
+
+  RPC_OP_LOG(INFO) << "Flushing batch with " << flushCount << " of "
+                   << function_->pendingBatchSize() << " accumulated rows";
 
-  auto rowLocations = std::move(batchRowLocations_);
-  auto rowIds = std::move(batchRowIds_);
+  // Split off the rows to flush.
+  std::vector<RPCState::RowLocation> rowLocations(
+      batchRowLocations_.begin(), batchRowLocations_.begin() + flushCount);
+  std::vector<int64_t> rowIds(
+      batchRowIds_.begin(), batchRowIds_.begin() + flushCount);
+  batchRowLocations_.erase(
+      batchRowLocations_.begin(), batchRowLocations_.begin() + flushCount);
+  batchRowIds_.erase(batchRowIds_.begin(), batchRowIds_.begin() + flushCount);
 
-  auto future = function_->flushBatch();
+  auto future = function_->flushBatch(maxRows);
 
   // Count each flushBatch() as 1 pending unit in the rate limiter.
   auto token = std::make_shared<RPCRateLimiter::Token>(
@@ -252,9 +268,9 @@ void RPCOperator::noMoreInput() {
                  << numRequestsDispatched_;
 
   if (state_->streamingMode() == RPCStreamingMode::kBatch) {
-    // Flush any remaining accumulated rows.
-    if (function_->pendingBatchSize() > 0) {
-      flushBatchRequests();
+    // Flush any remaining accumulated rows in chunks.
+    while (function_->pendingBatchSize() > 0) {
+      flushBatchRequests(dispatchBatchSize_ > 0 ? dispatchBatchSize_ : 0);
     }
   }
 
@@ -311,6 +327,16 @@ RowVectorPtr RPCOperator::getOutput() {
         numErrors_++;
       }
     }
+
+    // Delegate congestion evaluation to the function.
+    // The function knows its domain-specific error semantics.
+    auto signal = function_->evaluateCongestion(claimedBatch_->responses);
+    if (signal == AsyncRPCFunction::CongestionSignal::kError) {
+      state_->onBatchError();
+    } else if (signal == AsyncRPCFunction::CongestionSignal::kSuccess) {
+      state_->onBatchSuccess(function_->congestionRecoveryIncrement());
+    }
+
     auto output = buildOutputFromReadyBatch(*claimedBatch_);
     numResponsesCollected_ += numRows;
     claimedBatch_.reset();
diff --git a/velox/exec/rpc/RPCOperator.h b/velox/exec/rpc/RPCOperator.h
@@ -108,7 +108,8 @@ class RPCOperator : public exec::Operator {
  private:
   /// Flush accumulated batch rows via function_->flushBatch().
   /// Called when threshold is reached or at noMoreInput/drain time.
-  void flushBatchRequests();
+  /// @param maxRows Maximum rows to flush. 0 means flush all.
+  void flushBatchRequests(int32_t maxRows = 0);
 
   /// Build output RowVector from ready rows (PER_ROW mode).
   /// Supports multiple rows via batched drain for pipeline efficiency.
@@ -174,7 +175,7 @@ class RPCOperator : public exec::Operator {
   // This is a ceiling — the operator returns as soon as results are ready.
   // Batch LLM inference can take many minutes due to MetaGen queuing
   // and GPU scheduling, so the timeout needs generous headroom.
-  static constexpr auto kBatchRpcTimeout = std::chrono::milliseconds(1'800'000);
+  static constexpr auto kBatchRpcTimeout = std::chrono::milliseconds(3'600'000);
 
   // Block wait time tracking for runtime stats.
   std::optional<uint64_t> blockWaitStartNs_;
diff --git a/velox/exec/rpc/RPCState.cpp b/velox/exec/rpc/RPCState.cpp
@@ -44,6 +44,7 @@ void RPCState::setMaxPendingRows(int64_t maxPendingRows) {
 
 void RPCState::setMaxPendingBatches(int64_t maxPendingBatches) {
   maxPendingBatches_ = maxPendingBatches;
+  effectiveMaxPendingBatches_ = maxPendingBatches;
 }
 
 // ===== Input batch storage =====
@@ -340,11 +341,35 @@ bool RPCState::isFinished() {
 bool RPCState::isUnderBackpressure() {
   std::lock_guard<std::mutex> l(mutex_);
   if (streamingMode_ == RPCStreamingMode::kBatch) {
-    return static_cast<int64_t>(pendingBatches_.size()) >= maxPendingBatches_;
+    return static_cast<int64_t>(pendingBatches_.size()) >=
+        effectiveMaxPendingBatches_;
   }
   return numPendingRows_ >= maxPendingRows_;
 }
 
+void RPCState::onBatchSuccess(int64_t increment) {
+  std::lock_guard<std::mutex> l(mutex_);
+  if (effectiveMaxPendingBatches_ < maxPendingBatches_) {
+    effectiveMaxPendingBatches_ =
+        std::min(effectiveMaxPendingBatches_ + increment, maxPendingBatches_);
+    RPC_STATE_LOG(INFO) << "RPC congestion: batch success, window increased to "
+                        << effectiveMaxPendingBatches_ << "/"
+                        << maxPendingBatches_;
+  }
+}
+
+void RPCState::onBatchError() {
+  std::lock_guard<std::mutex> l(mutex_);
+  auto prev = effectiveMaxPendingBatches_;
+  effectiveMaxPendingBatches_ =
+      std::max<int64_t>(effectiveMaxPendingBatches_ / 2, 1);
+  if (effectiveMaxPendingBatches_ < prev) {
+    RPC_STATE_LOG(WARNING)
+        << "RPC congestion: batch error, window decreased from " << prev
+        << " to " << effectiveMaxPendingBatches_;
+  }
+}
+
 void RPCState::notifyWaitersLocked() {
   // Fulfill all promises to wake up blocked drivers.
   // Called while mutex_ is held.
diff --git a/velox/exec/rpc/RPCState.h b/velox/exec/rpc/RPCState.h
@@ -224,9 +224,20 @@ class RPCState {
 
   /// Returns true if backpressure should be applied. Thread-safe.
   /// PER_ROW mode: pending rows >= maxPendingRows.
-  /// BATCH mode: pending batches >= maxPendingBatches.
+  /// BATCH mode: pending batches >= effectiveMaxPendingBatches
+  /// (congestion-adjusted).
   bool isUnderBackpressure();
 
+  /// Signal that a batch completed successfully (all responses non-empty).
+  /// Increases the effective concurrency window by increment (additive
+  /// increase). Thread-safe.
+  void onBatchSuccess(int64_t increment = 2);
+
+  /// Signal that a batch had errors (e.g., empty responses from overload).
+  /// Halves the effective concurrency window (multiplicative decrease).
+  /// Thread-safe.
+  void onBatchError();
+
  private:
   /// Move a completed row into readyRows_ and notify waiters.
   /// Called from the RPC completion callback (runs on executor thread).
@@ -253,7 +264,14 @@ class RPCState {
   bool noMoreInput_{false};
   RPCStreamingMode streamingMode_{RPCStreamingMode::kPerRow};
   int64_t maxPendingRows_{100};
-  int64_t maxPendingBatches_{10};
+  int64_t maxPendingBatches_{2};
+
+  // Congestion control for BATCH mode.
+  // effectiveMaxPendingBatches_ starts at maxPendingBatches_ and adjusts:
+  //   - On success: min(effective + 1, maxPendingBatches_)  (additive increase)
+  //   - On error:   max(effective / 2, 1)                   (multiplicative
+  //   decrease)
+  int64_t effectiveMaxPendingBatches_{2};
 };
 
 } // namespace facebook::velox::exec::rpc
diff --git a/velox/expression/rpc/AsyncRPCFunction.h b/velox/expression/rpc/AsyncRPCFunction.h
@@ -141,13 +141,20 @@ class AsyncRPCFunction {
   /// The function builds the typed batch request from its internal
   /// accumulated state and dispatches it.
   ///
-  /// Returns responses for ALL accumulated rows. Null rows get
+  /// @param maxRows Maximum number of rows to flush. 0 means flush all.
+  /// Returns responses for the flushed rows. Null rows get
   /// RPCResponse{.error = "null_input"}. This keeps the operator
   /// completely agnostic to null handling in batch mode.
-  virtual folly::SemiFuture<std::vector<RPCResponse>> flushBatch() {
+  virtual folly::SemiFuture<std::vector<RPCResponse>> flushBatch(
+      int32_t /*maxRows*/) {
     VELOX_UNSUPPORTED("flushBatch() not implemented for function '{}'", name());
   }
 
+  /// Convenience overload: flush all accumulated rows.
+  virtual folly::SemiFuture<std::vector<RPCResponse>> flushBatch() {
+    return flushBatch(0);
+  }
+
   /// Number of rows accumulated so far (for threshold checks).
   /// Batch-capable functions MUST override this; the operator uses
   /// function_->pendingBatchSize() >= dispatchBatchSize_ to decide
@@ -177,6 +184,35 @@ class AsyncRPCFunction {
     }
     return result;
   }
+
+  // ── Congestion Control ───────────────────────────────────────
+
+  /// Signal returned by evaluateCongestion() to indicate batch health.
+  enum class CongestionSignal {
+    /// Batch completed successfully — increase concurrency window.
+    kSuccess,
+    /// Batch had errors — decrease concurrency window.
+    kError,
+    /// No congestion evaluation — skip window adjustment.
+    kNone,
+  };
+
+  /// Evaluate batch congestion from completed responses.
+  /// Called by RPCOperator after a BATCH-mode batch completes.
+  /// The function inspects responses and returns a signal that the
+  /// operator maps to window adjustments (additive increase on
+  /// kSuccess, multiplicative decrease on kError).
+  /// Default: kNone (no congestion control).
+  virtual CongestionSignal evaluateCongestion(
+      const std::vector<RPCResponse>& /*responses*/) const {
+    return CongestionSignal::kNone;
+  }
+
+  /// How much to increase the concurrency window on kSuccess.
+  /// Override to tune recovery speed per client. Default: 2.
+  virtual int64_t congestionRecoveryIncrement() const {
+    return 2;
+  }
 };
 
 } // namespace facebook::velox::exec::rpc