-
Notifications
You must be signed in to change notification settings - Fork 1.5k
Guard against late-arriving polls after worker shutdown #9330
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
99c530f
d7d6577
76ac033
5c1df05
d663c23
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -475,6 +475,14 @@ Deleted Redirect Rules will be kept in the DB (with DeleteTimestamp). After this | |
| `PollerHistoryTTL is the time to live for poller histories in the pollerHistory cache of a physical task queue. Poller histories are fetched when | ||
| requiring a list of pollers that polled a given task queue.`, | ||
| ) | ||
| ShutdownWorkerCacheTTL = NewGlobalDurationSetting( | ||
| "matching.ShutdownWorkerCacheTTL", | ||
| 70*time.Second, | ||
| `ShutdownWorkerCacheTTL is the time to live for entries in the shutdown worker cache. When a worker calls | ||
| ShutdownWorker, its WorkerInstanceKey is cached for this duration. Any poll arriving with a cached | ||
| WorkerInstanceKey returns empty immediately, preventing task dispatch to a shutting-down worker. | ||
| This should be longer than MatchingLongPollExpirationInterval (1 min default) to catch in-flight polls.`, | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This isn't related to the length of a long poll at all, it about the interval where rpcs can get reordered on the network. Which is unbounded in theory but practically something like 10s-30s should be fine. The SDK isn't waiting that long anyway between calling ShutdownWorker and actually exiting, right? |
||
| ) | ||
| ReachabilityBuildIdVisibilityGracePeriod = NewNamespaceDurationSetting( | ||
| "matching.wv.ReachabilityBuildIdVisibilityGracePeriod", | ||
| 3*time.Minute, | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -33,6 +33,7 @@ import ( | |
| "go.temporal.io/server/client/matching" | ||
| "go.temporal.io/server/common" | ||
| "go.temporal.io/server/common/backoff" | ||
| "go.temporal.io/server/common/cache" | ||
| "go.temporal.io/server/common/clock" | ||
| hlc "go.temporal.io/server/common/clock/hybrid_logical_clock" | ||
| "go.temporal.io/server/common/cluster" | ||
|
|
@@ -165,6 +166,10 @@ type ( | |
| outstandingPollers collection.SyncMap[string, context.CancelFunc] | ||
| // workerInstancePollers tracks pollers by worker instance key for bulk cancellation during shutdown. | ||
| workerInstancePollers workerPollerTracker | ||
| // shutdownWorkers caches WorkerInstanceKeys of workers that have initiated shutdown. | ||
| // Polls arriving with a cached key return empty immediately to prevent task dispatch | ||
| // to a shutting-down worker (handles race where poll arrives after cancellation completes). | ||
| shutdownWorkers cache.Cache | ||
| // Only set if global namespaces are enabled on the cluster. | ||
| namespaceReplicationQueue persistence.NamespaceReplicationQueue | ||
| // Lock to serialize replication queue updates. | ||
|
|
@@ -293,6 +298,8 @@ func NewEngine( | |
| nexusResults: collection.NewSyncMap[string, chan *nexusResult](), | ||
| outstandingPollers: collection.NewSyncMap[string, context.CancelFunc](), | ||
| workerInstancePollers: workerPollerTracker{pollers: make(map[string]map[string]context.CancelFunc)}, | ||
| // 50000 entries ≈ 10MB (each entry ~200 bytes: UUID key + cache overhead) | ||
| shutdownWorkers: cache.New(50000, &cache.Options{TTL: config.ShutdownWorkerCacheTTL()}), | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If you're going to use dynamic config, why not the cache size too? (I'm fine with neither being dynamic, actually) |
||
| namespaceReplicationQueue: namespaceReplicationQueue, | ||
| userDataUpdateBatchers: collection.NewSyncMap[namespace.ID, *stream_batcher.Batcher[*userDataUpdate, error]](), | ||
| rateLimiter: rateLimiter, | ||
|
|
@@ -650,6 +657,14 @@ func (e *matchingEngineImpl) PollWorkflowTaskQueue( | |
| request := req.PollRequest | ||
| taskQueueName := request.TaskQueue.GetName() | ||
|
|
||
| // Return empty immediately if this worker has already initiated shutdown. | ||
| // This guards against polls that arrive after CancelOutstandingWorkerPolls completed. | ||
| if workerInstanceKey := request.GetWorkerInstanceKey(); workerInstanceKey != "" { | ||
| if e.shutdownWorkers.Get(workerInstanceKey) != nil { | ||
| return emptyPollWorkflowTaskQueueResponse, nil | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would think it should |
||
| } | ||
| } | ||
|
|
||
| // Namespace field is not populated for forwarded requests. | ||
| if len(request.Namespace) == 0 { | ||
| ns, err := e.namespaceRegistry.GetNamespaceName(namespace.ID(req.GetNamespaceId())) | ||
|
|
@@ -952,6 +967,14 @@ func (e *matchingEngineImpl) PollActivityTaskQueue( | |
| request := req.PollRequest | ||
| taskQueueName := request.TaskQueue.GetName() | ||
|
|
||
| // Return empty immediately if this worker has already initiated shutdown. | ||
| // This guards against polls that arrive after CancelOutstandingWorkerPolls completed. | ||
| if workerInstanceKey := request.GetWorkerInstanceKey(); workerInstanceKey != "" { | ||
| if e.shutdownWorkers.Get(workerInstanceKey) != nil { | ||
| return emptyPollActivityTaskQueueResponse, nil | ||
| } | ||
| } | ||
|
|
||
| // Namespace field is not populated for forwarded requests. | ||
| if len(request.Namespace) == 0 { | ||
| ns, err := e.namespaceRegistry.GetNamespaceName(namespace.ID(req.GetNamespaceId())) | ||
|
|
@@ -1212,7 +1235,16 @@ func (e *matchingEngineImpl) CancelOutstandingWorkerPolls( | |
| ctx context.Context, | ||
| request *matchingservice.CancelOutstandingWorkerPollsRequest, | ||
| ) (*matchingservice.CancelOutstandingWorkerPollsResponse, error) { | ||
| cancelledCount := e.workerInstancePollers.CancelAll(request.WorkerInstanceKey) | ||
| workerInstanceKey := request.WorkerInstanceKey | ||
| cancelledCount := e.workerInstancePollers.CancelAll(workerInstanceKey) | ||
|
|
||
| // Cache the WorkerInstanceKey to guard against polls that arrive after this | ||
| // cancellation completes (edge case: poll was already in-flight when shutdown started). | ||
| // Any new poll with this key will return empty immediately. | ||
| if workerInstanceKey != "" { | ||
| e.shutdownWorkers.Put(workerInstanceKey, struct{}{}) | ||
| } | ||
|
|
||
| e.removePollerFromHistory(ctx, request) | ||
| return &matchingservice.CancelOutstandingWorkerPollsResponse{CancelledCount: cancelledCount}, nil | ||
| } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Note that this requires a process restart to take effect