Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
69 commits
Select commit Hold shift + click to select a range
31e35cd
Store worker_instance_key in ActivityInfo
rkannan82 Feb 4, 2026
2874f4e
Add unit test
rkannan82 Feb 5, 2026
ad8c480
Fix lint
rkannan82 Feb 5, 2026
986e9a1
Remove redundant WorkerInstanceKey assignment in UpdateActivity callback
rkannan82 Feb 9, 2026
fb3066c
Store worker_control_task_queue in ActivityInfo
rkannan82 Feb 11, 2026
6ef2640
Update go.temporal.io/api to include worker_instance_key and worker_c…
rkannan82 Feb 11, 2026
eab94b0
Fix lint
rkannan82 Feb 11, 2026
0f8e48e
Remove worker_instance_key as it is not needed
rkannan82 Feb 12, 2026
5390759
Forward WorkerControlTaskQueue through matching service partitions
rkannan82 Feb 18, 2026
ac7be08
Define CancelActivityNexusTask transfer task type
rkannan82 Feb 3, 2026
113ca15
Fix lint errors
rkannan82 Feb 6, 2026
5ad9aeb
Regen proto
rkannan82 Feb 11, 2026
05adf1a
Remove worker_instance_key as it is not needed
rkannan82 Feb 12, 2026
39c1759
Move CancelActivityNexusTask creation to task_generator
rkannan82 Feb 12, 2026
131c278
Add WorkerControlTaskQueue to CancelActivityNexusTask
rkannan82 Feb 12, 2026
7bc8ada
Add metrics tag and low priority for CancelActivityNexusTask
rkannan82 Feb 12, 2026
01e4822
Update comment for standby executor
rkannan82 Feb 12, 2026
e9b1fac
Add Version field to CancelActivityNexusTask for multi-cluster support
rkannan82 Feb 12, 2026
faff7ee
Add comment
rkannan82 Feb 12, 2026
d6af322
Add ActivityCommandTask for outbound activity commands
rkannan82 Feb 18, 2026
7f5e0fc
Regenerate mocks with go-generate for correct ordering and parameter …
rkannan82 Feb 18, 2026
cd17334
Change ActivityCommandTask to use task_tokens instead of scheduled_ev…
rkannan82 Feb 18, 2026
6a1a66b
Change ActivityCommandTaskInfo to WorkerCommandsTask
rkannan82 Feb 25, 2026
fbf13b0
Add ActivityCommandTask dispatch via Nexus
rkannan82 Feb 18, 2026
d5591a4
Update api-go dependency to merged activity-cancel branch
rkannan82 Mar 13, 2026
537edb9
Merge branch 'kannan/activity-cancel/task-definition' into kannan/act…
rkannan82 Mar 13, 2026
d182f84
Update dispatcher to use WorkerCommandsRequest from nexusservices pac…
rkannan82 Mar 13, 2026
0f6651c
Update dispatcher for ExecuteCommands rename and extracted types
rkannan82 Mar 13, 2026
7f842cf
Generalize task to WorkerCommandsTask using API WorkerCommand type
rkannan82 Mar 13, 2026
2150c89
Merge branch 'kannan/activity-cancel/task-definition' into kannan/act…
rkannan82 Mar 13, 2026
1b85cce
Update dispatcher and integration test for WorkerCommandsTask
rkannan82 Mar 13, 2026
404ce2a
Rename TASK_TYPE_ACTIVITY_COMMAND to TASK_TYPE_WORKER_COMMANDS and re…
rkannan82 Mar 13, 2026
54232d3
Merge branch 'kannan/activity-cancel/task-definition' into kannan/act…
rkannan82 Mar 13, 2026
3e0a3a0
Rename activity_command_task.go to worker_commands_task.go
rkannan82 Mar 13, 2026
05a7c1d
Merge branch 'kannan/activity-cancel/task-definition' into kannan/act…
rkannan82 Mar 13, 2026
6614c3e
Add worker commands dispatcher with Nexus response handling and unit …
rkannan82 Mar 31, 2026
64c52e9
Fix lint errors: gofmt alignment, testifylint, importas, staticcheck,…
rkannan82 Mar 31, 2026
7343da1
Refactor worker commands dispatcher: deduplicate failure handling, re…
rkannan82 Mar 31, 2026
66dd3fb
Clean up dispatch response handling: inline failure conversion, impro…
rkannan82 Mar 31, 2026
dfac6d2
Store started_clock in ActivityInfo for task token reconstruction
rkannan82 Apr 1, 2026
29ac6e5
Cap worker commands task retries at 3 attempts
rkannan82 Apr 1, 2026
3fd7447
Add retry cap comment to dispatcher failure scenarios doc
rkannan82 Apr 1, 2026
412045e
Emit metric when dropping worker commands task at retry cap
rkannan82 Apr 1, 2026
083d313
Clarify attempt parameter in test calls with inline comment
rkannan82 Apr 1, 2026
8dec8b4
Update go.mod to latest api-go (includes API PR #708)
rkannan82 Apr 7, 2026
3e4267f
Address review feedback: backward compat, standby executor, lock safety
rkannan82 Apr 10, 2026
ca17749
Remove replace directive and update go.temporal.io/api to v1.62.8
rkannan82 Apr 11, 2026
0549e23
Fix compile error in convertTemporalFailure: := to = for named returns
rkannan82 Apr 11, 2026
466f4d2
Remove redundant StartedEventId check in cancel activity handler
rkannan82 Apr 11, 2026
061cc39
Merge origin/main into kannan/activity-cancel/dispatch-logic
rkannan82 Apr 11, 2026
e3a08a5
Revert executable_mock.go to main (Attempt() already merged via #9924)
rkannan82 Apr 11, 2026
8846349
Fix nits: align metric defs, replace assert with require in tests
rkannan82 Apr 11, 2026
3c4e62c
Remove unused common import in handler test
rkannan82 Apr 11, 2026
6a890ba
Replace workerservicepb.WorkerService with string constants
rkannan82 Apr 11, 2026
3420f9a
Fix GCI lint: alphabetize imports and align struct fields in test
rkannan82 Apr 11, 2026
56e9b73
Fix nil Clock in duplicate RecordActivityTaskStarted for pre-deploy a…
rkannan82 Apr 14, 2026
9859175
Improve RecordActivityTaskStarted test coverage and naming
rkannan82 Apr 14, 2026
7ef2b52
Add integration test for duplicate RecordActivityTaskStarted
rkannan82 Apr 14, 2026
a6ac85f
Address review feedback: move metric, check handler error retryability
rkannan82 Apr 14, 2026
0f546ca
Add description to WorkerCommandsSent metric
rkannan82 Apr 14, 2026
f4fe808
Use SDK failure converter instead of Nexus failure conversion
rkannan82 Apr 14, 2026
aa9dd98
Use retryable matching client for worker commands dispatch
rkannan82 Apr 14, 2026
7966342
Fix formatting alignment
rkannan82 Apr 14, 2026
c271f43
Merge branch 'main' into kannan/activity-cancel/dispatch-logic
rkannan82 Apr 15, 2026
d7190ef
Fix proto depIdxs conflict and test dependency for MatchingClient
Copilot Apr 15, 2026
0f09733
Merge branch 'main' into kannan/activity-cancel/dispatch-logic
rkannan82 Apr 15, 2026
59b72db
Merge branch 'main' into kannan/activity-cancel/dispatch-logic
rkannan82 Apr 15, 2026
cd56d9c
Merge branch 'main' into kannan/activity-cancel/dispatch-logic
rkannan82 Apr 15, 2026
b0d428d
Merge branch 'main' into kannan/activity-cancel/dispatch-logic
rkannan82 Apr 15, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
135 changes: 79 additions & 56 deletions api/persistence/v1/executions.pb.go

Large diffs are not rendered by default.

5 changes: 5 additions & 0 deletions common/metrics/metric_defs.go
Original file line number Diff line number Diff line change
Expand Up @@ -748,6 +748,11 @@ var (
"nexus_completion_request_preprocess_errors",
WithDescription("The number of Nexus completion requests for which pre-processing failed."),
)
WorkerCommandsSent = NewCounterDef(
"worker_commands_sent",
WithDescription("The number of worker command dispatches, tagged by outcome (e.g. success, no_poller, rpc_error)."),
)

HostRPSLimit = NewGaugeDef("host_rps_limit")
NamespaceHostRPSLimit = NewGaugeDef("namespace_host_rps_limit")
HandoverWaitLatency = NewTimerDef("handover_wait_latency")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -678,6 +678,21 @@ message ActivityInfo {
// A dedicated per-worker Nexus task queue on which the server sends control
// tasks (e.g. activity cancellation) to this specific worker instance.
string worker_control_task_queue = 51;

// The shard clock at the time this activity was started (RecordActivityTaskStarted).
// Matching uses this clock to build the task token sent to the worker. Stored here so
// that history can later reconstruct the same task token (e.g. for cancel worker commands).
//
// IMPORTANT: The clock approach requires history to reconstruct the token using
// the same fields and logic as matching — if NewActivityTaskToken changes, both
// call sites must stay in sync or the tokens will silently diverge. An alternative
// is to store the full serialized task token (~150-300 bytes), which avoids
// reconstruction entirely and is immune to token format changes. We chose the
// clock approach to keep the per-activity memory footprint minimal (~24 bytes).
//
// Replication: This field is part of ActivityInfo and is automatically replicated
// via state-based replication. No special handling is needed.
temporal.server.api.clock.v1.VectorClock started_clock = 52;
}

// timer_map column
Expand Down
23 changes: 23 additions & 0 deletions service/history/api/recordactivitytaskstarted/api.go
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,16 @@ func recordActivityTaskStarted(
if ai.RequestId == requestID {
response.StartedTime = ai.StartedTime
response.Attempt = ai.Attempt
if ai.StartedClock != nil {
response.Clock = ai.StartedClock
} else {
// Activity started before the StartedClock field was added.
// Create a fresh clock for the shard staleness check.
response.Clock, err = shardContext.NewVectorClock()
if err != nil {
return nil, rejectCodeUndefined, err
}
}
return response, rejectCodeAccepted, nil
}

Expand Down Expand Up @@ -238,6 +248,17 @@ func recordActivityTaskStarted(
}
}

// Create the shard clock before recording the start event. Matching uses the returned clock
// to build the task token sent to the worker. We store it in ActivityInfo so that history
// can later reconstruct the same task token that matching created (e.g. for cancel worker
// commands). On retries of this RPC, we return the stored clock from the early return
// path above.
clock, err := shardContext.NewVectorClock()
if err != nil {
return nil, rejectCodeUndefined, err
}
ai.StartedClock = clock

versioningStamp := worker_versioning.StampFromCapabilities(request.PollRequest.WorkerVersionCapabilities, request.PollRequest.DeploymentOptions) //nolint:staticcheck // SA1019: WorkerVersionCapabilities is deprecated but still used for old versioning [cleanup-old-wv]
if _, err := mutableState.AddActivityTaskStartedEvent(
ai, scheduledEventID, requestID, request.PollRequest.GetIdentity(),
Expand All @@ -261,6 +282,8 @@ func recordActivityTaskStarted(
),
).Record(scheduleToStartLatency)

response.Clock = clock

response.StartedTime = ai.StartedTime
response.Attempt = ai.Attempt
response.HeartbeatDetails = ai.LastHeartbeatDetails
Expand Down
112 changes: 112 additions & 0 deletions service/history/api/recordactivitytaskstarted/api_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,23 @@ import (
"context"
"testing"

"github.com/google/uuid"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
commonpb "go.temporal.io/api/common/v1"
enumspb "go.temporal.io/api/enums/v1"
historypb "go.temporal.io/api/history/v1"
clockspb "go.temporal.io/server/api/clock/v1"
deploymentspb "go.temporal.io/server/api/deployment/v1"
"go.temporal.io/server/api/historyservice/v1"
"go.temporal.io/server/api/matchingservice/v1"
"go.temporal.io/server/api/matchingservicemock/v1"
persistencespb "go.temporal.io/server/api/persistence/v1"
"go.temporal.io/server/common/cluster"
"go.temporal.io/server/common/metrics"
"go.temporal.io/server/common/namespace"
"go.temporal.io/server/common/worker_versioning"
historyi "go.temporal.io/server/service/history/interfaces"
"go.uber.org/mock/gomock"
"google.golang.org/protobuf/types/known/timestamppb"
)
Expand Down Expand Up @@ -265,3 +274,106 @@ func TestGetDeploymentVersionForWorkflowID_UnversionedTaskQueue(t *testing.T) {
assert.Nil(t, targetVersion, "Unversioned task queue should return nil version")
assert.Equal(t, int64(0), targetRevNum, "Unversioned task queue should return 0 revision number")
}

const testClusterName = "active"

// setupMutableStateWithStartedActivity creates mock shard and mutable state with an activity
// that is already started (StartedEventId != EmptyEventID). The activity's StartedClock is set
// to the provided value. Returns the mocks and a request whose RequestId matches the activity's,
// so the call will hit the duplicate-request early-return path in recordActivityTaskStarted.
func setupMutableStateWithStartedActivity(t *testing.T, startedClock *clockspb.VectorClock) (
*historyi.MockShardContext,
*historyi.MockMutableState,
*historyservice.RecordActivityTaskStartedRequest,
) {
t.Helper()
ctrl := gomock.NewController(t)

mockShard := historyi.NewMockShardContext(ctrl)
mockMS := historyi.NewMockMutableState(ctrl)

nsID := uuid.New().String()
scheduledEventID := int64(5)
requestID := "test-request-id"

// Namespace registry
nsEntry := namespace.NewLocalNamespaceForTest(
&persistencespb.NamespaceInfo{Id: nsID, Name: "test-namespace"},
&persistencespb.NamespaceConfig{},
testClusterName,
)
mockNSReg := namespace.NewMockRegistry(ctrl)
mockNSReg.EXPECT().GetNamespaceByID(namespace.ID(nsID)).Return(nsEntry, nil)

// Cluster metadata
mockClusterMeta := cluster.NewMockMetadata(ctrl)
mockClusterMeta.EXPECT().GetCurrentClusterName().Return(testClusterName)

mockShard.EXPECT().GetNamespaceRegistry().Return(mockNSReg)
mockShard.EXPECT().GetClusterMetadata().Return(mockClusterMeta)
mockShard.EXPECT().GetMetricsHandler().Return(metrics.NoopMetricsHandler).AnyTimes()

ai := &persistencespb.ActivityInfo{
ScheduledEventId: scheduledEventID,
StartedEventId: 7, // already started
RequestId: requestID,
StartedTime: timestamppb.Now(),
Attempt: 1,
StartedClock: startedClock,
}

mockMS.EXPECT().GetActivityInfo(scheduledEventID).Return(ai, true)
mockMS.EXPECT().GetActivityScheduledEvent(gomock.Any(), scheduledEventID).Return(
&historypb.HistoryEvent{EventId: scheduledEventID}, nil,
)
mockMS.EXPECT().GetExecutionInfo().Return(&persistencespb.WorkflowExecutionInfo{})

request := &historyservice.RecordActivityTaskStartedRequest{
NamespaceId: nsID,
WorkflowExecution: &commonpb.WorkflowExecution{
WorkflowId: "test-wf-id",
RunId: "test-run-id",
},
ScheduledEventId: scheduledEventID,
RequestId: requestID,
}

return mockShard, mockMS, request
}

// TestRecordActivityTaskStarted_DuplicateRequest_NilStartedClock verifies that
// when a duplicate RecordActivityTaskStarted request arrives for an activity
// started before the StartedClock field was added (StartedClock is nil), the response
// still contains a non-nil Clock for the shard staleness check.
func TestRecordActivityTaskStarted_DuplicateRequest_NilStartedClock(t *testing.T) {
mockShard, mockMS, request := setupMutableStateWithStartedActivity(t, nil /* no StartedClock */)

// Should call NewVectorClock as fallback for nil StartedClock
fallbackClock := &clockspb.VectorClock{ClusterId: 1, ShardId: 1, Clock: 42}
mockShard.EXPECT().NewVectorClock().Return(fallbackClock, nil)

resp, code, err := recordActivityTaskStarted(
context.Background(), mockShard, mockMS, request, nil, nil,
)
require.NoError(t, err)
require.Equal(t, rejectCodeAccepted, code)
require.NotNil(t, resp.Clock, "Clock must be non-nil even for pre-deploy activities")
require.Equal(t, fallbackClock, resp.Clock)
}

// TestRecordActivityTaskStarted_DuplicateRequest_WithStartedClock verifies that
// when StartedClock is stored, the stored clock is returned without creating a new one.
func TestRecordActivityTaskStarted_DuplicateRequest_WithStartedClock(t *testing.T) {
storedClock := &clockspb.VectorClock{ClusterId: 1, ShardId: 1, Clock: 100}
mockShard, mockMS, request := setupMutableStateWithStartedActivity(t, storedClock)

// Should NOT call NewVectorClock since StartedClock is available
mockShard.EXPECT().NewVectorClock().Times(0)

resp, code, err := recordActivityTaskStarted(
context.Background(), mockShard, mockMS, request, nil, nil,
)
require.NoError(t, err)
require.Equal(t, rejectCodeAccepted, code)
require.Equal(t, storedClock, resp.Clock, "Should return the stored StartedClock")
}
Original file line number Diff line number Diff line change
Expand Up @@ -682,7 +682,6 @@ func (handler *workflowTaskCompletedHandler) handleCommandRequestCancelActivity(
if ai != nil {
// If ai is nil, the activity has already been canceled/completed/timedout. The cancel request
// will be recorded in the history, but no further action will be taken.

if ai.StartedEventId == common.EmptyEventID {
// We haven't started the activity yet, we can cancel the activity right away and
// schedule a workflow task to ensure the workflow makes progress.
Expand All @@ -697,37 +696,48 @@ func (handler *workflowTaskCompletedHandler) handleCommandRequestCancelActivity(
return nil, err
}
handler.activityNotStartedCancelled = true
} else if ai.StartedEventId != common.EmptyEventID && ai.WorkerControlTaskQueue != "" {
// Activity has started and worker supports Nexus control tasks - collect for batched dispatch.
taskToken, err := handler.tokenSerializer.Serialize(tasktoken.NewActivityTaskToken(
handler.mutableState.GetNamespaceEntry().ID().String(),
handler.mutableState.GetWorkflowKey().WorkflowID,
handler.mutableState.GetWorkflowKey().RunID,
ai.ScheduledEventId,
ai.ActivityId,
ai.ActivityType.GetName(),
ai.Attempt,
nil, // Clock not needed for cancel
ai.Version,
ai.StartVersion,
nil,
))
if err != nil {
return nil, err
}
if handler.pendingWorkerCommandsByControlQueue == nil {
handler.pendingWorkerCommandsByControlQueue = make(map[string][]*workerpb.WorkerCommand)
}
handler.pendingWorkerCommandsByControlQueue[ai.WorkerControlTaskQueue] = append(
handler.pendingWorkerCommandsByControlQueue[ai.WorkerControlTaskQueue],
&workerpb.WorkerCommand{
Type: &workerpb.WorkerCommand_CancelActivity{
CancelActivity: &workerpb.CancelActivityCommand{
TaskToken: taskToken,
} else if ai.WorkerControlTaskQueue != "" {
if ai.StartedClock == nil {
// StartedClock may be nil for activities started before this feature was deployed.
// Skip cancel command; the activity will time out normally.
handler.logger.Info("Skipping worker cancel command: activity missing StartedClock (pre-deploy)",
tag.WorkflowNamespaceID(handler.mutableState.GetWorkflowKey().NamespaceID),
tag.WorkflowID(handler.mutableState.GetWorkflowKey().WorkflowID),
tag.WorkflowRunID(handler.mutableState.GetWorkflowKey().RunID),
tag.WorkflowScheduledEventID(ai.ScheduledEventId),
)
} else {
// Activity has started and worker supports Nexus control tasks - collect for batched dispatch.
taskToken, err := handler.tokenSerializer.Serialize(tasktoken.NewActivityTaskToken(
handler.mutableState.GetNamespaceEntry().ID().String(),
handler.mutableState.GetWorkflowKey().WorkflowID,
handler.mutableState.GetWorkflowKey().RunID,
ai.ScheduledEventId,
ai.ActivityId,
ai.ActivityType.GetName(),
ai.Attempt,
ai.StartedClock,
ai.Version,
ai.StartVersion,
nil,
))
if err != nil {
return nil, err
}
if handler.pendingWorkerCommandsByControlQueue == nil {
handler.pendingWorkerCommandsByControlQueue = make(map[string][]*workerpb.WorkerCommand)
}
handler.pendingWorkerCommandsByControlQueue[ai.WorkerControlTaskQueue] = append(
handler.pendingWorkerCommandsByControlQueue[ai.WorkerControlTaskQueue],
&workerpb.WorkerCommand{
Type: &workerpb.WorkerCommand_CancelActivity{
CancelActivity: &workerpb.CancelActivityCommand{
TaskToken: taskToken,
},
},
},
},
)
)
}
}
}
return actCancelReqEvent, nil
Expand Down
Loading
Loading