Skip to content

Commit 8f4ba02

Browse files
committed
Fix ExecutionDelete hang (#anrossi/fix-executiondelete-hang)
In external execution mode, when cleaning up the last socket (binding) and registration and calling ExecutionDelete, it's possible the worker pool will attempt to be deleted by ExecutionDelete before the asynchronous shut down callbacks have had time to run. This will result in a hang because the ref count on the worker pool is non-zero, since there's still cleanup work to do. However, the caller, by calling ExecutionDelete has indicated that they have stopped the execution engine, so that remaining cleanup work won't get a chance to run. PR to merge into main: #5707
1 parent 9b5b9df commit 8f4ba02

File tree

1 file changed

+52
-5
lines changed

1 file changed

+52
-5
lines changed

src/platform/platform_worker.c

Lines changed: 52 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,9 @@
1818
typedef struct QUIC_CACHEALIGN CXPLAT_WORKER {
1919

2020
//
21-
// Thread used to drive the worker. Only set when the worker is created and
22-
// managed internally (default case).
21+
// Thread used to drive the worker, when the worker is created and
22+
// managed internally (default case). In the external execution case,
23+
// this thread is the cleanup thread.
2324
//
2425
CXPLAT_THREAD Thread;
2526

@@ -68,6 +69,12 @@ typedef struct QUIC_CACHEALIGN CXPLAT_WORKER {
6869
//
6970
CXPLAT_SLIST_ENTRY* ExecutionContexts;
7071

72+
//
73+
// The event for the cleanup worker to wait on to start cleanup.
74+
// NULL in the internally-managed execution case.
75+
//
76+
CXPLAT_EVENT* CleanupEvent;
77+
7178
#if DEBUG // Debug statistics
7279
uint64_t LoopCount;
7380
uint64_t EcPollCount;
@@ -107,7 +114,9 @@ typedef struct QUIC_CACHEALIGN CXPLAT_WORKER {
107114
typedef struct CXPLAT_WORKER_POOL {
108115

109116
CXPLAT_RUNDOWN_REF Rundown;
117+
CXPLAT_EVENT CleanupEvent; // Only used with external workers.
110118
uint32_t WorkerCount;
119+
BOOLEAN External;
111120

112121
#if DEBUG
113122
//
@@ -163,9 +172,10 @@ UpdatePollCompletion(
163172
BOOLEAN
164173
CxPlatWorkerPoolInitWorker(
165174
_Inout_ CXPLAT_WORKER* Worker,
175+
_In_ CXPLAT_WORKER_POOL* WorkerPool,
166176
_In_ uint16_t IdealProcessor,
167177
_In_opt_ CXPLAT_EVENTQ* EventQ, // Only for external workers
168-
_In_opt_ CXPLAT_THREAD_CONFIG* ThreadConfig // Only for internal workers
178+
_In_ CXPLAT_THREAD_CONFIG* ThreadConfig
169179
)
170180
{
171181
CxPlatLockInitialize(&Worker->ECLock);
@@ -174,6 +184,7 @@ CxPlatWorkerPoolInitWorker(
174184
Worker->IdealProcessor = IdealProcessor;
175185
Worker->State.WaitTime = UINT32_MAX;
176186
Worker->State.ThreadID = UINT32_MAX;
187+
Worker->CleanupEvent = &WorkerPool->CleanupEvent;
177188

178189
if (EventQ != NULL) {
179190
Worker->EventQ = *EventQ;
@@ -302,6 +313,7 @@ CxPlatWorkerPoolCreate(
302313
}
303314
CxPlatZeroMemory(WorkerPool, WorkerPoolSize);
304315
WorkerPool->WorkerCount = ProcessorCount;
316+
WorkerPool->External = FALSE;
305317

306318
//
307319
// Build up the configuration for creating the worker threads.
@@ -338,7 +350,7 @@ CxPlatWorkerPoolCreate(
338350

339351
CXPLAT_WORKER* Worker = &WorkerPool->Workers[i];
340352
if (!CxPlatWorkerPoolInitWorker(
341-
Worker, IdealProcessor, NULL, &ThreadConfig)) {
353+
Worker, WorkerPool, IdealProcessor, NULL, &ThreadConfig)) {
342354
goto Error;
343355
}
344356
}
@@ -368,6 +380,15 @@ CxPlatWorkerPoolCreate(
368380
return NULL;
369381
}
370382

383+
CXPLAT_THREAD_CALLBACK(CxPlatExecutionCleanupThread, Context)
384+
{
385+
CXPLAT_WORKER* Worker = (CXPLAT_WORKER*)Context;
386+
387+
CxPlatEventWaitForever(*Worker->CleanupEvent);
388+
389+
return CxPlatWorkerThread(Context);
390+
}
391+
371392
_Success_(return != NULL)
372393
CXPLAT_WORKER_POOL*
373394
CxPlatWorkerPoolCreateExternal(
@@ -395,6 +416,17 @@ CxPlatWorkerPoolCreateExternal(
395416
}
396417
CxPlatZeroMemory(WorkerPool, WorkerPoolSize);
397418
WorkerPool->WorkerCount = Count;
419+
WorkerPool->External = TRUE;
420+
421+
CxPlatEventInitialize(&WorkerPool->CleanupEvent, TRUE, FALSE);
422+
423+
CXPLAT_THREAD_CONFIG ThreadConfig = {
424+
CXPLAT_THREAD_FLAG_SET_IDEAL_PROC,
425+
0,
426+
"cxplat_exec_cleanup",
427+
CxPlatExecutionCleanupThread,
428+
NULL
429+
};
398430

399431
//
400432
// Set up each worker thread with the configuration initialized above. Also
@@ -407,7 +439,7 @@ CxPlatWorkerPoolCreateExternal(
407439

408440
CXPLAT_WORKER* Worker = &WorkerPool->Workers[i];
409441
if (!CxPlatWorkerPoolInitWorker(
410-
Worker, IdealProcessor, Configs[i].EventQ, NULL)) {
442+
Worker, WorkerPool, IdealProcessor, Configs[i].EventQ, &ThreadConfig)) {
411443
goto Error;
412444
}
413445
Executions[i] = (QUIC_EXECUTION*)Worker;
@@ -432,6 +464,7 @@ CxPlatWorkerPoolCreateExternal(
432464
CxPlatWorkerPoolDestroyWorker(Worker);
433465
}
434466

467+
CxPlatEventUninitialize(WorkerPool->CleanupEvent);
435468
CXPLAT_FREE(WorkerPool, QUIC_POOL_PLATFORM_WORKER);
436469

437470
return NULL;
@@ -449,6 +482,17 @@ CxPlatWorkerPoolDelete(
449482
#else
450483
UNREFERENCED_PARAMETER(RefType);
451484
#endif
485+
486+
if (WorkerPool->External) {
487+
//
488+
// In the case of external execution, it's possible for ExecutionDelete
489+
// to run before all the queues have been drained of internal cleanup work.
490+
// By calling ExecutionDelete, the app has indicated it is done with MsQuic,
491+
// so take ownership of the workers and allow them to run on cleanup threads
492+
// until there's nothing left to do.
493+
//
494+
CxPlatEventSet(WorkerPool->CleanupEvent);
495+
}
452496
CxPlatRundownReleaseAndWait(&WorkerPool->Rundown);
453497

454498
#if DEBUG
@@ -463,6 +507,9 @@ CxPlatWorkerPoolDelete(
463507
}
464508

465509
CxPlatRundownUninitialize(&WorkerPool->Rundown);
510+
if (WorkerPool->External) {
511+
CxPlatEventUninitialize(WorkerPool->CleanupEvent);
512+
}
466513
CXPLAT_FREE(WorkerPool, QUIC_POOL_PLATFORM_WORKER);
467514
}
468515
}

0 commit comments

Comments
 (0)