diff --git a/modules/nextflow/src/main/groovy/nextflow/processor/TaskProcessor.groovy b/modules/nextflow/src/main/groovy/nextflow/processor/TaskProcessor.groovy index f0ff2152f0..2a8ccdc473 100644 --- a/modules/nextflow/src/main/groovy/nextflow/processor/TaskProcessor.groovy +++ b/modules/nextflow/src/main/groovy/nextflow/processor/TaskProcessor.groovy @@ -797,7 +797,7 @@ class TaskProcessor { @CompileStatic final protected void checkCachedOrLaunchTask( TaskRun task, HashCode hash, boolean shouldTryCache ) { - int tries = task.failCount +1 + int tries = task.failCount + task.abortedCount + 1 while( true ) { hash = HashBuilder.defaultHasher().putBytes(hash.asBytes()).putInt(tries).hash() @@ -813,6 +813,11 @@ class TaskProcessor { final cached = shouldTryCache && exists && entry.trace.isCompleted() && checkCachedOutput(task.clone(), resumeDir, hash, entry) if( cached ) break + + // Issue #6884: https://github.com/nextflow-io/nextflow/issues/6884 + // When not cached but there is an entry whose status is ABORT or FAILED, the task counters for these status must be incremented to be sure future retries takes it into account to calculate the hash. + if( entry?.trace?.isFailed() ) task.failCount += 1 + if( entry?.trace?.isAborted() ) task.abortedCount+=1 } catch (Throwable t) { log.warn1("[${safeTaskName(task)}] Unable to resume cached task -- See log file for details", causedBy: t) @@ -838,7 +843,12 @@ class TaskProcessor { finally { lock.release() } - + // Issue #6884: https://github.com/nextflow-io/nextflow/issues/6884 + // When cached tasks include failures and aborts and not cached. The task.attempt could not be sync with cached failures. We update the task.attempts and task.scripts + if( task.failCount > 0 && task.config.getAttempt() != task.failCount + 1 ) { + task.config.attempt = task.failCount + 1 + task.resolve(taskBody) + } // submit task for execution submitTask( task, hash, workDir ) break diff --git a/modules/nextflow/src/main/groovy/nextflow/processor/TaskRun.groovy b/modules/nextflow/src/main/groovy/nextflow/processor/TaskRun.groovy index 2213783bfe..c0b96ef65e 100644 --- a/modules/nextflow/src/main/groovy/nextflow/processor/TaskRun.groovy +++ b/modules/nextflow/src/main/groovy/nextflow/processor/TaskRun.groovy @@ -327,6 +327,11 @@ class TaskRun implements Cloneable { */ volatile int failCount + /** + * The number of times the execution of the task has been aborted + */ + volatile int abortedCount + /** * The number of times the submit of the task has been retried */ diff --git a/modules/nextflow/src/main/groovy/nextflow/trace/TraceRecord.groovy b/modules/nextflow/src/main/groovy/nextflow/trace/TraceRecord.groovy index e5a6984643..0f854e472a 100644 --- a/modules/nextflow/src/main/groovy/nextflow/trace/TraceRecord.groovy +++ b/modules/nextflow/src/main/groovy/nextflow/trace/TraceRecord.groovy @@ -599,6 +599,14 @@ class TraceRecord implements Serializable { store.status == 'COMPLETED' } + boolean isAborted() { + store.status == 'ABORTED' + } + + boolean isFailed() { + store.status == 'FAILED' + } + String getExecutorName() { return executorName } diff --git a/tests/checks/resume-retried-with-abort.nf/.checks b/tests/checks/resume-retried-with-abort.nf/.checks new file mode 100644 index 0000000000..ca50a96738 --- /dev/null +++ b/tests/checks/resume-retried-with-abort.nf/.checks @@ -0,0 +1,34 @@ +set -e + +# +# run with abort +# +echo '' +timeout --signal=INT 6 $NXF_RUN | tee stdout || true + +[[ `< .nextflow.log grep -c 'Submitted process > SMALL_SLEEP_RETRY'` == 1 ]] || false + + +# +# RESUME mode with fail and abort +# +echo '' +timeout --signal=INT 12 $NXF_RUN -resume | tee stdout || true + +[[ `< .nextflow.log grep -c 'Submitted process > SMALL_SLEEP_RETRY'` == 1 ]] || false +[[ `< .nextflow.log grep -c 'Re-submitted process > SMALL_SLEEP_RETRY'` == 1 ]] || false +# +# RESUME mode with completed and abort +# +echo '' +timeout --signal=INT 12 $NXF_RUN -resume | tee stdout || true + +[[ `< .nextflow.log grep -c 'Submitted process > SMALL_SLEEP_RETRY'` == 1 ]] || false + +# +# RESUME mode with cached and finish +# +echo '' +$NXF_RUN -resume | tee stdout + +[[ `< .nextflow.log grep -c 'Cached process > SMALL_SLEEP_RETRY'` == 1 ]] || false diff --git a/tests/resume-retried-with-abort.nf b/tests/resume-retried-with-abort.nf new file mode 100644 index 0000000000..c4328a7215 --- /dev/null +++ b/tests/resume-retried-with-abort.nf @@ -0,0 +1,65 @@ +#!/usr/bin/env nextflow +/* + * Copyright 2013-2026, Seqera Labs + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +process LONG_SLEEP { + tag "long-sleep" + + input: + val x + + output: + stdout + + script: + ''' + echo "LONG_SLEEP start" + sleep 30 + echo "LONG_SLEEP done" + ''' +} + +process SMALL_SLEEP_RETRY { + tag "small-sleep-retry" + + errorStrategy 'retry' + maxRetries 1 + + input: + val x + + output: + stdout + + script: + """ + echo "SMALL_SLEEP_RETRY attempt: ${task.attempt}" + sleep 7 + + if [[ ${task.attempt} -eq 1 ]]; then + echo "Failing first attempt on purpose" + exit 1 + fi + + echo "Second attempt succeeded" + """ +} + +workflow { + ch = Channel.of(1) + LONG_SLEEP(ch) + SMALL_SLEEP_RETRY(ch) +}