From 125ed0c3ff0646946073f337eb0ff377626c2987 Mon Sep 17 00:00:00 2001 From: ChrisPaulBennett Date: Tue, 7 Jan 2025 11:29:55 +0000 Subject: [PATCH 01/47] Initial profiler implementation (non working) Changed the name of the profiler module. Linting Profiler sends KB instead of bytes Time Series now working CPU/Memory Logging working --- cylc/flow/cfgspec/globalcfg.py | 9 + cylc/flow/etc/job.sh | 36 +++ cylc/flow/job_file.py | 6 +- cylc/flow/scripts/profile.py | 246 ++++++++++++++++++ cylc/flow/scripts/profiler.py | 194 ++++++++++++++ setup.cfg | 2 + tests/functional/jobscript/02-profiler.t | 46 ++++ .../jobscript/02-profiler/bin/foo.sh | 2 + .../jobscript/02-profiler/flow.cylc | 77 ++++++ .../jobscript/02-profiler/reference.log | 3 + tests/unit/test_job_file.py | 8 +- 11 files changed, 626 insertions(+), 3 deletions(-) create mode 100755 cylc/flow/scripts/profile.py create mode 100755 cylc/flow/scripts/profiler.py create mode 100644 tests/functional/jobscript/02-profiler.t create mode 100755 tests/functional/jobscript/02-profiler/bin/foo.sh create mode 100644 tests/functional/jobscript/02-profiler/flow.cylc create mode 100644 tests/functional/jobscript/02-profiler/reference.log diff --git a/cylc/flow/cfgspec/globalcfg.py b/cylc/flow/cfgspec/globalcfg.py index cff1eddc7e5..064e72da1b5 100644 --- a/cylc/flow/cfgspec/globalcfg.py +++ b/cylc/flow/cfgspec/globalcfg.py @@ -1477,6 +1477,15 @@ def default_for( .. versionadded:: 8.0.0 ''') + + with Conf('profile'): + Conf('activate', VDR.V_BOOLEAN, True, desc=''' + A Boolean that sets if the cylc profiler will be used + + .. versionadded:: 8.0.0 + ''') + Conf('cgroups path', VDR.V_STRING, '/sys/fs/cgroup') + Conf('job runner', VDR.V_STRING, 'background', desc=f''' The system used to run jobs on the platform. diff --git a/cylc/flow/etc/job.sh b/cylc/flow/etc/job.sh index c64edfda298..39861aa1460 100755 --- a/cylc/flow/etc/job.sh +++ b/cylc/flow/etc/job.sh @@ -38,6 +38,8 @@ cylc__job__main() { set -x fi # Init-Script + cylc profile & + export profiler_pid="$!" & cylc__job__run_inst_func 'init_script' # Start error and vacation traps typeset signal_name= @@ -139,6 +141,12 @@ cylc__job__main() { mkdir -p "$(dirname "${CYLC_TASK_WORK_DIR}")" || true mkdir -p "${CYLC_TASK_WORK_DIR}" cd "${CYLC_TASK_WORK_DIR}" + + if [[ "${CYLC_PROFILE}" == "True" ]] ; then + cylc profile & + export profiler_pid="$!" + fi + # Env-Script, User Environment, Pre-Script, Script and Post-Script # Run user scripts in subshell to protect cylc job script from interference. # Waiting on background process allows signal traps to trigger immediately. @@ -157,12 +165,25 @@ cylc__job__main() { cylc__set_return "$ret_code" fi } + # Grab the max rss and cpu_time value before moving directory + if [[ -f "max_rss" ]]; then + max_rss=$(sed -n '1p' max_rss) + rm max_rss + fi + if [[ -f "cpu_time" ]]; then + cpu_time=$(sed -n '1p' cpu_time) + rm cpu_time + fi # Empty work directory remove cd rmdir "${CYLC_TASK_WORK_DIR}" 2>'/dev/null' || true # Send task succeeded message + cylc__kill_profiler + wait "${CYLC_TASK_MESSAGE_STARTED_PID}" 2>'/dev/null' || true + cylc message -- "${CYLC_WORKFLOW_ID}" "${CYLC_TASK_JOB}" 'succeeded' || true + cylc message -- "${CYLC_WORKFLOW_ID}" "${CYLC_TASK_JOB}" 'max_rss:' "${MAXRSS}" || true # (Ignore shellcheck "globbing and word splitting" warning here). # shellcheck disable=SC2086 trap '' ${CYLC_VACATION_SIGNALS:-} ${CYLC_FAIL_SIGNALS} @@ -187,6 +208,20 @@ cylc__set_return() { return "${1:-0}" } +############################################################################### +# Save the data using cylc message and exit the profiler +cylc__kill_profiler() { + if [[ -n ${profiler_pid:-} ]]; then + kill -s SIGINT "${profiler_pid}" + fi + if [ -n "${max_rss}" ]; then + cylc message -- "${CYLC_WORKFLOW_ID}" "${CYLC_TASK_JOB}" "DEBUG: max_rss $max_rss" || true + fi + if [ -n "${cpu_time}" ]; then + cylc message -- "${CYLC_WORKFLOW_ID}" "${CYLC_TASK_JOB}" "DEBUG: cpu_time $cpu_time" || true + fi +} + ############################################################################### # Disable selected or all (if no arguments given) fail traps. # Globals: @@ -261,6 +296,7 @@ cylc__job__run_inst_func() { # Returns: # exit ${CYLC_TASK_USER_SCRIPT_EXITCODE} cylc__job_finish_err() { + cylc__kill_profiler CYLC_TASK_USER_SCRIPT_EXITCODE="${CYLC_TASK_USER_SCRIPT_EXITCODE:-$?}" typeset signal="$1" typeset run_err_script="$2" diff --git a/cylc/flow/job_file.py b/cylc/flow/job_file.py index 930331dc5a4..489b3f99c47 100644 --- a/cylc/flow/job_file.py +++ b/cylc/flow/job_file.py @@ -224,8 +224,10 @@ def _write_task_environment(self, handle, job_conf): '\n export CYLC_TASK_TRY_NUMBER=%s' % job_conf['try_num']) handle.write( "\n export CYLC_TASK_FLOW_NUMBERS=" - f"{','.join(str(f) for f in job_conf['flow_nums'])}" - ) + f"{','.join(str(f) for f in job_conf['flow_nums'])}") + handle.write( + "\n export CYLC_PROFILE=" + f"{job_conf['platform']['profile']['activate']}") # Standard parameter environment variables for var, val in job_conf['param_var'].items(): handle.write('\n export CYLC_TASK_PARAM_%s="%s"' % (var, val)) diff --git a/cylc/flow/scripts/profile.py b/cylc/flow/scripts/profile.py new file mode 100755 index 00000000000..391dbbe9ca9 --- /dev/null +++ b/cylc/flow/scripts/profile.py @@ -0,0 +1,246 @@ +#!/usr/bin/env python3 +# THIS FILE IS PART OF THE CYLC WORKFLOW ENGINE. +# Copyright (C) NIWA & British Crown (Met Office) & Contributors. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +"""cylc profiler [OPTIONS] + +Profiler which periodically polls PBS cgroups to track +the resource usage of jobs running on the node. +""" + +import os +import sys +import time +import signal +from pathlib import Path +from dataclasses import dataclass +from argparse import ArgumentParser +from cylc.flow.terminal import cli_function +from cylc.flow.option_parsers import CylcOptionParser as COP + +INTERNAL = True + +def get_option_parser() -> COP: + parser = COP( + __doc__, + argdoc=[ + ], + ) + parser.add_option( + "-i", type=int, help="interval between query cycles in seconds", + default=10, dest="delay") + parser.add_option( + "-o", type=str, help="output directory for json file", + default=os.environ['DATADIR'], dest="output_dir") + parser.add_option( + "-m", type=str, help="Location of memory process files", + default="/sys/fs/cgroup/memory/pbspro.service/jobid", dest="memory") + + return parser + + +@cli_function(get_option_parser) +def main(parser, options): + """CLI main.""" + # Register the stop_profiler function with the signal library + signal.signal(signal.SIGINT, stop_profiler) + + profile(options) + + + +@dataclass +class Process: + """Class for representing CPU and Memory usage of a process""" + cgroup_memory_path: str + # cgroup_cpu_path: str + job_id: str + # system_usage: int + # cpu_usage: int + + +def stop_profiler(*args): + """This function will be executed when the SIGINT signal is sent + to this process""" + print('profiler exited') + sys.exit(0) + + +def parse_memory_file(process): + """Open the memory stat file and copy the appropriate data""" + path = os.path.join(process.cgroup_memory_path + "/" + + process.job_id, "memory.stat") + + for line in open(path): + key, value = line.strip().split() + # Grab the data we want + if key == 'rss': + return value + + +def write_data(process, data, output_dir, data_type): + + # Build the output file path + path = os.path.join(output_dir, process.job_id + data_type) + try: + with open("/home/h01/cbennett/repos/cylc_ui_gantt/cylc-flow/max_rss", 'w') as f: + f.write(data) + except IOError: + raise IOError("Unable to write memory data to file") + + +# def get_host_num_cpus(cpuset_path, processes): +# """Number of physical CPUs available to the process""" +# cpuset = open(cpuset_path + '/' + +# processes[0].job_id + '/cpuset.cpus').read() +# print("raw_data:", cpuset) +# cpu_number = cpuset.split('-') +# print('split:', cpu_number) +# number_of_cpus = ((int(cpu_number[1]) - int(cpu_number[0])) + 1) // 2 + # split_proc_stat_lines = [cpuset.split(') for line in proc_stat_lines] + # cpu_lines = [ + # split_line + # for split_line in split_proc_stat_lines + # if len(split_line) > 0 and "cpu" in split_line[0] + # ] + # # Number of lines starting with a word including 'cpu', subtracting + # # 1 for the first summary line. + # host_num_cpus = len(cpu_lines) - 1 + # print(number_of_cpus) + # return number_of_cpus + + +# def get_system_usage(): +# """ +# Computes total CPU usage of the host in nanoseconds. +# See also the / proc / stat entry here: +# https://man7.org/linux/man-pages/man5/proc.5.html +# """ +# # Grab usage data from proc/stat +# usage_data = open('/proc/stat').read().split("\n")[0].split()[1:8] +# +# total_clock_ticks = sum(int(entry) for entry in usage_data) +# # 100 clock ticks per second, 10^9 ns per second +# usage_ns = total_clock_ticks * 10 ** 7 +# return usage_ns +# +# +# def get_cpu_percent(num_of_cpus, proc_path, process, +# last_system_usage, last_cpu_usage): +# +# time.sleep(5) +# # Find cpuacct.usage files +# cpu_usage = int(open(process.cgroup_cpu_path + "/" + +# process.job_id + "/cpuacct.usage").read()) +# system_usage = get_system_usage() +# +# # Since deltas are not initially available, return 0.0 on first call. +# if last_system_usage is None: +# cpu_percent = 0.0 +# else: +# cpu_delta = cpu_usage - last_cpu_usage +# # "System time passed." (Typically close to clock time.) +# system_delta = (system_usage - last_system_usage) / num_of_cpus +# +# quotient = cpu_delta / system_delta +# cpu_percent = round(quotient * 100 / 8, 1) +# process.system_usage = system_usage +# process.cpu_usage = cpu_usage +# # Computed percentage might be slightly above 100%. +# return process, min(cpu_percent, 100.0) + + +def profile(args): + + # print("cylc_profile:", os.environ['CYLC_PROFILE']) + max_rss = 0 + processes = [] + # last_system_usage = None + # last_cpu_usage = None + + # Find the correct memory_stat file for the process + if not Path.exists(Path(args.memory)): + FileNotFoundError("cgroups not found") + + try: + # Find memory.stat files + for job_id in os.listdir(args.memory): + if "ex" in job_id: + print("found process:", job_id) + processes.append(Process( + cgroup_memory_path=args.memory, + # cgroup_cpu_path=args.cpu, + # system_usage=0, + # cpu_usage=0, + job_id=job_id)) + except FileNotFoundError as e: + print(e) + exit("Is this being ran on Azure HPC?") + + # cpu_count = get_host_num_cpus(args.cpuset_path, processes) + for i in range(30): + # Write memory usage data + for process in processes: + # Only save Max RSS to disk if it is above the previous value + try: + rss = int(parse_memory_file(process)) + if rss > max_rss: + max_rss = rss + write_data(process, rss, + args.output_dir, ".memory") + + except (OSError, IOError) as error: + print(error) + + # process, usage_percent = get_cpu_percent( + # cpu_count, args.proc_path, + # process, last_system_usage, last_cpu_usage) + # + # write_data(process, usage_percent, + # args.output_dir, ".cpu") + + time.sleep(args.delay) + + +def parse_arguments(): + + p = ArgumentParser( + usage="%(prog)s [options]", + description="Profiler which periodically polls PBS cgroups to track " + "the resource usage of jobs running on the node.") + + p.add_argument("-i", dest="delay", type=int, metavar="S", + default=10, help="interval between query cycles in seconds") + p.add_argument("-o", dest="output_dir", type=str, + default=os.environ['DATADIR'], + help="output directory for json file") + p.add_argument("-m", dest="memory", type=str, + default="/sys/fs/cgroup/memory/pbspro.service/jobid", + # default="/sys/fs/cgroup", + help="Location of memory process files") + # p.add_argument("-c", dest="cpu", type=str, + # default="/sys/fs/cgroup/cpu,cpuacct/pbspro.service/jobid", + # # default="/sys/fs/cgroup", + # help="Location of cpu cgroup files") + # p.add_argument("-u", dest="cpuset_path", type=str, + # default="/sys/fs/cgroup/cpuset/pbspro.service/jobid", + # help="Location of processor details") + # p.add_argument("-p", dest="proc_path", type=str, + # default="/sys/fs/cgroup/cpuset/pbspro.service/jobid", + # help="Location of processor details") + + args = p.parse_args() + + return args diff --git a/cylc/flow/scripts/profiler.py b/cylc/flow/scripts/profiler.py new file mode 100755 index 00000000000..392a66569c6 --- /dev/null +++ b/cylc/flow/scripts/profiler.py @@ -0,0 +1,194 @@ +#!/usr/bin/env python3 +# THIS FILE IS PART OF THE CYLC WORKFLOW ENGINE. +# Copyright (C) NIWA & British Crown (Met Office) & Contributors. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +"""cylc profiler [OPTIONS] + +Profiler which periodically polls PBS cgroups to track +the resource usage of jobs running on the node. +""" + +import os +import re +import sys +import time +import signal +import subprocess +from pathlib import Path +from dataclasses import dataclass +from cylc.flow.terminal import cli_function +from cylc.flow.option_parsers import CylcOptionParser as COP + +INTERNAL = True +PID_REGEX = re.compile(r"([^:]*\d{6,}.*)") +RE_INT = re.compile(r'\d+') + + +def get_option_parser() -> COP: + parser = COP( + __doc__, + argdoc=[ + ], + ) + parser.add_option( + "-i", type=int, help="interval between query cycles in seconds", + default=10, dest="delay") + parser.add_option( + "-m", type=str, help="Location of cgroups directory", + default="/sys/fs/cgroup", + dest="cgroup_location") + + return parser + + +@cli_function(get_option_parser) +def main(parser, options): + """CLI main.""" + # Register the stop_profiler function with the signal library + signal.signal(signal.SIGINT, stop_profiler) + signal.signal(signal.SIGHUP, stop_profiler) + signal.signal(signal.SIGTERM, stop_profiler) + + profile(options) + + +@dataclass +class Process: + """Class for representing CPU and Memory usage of a process""" + cgroup_memory_path: str + cgroup_cpu_path: str + + +def stop_profiler(*args): + """This function will be executed when the SIGINT signal is sent + to this process""" + print('profiler exited') + sys.exit(0) + + +def parse_memory_file(process): + """Open the memory stat file and copy the appropriate data""" + + with open(process.cgroup_memory_path, 'r') as f: + for line in f: + return int(line) // 1024 + + +def parse_cpu_file(process, cgroup_version): + """Open the memory stat file and return the appropriate data""" + + if cgroup_version == 1: + with open(process.cgroup_cpu_path, 'r') as f: + for line in f: + if "usage_usec" in line: + return int(RE_INT.findall(line)[0]) // 1000 + elif cgroup_version == 2: + with open(process.cgroup_cpu_path, 'r') as f: + for line in f: + # Cgroups v2 uses nanoseconds + return int(line) / 1000000 + else: + raise FileNotFoundError("cpu usage files not found") + + +def write_data(data, filename): + try: + with open(filename, 'w') as f: + f.write(data + "\n") + except IOError as err: + raise IOError("Unable to write data to file:" + filename) from err + + +def get_cgroup_dir(): + """Get the cgroup directory for the current process""" + # Get the PID of the current process + pid = os.getpid() + # Get the cgroup information for the current process + result = subprocess.run(['cat', '/proc/' + str(pid) + '/cgroup'], + capture_output=True, text=True, shell=False) + if result.stderr: + print(result.stderr, file=sys.stderr) + result = PID_REGEX.search(result.stdout).group() + return result + + +def profile(args): + # Find the cgroup that this process is running in. + # Cylc will put this profiler in the same cgroup + # as the job it is profiling + cgroup_name = get_cgroup_dir() + + # HPC uses cgroups v2 and SPICE uses cgroups v1 + cgroup_version = None + + if Path.exists(Path(args.cgroup_location + cgroup_name)): + cgroup_version = 1 + elif Path.exists(Path(args.cgroup_location + "/memory" + cgroup_name)): + cgroup_version = 2 + else: + raise FileNotFoundError("cgroups not found:" + cgroup_name) + + peak_memory = 0 + processes = [] + + if cgroup_version == 1: + try: + processes.append(Process( + cgroup_memory_path=args.cgroup_location + + cgroup_name + "/" + "memory.peak", + cgroup_cpu_path=args.cgroup_location + + cgroup_name + "/" + "cpu.stat")) + except FileNotFoundError as err: + print(err) + raise FileNotFoundError("cgroups not found:" + + args.cgroup_location) from err + + elif cgroup_version == 2: + try: + processes.append(Process( + cgroup_memory_path=args.cgroup_location + "/memory" + + cgroup_name + "/memory.max_usage_in_bytes", + cgroup_cpu_path=args.cgroup_location + "/cpu" + + cgroup_name + "/cpuacct.usage")) + except FileNotFoundError as err: + print(err) + raise FileNotFoundError("cgroups not found:" + + args.cgroup_location) from err + + while True: + failures = 0 + # Write memory usage data + for process in processes: + # Only save Max RSS to disk if it is above the previous value + try: + memory = parse_memory_file(process) + if memory > peak_memory: + peak_memory = memory + write_data(str(peak_memory), "max_rss") + cpu_time = parse_cpu_file(process, cgroup_version) + write_data(str(cpu_time), "cpu_time") + + except (OSError, ValueError) as error: + failures += 1 + if failures > 5: + raise OSError("cgroup polling failure", error) from error + + time.sleep(args.delay) + + +if __name__ == "__main__": + + arg_parser = get_option_parser() + profile(arg_parser.parse_args()) diff --git a/setup.cfg b/setup.cfg index 5953e14d2b8..016a43db819 100644 --- a/setup.cfg +++ b/setup.cfg @@ -163,11 +163,13 @@ cylc.command = kill = cylc.flow.scripts.kill:main lint = cylc.flow.scripts.lint:main list = cylc.flow.scripts.list:main + profiler = cylc.flow.scripts.profile:main message = cylc.flow.scripts.message:main pause = cylc.flow.scripts.pause:main ping = cylc.flow.scripts.ping:main play = cylc.flow.scripts.play:main poll = cylc.flow.scripts.poll:main + profile = cylc.flow.scripts.profiler:main psutils = cylc.flow.scripts.psutil:main reinstall = cylc.flow.scripts.reinstall:main release = cylc.flow.scripts.release:main diff --git a/tests/functional/jobscript/02-profiler.t b/tests/functional/jobscript/02-profiler.t new file mode 100644 index 00000000000..50e016c5ea0 --- /dev/null +++ b/tests/functional/jobscript/02-profiler.t @@ -0,0 +1,46 @@ +#!/usr/bin/env bash +# THIS FILE IS PART OF THE CYLC WORKFLOW ENGINE. +# Copyright (C) NIWA & British Crown (Met Office) & Contributors. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +#------------------------------------------------------------------------------- +# cylc profile test +. "$(dirname "$0")/test_header" +#------------------------------------------------------------------------------- +set_test_number 3 +#------------------------------------------------------------------------------- +install_workflow "${TEST_NAME_BASE}" "${TEST_NAME_BASE}" +#------------------------------------------------------------------------------- +TEST_NAME="${TEST_NAME_BASE}-validate" +run_ok "${TEST_NAME}" cylc validate "${WORKFLOW_NAME}" + +if [[ -n "${PYTHONPATH:-}" ]]; then + export PYTHONPATH="${PWD}/lib:${PYTHONPATH}" +else + export PYTHONPATH="${PWD}/lib" +fi + +export PATH_TO_CYLC_BIN="/path/to/cylc/bin" +create_test_global_config ' +[platforms] + [[profile]] + activate = true +' +#------------------------------------------------------------------------------- +TEST_NAME="${TEST_NAME_BASE}-run" +workflow_run_ok "${TEST_NAME}" cylc play --reference-test --debug --no-detach "${WORKFLOW_NAME}" + +grep_ok 'MAXRSS' "${WORKFLOW_RUN_DIR}/log/scheduler/log" + +purge diff --git a/tests/functional/jobscript/02-profiler/bin/foo.sh b/tests/functional/jobscript/02-profiler/bin/foo.sh new file mode 100755 index 00000000000..4b20577c0d0 --- /dev/null +++ b/tests/functional/jobscript/02-profiler/bin/foo.sh @@ -0,0 +1,2 @@ +#!/usr/bin/env bash +echo "Hello from $0" diff --git a/tests/functional/jobscript/02-profiler/flow.cylc b/tests/functional/jobscript/02-profiler/flow.cylc new file mode 100644 index 00000000000..951a5b94d20 --- /dev/null +++ b/tests/functional/jobscript/02-profiler/flow.cylc @@ -0,0 +1,77 @@ +[meta] + title = "job script torture test" + + description = """Any task job script may fail regardless of user runtime +settings if changes to cylc re-order the job script sections badly: e.g. +"cylc task started" must be called after the CYLC_ environment variables +are exported. Additionally, users may rely on the order of variable +definition in each environment and script section: e.g. workflow +bin directory must go in PATH before the task runtime environment is +defined because workflow bin commands could be used in variable assignment +expressions.""" + +[scheduling] + [[graph]] + R1 = "foo" +[runtime] + [[foo]] + platform = localhost + init-script = """ +echo "HELLO FROM INIT-SCRIPT" +# define a variable +export VAR_IS=is""" + pre-script = """ +echo "HELLO FROM PRE-SCRIPT" +# init-script must be done: +echo VAR_IS is $VAR_IS +# user environment must be done: +echo E_ONE is $E_ONE +echo E_TWO is $E_TWO +echo E_THR is $E_THR +echo E_FOU is $E_FOU +echo E_FIV is $E_FIV +# define a variable +export VAR_PreCS=precs""" + script = """ +echo "HELLO FROM SCRIPT" +# init-script must be done: +echo VAR_IS is $VAR_IS +# pre-script must be done: +echo VAR_PreCS is $VAR_PreCS +# environment must be done: +echo E_ONE is $E_ONE +echo E_TWO is $E_TWO +echo E_THR is $E_THR +echo E_FOU is $E_FOU +echo E_FIV is $E_FIV +# define a variable +export VAR_CS=var_cs""" + post-script = """ +echo "HELLO FROM POST-SCRIPT" +# init-script must be done: +echo VAR_IS is $VAR_IS +# pre-script must be done: +echo VAR_PreCS is $VAR_PreCS +# script must be done: +echo VAR_CS is $VAR_CS +# environment must be done: +echo E_ONE is $E_ONE +echo E_TWO is $E_TWO +echo E_THR is $E_THR +echo E_FOU is $E_FOU +echo E_FIV is $E_FIV +echo VAR_IS is $VAR_IS +echo VAR_PreCS is $VAR_PreCS +echo VAR_CS is $VAR_CS +# define a variable +export VAR_PostCS=postcs""" + [[[environment]]] + # path to cylc must be available: + E_ONE = $(( RANDOM % 10 )) + # init-script must be done: + E_TWO = $VAR_IS + # cylc-defined variables must be done: + E_THR = $CYLC_WORKFLOW_SHARE_DIR + E_FOU = $CYLC_TASK_NAME + # the workflow bin must be in $PATH already: + E_FIV = $( foo.sh ) diff --git a/tests/functional/jobscript/02-profiler/reference.log b/tests/functional/jobscript/02-profiler/reference.log new file mode 100644 index 00000000000..08fe5d5558a --- /dev/null +++ b/tests/functional/jobscript/02-profiler/reference.log @@ -0,0 +1,3 @@ +Initial point: 1 +Final point: 1 +1/foo -triggered off [] diff --git a/tests/unit/test_job_file.py b/tests/unit/test_job_file.py index 7e6b7982519..32bdcea2861 100644 --- a/tests/unit/test_job_file.py +++ b/tests/unit/test_job_file.py @@ -399,11 +399,16 @@ def test_write_task_environment(): 'CYLC_TASK_NAMESPACE_HIERARCHY="baa moo"\n export ' 'CYLC_TASK_TRY_NUMBER=1\n export ' 'CYLC_TASK_FLOW_NUMBERS=1\n export ' + 'CYLC_PROFILE=true\n export ' 'CYLC_TASK_PARAM_duck="quack"\n export ' 'CYLC_TASK_PARAM_mouse="squeak"\n ' 'CYLC_TASK_WORK_DIR_BASE=\'farm_noises/work_d\'\n}') job_conf = { - "platform": {'communication method': 'ssh'}, + "platform": {'communication method': 'ssh', + 'profile': { + "activate": "true", + } + }, "job_d": "1/moo/01", "namespace_hierarchy": ["baa", "moo"], "dependencies": ['moo', 'neigh', 'quack'], @@ -534,3 +539,4 @@ def test_homeless_platform(fixture_get_platform): job_sh_txt = job_sh.read() if 'HOME' in job_sh_txt: raise Exception('$HOME found in job.sh\n{job_sh_txt}') + From 60ffd73989204d76a336b954c18a74ec442307bf Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Mon, 24 Feb 2025 10:43:11 +0000 Subject: [PATCH 02/47] CPU/Memory Logging working Initial profiler implementation (non working) Changed the name of the profiler module. Linting Profiler sends KB instead of bytes Time Series now working CPU/Memory Logging working Adding profiler unit tests updating tests Fail gracefully if cgroups cannot be found Revert "Fail gracefully if cgroups cannot be found" This reverts commit 92e1e11c9b392b4742501d399f191f590814e95e. Linting Modifying unit tests Linting Changed the name of the profiler module. Profiler sends KB instead of bytes Time Series now working --- .github/workflows/1_create_release_pr.yml | 13 - .github/workflows/2_auto_publish_release.yml | 11 - .github/workflows/bash.yml | 8 - .github/workflows/build.yml | 34 +- .github/workflows/test_conda-build.yml | 7 - .github/workflows/test_fast.yml | 7 +- .github/workflows/test_functional.yml | 4 - .github/workflows/test_tutorial_workflow.yml | 8 - cylc/flow/cfgspec/globalcfg.py | 46 +- cylc/flow/etc/job.sh | 3 - cylc/flow/host_select.py | 11 +- cylc/flow/scripts/profile.py | 183 +++---- setup.cfg | 1 - .../43-auto-restart-force-override-normal.t | 5 +- tests/unit/scripts/test_profiler.py | 160 +++++++ tests/unit/test_subprocpool.py | 449 +++++++++--------- 16 files changed, 459 insertions(+), 491 deletions(-) create mode 100644 tests/unit/scripts/test_profiler.py diff --git a/.github/workflows/1_create_release_pr.yml b/.github/workflows/1_create_release_pr.yml index 662c1d8d521..bf1d7bd399d 100644 --- a/.github/workflows/1_create_release_pr.yml +++ b/.github/workflows/1_create_release_pr.yml @@ -11,19 +11,6 @@ on: required: false default: 'master' -concurrency: - # Only let this run 1 at a time - group: ${{ github.workflow }} - cancel-in-progress: false - -defaults: - run: - shell: bash - -env: - FORCE_COLOR: 2 - PIP_PROGRESS_BAR: off - jobs: create-release-pr: runs-on: ubuntu-latest diff --git a/.github/workflows/2_auto_publish_release.yml b/.github/workflows/2_auto_publish_release.yml index 72bc8c35ff4..ebbb7c3b751 100644 --- a/.github/workflows/2_auto_publish_release.yml +++ b/.github/workflows/2_auto_publish_release.yml @@ -7,18 +7,7 @@ on: # NOTE: While this is too generic, we use the `if` condition of the job to narrow it down # NOTE: Don't use `branches` as we might create release on any branch -concurrency: - # Only let this run 1 at a time - group: ${{ github.workflow }} - cancel-in-progress: false - -defaults: - run: - shell: bash - env: - FORCE_COLOR: 2 - PIP_PROGRESS_BAR: off # Best not to include the GH token here, only do it for the steps that need it MERGE_SHA: ${{ github.event.pull_request.merge_commit_sha }} CHANGELOG_FILE: CHANGES.md diff --git a/.github/workflows/bash.yml b/.github/workflows/bash.yml index 52408e80e01..48ead8fb5dd 100644 --- a/.github/workflows/bash.yml +++ b/.github/workflows/bash.yml @@ -31,14 +31,6 @@ concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true -defaults: - run: - shell: bash - -env: - FORCE_COLOR: 2 - PIP_PROGRESS_BAR: off - jobs: bash-docker: runs-on: ubuntu-latest diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index f9151fff5a0..778d63a4553 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -10,20 +10,6 @@ on: - 'MANIFEST.in' # check packaging - 'pyproject.toml' # check build config - 'setup.cfg' # check deps and project config - - '.gitignore' - - '.github/workflows/build.yml' - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -defaults: - run: - shell: bash -leo pipefail {0} - -env: - FORCE_COLOR: 2 - PIP_PROGRESS_BAR: off jobs: test: @@ -32,20 +18,26 @@ jobs: strategy: fail-fast: false matrix: +<<<<<<< HEAD os: ['ubuntu-latest', 'macos-latest'] python: ['3.12', '3'] +======= + os: ['ubuntu-latest'] + python: ['3.8', '3.9', '3.10', '3.11'] + include: + - os: 'ubuntu-22.04' + python: '3.7' + - os: 'macos-latest' + python: '3.8' +>>>>>>> 9727be9e5 (CPU/Memory Logging working) steps: - name: Checkout uses: actions/checkout@v6 - name: Setup Python - uses: mamba-org/setup-micromamba@v2 + uses: actions/setup-python@v5 with: - cache-environment: true - post-cleanup: 'all' - environment-name: cylc-build - create-args: >- - python=${{ matrix.python }} + python-version: ${{ matrix.python }} - name: Build uses: cylc/release-actions/build-python-package@v1 @@ -53,7 +45,7 @@ jobs: - name: Inspect run: | unzip -l dist/*.whl | tee files - grep -E 'cylc_flow.*.dist-info/.*COPYING' files + grep 'cylc_flow.*.dist-info/COPYING' files grep 'cylc/flow/py.typed' files grep 'cylc/flow/etc' files grep 'cylc/flow/etc/cylc-completion.bash' files diff --git a/.github/workflows/test_conda-build.yml b/.github/workflows/test_conda-build.yml index 9f6f23fa659..09c35a44e0e 100644 --- a/.github/workflows/test_conda-build.yml +++ b/.github/workflows/test_conda-build.yml @@ -15,13 +15,6 @@ concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true -defaults: - run: - shell: bash - -env: - FORCE_COLOR: 2 - jobs: test_conda_install: if: github.repository_owner == 'cylc' || github.event_name != 'schedule' diff --git a/.github/workflows/test_fast.yml b/.github/workflows/test_fast.yml index 460c6e3a867..4201d6a0df5 100644 --- a/.github/workflows/test_fast.yml +++ b/.github/workflows/test_fast.yml @@ -16,9 +16,6 @@ defaults: run: shell: bash -c "exec $CONDA_PREFIX/bin/bash -elo pipefail {0}" -env: - PIP_PROGRESS_BAR: off - jobs: test: runs-on: ${{ matrix.os }} @@ -35,9 +32,11 @@ jobs: - os: 'ubuntu-latest' python-version: '3' time-zone: 'XXX-09:35' + env: TZ: ${{ matrix.time-zone }} PYTEST_ADDOPTS: --cov --cov-append -n 5 --color=yes + steps: - name: Checkout uses: actions/checkout@v6 @@ -112,8 +111,6 @@ jobs: strategy: matrix: python-version: ['3'] - env: - FORCE_COLOR: 2 steps: - name: Checkout uses: actions/checkout@v6 diff --git a/.github/workflows/test_functional.yml b/.github/workflows/test_functional.yml index 37171697651..897cf259497 100644 --- a/.github/workflows/test_functional.yml +++ b/.github/workflows/test_functional.yml @@ -36,10 +36,6 @@ defaults: run: shell: bash -c "exec $CONDA_PREFIX/bin/bash -elo pipefail {0}" -env: - FORCE_COLOR: 2 - PIP_PROGRESS_BAR: off - jobs: test: runs-on: ${{ matrix.os }} diff --git a/.github/workflows/test_tutorial_workflow.yml b/.github/workflows/test_tutorial_workflow.yml index f29b5f8885b..6973c0e0421 100644 --- a/.github/workflows/test_tutorial_workflow.yml +++ b/.github/workflows/test_tutorial_workflow.yml @@ -17,14 +17,6 @@ concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true -defaults: - run: - shell: bash - -env: - FORCE_COLOR: 2 - PIP_PROGRESS_BAR: off - jobs: test: strategy: diff --git a/cylc/flow/cfgspec/globalcfg.py b/cylc/flow/cfgspec/globalcfg.py index 064e72da1b5..c0872bebfa9 100644 --- a/cylc/flow/cfgspec/globalcfg.py +++ b/cylc/flow/cfgspec/globalcfg.py @@ -943,52 +943,16 @@ def default_for( range. ''') Conf('condemned', VDR.V_ABSOLUTE_HOST_LIST, desc=f''' - List run hosts that workflows should *not* run on. + These hosts will not be used to run jobs. - These hosts will be subtracted from the - `available ` hosts - preventing new workflows from starting on the "condemned" host. - - Any workflows running on these hosts will either migrate - to another host, or shut down according to - :py:mod:`the configuration `. - - This feature requires ``auto restart`` to be listed - in `global.cylc[scheduler][main loop]plugins`. - - For more information, see the - :py:mod:`auto restart ` - plugin. - - .. rubric:: Example: - - .. code-block:: cylc - - [scheduler] - [[main loop]] - # activate the "auto restart" plugin - plugins = auto restart - [[run hosts]] - # there are three hosts in the "pool" - available = host1, host2, host3 - - # however two have been taken out: - # * workflows running on "host1" will attempt to - # restart on "host3" - # * workflows running on "host2" will shutdown - condemned = host1, host2! + If workflows are already running on + condemned hosts, Cylc will shut them down and + restart them on different hosts. .. seealso:: - :py:mod:`cylc.flow.main_loop.auto_restart` :ref:`auto-stop-restart` - .. versionchanged:: 8.4.2 - - The "force mode" (activated by a "!" suffix) caused issues - at workflow startup for Cylc versions between 8.0.0 and - 8.4.1 inclusive. - .. versionchanged:: 8.0.0 {REPLACES}``[suite servers]condemned hosts``. @@ -1522,7 +1486,7 @@ def default_for( The means by which task progress messages are reported back to the running workflow. - .. rubric:: Options: + ..rubric:: Options: zmq Direct client-server TCP communication via network ports diff --git a/cylc/flow/etc/job.sh b/cylc/flow/etc/job.sh index 39861aa1460..0d1e3275604 100755 --- a/cylc/flow/etc/job.sh +++ b/cylc/flow/etc/job.sh @@ -38,8 +38,6 @@ cylc__job__main() { set -x fi # Init-Script - cylc profile & - export profiler_pid="$!" & cylc__job__run_inst_func 'init_script' # Start error and vacation traps typeset signal_name= @@ -183,7 +181,6 @@ cylc__job__main() { wait "${CYLC_TASK_MESSAGE_STARTED_PID}" 2>'/dev/null' || true cylc message -- "${CYLC_WORKFLOW_ID}" "${CYLC_TASK_JOB}" 'succeeded' || true - cylc message -- "${CYLC_WORKFLOW_ID}" "${CYLC_TASK_JOB}" 'max_rss:' "${MAXRSS}" || true # (Ignore shellcheck "globbing and word splitting" warning here). # shellcheck disable=SC2086 trap '' ${CYLC_VACATION_SIGNALS:-} ${CYLC_FAIL_SIGNALS} diff --git a/cylc/flow/host_select.py b/cylc/flow/host_select.py index 678306611f6..fdf7cf2a8d6 100644 --- a/cylc/flow/host_select.py +++ b/cylc/flow/host_select.py @@ -143,13 +143,6 @@ def select_workflow_host(cached=True): # be returned with the up-to-date configuration. global_config = glbl_cfg(cached=cached) - # condemned hosts may be suffixed with an "!" to activate "force mode" - blacklist = [] - for host in global_config.get(['scheduler', 'run hosts', 'condemned'], []): - if host.endswith('!'): - host = host[:-1] - blacklist.append(host) - return select_host( # list of workflow hosts global_config.get([ @@ -160,7 +153,9 @@ def select_workflow_host(cached=True): 'scheduler', 'run hosts', 'ranking' ]), # list of condemned hosts - blacklist=blacklist, + blacklist=global_config.get( + ['scheduler', 'run hosts', 'condemned'] + ), blacklist_name='condemned host' ) diff --git a/cylc/flow/scripts/profile.py b/cylc/flow/scripts/profile.py index 391dbbe9ca9..2d3f1faa0ce 100755 --- a/cylc/flow/scripts/profile.py +++ b/cylc/flow/scripts/profile.py @@ -21,16 +21,20 @@ """ import os +import re import sys import time import signal +import subprocess from pathlib import Path from dataclasses import dataclass -from argparse import ArgumentParser from cylc.flow.terminal import cli_function from cylc.flow.option_parsers import CylcOptionParser as COP INTERNAL = True +PID_REGEX = re.compile(r"([^:]*\d{6,}.*)") + + def get_option_parser() -> COP: parser = COP( @@ -46,7 +50,8 @@ def get_option_parser() -> COP: default=os.environ['DATADIR'], dest="output_dir") parser.add_option( "-m", type=str, help="Location of memory process files", - default="/sys/fs/cgroup/memory/pbspro.service/jobid", dest="memory") + default="/sys/fs/cgroup", + dest="memory") return parser @@ -60,15 +65,12 @@ def main(parser, options): profile(options) - @dataclass class Process: """Class for representing CPU and Memory usage of a process""" cgroup_memory_path: str - # cgroup_cpu_path: str + cgroup_cpu_path: str job_id: str - # system_usage: int - # cpu_usage: int def stop_profiler(*args): @@ -80,128 +82,81 @@ def stop_profiler(*args): def parse_memory_file(process): """Open the memory stat file and copy the appropriate data""" - path = os.path.join(process.cgroup_memory_path + "/" + - process.job_id, "memory.stat") + memory_stats = {} - for line in open(path): - key, value = line.strip().split() - # Grab the data we want - if key == 'rss': - return value + for line in open(process.cgroup_memory_path): + return int(line) -def write_data(process, data, output_dir, data_type): +def parse_cpu_file(process): + """Open the memory stat file and copy the appropriate data""" + memory_stats = {} + + for line in open(process.cgroup_cpu_path): + if "usage_usec" in line: + return int(re.findall(r'\d+', line)[0]) + + +def write_data(process, data, output_dir, data_type, filename): # Build the output file path path = os.path.join(output_dir, process.job_id + data_type) try: - with open("/home/h01/cbennett/repos/cylc_ui_gantt/cylc-flow/max_rss", 'w') as f: - f.write(data) + with open(filename, 'w') as f: + f.write(data + "\n") except IOError: raise IOError("Unable to write memory data to file") -# def get_host_num_cpus(cpuset_path, processes): -# """Number of physical CPUs available to the process""" -# cpuset = open(cpuset_path + '/' + -# processes[0].job_id + '/cpuset.cpus').read() -# print("raw_data:", cpuset) -# cpu_number = cpuset.split('-') -# print('split:', cpu_number) -# number_of_cpus = ((int(cpu_number[1]) - int(cpu_number[0])) + 1) // 2 - # split_proc_stat_lines = [cpuset.split(') for line in proc_stat_lines] - # cpu_lines = [ - # split_line - # for split_line in split_proc_stat_lines - # if len(split_line) > 0 and "cpu" in split_line[0] - # ] - # # Number of lines starting with a word including 'cpu', subtracting - # # 1 for the first summary line. - # host_num_cpus = len(cpu_lines) - 1 - # print(number_of_cpus) - # return number_of_cpus - - -# def get_system_usage(): -# """ -# Computes total CPU usage of the host in nanoseconds. -# See also the / proc / stat entry here: -# https://man7.org/linux/man-pages/man5/proc.5.html -# """ -# # Grab usage data from proc/stat -# usage_data = open('/proc/stat').read().split("\n")[0].split()[1:8] -# -# total_clock_ticks = sum(int(entry) for entry in usage_data) -# # 100 clock ticks per second, 10^9 ns per second -# usage_ns = total_clock_ticks * 10 ** 7 -# return usage_ns -# -# -# def get_cpu_percent(num_of_cpus, proc_path, process, -# last_system_usage, last_cpu_usage): -# -# time.sleep(5) -# # Find cpuacct.usage files -# cpu_usage = int(open(process.cgroup_cpu_path + "/" + -# process.job_id + "/cpuacct.usage").read()) -# system_usage = get_system_usage() -# -# # Since deltas are not initially available, return 0.0 on first call. -# if last_system_usage is None: -# cpu_percent = 0.0 -# else: -# cpu_delta = cpu_usage - last_cpu_usage -# # "System time passed." (Typically close to clock time.) -# system_delta = (system_usage - last_system_usage) / num_of_cpus -# -# quotient = cpu_delta / system_delta -# cpu_percent = round(quotient * 100 / 8, 1) -# process.system_usage = system_usage -# process.cpu_usage = cpu_usage -# # Computed percentage might be slightly above 100%. -# return process, min(cpu_percent, 100.0) +def get_cgroup_dir(): + """Get the cgroup directory for the current process""" + # Get the PID of the current process + pid = os.getpid() + # Get the cgroup information for the current process + result = subprocess.run(['cat', '/proc/' + str(pid) + '/cgroup'], capture_output=True, text=True) + result = PID_REGEX.search(result.stdout).group() + return result def profile(args): - # print("cylc_profile:", os.environ['CYLC_PROFILE']) - max_rss = 0 + cgroup_name = get_cgroup_dir() + + # AZURE SPICE CGROUP LOCATION + cgroup_location = "/sys/fs/cgroup/" + cgroup_name + peak_memory = 0 processes = [] # last_system_usage = None # last_cpu_usage = None - # Find the correct memory_stat file for the process - if not Path.exists(Path(args.memory)): - FileNotFoundError("cgroups not found") - + if not Path.exists(Path(cgroup_location)): + raise FileNotFoundError("cgroups not found:" + cgroup_location) try: # Find memory.stat files - for job_id in os.listdir(args.memory): - if "ex" in job_id: - print("found process:", job_id) + for job_id in os.listdir(cgroup_location): + if "memory.peak" in job_id: processes.append(Process( - cgroup_memory_path=args.memory, - # cgroup_cpu_path=args.cpu, - # system_usage=0, - # cpu_usage=0, + cgroup_memory_path=cgroup_location + "/" + job_id, + cgroup_cpu_path=cgroup_location + "/" + "cpu.stat", job_id=job_id)) except FileNotFoundError as e: print(e) - exit("Is this being ran on Azure HPC?") + raise FileNotFoundError("cgroups not found:" + cgroup_location) # cpu_count = get_host_num_cpus(args.cpuset_path, processes) - for i in range(30): + while True: # Write memory usage data for process in processes: # Only save Max RSS to disk if it is above the previous value try: - rss = int(parse_memory_file(process)) - if rss > max_rss: - max_rss = rss - write_data(process, rss, - args.output_dir, ".memory") - - except (OSError, IOError) as error: + memory = parse_memory_file(process) + if memory > peak_memory: + peak_memory = memory + write_data(process, str(peak_memory), args.output_dir, ".memory", "max_rss") + cpu_time = parse_cpu_file(process) + write_data(process, str(cpu_time), args.output_dir, ".cpu", "cpu_time") + + except (OSError, IOError, ValueError) as error: print(error) # process, usage_percent = get_cpu_percent( @@ -214,33 +169,7 @@ def profile(args): time.sleep(args.delay) -def parse_arguments(): - - p = ArgumentParser( - usage="%(prog)s [options]", - description="Profiler which periodically polls PBS cgroups to track " - "the resource usage of jobs running on the node.") - - p.add_argument("-i", dest="delay", type=int, metavar="S", - default=10, help="interval between query cycles in seconds") - p.add_argument("-o", dest="output_dir", type=str, - default=os.environ['DATADIR'], - help="output directory for json file") - p.add_argument("-m", dest="memory", type=str, - default="/sys/fs/cgroup/memory/pbspro.service/jobid", - # default="/sys/fs/cgroup", - help="Location of memory process files") - # p.add_argument("-c", dest="cpu", type=str, - # default="/sys/fs/cgroup/cpu,cpuacct/pbspro.service/jobid", - # # default="/sys/fs/cgroup", - # help="Location of cpu cgroup files") - # p.add_argument("-u", dest="cpuset_path", type=str, - # default="/sys/fs/cgroup/cpuset/pbspro.service/jobid", - # help="Location of processor details") - # p.add_argument("-p", dest="proc_path", type=str, - # default="/sys/fs/cgroup/cpuset/pbspro.service/jobid", - # help="Location of processor details") - - args = p.parse_args() - - return args +if __name__ == "__main__": + + arg_parser = get_option_parser() + profile(arg_parser.parse_args()) diff --git a/setup.cfg b/setup.cfg index 016a43db819..4128850bcb7 100644 --- a/setup.cfg +++ b/setup.cfg @@ -163,7 +163,6 @@ cylc.command = kill = cylc.flow.scripts.kill:main lint = cylc.flow.scripts.lint:main list = cylc.flow.scripts.list:main - profiler = cylc.flow.scripts.profile:main message = cylc.flow.scripts.message:main pause = cylc.flow.scripts.pause:main ping = cylc.flow.scripts.ping:main diff --git a/tests/functional/restart/43-auto-restart-force-override-normal.t b/tests/functional/restart/43-auto-restart-force-override-normal.t index 35edc57d1f9..b61d08c68cb 100644 --- a/tests/functional/restart/43-auto-restart-force-override-normal.t +++ b/tests/functional/restart/43-auto-restart-force-override-normal.t @@ -50,10 +50,7 @@ create_test_global_config '' " ${BASE_GLOBAL_CONFIG} [scheduler] [[run hosts]] - available = ${CYLC_TEST_HOST_1}, ${CYLC_TEST_HOST_2} - # ensure the workflow can start if a host is condemned - # in force mode see #6623 - condemned = ${CYLC_TEST_HOST_2}! + available = ${CYLC_TEST_HOST_1} " set_test_number 8 diff --git a/tests/unit/scripts/test_profiler.py b/tests/unit/scripts/test_profiler.py new file mode 100644 index 00000000000..f8560eafd9a --- /dev/null +++ b/tests/unit/scripts/test_profiler.py @@ -0,0 +1,160 @@ +# THIS FILE IS PART OF THE CYLC WORKFLOW ENGINE. +# Copyright (C) NIWA & British Crown (Met Office) & Contributors. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# Tests for functions contained in cylc.flow.scripts.profiler +from cylc.flow.scripts.profiler import (parse_memory_file, + parse_cpu_file, + write_data, + get_cgroup_name, + get_cgroup_version, + get_cgroup_paths, + stop_profiler, + profile) +import pytest +from unittest import mock + + +def test_parse_memory_file(mocker): + + with pytest.raises(FileNotFoundError): + parse_memory_file("non_existent_file.txt") + + # Mock the 'open' function call to return a file object. + mock_file = mocker.mock_open(read_data="1024") + mocker.patch("builtins.open", mock_file) + + # Test the parse_memory_file function + assert parse_memory_file("mocked_file.txt") == 1 + + # Assert that the 'open' function was called with the expected arguments. + mock_file.assert_called_once_with("mocked_file.txt", "r") + + +def test_parse_cpu_file(mocker): + with pytest.raises(FileNotFoundError): + parse_cpu_file("non_existent_file.txt", 1) + + # Mock the 'open' function call to return a file object. + mock_file = mocker.mock_open(read_data="usage_usec 1000000") + mocker.patch("builtins.open", mock_file) + + assert parse_cpu_file("mocked_file.txt", 1) == 1000 + mock_file.assert_called_once_with("mocked_file.txt", "r") + + mock_file = mocker.mock_open(read_data="1000000") + mocker.patch("builtins.open", mock_file) + assert parse_cpu_file("mocked_file.txt", 2) == 1 + mock_file.assert_called_once_with("mocked_file.txt", "r") + + +def test_write_data(tmpdir): + # Create tmp file + file = tmpdir.join('output.txt') + + write_data('test_data', file.strpath) + assert file.read() == 'test_data\n' + + +def test_get_cgroup_name(mocker): + + mock_file = mocker.mock_open(read_data="0::bad/test/cgroup/place") + mocker.patch("builtins.open", mock_file) + with pytest.raises(AttributeError): + get_cgroup_name() + + mock_file = mocker.mock_open(read_data="0::good/cgroup/place/2222222") + mocker.patch("builtins.open", mock_file) + assert get_cgroup_name() == "good/cgroup/place/2222222" + + +def test_get_cgroup_name_file_not_found(mocker): + + def mock_os_pid(): + return 'The Thing That Should Not Be' + + mocker.patch("os.getpid", mock_os_pid) + with pytest.raises(FileNotFoundError): + get_cgroup_name() + + +def test_get_cgroup_version(mocker): + + # Mock the Path.exists function call to return True + mocker.patch("pathlib.Path.exists", return_value=True) + assert get_cgroup_version('stuff/in/place', + 'more_stuff') == 1 + + with mock.patch('pathlib.Path.exists', side_effect=[False, True]): + assert get_cgroup_version('stuff/in/place', + 'more_stuff') == 2 + + # Mock the Path.exists function call to return False + mocker.patch("pathlib.Path.exists", return_value=False) + assert get_cgroup_version('stuff/in/other/place', + 'things') is None + + +def test_get_cgroup_paths(): + + process = get_cgroup_paths(1, "test_location/", + "test_name") + assert process.cgroup_memory_path == "test_location/test_name/memory.peak" + assert process.cgroup_cpu_path == "test_location/test_name/cpu.stat" + + process = get_cgroup_paths(2, "test_location", + "/test_name") + assert (process.cgroup_memory_path == + "test_location/memory/test_name/memory.max_usage_in_bytes") + assert (process.cgroup_cpu_path == + "test_location/cpu/test_name/cpuacct.usage") + + +def test_profile_cpu(mocker): + process = get_cgroup_paths(1, "test_location/", + "test_name") + + mock_file = mocker.mock_open(read_data="") + mocker.patch("builtins.open", mock_file) + mocker.patch("cylc.flow.scripts.profiler.parse_memory_file", + return_value=0) + mocker.patch("cylc.flow.scripts.profiler.parse_cpu_file", + return_value=2048) + run_once = mock.Mock(side_effect=[True, False]) + profile(process, 1, 1, run_once) + mock_file.assert_called_with("cpu_time", "w") + + +def test_profile_max_rss(mocker): + process = get_cgroup_paths(1, + "test_location/", + "test_name") + + mock_file = mocker.mock_open(read_data="") + mocker.patch("builtins.open", mock_file) + mocker.patch("cylc.flow.scripts.profiler.parse_memory_file", + return_value=1024) + mocker.patch("cylc.flow.scripts.profiler.parse_cpu_file", + return_value=2048) + run_once = mock.Mock(side_effect=[True, False]) + profile(process, 1, 1, run_once) + mock_file.assert_called_with("max_rss", "w") + + +def test_stop_profiler(): + with pytest.raises(SystemExit) as pytest_wrapped_e: + stop_profiler() + assert pytest_wrapped_e.type == SystemExit + assert pytest_wrapped_e.value.code == 0 diff --git a/tests/unit/test_subprocpool.py b/tests/unit/test_subprocpool.py index b4fe684a5e9..6b686c27082 100644 --- a/tests/unit/test_subprocpool.py +++ b/tests/unit/test_subprocpool.py @@ -14,17 +14,16 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . -from pathlib import Path -from types import SimpleNamespace from tempfile import ( - NamedTemporaryFile, - SpooledTemporaryFile, - TemporaryFile, - TemporaryDirectory, + NamedTemporaryFile, SpooledTemporaryFile, TemporaryFile, + TemporaryDirectory ) - +import unittest import pytest +from pathlib import Path +from types import SimpleNamespace + from cylc.flow import LOG from cylc.flow.id import Tokens from cylc.flow.cycling.iso8601 import ISO8601Point @@ -45,201 +44,188 @@ from cylc.flow.task_proxy import TaskProxy -def test_get_temporary_file(): - """Test SubProcPool.get_temporary_file.""" - assert isinstance(SubProcPool.get_temporary_file(), SpooledTemporaryFile) - - -def test_run_command_returns_0(): - """Test basic usage, command returns 0""" - ctx = SubProcContext('truth', ['true']) - SubProcPool.run_command(ctx) - assert ctx.err == '' - assert ctx.out == '' - assert ctx.ret_code == 0 - - -def test_run_command_returns_1(): - """Test basic usage, command returns 1""" - ctx = SubProcContext('lies', ['false']) - SubProcPool.run_command(ctx) - assert ctx.err == '' - assert ctx.out == '' - assert ctx.ret_code == 1 - - -def test_run_command_writes_to_out(): - """Test basic usage, command writes to STDOUT""" - ctx = SubProcContext('parrot', ['echo', 'pirate', 'urrrr']) - SubProcPool.run_command(ctx) - assert ctx.err == '' - assert ctx.out == 'pirate urrrr\n' - assert ctx.ret_code == 0 - - -def test_run_command_writes_to_err(): - """Test basic usage, command writes to STDERR""" - ctx = SubProcContext( - 'parrot2', - ['bash', '--noprofile', '--norc', '-c', 'echo pirate errrr >&2'] - ) - SubProcPool.run_command(ctx) - assert 'pirate errrr\n' - assert ctx.out == '' - assert ctx.ret_code == 0 - - -def test_run_command_with_stdin_from_str(): - """Test STDIN from string""" - ctx = SubProcContext('meow', ['cat'], stdin_str='catches mice.\n') - SubProcPool.run_command(ctx) - assert ctx.err == '' - assert ctx.out == 'catches mice.\n' - assert ctx.ret_code == 0 - - -def test_run_command_with_stdin_from_unicode(): - """Test STDIN from string with Unicode""" - ctx = SubProcContext('meow', ['cat'], stdin_str='喵\n') - SubProcPool.run_command(ctx) - assert ctx.err == '' - assert ctx.out == '喵\n' - assert ctx.ret_code == 0 - - -def test_run_command_with_stdin_from_handle(): - """Test STDIN from a single opened file handle""" - handle = TemporaryFile() - handle.write('catches mice.\n'.encode('UTF-8')) - handle.seek(0) - ctx = SubProcContext('meow', ['cat'], stdin_files=[handle]) - SubProcPool.run_command(ctx) - assert ctx.err == '' - assert ctx.out == 'catches mice.\n' - assert ctx.ret_code == 0 - handle.close() - - -def test_run_command_with_stdin_from_path(): - """Test STDIN from a single file path""" - handle = NamedTemporaryFile() - handle.write('catches mice.\n'.encode('UTF-8')) - handle.seek(0) - ctx = SubProcContext('meow', ['cat'], stdin_files=[handle.name]) - SubProcPool.run_command(ctx) - assert ctx.err == '' - assert ctx.out == 'catches mice.\n' - assert ctx.ret_code == 0 - handle.close() - - -def test_run_command_with_stdin_from_handles(): - """Test STDIN from multiple file handles""" - handles = [] - for txt in ['catches mice.\n', 'eat fish.\n']: +class TestSubProcPool(unittest.TestCase): + + def test_get_temporary_file(self): + """Test SubProcPool.get_temporary_file.""" + self.assertIsInstance( + SubProcPool.get_temporary_file(), SpooledTemporaryFile) + + def test_run_command_returns_0(self): + """Test basic usage, command returns 0""" + ctx = SubProcContext('truth', ['true']) + SubProcPool.run_command(ctx) + self.assertEqual(ctx.err, '') + self.assertEqual(ctx.out, '') + self.assertEqual(ctx.ret_code, 0) + + def test_run_command_returns_1(self): + """Test basic usage, command returns 1""" + ctx = SubProcContext('lies', ['false']) + SubProcPool.run_command(ctx) + self.assertEqual(ctx.err, '') + self.assertEqual(ctx.out, '') + self.assertEqual(ctx.ret_code, 1) + + def test_run_command_writes_to_out(self): + """Test basic usage, command writes to STDOUT""" + ctx = SubProcContext('parrot', ['echo', 'pirate', 'urrrr']) + SubProcPool.run_command(ctx) + self.assertEqual(ctx.err, '') + self.assertEqual(ctx.out, 'pirate urrrr\n') + self.assertEqual(ctx.ret_code, 0) + + def test_run_command_writes_to_err(self): + """Test basic usage, command writes to STDERR""" + ctx = SubProcContext( + 'parrot2', ['bash', '-c', 'echo pirate errrr >&2']) + SubProcPool.run_command(ctx) + self.assertEqual(ctx.err, 'pirate errrr\n') + self.assertEqual(ctx.out, '') + self.assertEqual(ctx.ret_code, 0) + + def test_run_command_with_stdin_from_str(self): + """Test STDIN from string""" + ctx = SubProcContext('meow', ['cat'], stdin_str='catches mice.\n') + SubProcPool.run_command(ctx) + self.assertEqual(ctx.err, '') + self.assertEqual(ctx.out, 'catches mice.\n') + self.assertEqual(ctx.ret_code, 0) + + def test_run_command_with_stdin_from_unicode(self): + """Test STDIN from string with Unicode""" + ctx = SubProcContext('meow', ['cat'], stdin_str='喵\n') + SubProcPool.run_command(ctx) + self.assertEqual(ctx.err, '') + self.assertEqual(ctx.out, '喵\n') + self.assertEqual(ctx.ret_code, 0) + + def test_run_command_with_stdin_from_handle(self): + """Test STDIN from a single opened file handle""" handle = TemporaryFile() - handle.write(txt.encode('UTF-8')) + handle.write('catches mice.\n'.encode('UTF-8')) handle.seek(0) - handles.append(handle) - ctx = SubProcContext('meow', ['cat'], stdin_files=handles) - SubProcPool.run_command(ctx) - assert ctx.err == '' - assert ctx.out == 'catches mice.\neat fish.\n' - assert ctx.ret_code == 0 - for handle in handles: + ctx = SubProcContext('meow', ['cat'], stdin_files=[handle]) + SubProcPool.run_command(ctx) + self.assertEqual(ctx.err, '') + self.assertEqual(ctx.out, 'catches mice.\n') + self.assertEqual(ctx.ret_code, 0) handle.close() - -def test_run_command_with_stdin_from_paths(): - """Test STDIN from multiple file paths""" - handles = [] - for txt in ['catches mice.\n', 'eat fish.\n']: + def test_run_command_with_stdin_from_path(self): + """Test STDIN from a single file path""" handle = NamedTemporaryFile() - handle.write(txt.encode('UTF-8')) + handle.write('catches mice.\n'.encode('UTF-8')) handle.seek(0) - handles.append(handle) - ctx = SubProcContext( - 'meow', ['cat'], stdin_files=[handle.name for handle in handles] - ) - SubProcPool.run_command(ctx) - assert ctx.err == '' - assert ctx.out == 'catches mice.\neat fish.\n' - assert ctx.ret_code == 0 - for handle in handles: + ctx = SubProcContext('meow', ['cat'], stdin_files=[handle.name]) + SubProcPool.run_command(ctx) + self.assertEqual(ctx.err, '') + self.assertEqual(ctx.out, 'catches mice.\n') + self.assertEqual(ctx.ret_code, 0) handle.close() - -def test_xfunction(): - """Test xtrigger function import.""" - with TemporaryDirectory() as temp_dir: - python_dir = Path(temp_dir, "lib", "python") - python_dir.mkdir(parents=True) - the_answer_file = python_dir / "the_answer.py" - with the_answer_file.open(mode="w") as f: - f.write("""the_answer = lambda: 42""") - f.flush() - f_name = "the_answer" - fn = get_xtrig_func(f_name, f_name, temp_dir) - result = fn() - assert 42 == result - - -def test_xfunction_cache(): - """Test xtrigger function import cache.""" - with TemporaryDirectory() as temp_dir: - python_dir = Path(temp_dir, "lib", "python") - python_dir.mkdir(parents=True) - amandita_file = python_dir / "amandita.py" - with amandita_file.open(mode="w") as f: - f.write("""choco = lambda: 'chocolate'""") - f.flush() - m_name = "amandita" # module - f_name = "choco" # function - fn = get_xtrig_func(m_name, f_name, temp_dir) - result = fn() - assert 'chocolate' == result - - # is in the cache - assert (m_name, f_name) in _XTRIG_FUNC_CACHE - # returned from cache - assert fn, get_xtrig_func(m_name, f_name == temp_dir) - - -def test_xfunction_import_error(): - """Test for error on importing a xtrigger function. - - To prevent the test eventually failing if the test function is added - and successfully imported, we use an invalid module name as per Python - spec. - """ - with TemporaryDirectory() as temp_dir, pytest.raises(ModuleNotFoundError): - get_xtrig_func("invalid-module-name", "func-name", temp_dir) - - -def test_xfunction_attribute_error(): - """Test for error on looking for an attribute in a xtrigger script.""" - with TemporaryDirectory() as temp_dir: - python_dir = Path(temp_dir, "lib", "python") - python_dir.mkdir(parents=True) - the_answer_file = python_dir / "the_sword.py" - with the_answer_file.open(mode="w") as f: - f.write("""the_droid = lambda: 'excalibur'""") - f.flush() - f_name = "the_sword" - with pytest.raises(AttributeError): - get_xtrig_func(f_name, f_name, temp_dir) + def test_run_command_with_stdin_from_handles(self): + """Test STDIN from multiple file handles""" + handles = [] + for txt in ['catches mice.\n', 'eat fish.\n']: + handle = TemporaryFile() + handle.write(txt.encode('UTF-8')) + handle.seek(0) + handles.append(handle) + ctx = SubProcContext('meow', ['cat'], stdin_files=handles) + SubProcPool.run_command(ctx) + self.assertEqual(ctx.err, '') + self.assertEqual(ctx.out, 'catches mice.\neat fish.\n') + self.assertEqual(ctx.ret_code, 0) + for handle in handles: + handle.close() + + def test_run_command_with_stdin_from_paths(self): + """Test STDIN from multiple file paths""" + handles = [] + for txt in ['catches mice.\n', 'eat fish.\n']: + handle = NamedTemporaryFile() + handle.write(txt.encode('UTF-8')) + handle.seek(0) + handles.append(handle) + ctx = SubProcContext( + 'meow', ['cat'], stdin_files=[handle.name for handle in handles]) + SubProcPool.run_command(ctx) + self.assertEqual(ctx.err, '') + self.assertEqual(ctx.out, 'catches mice.\neat fish.\n') + self.assertEqual(ctx.ret_code, 0) + for handle in handles: + handle.close() + + def test_xfunction(self): + """Test xtrigger function import.""" + with TemporaryDirectory() as temp_dir: + python_dir = Path(temp_dir, "lib", "python") + python_dir.mkdir(parents=True) + the_answer_file = python_dir / "the_answer.py" + with the_answer_file.open(mode="w") as f: + f.write("""the_answer = lambda: 42""") + f.flush() + f_name = "the_answer" + fn = get_xtrig_func(f_name, f_name, temp_dir) + result = fn() + self.assertEqual(42, result) + + def test_xfunction_cache(self): + """Test xtrigger function import cache.""" + with TemporaryDirectory() as temp_dir: + python_dir = Path(temp_dir, "lib", "python") + python_dir.mkdir(parents=True) + amandita_file = python_dir / "amandita.py" + with amandita_file.open(mode="w") as f: + f.write("""choco = lambda: 'chocolate'""") + f.flush() + m_name = "amandita" # module + f_name = "choco" # function + fn = get_xtrig_func(m_name, f_name, temp_dir) + result = fn() + self.assertEqual('chocolate', result) + + # is in the cache + self.assertTrue((m_name, f_name) in _XTRIG_FUNC_CACHE) + # returned from cache + self.assertEqual(fn, get_xtrig_func(m_name, f_name, temp_dir)) + + def test_xfunction_import_error(self): + """Test for error on importing a xtrigger function. + + To prevent the test eventually failing if the test function is added + and successfully imported, we use an invalid module name as per Python + spec. + """ + with TemporaryDirectory() as temp_dir, self.assertRaises( + ModuleNotFoundError + ): + get_xtrig_func("invalid-module-name", "func-name", temp_dir) + + def test_xfunction_attribute_error(self): + """Test for error on looking for an attribute in a xtrigger script.""" + with TemporaryDirectory() as temp_dir: + python_dir = Path(temp_dir, "lib", "python") + python_dir.mkdir(parents=True) + the_answer_file = python_dir / "the_sword.py" + with the_answer_file.open(mode="w") as f: + f.write("""the_droid = lambda: 'excalibur'""") + f.flush() + f_name = "the_sword" + with self.assertRaises(AttributeError): + get_xtrig_func(f_name, f_name, temp_dir) @pytest.fixture def mock_ctx(): def inner_(ret_code=None, host=None, cmd_key=None, cmd=None): - """Provide a SimpleNamespace which looks like a ctx object.""" + """Provide a SimpleNamespace which looks like a ctx object. + """ inputs = locals() defaults = { - 'ret_code': 255, - 'host': 'mouse', - 'cmd_key': 'my-command', - 'cmd': ['bistromathic', 'take-off'], + 'ret_code': 255, 'host': 'mouse', 'cmd_key': 'my-command', + 'cmd': ['bistromathic', 'take-off'] } for key in inputs: if inputs[key] is None: @@ -249,10 +235,9 @@ def inner_(ret_code=None, host=None, cmd_key=None, cmd=None): timestamp=None, ret_code=inputs['ret_code'], host=inputs['host'], - cmd_key=inputs['cmd_key'], + cmd_key=inputs['cmd_key'] ) return ctx - yield inner_ @@ -275,18 +260,21 @@ def _test_callback_255(ctx, foo=''): 'platform: None - Could not connect to mouse.', 255, 'ssh', - id="return 255", + id="return 255" ), pytest.param( 'platform: localhost - Could not connect to mouse.', 255, TaskJobLogsRetrieveContext(['ssh', 'something'], None, None), - id="return 255 (log-ret)", - ), - ], + id="return 255 (log-ret)" + ) + ] ) -def test__run_command_exit(caplog, mock_ctx, expect, ret_code, cmd_key): - """It runs a callback""" +def test__run_command_exit( + caplog, mock_ctx, expect, ret_code, cmd_key +): + """It runs a callback + """ ctx = mock_ctx(ret_code=ret_code, cmd_key=cmd_key, cmd=['ssh']) SubProcPool._run_command_exit( ctx, callback=_test_callback, callback_255=_test_callback_255 @@ -305,7 +293,9 @@ def test__run_command_exit_no_255_callback(caplog, mock_ctx): def test__run_command_exit_no_gettable_platform(caplog, mock_ctx): """It logs being unable to select a platform""" ret_ctx = TaskJobLogsRetrieveContext( - platform_name='rhenas', max_size=256, key='rhenas' + platform_name='rhenas', + max_size=256, + key='rhenas' ) ctx = mock_ctx(cmd_key=ret_ctx, cmd=['ssh'], ret_code=255) SubProcPool._run_command_exit(ctx, callback=_test_callback) @@ -320,19 +310,20 @@ def test__run_command_exit_no_255_args(caplog, mock_ctx): mock_ctx(cmd=['ssh', 'Zaphod']), callback=_test_callback, callback_args=['Zaphod'], - callback_255=_test_callback_255, + callback_255=_test_callback_255 ) assert '255' in caplog.records[1].msg def test__run_command_exit_add_to_badhosts(mock_ctx): - """It updates the list of badhosts""" + """It updates the list of badhosts + """ badhosts = {'foo', 'bar'} SubProcPool._run_command_exit( mock_ctx(cmd=['ssh']), bad_hosts=badhosts, callback=print, - callback_args=['Welcome to Magrathea'], + callback_args=['Welcome to Magrathea'] ) assert badhosts == {'foo', 'bar', 'mouse'} @@ -344,36 +335,32 @@ def test__run_command_exit_add_to_badhosts_log(caplog, mock_ctx): mock_ctx(cmd=['ssh']), bad_hosts=badhosts, callback=lambda x, t: print(str(x)), - callback_args=[ - TaskProxy( - Tokens('~u/w//c/t/2'), - SimpleNamespace( - name='t', - dependencies={}, - sequential='', - external_triggers=[], - xtrig_labels={}, - expiration_offset=None, - outputs={ - TASK_OUTPUT_SUBMITTED: [None, None], - TASK_OUTPUT_SUBMIT_FAILED: [None, None], - TASK_OUTPUT_SUCCEEDED: [None, None], - TASK_OUTPUT_FAILED: [None, None], - TASK_OUTPUT_EXPIRED: [None, None], - }, - graph_children={}, - rtconfig={'platform': 'foo'}, - ), - ISO8601Point('1990'), - ) - ], + callback_args=[TaskProxy( + Tokens('~u/w//c/t/2'), + SimpleNamespace( + name='t', dependencies={}, sequential='', + external_triggers=[], xtrig_labels={}, + expiration_offset=None, + outputs={ + TASK_OUTPUT_SUBMITTED: [None, None], + TASK_OUTPUT_SUBMIT_FAILED: [None, None], + TASK_OUTPUT_SUCCEEDED: [None, None], + TASK_OUTPUT_FAILED: [None, None], + TASK_OUTPUT_EXPIRED: [None, None], + }, + graph_children={}, rtconfig={'platform': 'foo'} + + ), + ISO8601Point('1990') + )] ) assert 'platform: foo' in caplog.records[0].message assert badhosts == {'foo', 'bar', 'mouse'} def test__run_command_exit_rsync_fails(mock_ctx): - """It updates the list of badhosts""" + """It updates the list of badhosts + """ badhosts = {'foo', 'bar'} ctx = mock_ctx(cmd=['rsync'], ret_code=42, cmd_key='file-install') SubProcPool._run_command_exit( @@ -384,10 +371,10 @@ def test__run_command_exit_rsync_fails(mock_ctx): { 'name': 'Magrathea', 'ssh command': 'ssh', - 'rsync command': 'rsync command', + 'rsync command': 'rsync command' }, 'Welcome to Magrathea', - ], + ] ) assert badhosts == {'foo', 'bar', 'mouse'} @@ -397,11 +384,12 @@ def test__run_command_exit_rsync_fails(mock_ctx): [ (True, {'cmd': ['ssh'], 'ret_code': 255}), (False, {'cmd': ['foo'], 'ret_code': 255}), - (False, {'cmd': ['ssh'], 'ret_code': 42}), - ], + (False, {'cmd': ['ssh'], 'ret_code': 42}) + ] ) def test_ssh_255_fail(mock_ctx, expect, ctx_kwargs): - """It knows when a ctx has failed""" + """It knows when a ctx has failed + """ output = SubProcPool.ssh_255_fail(mock_ctx(**ctx_kwargs)) assert output == expect @@ -413,10 +401,11 @@ def test_ssh_255_fail(mock_ctx, expect, ctx_kwargs): (True, {'cmd': ['rsync'], 'ret_code': 255, 'host': 'not_local'}), (False, {'cmd': ['make it-so'], 'ret_code': 255, 'host': 'not_local'}), (False, {'cmd': ['rsync'], 'ret_code': 125, 'host': 'localhost'}), - ], + ] ) def test_rsync_255_fail(mock_ctx, expect, ctx_kwargs): - """It knows when a ctx has failed""" + """It knows when a ctx has failed + """ output = SubProcPool.rsync_255_fail( mock_ctx(**ctx_kwargs), {'ssh command': 'ssh', 'rsync command': 'rsync command'}, From 36de59fdef358f52ceb59cd94b3d631ee4ed34dc Mon Sep 17 00:00:00 2001 From: Ronnie Dutta <61982285+MetRonnie@users.noreply.github.com> Date: Wed, 12 Mar 2025 15:34:32 +0000 Subject: [PATCH 03/47] Time Series now working --- .github/workflows/1_create_release_pr.yml | 13 + .github/workflows/2_auto_publish_release.yml | 11 + .github/workflows/bash.yml | 8 + .github/workflows/build.yml | 49 +- .github/workflows/test_conda-build.yml | 7 + .github/workflows/test_fast.yml | 7 +- .github/workflows/test_functional.yml | 4 + .github/workflows/test_tutorial_workflow.yml | 8 + .gitignore | 3 + changes.d/6663.feat.md | 1 + cylc/flow/cfgspec/globalcfg.py | 46 +- cylc/flow/etc/job.sh | 12 +- cylc/flow/host_select.py | 11 +- cylc/flow/scripts/profile.py | 175 ------- cylc/flow/scripts/profiler.py | 159 +++---- tests/functional/jobscript/02-profiler.t | 7 +- .../43-auto-restart-force-override-normal.t | 5 +- tests/unit/scripts/test_profiler.py | 52 +- tests/unit/test_job_file.py | 12 +- tests/unit/test_subprocpool.py | 449 +++++++++--------- 20 files changed, 524 insertions(+), 515 deletions(-) create mode 100644 changes.d/6663.feat.md delete mode 100755 cylc/flow/scripts/profile.py diff --git a/.github/workflows/1_create_release_pr.yml b/.github/workflows/1_create_release_pr.yml index bf1d7bd399d..662c1d8d521 100644 --- a/.github/workflows/1_create_release_pr.yml +++ b/.github/workflows/1_create_release_pr.yml @@ -11,6 +11,19 @@ on: required: false default: 'master' +concurrency: + # Only let this run 1 at a time + group: ${{ github.workflow }} + cancel-in-progress: false + +defaults: + run: + shell: bash + +env: + FORCE_COLOR: 2 + PIP_PROGRESS_BAR: off + jobs: create-release-pr: runs-on: ubuntu-latest diff --git a/.github/workflows/2_auto_publish_release.yml b/.github/workflows/2_auto_publish_release.yml index ebbb7c3b751..72bc8c35ff4 100644 --- a/.github/workflows/2_auto_publish_release.yml +++ b/.github/workflows/2_auto_publish_release.yml @@ -7,7 +7,18 @@ on: # NOTE: While this is too generic, we use the `if` condition of the job to narrow it down # NOTE: Don't use `branches` as we might create release on any branch +concurrency: + # Only let this run 1 at a time + group: ${{ github.workflow }} + cancel-in-progress: false + +defaults: + run: + shell: bash + env: + FORCE_COLOR: 2 + PIP_PROGRESS_BAR: off # Best not to include the GH token here, only do it for the steps that need it MERGE_SHA: ${{ github.event.pull_request.merge_commit_sha }} CHANGELOG_FILE: CHANGES.md diff --git a/.github/workflows/bash.yml b/.github/workflows/bash.yml index 48ead8fb5dd..52408e80e01 100644 --- a/.github/workflows/bash.yml +++ b/.github/workflows/bash.yml @@ -31,6 +31,14 @@ concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true +defaults: + run: + shell: bash + +env: + FORCE_COLOR: 2 + PIP_PROGRESS_BAR: off + jobs: bash-docker: runs-on: ubuntu-latest diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 778d63a4553..567b05aee2e 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -10,6 +10,20 @@ on: - 'MANIFEST.in' # check packaging - 'pyproject.toml' # check build config - 'setup.cfg' # check deps and project config + - '.gitignore' + - '.github/workflows/build.yml' + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +defaults: + run: + shell: bash -leo pipefail {0} + +env: + FORCE_COLOR: 2 + PIP_PROGRESS_BAR: off jobs: test: @@ -18,26 +32,55 @@ jobs: strategy: fail-fast: false matrix: +<<<<<<< HEAD +<<<<<<< HEAD +<<<<<<< HEAD <<<<<<< HEAD os: ['ubuntu-latest', 'macos-latest'] python: ['3.12', '3'] ======= +======= +>>>>>>> 38c31f56d (Fail gracefully if cgroups cannot be found) os: ['ubuntu-latest'] python: ['3.8', '3.9', '3.10', '3.11'] include: - os: 'ubuntu-22.04' +<<<<<<< HEAD python: '3.7' - os: 'macos-latest' python: '3.8' >>>>>>> 9727be9e5 (CPU/Memory Logging working) +======= + os: ['ubuntu-latest', 'macos-latest'] + python: ['3.7', '3.8', '3.9', '3.10', '3'] + exclude: + - os: 'macos-latest' + python: '3.7' +>>>>>>> 171f7ee1d (GH Actions: use explicit `bash` shell & other defaults) +======= + python: '3.7' + - os: 'macos-latest' + python: '3.8' +>>>>>>> 38c31f56d (Fail gracefully if cgroups cannot be found) +======= + os: ['ubuntu-latest', 'macos-latest'] + python: ['3.7', '3.8', '3.9', '3.10', '3'] + exclude: + - os: 'macos-latest' + python: '3.7' +>>>>>>> ad6b3a133 (Fixing my terrible rebasing) steps: - name: Checkout uses: actions/checkout@v6 - name: Setup Python - uses: actions/setup-python@v5 + uses: mamba-org/setup-micromamba@v2 with: - python-version: ${{ matrix.python }} + cache-environment: true + post-cleanup: 'all' + environment-name: cylc-build + create-args: >- + python=${{ matrix.python }} - name: Build uses: cylc/release-actions/build-python-package@v1 @@ -45,7 +88,7 @@ jobs: - name: Inspect run: | unzip -l dist/*.whl | tee files - grep 'cylc_flow.*.dist-info/COPYING' files + grep -E 'cylc_flow.*.dist-info/.*COPYING' files grep 'cylc/flow/py.typed' files grep 'cylc/flow/etc' files grep 'cylc/flow/etc/cylc-completion.bash' files diff --git a/.github/workflows/test_conda-build.yml b/.github/workflows/test_conda-build.yml index 09c35a44e0e..9f6f23fa659 100644 --- a/.github/workflows/test_conda-build.yml +++ b/.github/workflows/test_conda-build.yml @@ -15,6 +15,13 @@ concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true +defaults: + run: + shell: bash + +env: + FORCE_COLOR: 2 + jobs: test_conda_install: if: github.repository_owner == 'cylc' || github.event_name != 'schedule' diff --git a/.github/workflows/test_fast.yml b/.github/workflows/test_fast.yml index 4201d6a0df5..460c6e3a867 100644 --- a/.github/workflows/test_fast.yml +++ b/.github/workflows/test_fast.yml @@ -16,6 +16,9 @@ defaults: run: shell: bash -c "exec $CONDA_PREFIX/bin/bash -elo pipefail {0}" +env: + PIP_PROGRESS_BAR: off + jobs: test: runs-on: ${{ matrix.os }} @@ -32,11 +35,9 @@ jobs: - os: 'ubuntu-latest' python-version: '3' time-zone: 'XXX-09:35' - env: TZ: ${{ matrix.time-zone }} PYTEST_ADDOPTS: --cov --cov-append -n 5 --color=yes - steps: - name: Checkout uses: actions/checkout@v6 @@ -111,6 +112,8 @@ jobs: strategy: matrix: python-version: ['3'] + env: + FORCE_COLOR: 2 steps: - name: Checkout uses: actions/checkout@v6 diff --git a/.github/workflows/test_functional.yml b/.github/workflows/test_functional.yml index 897cf259497..37171697651 100644 --- a/.github/workflows/test_functional.yml +++ b/.github/workflows/test_functional.yml @@ -36,6 +36,10 @@ defaults: run: shell: bash -c "exec $CONDA_PREFIX/bin/bash -elo pipefail {0}" +env: + FORCE_COLOR: 2 + PIP_PROGRESS_BAR: off + jobs: test: runs-on: ${{ matrix.os }} diff --git a/.github/workflows/test_tutorial_workflow.yml b/.github/workflows/test_tutorial_workflow.yml index 6973c0e0421..f29b5f8885b 100644 --- a/.github/workflows/test_tutorial_workflow.yml +++ b/.github/workflows/test_tutorial_workflow.yml @@ -17,6 +17,14 @@ concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true +defaults: + run: + shell: bash + +env: + FORCE_COLOR: 2 + PIP_PROGRESS_BAR: off + jobs: test: strategy: diff --git a/.gitignore b/.gitignore index b18c09ad1eb..f7e26153db0 100644 --- a/.gitignore +++ b/.gitignore @@ -11,6 +11,9 @@ __pycache__/ # vscode .vscode +# pycharm +.idea + # processed workflow configs *.rc.processed *.cylc.processed diff --git a/changes.d/6663.feat.md b/changes.d/6663.feat.md new file mode 100644 index 00000000000..9a40cc43d59 --- /dev/null +++ b/changes.d/6663.feat.md @@ -0,0 +1 @@ +Adding CPU time and Max RSS to Analysis Tools diff --git a/cylc/flow/cfgspec/globalcfg.py b/cylc/flow/cfgspec/globalcfg.py index c0872bebfa9..064e72da1b5 100644 --- a/cylc/flow/cfgspec/globalcfg.py +++ b/cylc/flow/cfgspec/globalcfg.py @@ -943,16 +943,52 @@ def default_for( range. ''') Conf('condemned', VDR.V_ABSOLUTE_HOST_LIST, desc=f''' - These hosts will not be used to run jobs. + List run hosts that workflows should *not* run on. - If workflows are already running on - condemned hosts, Cylc will shut them down and - restart them on different hosts. + These hosts will be subtracted from the + `available ` hosts + preventing new workflows from starting on the "condemned" host. + + Any workflows running on these hosts will either migrate + to another host, or shut down according to + :py:mod:`the configuration `. + + This feature requires ``auto restart`` to be listed + in `global.cylc[scheduler][main loop]plugins`. + + For more information, see the + :py:mod:`auto restart ` + plugin. + + .. rubric:: Example: + + .. code-block:: cylc + + [scheduler] + [[main loop]] + # activate the "auto restart" plugin + plugins = auto restart + [[run hosts]] + # there are three hosts in the "pool" + available = host1, host2, host3 + + # however two have been taken out: + # * workflows running on "host1" will attempt to + # restart on "host3" + # * workflows running on "host2" will shutdown + condemned = host1, host2! .. seealso:: + :py:mod:`cylc.flow.main_loop.auto_restart` :ref:`auto-stop-restart` + .. versionchanged:: 8.4.2 + + The "force mode" (activated by a "!" suffix) caused issues + at workflow startup for Cylc versions between 8.0.0 and + 8.4.1 inclusive. + .. versionchanged:: 8.0.0 {REPLACES}``[suite servers]condemned hosts``. @@ -1486,7 +1522,7 @@ def default_for( The means by which task progress messages are reported back to the running workflow. - ..rubric:: Options: + .. rubric:: Options: zmq Direct client-server TCP communication via network ports diff --git a/cylc/flow/etc/job.sh b/cylc/flow/etc/job.sh index 0d1e3275604..a4747579538 100755 --- a/cylc/flow/etc/job.sh +++ b/cylc/flow/etc/job.sh @@ -208,14 +208,14 @@ cylc__set_return() { ############################################################################### # Save the data using cylc message and exit the profiler cylc__kill_profiler() { - if [[ -n ${profiler_pid:-} ]]; then - kill -s SIGINT "${profiler_pid}" + if [[ -n "${cpu_time:-}" ]]; then + cylc message -- "${CYLC_WORKFLOW_ID}" "${CYLC_TASK_JOB}" "DEBUG: cpu_time $cpu_time" || true fi - if [ -n "${max_rss}" ]; then + if [[ -n "${max_rss:-}" ]]; then cylc message -- "${CYLC_WORKFLOW_ID}" "${CYLC_TASK_JOB}" "DEBUG: max_rss $max_rss" || true fi - if [ -n "${cpu_time}" ]; then - cylc message -- "${CYLC_WORKFLOW_ID}" "${CYLC_TASK_JOB}" "DEBUG: cpu_time $cpu_time" || true + if [[ -f "proc/${profiler_pid}" ]]; then + kill -s SIGINT "${profiler_pid}" || true fi } @@ -293,7 +293,6 @@ cylc__job__run_inst_func() { # Returns: # exit ${CYLC_TASK_USER_SCRIPT_EXITCODE} cylc__job_finish_err() { - cylc__kill_profiler CYLC_TASK_USER_SCRIPT_EXITCODE="${CYLC_TASK_USER_SCRIPT_EXITCODE:-$?}" typeset signal="$1" typeset run_err_script="$2" @@ -301,6 +300,7 @@ cylc__job_finish_err() { # (Ignore shellcheck "globbing and word splitting" warning here). # shellcheck disable=SC2086 trap '' ${CYLC_VACATION_SIGNALS:-} ${CYLC_FAIL_SIGNALS} + cylc__kill_profiler if [[ -n "${CYLC_TASK_MESSAGE_STARTED_PID:-}" ]]; then wait "${CYLC_TASK_MESSAGE_STARTED_PID}" 2>'/dev/null' || true fi diff --git a/cylc/flow/host_select.py b/cylc/flow/host_select.py index fdf7cf2a8d6..678306611f6 100644 --- a/cylc/flow/host_select.py +++ b/cylc/flow/host_select.py @@ -143,6 +143,13 @@ def select_workflow_host(cached=True): # be returned with the up-to-date configuration. global_config = glbl_cfg(cached=cached) + # condemned hosts may be suffixed with an "!" to activate "force mode" + blacklist = [] + for host in global_config.get(['scheduler', 'run hosts', 'condemned'], []): + if host.endswith('!'): + host = host[:-1] + blacklist.append(host) + return select_host( # list of workflow hosts global_config.get([ @@ -153,9 +160,7 @@ def select_workflow_host(cached=True): 'scheduler', 'run hosts', 'ranking' ]), # list of condemned hosts - blacklist=global_config.get( - ['scheduler', 'run hosts', 'condemned'] - ), + blacklist=blacklist, blacklist_name='condemned host' ) diff --git a/cylc/flow/scripts/profile.py b/cylc/flow/scripts/profile.py deleted file mode 100755 index 2d3f1faa0ce..00000000000 --- a/cylc/flow/scripts/profile.py +++ /dev/null @@ -1,175 +0,0 @@ -#!/usr/bin/env python3 -# THIS FILE IS PART OF THE CYLC WORKFLOW ENGINE. -# Copyright (C) NIWA & British Crown (Met Office) & Contributors. -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . -"""cylc profiler [OPTIONS] - -Profiler which periodically polls PBS cgroups to track -the resource usage of jobs running on the node. -""" - -import os -import re -import sys -import time -import signal -import subprocess -from pathlib import Path -from dataclasses import dataclass -from cylc.flow.terminal import cli_function -from cylc.flow.option_parsers import CylcOptionParser as COP - -INTERNAL = True -PID_REGEX = re.compile(r"([^:]*\d{6,}.*)") - - - -def get_option_parser() -> COP: - parser = COP( - __doc__, - argdoc=[ - ], - ) - parser.add_option( - "-i", type=int, help="interval between query cycles in seconds", - default=10, dest="delay") - parser.add_option( - "-o", type=str, help="output directory for json file", - default=os.environ['DATADIR'], dest="output_dir") - parser.add_option( - "-m", type=str, help="Location of memory process files", - default="/sys/fs/cgroup", - dest="memory") - - return parser - - -@cli_function(get_option_parser) -def main(parser, options): - """CLI main.""" - # Register the stop_profiler function with the signal library - signal.signal(signal.SIGINT, stop_profiler) - - profile(options) - - -@dataclass -class Process: - """Class for representing CPU and Memory usage of a process""" - cgroup_memory_path: str - cgroup_cpu_path: str - job_id: str - - -def stop_profiler(*args): - """This function will be executed when the SIGINT signal is sent - to this process""" - print('profiler exited') - sys.exit(0) - - -def parse_memory_file(process): - """Open the memory stat file and copy the appropriate data""" - memory_stats = {} - - for line in open(process.cgroup_memory_path): - return int(line) - - -def parse_cpu_file(process): - """Open the memory stat file and copy the appropriate data""" - memory_stats = {} - - for line in open(process.cgroup_cpu_path): - if "usage_usec" in line: - return int(re.findall(r'\d+', line)[0]) - - -def write_data(process, data, output_dir, data_type, filename): - - # Build the output file path - path = os.path.join(output_dir, process.job_id + data_type) - try: - with open(filename, 'w') as f: - f.write(data + "\n") - except IOError: - raise IOError("Unable to write memory data to file") - - -def get_cgroup_dir(): - """Get the cgroup directory for the current process""" - # Get the PID of the current process - pid = os.getpid() - # Get the cgroup information for the current process - result = subprocess.run(['cat', '/proc/' + str(pid) + '/cgroup'], capture_output=True, text=True) - result = PID_REGEX.search(result.stdout).group() - return result - - -def profile(args): - - cgroup_name = get_cgroup_dir() - - # AZURE SPICE CGROUP LOCATION - cgroup_location = "/sys/fs/cgroup/" + cgroup_name - peak_memory = 0 - processes = [] - # last_system_usage = None - # last_cpu_usage = None - # Find the correct memory_stat file for the process - if not Path.exists(Path(cgroup_location)): - raise FileNotFoundError("cgroups not found:" + cgroup_location) - try: - # Find memory.stat files - for job_id in os.listdir(cgroup_location): - if "memory.peak" in job_id: - processes.append(Process( - cgroup_memory_path=cgroup_location + "/" + job_id, - cgroup_cpu_path=cgroup_location + "/" + "cpu.stat", - job_id=job_id)) - except FileNotFoundError as e: - print(e) - raise FileNotFoundError("cgroups not found:" + cgroup_location) - - # cpu_count = get_host_num_cpus(args.cpuset_path, processes) - while True: - # Write memory usage data - for process in processes: - # Only save Max RSS to disk if it is above the previous value - try: - memory = parse_memory_file(process) - if memory > peak_memory: - peak_memory = memory - write_data(process, str(peak_memory), args.output_dir, ".memory", "max_rss") - cpu_time = parse_cpu_file(process) - write_data(process, str(cpu_time), args.output_dir, ".cpu", "cpu_time") - - except (OSError, IOError, ValueError) as error: - print(error) - - # process, usage_percent = get_cpu_percent( - # cpu_count, args.proc_path, - # process, last_system_usage, last_cpu_usage) - # - # write_data(process, usage_percent, - # args.output_dir, ".cpu") - - time.sleep(args.delay) - - -if __name__ == "__main__": - - arg_parser = get_option_parser() - profile(arg_parser.parse_args()) diff --git a/cylc/flow/scripts/profiler.py b/cylc/flow/scripts/profiler.py index 392a66569c6..b31c564638f 100755 --- a/cylc/flow/scripts/profiler.py +++ b/cylc/flow/scripts/profiler.py @@ -16,7 +16,7 @@ # along with this program. If not, see . """cylc profiler [OPTIONS] -Profiler which periodically polls PBS cgroups to track +Profiler which periodically polls cgroups to track the resource usage of jobs running on the node. """ @@ -25,7 +25,6 @@ import sys import time import signal -import subprocess from pathlib import Path from dataclasses import dataclass from cylc.flow.terminal import cli_function @@ -54,14 +53,14 @@ def get_option_parser() -> COP: @cli_function(get_option_parser) -def main(parser, options): +def main(parser: COP, options) -> None: """CLI main.""" # Register the stop_profiler function with the signal library signal.signal(signal.SIGINT, stop_profiler) signal.signal(signal.SIGHUP, stop_profiler) signal.signal(signal.SIGTERM, stop_profiler) - profile(options) + get_config(options) @dataclass @@ -78,117 +77,111 @@ def stop_profiler(*args): sys.exit(0) -def parse_memory_file(process): +def parse_memory_file(cgroup_memory_path): """Open the memory stat file and copy the appropriate data""" - with open(process.cgroup_memory_path, 'r') as f: + with open(cgroup_memory_path, 'r') as f: for line in f: return int(line) // 1024 -def parse_cpu_file(process, cgroup_version): +def parse_cpu_file(cgroup_cpu_path, cgroup_version): """Open the memory stat file and return the appropriate data""" if cgroup_version == 1: - with open(process.cgroup_cpu_path, 'r') as f: + with open(cgroup_cpu_path, 'r') as f: for line in f: if "usage_usec" in line: return int(RE_INT.findall(line)[0]) // 1000 elif cgroup_version == 2: - with open(process.cgroup_cpu_path, 'r') as f: + with open(cgroup_cpu_path, 'r') as f: for line in f: # Cgroups v2 uses nanoseconds return int(line) / 1000000 - else: - raise FileNotFoundError("cpu usage files not found") def write_data(data, filename): - try: - with open(filename, 'w') as f: - f.write(data + "\n") - except IOError as err: - raise IOError("Unable to write data to file:" + filename) from err + with open(filename, 'w') as f: + f.write(data + "\n") + + +def get_cgroup_version(cgroup_location: str, cgroup_name: str) -> int: + # HPC uses cgroups v2 and SPICE uses cgroups v1 + if Path.exists(Path(cgroup_location + cgroup_name)): + return 1 + elif Path.exists(Path(cgroup_location + "/memory" + cgroup_name)): + return 2 + else: + raise FileNotFoundError("Cgroup not found") -def get_cgroup_dir(): +def get_cgroup_name(): """Get the cgroup directory for the current process""" # Get the PID of the current process pid = os.getpid() - # Get the cgroup information for the current process - result = subprocess.run(['cat', '/proc/' + str(pid) + '/cgroup'], - capture_output=True, text=True, shell=False) - if result.stderr: - print(result.stderr, file=sys.stderr) - result = PID_REGEX.search(result.stdout).group() - return result + try: + # Get the cgroup information for the current process + with open('/proc/' + str(pid) + '/cgroup', 'r') as f: + result = f.read() + result = PID_REGEX.search(result).group() + return result + except FileNotFoundError as err: + raise FileNotFoundError( + '/proc/' + str(pid) + '/cgroup not found') from err + + except AttributeError as err: + raise AttributeError("No cgroup found for process:", pid) from err + + +def get_cgroup_paths(version, location, name): + + if version == 1: + return Process( + cgroup_memory_path=location + + name + "/" + "memory.peak", + cgroup_cpu_path=location + + name + "/" + "cpu.stat") + + elif version == 2: + return Process( + cgroup_memory_path=location + "/memory" + + name + "/memory.max_usage_in_bytes", + cgroup_cpu_path=location + "/cpu" + + name + "/cpuacct.usage") + + +def profile(process, version, delay, keep_looping=lambda: True): + # The infinite loop that will constantly poll the cgroup + # The lambda function is used to allow the loop to be stopped in unit tests + peak_memory = 0 + while keep_looping(): + # Write cpu / memory usage data to disk + cpu_time = parse_cpu_file(process.cgroup_cpu_path, version) + write_data(str(cpu_time), "cpu_time") + + memory = parse_memory_file(process.cgroup_memory_path) + # Only save Max RSS to disk if it is above the previous value + if memory > peak_memory: + peak_memory = memory + write_data(str(peak_memory), "max_rss") + + time.sleep(delay) -def profile(args): +def get_config(args): # Find the cgroup that this process is running in. # Cylc will put this profiler in the same cgroup # as the job it is profiling - cgroup_name = get_cgroup_dir() + cgroup_name = get_cgroup_name() + cgroup_version = get_cgroup_version(args.cgroup_location, cgroup_name) + process = get_cgroup_paths(cgroup_version, + args.cgroups_location, + cgroup_name) - # HPC uses cgroups v2 and SPICE uses cgroups v1 - cgroup_version = None - - if Path.exists(Path(args.cgroup_location + cgroup_name)): - cgroup_version = 1 - elif Path.exists(Path(args.cgroup_location + "/memory" + cgroup_name)): - cgroup_version = 2 - else: - raise FileNotFoundError("cgroups not found:" + cgroup_name) - - peak_memory = 0 - processes = [] - - if cgroup_version == 1: - try: - processes.append(Process( - cgroup_memory_path=args.cgroup_location + - cgroup_name + "/" + "memory.peak", - cgroup_cpu_path=args.cgroup_location + - cgroup_name + "/" + "cpu.stat")) - except FileNotFoundError as err: - print(err) - raise FileNotFoundError("cgroups not found:" - + args.cgroup_location) from err - - elif cgroup_version == 2: - try: - processes.append(Process( - cgroup_memory_path=args.cgroup_location + "/memory" + - cgroup_name + "/memory.max_usage_in_bytes", - cgroup_cpu_path=args.cgroup_location + "/cpu" + - cgroup_name + "/cpuacct.usage")) - except FileNotFoundError as err: - print(err) - raise FileNotFoundError("cgroups not found:" + - args.cgroup_location) from err - - while True: - failures = 0 - # Write memory usage data - for process in processes: - # Only save Max RSS to disk if it is above the previous value - try: - memory = parse_memory_file(process) - if memory > peak_memory: - peak_memory = memory - write_data(str(peak_memory), "max_rss") - cpu_time = parse_cpu_file(process, cgroup_version) - write_data(str(cpu_time), "cpu_time") - - except (OSError, ValueError) as error: - failures += 1 - if failures > 5: - raise OSError("cgroup polling failure", error) from error - - time.sleep(args.delay) + profile(process, cgroup_version, args.delay) if __name__ == "__main__": arg_parser = get_option_parser() - profile(arg_parser.parse_args()) + get_config(arg_parser.parse_args([])) diff --git a/tests/functional/jobscript/02-profiler.t b/tests/functional/jobscript/02-profiler.t index 50e016c5ea0..e43705d4720 100644 --- a/tests/functional/jobscript/02-profiler.t +++ b/tests/functional/jobscript/02-profiler.t @@ -18,7 +18,7 @@ # cylc profile test . "$(dirname "$0")/test_header" #------------------------------------------------------------------------------- -set_test_number 3 +set_test_number 2 #------------------------------------------------------------------------------- install_workflow "${TEST_NAME_BASE}" "${TEST_NAME_BASE}" #------------------------------------------------------------------------------- @@ -34,13 +34,12 @@ fi export PATH_TO_CYLC_BIN="/path/to/cylc/bin" create_test_global_config ' [platforms] - [[profile]] + [[localhost]] + [[[profile]]] activate = true ' #------------------------------------------------------------------------------- TEST_NAME="${TEST_NAME_BASE}-run" workflow_run_ok "${TEST_NAME}" cylc play --reference-test --debug --no-detach "${WORKFLOW_NAME}" -grep_ok 'MAXRSS' "${WORKFLOW_RUN_DIR}/log/scheduler/log" - purge diff --git a/tests/functional/restart/43-auto-restart-force-override-normal.t b/tests/functional/restart/43-auto-restart-force-override-normal.t index b61d08c68cb..35edc57d1f9 100644 --- a/tests/functional/restart/43-auto-restart-force-override-normal.t +++ b/tests/functional/restart/43-auto-restart-force-override-normal.t @@ -50,7 +50,10 @@ create_test_global_config '' " ${BASE_GLOBAL_CONFIG} [scheduler] [[run hosts]] - available = ${CYLC_TEST_HOST_1} + available = ${CYLC_TEST_HOST_1}, ${CYLC_TEST_HOST_2} + # ensure the workflow can start if a host is condemned + # in force mode see #6623 + condemned = ${CYLC_TEST_HOST_2}! " set_test_number 8 diff --git a/tests/unit/scripts/test_profiler.py b/tests/unit/scripts/test_profiler.py index f8560eafd9a..e0df1c9f881 100644 --- a/tests/unit/scripts/test_profiler.py +++ b/tests/unit/scripts/test_profiler.py @@ -51,7 +51,8 @@ def test_parse_cpu_file(mocker): mock_file = mocker.mock_open(read_data="usage_usec 1000000") mocker.patch("builtins.open", mock_file) - assert parse_cpu_file("mocked_file.txt", 1) == 1000 + assert parse_cpu_file( + "mocked_file.txt", 1) == 1000 mock_file.assert_called_once_with("mocked_file.txt", "r") mock_file = mocker.mock_open(read_data="1000000") @@ -103,8 +104,9 @@ def test_get_cgroup_version(mocker): # Mock the Path.exists function call to return False mocker.patch("pathlib.Path.exists", return_value=False) - assert get_cgroup_version('stuff/in/other/place', - 'things') is None + with pytest.raises(FileNotFoundError): + get_cgroup_version('stuff/in/other/place', + 'things') def test_get_cgroup_paths(): @@ -137,6 +139,13 @@ def test_profile_cpu(mocker): mock_file.assert_called_with("cpu_time", "w") +def test_stop_profiler(): + with pytest.raises(SystemExit) as pytest_wrapped_e: + stop_profiler() + assert pytest_wrapped_e.type == SystemExit + assert pytest_wrapped_e.value.code == 0 + + def test_profile_max_rss(mocker): process = get_cgroup_paths(1, "test_location/", @@ -153,8 +162,35 @@ def test_profile_max_rss(mocker): mock_file.assert_called_with("max_rss", "w") -def test_stop_profiler(): - with pytest.raises(SystemExit) as pytest_wrapped_e: - stop_profiler() - assert pytest_wrapped_e.type == SystemExit - assert pytest_wrapped_e.value.code == 0 +def test_profile_1(mocker): + process = get_cgroup_paths( + 1, "test_location/", "test_name") + + mock_file = mocker.mock_open(read_data="") + mocker.patch("builtins.open", mock_file) + mocker.patch( + "cylc.flow.scripts.profiler.parse_memory_file", return_value=1024) + mocker.patch( + "cylc.flow.scripts.profiler.parse_cpu_file", return_value=2048) + run_once = mock.Mock(side_effect=[True, False]) + + profile(process, 1, 1, run_once) + mock_file.assert_called_with("max_rss", "w") + + +def test_profile_2(mocker): + # assert_called_with only shows the last call to open(). + # Setting peak memory to zero stops the memory call to open + process = get_cgroup_paths( + 1, "test_location/", "test_name") + + mock_file = mocker.mock_open(read_data="") + mocker.patch("builtins.open", mock_file) + mocker.patch( + "cylc.flow.scripts.profiler.parse_cpu_file", return_value=2048) + mocker.patch( + "cylc.flow.scripts.profiler.parse_memory_file", return_value=0) + run_once = mock.Mock(side_effect=[True, False]) + + profile(process, 1, 1, run_once) + mock_file.assert_called_with("cpu_time", "w") diff --git a/tests/unit/test_job_file.py b/tests/unit/test_job_file.py index 32bdcea2861..617d02fc1e1 100644 --- a/tests/unit/test_job_file.py +++ b/tests/unit/test_job_file.py @@ -404,11 +404,12 @@ def test_write_task_environment(): 'CYLC_TASK_PARAM_mouse="squeak"\n ' 'CYLC_TASK_WORK_DIR_BASE=\'farm_noises/work_d\'\n}') job_conf = { - "platform": {'communication method': 'ssh', - 'profile': { - "activate": "true", - } - }, + "platform": { + 'communication method': 'ssh', + 'profile': { + "activate": "true", + } + }, "job_d": "1/moo/01", "namespace_hierarchy": ["baa", "moo"], "dependencies": ['moo', 'neigh', 'quack'], @@ -539,4 +540,3 @@ def test_homeless_platform(fixture_get_platform): job_sh_txt = job_sh.read() if 'HOME' in job_sh_txt: raise Exception('$HOME found in job.sh\n{job_sh_txt}') - diff --git a/tests/unit/test_subprocpool.py b/tests/unit/test_subprocpool.py index 6b686c27082..b4fe684a5e9 100644 --- a/tests/unit/test_subprocpool.py +++ b/tests/unit/test_subprocpool.py @@ -14,15 +14,16 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . +from pathlib import Path +from types import SimpleNamespace from tempfile import ( - NamedTemporaryFile, SpooledTemporaryFile, TemporaryFile, - TemporaryDirectory + NamedTemporaryFile, + SpooledTemporaryFile, + TemporaryFile, + TemporaryDirectory, ) -import unittest -import pytest -from pathlib import Path -from types import SimpleNamespace +import pytest from cylc.flow import LOG from cylc.flow.id import Tokens @@ -44,188 +45,201 @@ from cylc.flow.task_proxy import TaskProxy -class TestSubProcPool(unittest.TestCase): - - def test_get_temporary_file(self): - """Test SubProcPool.get_temporary_file.""" - self.assertIsInstance( - SubProcPool.get_temporary_file(), SpooledTemporaryFile) - - def test_run_command_returns_0(self): - """Test basic usage, command returns 0""" - ctx = SubProcContext('truth', ['true']) - SubProcPool.run_command(ctx) - self.assertEqual(ctx.err, '') - self.assertEqual(ctx.out, '') - self.assertEqual(ctx.ret_code, 0) - - def test_run_command_returns_1(self): - """Test basic usage, command returns 1""" - ctx = SubProcContext('lies', ['false']) - SubProcPool.run_command(ctx) - self.assertEqual(ctx.err, '') - self.assertEqual(ctx.out, '') - self.assertEqual(ctx.ret_code, 1) - - def test_run_command_writes_to_out(self): - """Test basic usage, command writes to STDOUT""" - ctx = SubProcContext('parrot', ['echo', 'pirate', 'urrrr']) - SubProcPool.run_command(ctx) - self.assertEqual(ctx.err, '') - self.assertEqual(ctx.out, 'pirate urrrr\n') - self.assertEqual(ctx.ret_code, 0) - - def test_run_command_writes_to_err(self): - """Test basic usage, command writes to STDERR""" - ctx = SubProcContext( - 'parrot2', ['bash', '-c', 'echo pirate errrr >&2']) - SubProcPool.run_command(ctx) - self.assertEqual(ctx.err, 'pirate errrr\n') - self.assertEqual(ctx.out, '') - self.assertEqual(ctx.ret_code, 0) - - def test_run_command_with_stdin_from_str(self): - """Test STDIN from string""" - ctx = SubProcContext('meow', ['cat'], stdin_str='catches mice.\n') - SubProcPool.run_command(ctx) - self.assertEqual(ctx.err, '') - self.assertEqual(ctx.out, 'catches mice.\n') - self.assertEqual(ctx.ret_code, 0) - - def test_run_command_with_stdin_from_unicode(self): - """Test STDIN from string with Unicode""" - ctx = SubProcContext('meow', ['cat'], stdin_str='喵\n') - SubProcPool.run_command(ctx) - self.assertEqual(ctx.err, '') - self.assertEqual(ctx.out, '喵\n') - self.assertEqual(ctx.ret_code, 0) - - def test_run_command_with_stdin_from_handle(self): - """Test STDIN from a single opened file handle""" +def test_get_temporary_file(): + """Test SubProcPool.get_temporary_file.""" + assert isinstance(SubProcPool.get_temporary_file(), SpooledTemporaryFile) + + +def test_run_command_returns_0(): + """Test basic usage, command returns 0""" + ctx = SubProcContext('truth', ['true']) + SubProcPool.run_command(ctx) + assert ctx.err == '' + assert ctx.out == '' + assert ctx.ret_code == 0 + + +def test_run_command_returns_1(): + """Test basic usage, command returns 1""" + ctx = SubProcContext('lies', ['false']) + SubProcPool.run_command(ctx) + assert ctx.err == '' + assert ctx.out == '' + assert ctx.ret_code == 1 + + +def test_run_command_writes_to_out(): + """Test basic usage, command writes to STDOUT""" + ctx = SubProcContext('parrot', ['echo', 'pirate', 'urrrr']) + SubProcPool.run_command(ctx) + assert ctx.err == '' + assert ctx.out == 'pirate urrrr\n' + assert ctx.ret_code == 0 + + +def test_run_command_writes_to_err(): + """Test basic usage, command writes to STDERR""" + ctx = SubProcContext( + 'parrot2', + ['bash', '--noprofile', '--norc', '-c', 'echo pirate errrr >&2'] + ) + SubProcPool.run_command(ctx) + assert 'pirate errrr\n' + assert ctx.out == '' + assert ctx.ret_code == 0 + + +def test_run_command_with_stdin_from_str(): + """Test STDIN from string""" + ctx = SubProcContext('meow', ['cat'], stdin_str='catches mice.\n') + SubProcPool.run_command(ctx) + assert ctx.err == '' + assert ctx.out == 'catches mice.\n' + assert ctx.ret_code == 0 + + +def test_run_command_with_stdin_from_unicode(): + """Test STDIN from string with Unicode""" + ctx = SubProcContext('meow', ['cat'], stdin_str='喵\n') + SubProcPool.run_command(ctx) + assert ctx.err == '' + assert ctx.out == '喵\n' + assert ctx.ret_code == 0 + + +def test_run_command_with_stdin_from_handle(): + """Test STDIN from a single opened file handle""" + handle = TemporaryFile() + handle.write('catches mice.\n'.encode('UTF-8')) + handle.seek(0) + ctx = SubProcContext('meow', ['cat'], stdin_files=[handle]) + SubProcPool.run_command(ctx) + assert ctx.err == '' + assert ctx.out == 'catches mice.\n' + assert ctx.ret_code == 0 + handle.close() + + +def test_run_command_with_stdin_from_path(): + """Test STDIN from a single file path""" + handle = NamedTemporaryFile() + handle.write('catches mice.\n'.encode('UTF-8')) + handle.seek(0) + ctx = SubProcContext('meow', ['cat'], stdin_files=[handle.name]) + SubProcPool.run_command(ctx) + assert ctx.err == '' + assert ctx.out == 'catches mice.\n' + assert ctx.ret_code == 0 + handle.close() + + +def test_run_command_with_stdin_from_handles(): + """Test STDIN from multiple file handles""" + handles = [] + for txt in ['catches mice.\n', 'eat fish.\n']: handle = TemporaryFile() - handle.write('catches mice.\n'.encode('UTF-8')) + handle.write(txt.encode('UTF-8')) handle.seek(0) - ctx = SubProcContext('meow', ['cat'], stdin_files=[handle]) - SubProcPool.run_command(ctx) - self.assertEqual(ctx.err, '') - self.assertEqual(ctx.out, 'catches mice.\n') - self.assertEqual(ctx.ret_code, 0) + handles.append(handle) + ctx = SubProcContext('meow', ['cat'], stdin_files=handles) + SubProcPool.run_command(ctx) + assert ctx.err == '' + assert ctx.out == 'catches mice.\neat fish.\n' + assert ctx.ret_code == 0 + for handle in handles: handle.close() - def test_run_command_with_stdin_from_path(self): - """Test STDIN from a single file path""" + +def test_run_command_with_stdin_from_paths(): + """Test STDIN from multiple file paths""" + handles = [] + for txt in ['catches mice.\n', 'eat fish.\n']: handle = NamedTemporaryFile() - handle.write('catches mice.\n'.encode('UTF-8')) + handle.write(txt.encode('UTF-8')) handle.seek(0) - ctx = SubProcContext('meow', ['cat'], stdin_files=[handle.name]) - SubProcPool.run_command(ctx) - self.assertEqual(ctx.err, '') - self.assertEqual(ctx.out, 'catches mice.\n') - self.assertEqual(ctx.ret_code, 0) + handles.append(handle) + ctx = SubProcContext( + 'meow', ['cat'], stdin_files=[handle.name for handle in handles] + ) + SubProcPool.run_command(ctx) + assert ctx.err == '' + assert ctx.out == 'catches mice.\neat fish.\n' + assert ctx.ret_code == 0 + for handle in handles: handle.close() - def test_run_command_with_stdin_from_handles(self): - """Test STDIN from multiple file handles""" - handles = [] - for txt in ['catches mice.\n', 'eat fish.\n']: - handle = TemporaryFile() - handle.write(txt.encode('UTF-8')) - handle.seek(0) - handles.append(handle) - ctx = SubProcContext('meow', ['cat'], stdin_files=handles) - SubProcPool.run_command(ctx) - self.assertEqual(ctx.err, '') - self.assertEqual(ctx.out, 'catches mice.\neat fish.\n') - self.assertEqual(ctx.ret_code, 0) - for handle in handles: - handle.close() - - def test_run_command_with_stdin_from_paths(self): - """Test STDIN from multiple file paths""" - handles = [] - for txt in ['catches mice.\n', 'eat fish.\n']: - handle = NamedTemporaryFile() - handle.write(txt.encode('UTF-8')) - handle.seek(0) - handles.append(handle) - ctx = SubProcContext( - 'meow', ['cat'], stdin_files=[handle.name for handle in handles]) - SubProcPool.run_command(ctx) - self.assertEqual(ctx.err, '') - self.assertEqual(ctx.out, 'catches mice.\neat fish.\n') - self.assertEqual(ctx.ret_code, 0) - for handle in handles: - handle.close() - - def test_xfunction(self): - """Test xtrigger function import.""" - with TemporaryDirectory() as temp_dir: - python_dir = Path(temp_dir, "lib", "python") - python_dir.mkdir(parents=True) - the_answer_file = python_dir / "the_answer.py" - with the_answer_file.open(mode="w") as f: - f.write("""the_answer = lambda: 42""") - f.flush() - f_name = "the_answer" - fn = get_xtrig_func(f_name, f_name, temp_dir) - result = fn() - self.assertEqual(42, result) - - def test_xfunction_cache(self): - """Test xtrigger function import cache.""" - with TemporaryDirectory() as temp_dir: - python_dir = Path(temp_dir, "lib", "python") - python_dir.mkdir(parents=True) - amandita_file = python_dir / "amandita.py" - with amandita_file.open(mode="w") as f: - f.write("""choco = lambda: 'chocolate'""") - f.flush() - m_name = "amandita" # module - f_name = "choco" # function - fn = get_xtrig_func(m_name, f_name, temp_dir) - result = fn() - self.assertEqual('chocolate', result) - - # is in the cache - self.assertTrue((m_name, f_name) in _XTRIG_FUNC_CACHE) - # returned from cache - self.assertEqual(fn, get_xtrig_func(m_name, f_name, temp_dir)) - - def test_xfunction_import_error(self): - """Test for error on importing a xtrigger function. - - To prevent the test eventually failing if the test function is added - and successfully imported, we use an invalid module name as per Python - spec. - """ - with TemporaryDirectory() as temp_dir, self.assertRaises( - ModuleNotFoundError - ): - get_xtrig_func("invalid-module-name", "func-name", temp_dir) - - def test_xfunction_attribute_error(self): - """Test for error on looking for an attribute in a xtrigger script.""" - with TemporaryDirectory() as temp_dir: - python_dir = Path(temp_dir, "lib", "python") - python_dir.mkdir(parents=True) - the_answer_file = python_dir / "the_sword.py" - with the_answer_file.open(mode="w") as f: - f.write("""the_droid = lambda: 'excalibur'""") - f.flush() - f_name = "the_sword" - with self.assertRaises(AttributeError): - get_xtrig_func(f_name, f_name, temp_dir) + +def test_xfunction(): + """Test xtrigger function import.""" + with TemporaryDirectory() as temp_dir: + python_dir = Path(temp_dir, "lib", "python") + python_dir.mkdir(parents=True) + the_answer_file = python_dir / "the_answer.py" + with the_answer_file.open(mode="w") as f: + f.write("""the_answer = lambda: 42""") + f.flush() + f_name = "the_answer" + fn = get_xtrig_func(f_name, f_name, temp_dir) + result = fn() + assert 42 == result + + +def test_xfunction_cache(): + """Test xtrigger function import cache.""" + with TemporaryDirectory() as temp_dir: + python_dir = Path(temp_dir, "lib", "python") + python_dir.mkdir(parents=True) + amandita_file = python_dir / "amandita.py" + with amandita_file.open(mode="w") as f: + f.write("""choco = lambda: 'chocolate'""") + f.flush() + m_name = "amandita" # module + f_name = "choco" # function + fn = get_xtrig_func(m_name, f_name, temp_dir) + result = fn() + assert 'chocolate' == result + + # is in the cache + assert (m_name, f_name) in _XTRIG_FUNC_CACHE + # returned from cache + assert fn, get_xtrig_func(m_name, f_name == temp_dir) + + +def test_xfunction_import_error(): + """Test for error on importing a xtrigger function. + + To prevent the test eventually failing if the test function is added + and successfully imported, we use an invalid module name as per Python + spec. + """ + with TemporaryDirectory() as temp_dir, pytest.raises(ModuleNotFoundError): + get_xtrig_func("invalid-module-name", "func-name", temp_dir) + + +def test_xfunction_attribute_error(): + """Test for error on looking for an attribute in a xtrigger script.""" + with TemporaryDirectory() as temp_dir: + python_dir = Path(temp_dir, "lib", "python") + python_dir.mkdir(parents=True) + the_answer_file = python_dir / "the_sword.py" + with the_answer_file.open(mode="w") as f: + f.write("""the_droid = lambda: 'excalibur'""") + f.flush() + f_name = "the_sword" + with pytest.raises(AttributeError): + get_xtrig_func(f_name, f_name, temp_dir) @pytest.fixture def mock_ctx(): def inner_(ret_code=None, host=None, cmd_key=None, cmd=None): - """Provide a SimpleNamespace which looks like a ctx object. - """ + """Provide a SimpleNamespace which looks like a ctx object.""" inputs = locals() defaults = { - 'ret_code': 255, 'host': 'mouse', 'cmd_key': 'my-command', - 'cmd': ['bistromathic', 'take-off'] + 'ret_code': 255, + 'host': 'mouse', + 'cmd_key': 'my-command', + 'cmd': ['bistromathic', 'take-off'], } for key in inputs: if inputs[key] is None: @@ -235,9 +249,10 @@ def inner_(ret_code=None, host=None, cmd_key=None, cmd=None): timestamp=None, ret_code=inputs['ret_code'], host=inputs['host'], - cmd_key=inputs['cmd_key'] + cmd_key=inputs['cmd_key'], ) return ctx + yield inner_ @@ -260,21 +275,18 @@ def _test_callback_255(ctx, foo=''): 'platform: None - Could not connect to mouse.', 255, 'ssh', - id="return 255" + id="return 255", ), pytest.param( 'platform: localhost - Could not connect to mouse.', 255, TaskJobLogsRetrieveContext(['ssh', 'something'], None, None), - id="return 255 (log-ret)" - ) - ] + id="return 255 (log-ret)", + ), + ], ) -def test__run_command_exit( - caplog, mock_ctx, expect, ret_code, cmd_key -): - """It runs a callback - """ +def test__run_command_exit(caplog, mock_ctx, expect, ret_code, cmd_key): + """It runs a callback""" ctx = mock_ctx(ret_code=ret_code, cmd_key=cmd_key, cmd=['ssh']) SubProcPool._run_command_exit( ctx, callback=_test_callback, callback_255=_test_callback_255 @@ -293,9 +305,7 @@ def test__run_command_exit_no_255_callback(caplog, mock_ctx): def test__run_command_exit_no_gettable_platform(caplog, mock_ctx): """It logs being unable to select a platform""" ret_ctx = TaskJobLogsRetrieveContext( - platform_name='rhenas', - max_size=256, - key='rhenas' + platform_name='rhenas', max_size=256, key='rhenas' ) ctx = mock_ctx(cmd_key=ret_ctx, cmd=['ssh'], ret_code=255) SubProcPool._run_command_exit(ctx, callback=_test_callback) @@ -310,20 +320,19 @@ def test__run_command_exit_no_255_args(caplog, mock_ctx): mock_ctx(cmd=['ssh', 'Zaphod']), callback=_test_callback, callback_args=['Zaphod'], - callback_255=_test_callback_255 + callback_255=_test_callback_255, ) assert '255' in caplog.records[1].msg def test__run_command_exit_add_to_badhosts(mock_ctx): - """It updates the list of badhosts - """ + """It updates the list of badhosts""" badhosts = {'foo', 'bar'} SubProcPool._run_command_exit( mock_ctx(cmd=['ssh']), bad_hosts=badhosts, callback=print, - callback_args=['Welcome to Magrathea'] + callback_args=['Welcome to Magrathea'], ) assert badhosts == {'foo', 'bar', 'mouse'} @@ -335,32 +344,36 @@ def test__run_command_exit_add_to_badhosts_log(caplog, mock_ctx): mock_ctx(cmd=['ssh']), bad_hosts=badhosts, callback=lambda x, t: print(str(x)), - callback_args=[TaskProxy( - Tokens('~u/w//c/t/2'), - SimpleNamespace( - name='t', dependencies={}, sequential='', - external_triggers=[], xtrig_labels={}, - expiration_offset=None, - outputs={ - TASK_OUTPUT_SUBMITTED: [None, None], - TASK_OUTPUT_SUBMIT_FAILED: [None, None], - TASK_OUTPUT_SUCCEEDED: [None, None], - TASK_OUTPUT_FAILED: [None, None], - TASK_OUTPUT_EXPIRED: [None, None], - }, - graph_children={}, rtconfig={'platform': 'foo'} - - ), - ISO8601Point('1990') - )] + callback_args=[ + TaskProxy( + Tokens('~u/w//c/t/2'), + SimpleNamespace( + name='t', + dependencies={}, + sequential='', + external_triggers=[], + xtrig_labels={}, + expiration_offset=None, + outputs={ + TASK_OUTPUT_SUBMITTED: [None, None], + TASK_OUTPUT_SUBMIT_FAILED: [None, None], + TASK_OUTPUT_SUCCEEDED: [None, None], + TASK_OUTPUT_FAILED: [None, None], + TASK_OUTPUT_EXPIRED: [None, None], + }, + graph_children={}, + rtconfig={'platform': 'foo'}, + ), + ISO8601Point('1990'), + ) + ], ) assert 'platform: foo' in caplog.records[0].message assert badhosts == {'foo', 'bar', 'mouse'} def test__run_command_exit_rsync_fails(mock_ctx): - """It updates the list of badhosts - """ + """It updates the list of badhosts""" badhosts = {'foo', 'bar'} ctx = mock_ctx(cmd=['rsync'], ret_code=42, cmd_key='file-install') SubProcPool._run_command_exit( @@ -371,10 +384,10 @@ def test__run_command_exit_rsync_fails(mock_ctx): { 'name': 'Magrathea', 'ssh command': 'ssh', - 'rsync command': 'rsync command' + 'rsync command': 'rsync command', }, 'Welcome to Magrathea', - ] + ], ) assert badhosts == {'foo', 'bar', 'mouse'} @@ -384,12 +397,11 @@ def test__run_command_exit_rsync_fails(mock_ctx): [ (True, {'cmd': ['ssh'], 'ret_code': 255}), (False, {'cmd': ['foo'], 'ret_code': 255}), - (False, {'cmd': ['ssh'], 'ret_code': 42}) - ] + (False, {'cmd': ['ssh'], 'ret_code': 42}), + ], ) def test_ssh_255_fail(mock_ctx, expect, ctx_kwargs): - """It knows when a ctx has failed - """ + """It knows when a ctx has failed""" output = SubProcPool.ssh_255_fail(mock_ctx(**ctx_kwargs)) assert output == expect @@ -401,11 +413,10 @@ def test_ssh_255_fail(mock_ctx, expect, ctx_kwargs): (True, {'cmd': ['rsync'], 'ret_code': 255, 'host': 'not_local'}), (False, {'cmd': ['make it-so'], 'ret_code': 255, 'host': 'not_local'}), (False, {'cmd': ['rsync'], 'ret_code': 125, 'host': 'localhost'}), - ] + ], ) def test_rsync_255_fail(mock_ctx, expect, ctx_kwargs): - """It knows when a ctx has failed - """ + """It knows when a ctx has failed""" output = SubProcPool.rsync_255_fail( mock_ctx(**ctx_kwargs), {'ssh command': 'ssh', 'rsync command': 'rsync command'}, From 12c347d66778c9fd5cf703cbeb9bc64d2f481f9c Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Mon, 7 Apr 2025 13:54:58 +0100 Subject: [PATCH 04/47] Code review changes --- .mailmap | 1 + cylc/flow/cfgspec/globalcfg.py | 17 +- cylc/flow/etc/job.sh | 24 +- cylc/flow/job_file.py | 6 + cylc/flow/scripts/profiler.py | 249 ++++++++++----- tests/functional/jobscript/02-profiler.t | 45 --- .../jobscript/02-profiler/bin/foo.sh | 2 - .../jobscript/02-profiler/flow.cylc | 77 ----- .../jobscript/02-profiler/reference.log | 3 - tests/functional/jobscript/03-profiler.t | 82 +++++ .../jobscript/03-profiler/flow.cylc | 0 tests/functional/jobscript/04-profiler-e2e.t | 88 ++++++ .../jobscript/04-profiler-e2e/flow.cylc | 0 tests/unit/scripts/test_profiler.py | 293 ++++++++++++------ tests/unit/test_job_file.py | 4 + 15 files changed, 563 insertions(+), 328 deletions(-) delete mode 100644 tests/functional/jobscript/02-profiler.t delete mode 100755 tests/functional/jobscript/02-profiler/bin/foo.sh delete mode 100644 tests/functional/jobscript/02-profiler/flow.cylc delete mode 100644 tests/functional/jobscript/02-profiler/reference.log create mode 100644 tests/functional/jobscript/03-profiler.t create mode 100644 tests/functional/jobscript/03-profiler/flow.cylc create mode 100644 tests/functional/jobscript/04-profiler-e2e.t create mode 100644 tests/functional/jobscript/04-profiler-e2e/flow.cylc diff --git a/.mailmap b/.mailmap index aca6b1acd25..5f1a688518b 100644 --- a/.mailmap +++ b/.mailmap @@ -58,4 +58,5 @@ github-actions[bot] github-actions[bot] GitHub Action Diquan Jabbour <165976689+Diquan-BOM@users.noreply.github.com> Maxime Rio +Christopher Bennett ChrisPaulBennett Christopher Bennett christopher.bennett \ No newline at end of file diff --git a/cylc/flow/cfgspec/globalcfg.py b/cylc/flow/cfgspec/globalcfg.py index 064e72da1b5..1565f986d87 100644 --- a/cylc/flow/cfgspec/globalcfg.py +++ b/cylc/flow/cfgspec/globalcfg.py @@ -1479,12 +1479,25 @@ def default_for( ''') with Conf('profile'): - Conf('activate', VDR.V_BOOLEAN, True, desc=''' + Conf('activate', VDR.V_BOOLEAN, False, desc=''' A Boolean that sets if the cylc profiler will be used .. versionadded:: 8.0.0 ''') - Conf('cgroups path', VDR.V_STRING, '/sys/fs/cgroup') + Conf('cgroups path', VDR.V_STRING, + default='/sys/fs/cgroup', + desc=''' + The path to the cgroups filesystem. The default value + (/sys/fs/cgroup) is the standard location for cgroups on + linux and should work in most circumstances''') + Conf('polling interval', VDR.V_INTEGER, + default=10, + desc=''' + The interval (in seconds) at which the profiler will + poll the cgroups filesystem for resource usage data. + The default value of 10 seconds should be sufficient for + most use cases, but can be adjusted as needed. + ''') Conf('job runner', VDR.V_STRING, 'background', desc=f''' The system used to run jobs on the platform. diff --git a/cylc/flow/etc/job.sh b/cylc/flow/etc/job.sh index a4747579538..16849e9f4b2 100755 --- a/cylc/flow/etc/job.sh +++ b/cylc/flow/etc/job.sh @@ -141,7 +141,7 @@ cylc__job__main() { cd "${CYLC_TASK_WORK_DIR}" if [[ "${CYLC_PROFILE}" == "True" ]] ; then - cylc profile & + cylc profile -m "${CYLC_CGROUP}" -i "${CYLC_POLLING_INTERVAL}" & export profiler_pid="$!" fi @@ -163,20 +163,12 @@ cylc__job__main() { cylc__set_return "$ret_code" fi } - # Grab the max rss and cpu_time value before moving directory - if [[ -f "max_rss" ]]; then - max_rss=$(sed -n '1p' max_rss) - rm max_rss - fi - if [[ -f "cpu_time" ]]; then - cpu_time=$(sed -n '1p' cpu_time) - rm cpu_time - fi + # Grab the max rss and cpu_time and clean up before changing directory + cylc__kill_profiler # Empty work directory remove cd rmdir "${CYLC_TASK_WORK_DIR}" 2>'/dev/null' || true # Send task succeeded message - cylc__kill_profiler wait "${CYLC_TASK_MESSAGE_STARTED_PID}" 2>'/dev/null' || true @@ -208,13 +200,7 @@ cylc__set_return() { ############################################################################### # Save the data using cylc message and exit the profiler cylc__kill_profiler() { - if [[ -n "${cpu_time:-}" ]]; then - cylc message -- "${CYLC_WORKFLOW_ID}" "${CYLC_TASK_JOB}" "DEBUG: cpu_time $cpu_time" || true - fi - if [[ -n "${max_rss:-}" ]]; then - cylc message -- "${CYLC_WORKFLOW_ID}" "${CYLC_TASK_JOB}" "DEBUG: max_rss $max_rss" || true - fi - if [[ -f "proc/${profiler_pid}" ]]; then + if [[ -n "${profiler_pid:-}" ]] && ps -p "$profiler_pid" > /dev/null; then kill -s SIGINT "${profiler_pid}" || true fi } @@ -300,7 +286,9 @@ cylc__job_finish_err() { # (Ignore shellcheck "globbing and word splitting" warning here). # shellcheck disable=SC2086 trap '' ${CYLC_VACATION_SIGNALS:-} ${CYLC_FAIL_SIGNALS} + cylc__kill_profiler + if [[ -n "${CYLC_TASK_MESSAGE_STARTED_PID:-}" ]]; then wait "${CYLC_TASK_MESSAGE_STARTED_PID}" 2>'/dev/null' || true fi diff --git a/cylc/flow/job_file.py b/cylc/flow/job_file.py index 489b3f99c47..cc44aa42e03 100644 --- a/cylc/flow/job_file.py +++ b/cylc/flow/job_file.py @@ -228,6 +228,12 @@ def _write_task_environment(self, handle, job_conf): handle.write( "\n export CYLC_PROFILE=" f"{job_conf['platform']['profile']['activate']}") + handle.write( + "\n export CYLC_CGROUP=" + f"{job_conf['platform']['profile']['cgroups path']}") + handle.write( + "\n export CYLC_POLLING_INTERVAL=" + f"{job_conf['platform']['profile']['polling interval']}") # Standard parameter environment variables for var, val in job_conf['param_var'].items(): handle.write('\n export CYLC_TASK_PARAM_%s="%s"' % (var, val)) diff --git a/cylc/flow/scripts/profiler.py b/cylc/flow/scripts/profiler.py index b31c564638f..5e3b4b1e89b 100755 --- a/cylc/flow/scripts/profiler.py +++ b/cylc/flow/scripts/profiler.py @@ -25,98 +25,151 @@ import sys import time import signal +import asyncio + from pathlib import Path +from functools import partial from dataclasses import dataclass -from cylc.flow.terminal import cli_function + +from cylc.flow.network.client_factory import get_client from cylc.flow.option_parsers import CylcOptionParser as COP +from cylc.flow.terminal import cli_function + -INTERNAL = True PID_REGEX = re.compile(r"([^:]*\d{6,}.*)") RE_INT = re.compile(r'\d+') -def get_option_parser() -> COP: - parser = COP( - __doc__, - argdoc=[ - ], - ) - parser.add_option( - "-i", type=int, help="interval between query cycles in seconds", - default=10, dest="delay") - parser.add_option( - "-m", type=str, help="Location of cgroups directory", - default="/sys/fs/cgroup", - dest="cgroup_location") - - return parser - - -@cli_function(get_option_parser) -def main(parser: COP, options) -> None: - """CLI main.""" - # Register the stop_profiler function with the signal library - signal.signal(signal.SIGINT, stop_profiler) - signal.signal(signal.SIGHUP, stop_profiler) - signal.signal(signal.SIGTERM, stop_profiler) - - get_config(options) - - @dataclass class Process: """Class for representing CPU and Memory usage of a process""" cgroup_memory_path: str cgroup_cpu_path: str + memory_allocated_path: str + cgroup_version: int -def stop_profiler(*args): +def stop_profiler(process, comms_timeout, *args): """This function will be executed when the SIGINT signal is sent to this process""" - print('profiler exited') + + max_rss, cpu_time, memory_allocated = get_resource_usage(process) + + graphql_mutation = """ + mutation($WORKFLOWS: [WorkflowID]!, + $MESSAGES: [[String]], $JOB: String!, $TIME: String) { + message(workflows: $WORKFLOWS, messages:$MESSAGES, + taskJob:$JOB, eventTime:$TIME) { + result + } + } + """ + + graphql_request_variables = { + "WORKFLOWS": [os.environ.get('CYLC_WORKFLOW_ID')], + "MESSAGES": [[ + "DEBUG", + f"cpu_time {cpu_time} " + f"max_rss {max_rss} " + f"mem_alloc {memory_allocated}"]], + "JOB": os.environ.get('CYLC_TASK_JOB'), + "TIME": "now" + } + + pclient = get_client(os.environ.get('CYLC_WORKFLOW_ID'), + timeout=comms_timeout) + + async def send_cylc_message(): + await pclient.async_request( + 'graphql', + {'request_string': graphql_mutation, + 'variables': graphql_request_variables}, + ) + + asyncio.run(send_cylc_message()) sys.exit(0) -def parse_memory_file(cgroup_memory_path): - """Open the memory stat file and copy the appropriate data""" +def get_resource_usage(process): + # If a task fails instantly, or finishes very quickly (< 1 second), + # the get config function doesn't have time to run + if (process.cgroup_memory_path is None + or process.cgroup_cpu_path is None + or process.memory_allocated_path is None): + return 0, 0, 0 + max_rss = parse_memory_file(process) + cpu_time = parse_cpu_file(process) + memory_allocated = parse_memory_allocated(process) + return max_rss, cpu_time, memory_allocated + - with open(cgroup_memory_path, 'r') as f: - for line in f: - return int(line) // 1024 +def parse_memory_file(process: Process): + """Open the memory stat file and copy the appropriate data""" + if process.cgroup_version == 2: + with open(process.cgroup_memory_path, 'r') as f: + for line in f: + if "anon" in line: + return int(''.join(filter(str.isdigit, line))) + else: + with open(process.cgroup_memory_path, 'r') as f: + for line in f: + if "total_rss" in line: + return int(''.join(filter(str.isdigit, line))) -def parse_cpu_file(cgroup_cpu_path, cgroup_version): - """Open the memory stat file and return the appropriate data""" - if cgroup_version == 1: - with open(cgroup_cpu_path, 'r') as f: +def parse_memory_allocated(process: Process) -> int: + """Open the memory stat file and copy the appropriate data""" + if process.cgroup_version == 2: + cgroup_memory_path = Path(process.memory_allocated_path) + + for i in range(5): + with open(cgroup_memory_path / "memory.max", 'r') as f: + line = f.readline() + if "max" not in line: + return int(line) + cgroup_memory_path = cgroup_memory_path.parent + if i == 5: + break + elif process.cgroup_version == 1: + return 0 # Memory limit not tracked for cgroups v1 + + raise FileNotFoundError("Could not find memory.max file") + + +def parse_cpu_file(process: Process) -> int: + """Open the CPU stat file and return the appropriate data""" + if process.cgroup_version == 2: + with open(process.cgroup_cpu_path, 'r') as f: for line in f: if "usage_usec" in line: return int(RE_INT.findall(line)[0]) // 1000 - elif cgroup_version == 2: - with open(cgroup_cpu_path, 'r') as f: + raise ValueError("Unable to find cpu usage data") + else: + with open(process.cgroup_cpu_path, 'r') as f: for line in f: - # Cgroups v2 uses nanoseconds - return int(line) / 1000000 - - -def write_data(data, filename): - with open(filename, 'w') as f: - f.write(data + "\n") + # Cgroups v1 uses nanoseconds + return int(line) // 1000000 + raise ValueError("Unable to find cpu usage data") def get_cgroup_version(cgroup_location: str, cgroup_name: str) -> int: - # HPC uses cgroups v2 and SPICE uses cgroups v1 if Path.exists(Path(cgroup_location + cgroup_name)): - return 1 - elif Path.exists(Path(cgroup_location + "/memory" + cgroup_name)): return 2 + elif Path.exists(Path(cgroup_location + "/memory" + cgroup_name)): + return 1 else: - raise FileNotFoundError("Cgroup not found") + raise FileNotFoundError("Cgroup not found at " + + cgroup_location + cgroup_name) def get_cgroup_name(): """Get the cgroup directory for the current process""" + + # fugly hack to allow functional tests to use test data + if 'profiler_test_env_var' in os.environ: + return os.getenv('profiler_test_env_var') + # Get the PID of the current process pid = os.getpid() try: @@ -133,55 +186,79 @@ def get_cgroup_name(): raise AttributeError("No cgroup found for process:", pid) from err -def get_cgroup_paths(version, location, name): - - if version == 1: +def get_cgroup_paths(location) -> Process: + cgroup_name = get_cgroup_name() + cgroup_version = get_cgroup_version(location, cgroup_name) + if cgroup_version == 2: return Process( cgroup_memory_path=location + - name + "/" + "memory.peak", + cgroup_name + "/" + "memory.stat", cgroup_cpu_path=location + - name + "/" + "cpu.stat") + cgroup_name + "/" + "cpu.stat", + memory_allocated_path=location + cgroup_name, + cgroup_version=cgroup_version, + ) - elif version == 2: + elif cgroup_version == 1: return Process( - cgroup_memory_path=location + "/memory" + - name + "/memory.max_usage_in_bytes", - cgroup_cpu_path=location + "/cpu" + - name + "/cpuacct.usage") + cgroup_memory_path=location + "memory/" + + cgroup_name + "/memory.stat", + cgroup_cpu_path=location + "cpu/" + + cgroup_name + "/cpuacct.usage", + memory_allocated_path="", + cgroup_version=cgroup_version, + ) + + raise ValueError("Unable to determine cgroup version") -def profile(process, version, delay, keep_looping=lambda: True): +def profile(_process: Process, delay, keep_looping=lambda: True): # The infinite loop that will constantly poll the cgroup # The lambda function is used to allow the loop to be stopped in unit tests - peak_memory = 0 + while keep_looping(): # Write cpu / memory usage data to disk - cpu_time = parse_cpu_file(process.cgroup_cpu_path, version) - write_data(str(cpu_time), "cpu_time") + # CPU_TIME = parse_cpu_file(process.cgroup_cpu_path, version) + time.sleep(delay) - memory = parse_memory_file(process.cgroup_memory_path) - # Only save Max RSS to disk if it is above the previous value - if memory > peak_memory: - peak_memory = memory - write_data(str(peak_memory), "max_rss") - time.sleep(delay) +def get_option_parser() -> COP: + parser = COP( + __doc__, + comms=True, + argdoc=[ + ], + ) + parser.add_option( + "-i", type=int, + help="interval between query cycles in seconds", dest="delay") + parser.add_option( + "-m", type=str, help="Location of cgroups directory", + dest="cgroup_location") + return parser -def get_config(args): - # Find the cgroup that this process is running in. - # Cylc will put this profiler in the same cgroup - # as the job it is profiling - cgroup_name = get_cgroup_name() - cgroup_version = get_cgroup_version(args.cgroup_location, cgroup_name) - process = get_cgroup_paths(cgroup_version, - args.cgroups_location, - cgroup_name) - profile(process, cgroup_version, args.delay) +@cli_function(get_option_parser) +def main(_parser: COP, options) -> None: + """CLI main.""" + _main(options) -if __name__ == "__main__": +def _main(options) -> None: + # get cgroup information + process = get_cgroup_paths(options.cgroup_location) + + # Register the stop_profiler function with the signal library + _stop_profiler = partial(stop_profiler, process, options.comms_timeout) + signal.signal(signal.SIGINT, _stop_profiler) + signal.signal(signal.SIGHUP, _stop_profiler) + signal.signal(signal.SIGTERM, _stop_profiler) + + # run profiler run + profile(process, options.delay) + +if __name__ == "__main__": arg_parser = get_option_parser() - get_config(arg_parser.parse_args([])) + _main(arg_parser.parse_args([])) diff --git a/tests/functional/jobscript/02-profiler.t b/tests/functional/jobscript/02-profiler.t deleted file mode 100644 index e43705d4720..00000000000 --- a/tests/functional/jobscript/02-profiler.t +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/env bash -# THIS FILE IS PART OF THE CYLC WORKFLOW ENGINE. -# Copyright (C) NIWA & British Crown (Met Office) & Contributors. -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . -#------------------------------------------------------------------------------- -# cylc profile test -. "$(dirname "$0")/test_header" -#------------------------------------------------------------------------------- -set_test_number 2 -#------------------------------------------------------------------------------- -install_workflow "${TEST_NAME_BASE}" "${TEST_NAME_BASE}" -#------------------------------------------------------------------------------- -TEST_NAME="${TEST_NAME_BASE}-validate" -run_ok "${TEST_NAME}" cylc validate "${WORKFLOW_NAME}" - -if [[ -n "${PYTHONPATH:-}" ]]; then - export PYTHONPATH="${PWD}/lib:${PYTHONPATH}" -else - export PYTHONPATH="${PWD}/lib" -fi - -export PATH_TO_CYLC_BIN="/path/to/cylc/bin" -create_test_global_config ' -[platforms] - [[localhost]] - [[[profile]]] - activate = true -' -#------------------------------------------------------------------------------- -TEST_NAME="${TEST_NAME_BASE}-run" -workflow_run_ok "${TEST_NAME}" cylc play --reference-test --debug --no-detach "${WORKFLOW_NAME}" - -purge diff --git a/tests/functional/jobscript/02-profiler/bin/foo.sh b/tests/functional/jobscript/02-profiler/bin/foo.sh deleted file mode 100755 index 4b20577c0d0..00000000000 --- a/tests/functional/jobscript/02-profiler/bin/foo.sh +++ /dev/null @@ -1,2 +0,0 @@ -#!/usr/bin/env bash -echo "Hello from $0" diff --git a/tests/functional/jobscript/02-profiler/flow.cylc b/tests/functional/jobscript/02-profiler/flow.cylc deleted file mode 100644 index 951a5b94d20..00000000000 --- a/tests/functional/jobscript/02-profiler/flow.cylc +++ /dev/null @@ -1,77 +0,0 @@ -[meta] - title = "job script torture test" - - description = """Any task job script may fail regardless of user runtime -settings if changes to cylc re-order the job script sections badly: e.g. -"cylc task started" must be called after the CYLC_ environment variables -are exported. Additionally, users may rely on the order of variable -definition in each environment and script section: e.g. workflow -bin directory must go in PATH before the task runtime environment is -defined because workflow bin commands could be used in variable assignment -expressions.""" - -[scheduling] - [[graph]] - R1 = "foo" -[runtime] - [[foo]] - platform = localhost - init-script = """ -echo "HELLO FROM INIT-SCRIPT" -# define a variable -export VAR_IS=is""" - pre-script = """ -echo "HELLO FROM PRE-SCRIPT" -# init-script must be done: -echo VAR_IS is $VAR_IS -# user environment must be done: -echo E_ONE is $E_ONE -echo E_TWO is $E_TWO -echo E_THR is $E_THR -echo E_FOU is $E_FOU -echo E_FIV is $E_FIV -# define a variable -export VAR_PreCS=precs""" - script = """ -echo "HELLO FROM SCRIPT" -# init-script must be done: -echo VAR_IS is $VAR_IS -# pre-script must be done: -echo VAR_PreCS is $VAR_PreCS -# environment must be done: -echo E_ONE is $E_ONE -echo E_TWO is $E_TWO -echo E_THR is $E_THR -echo E_FOU is $E_FOU -echo E_FIV is $E_FIV -# define a variable -export VAR_CS=var_cs""" - post-script = """ -echo "HELLO FROM POST-SCRIPT" -# init-script must be done: -echo VAR_IS is $VAR_IS -# pre-script must be done: -echo VAR_PreCS is $VAR_PreCS -# script must be done: -echo VAR_CS is $VAR_CS -# environment must be done: -echo E_ONE is $E_ONE -echo E_TWO is $E_TWO -echo E_THR is $E_THR -echo E_FOU is $E_FOU -echo E_FIV is $E_FIV -echo VAR_IS is $VAR_IS -echo VAR_PreCS is $VAR_PreCS -echo VAR_CS is $VAR_CS -# define a variable -export VAR_PostCS=postcs""" - [[[environment]]] - # path to cylc must be available: - E_ONE = $(( RANDOM % 10 )) - # init-script must be done: - E_TWO = $VAR_IS - # cylc-defined variables must be done: - E_THR = $CYLC_WORKFLOW_SHARE_DIR - E_FOU = $CYLC_TASK_NAME - # the workflow bin must be in $PATH already: - E_FIV = $( foo.sh ) diff --git a/tests/functional/jobscript/02-profiler/reference.log b/tests/functional/jobscript/02-profiler/reference.log deleted file mode 100644 index 08fe5d5558a..00000000000 --- a/tests/functional/jobscript/02-profiler/reference.log +++ /dev/null @@ -1,3 +0,0 @@ -Initial point: 1 -Final point: 1 -1/foo -triggered off [] diff --git a/tests/functional/jobscript/03-profiler.t b/tests/functional/jobscript/03-profiler.t new file mode 100644 index 00000000000..2a910e6705c --- /dev/null +++ b/tests/functional/jobscript/03-profiler.t @@ -0,0 +1,82 @@ +#!/usr/bin/env bash +# THIS FILE IS PART OF THE CYLC WORKFLOW ENGINE. +# Copyright (C) NIWA & British Crown (Met Office) & Contributors. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +#------------------------------------------------------------------------------- +# Cylc profile test +# NOTE: This test will run the Cylc profiler on the given test platform. +# The test platform may need to be configured for this to work (e.g. +# "cgroups path" may need to be set). +export REQUIRE_PLATFORM='runner:?(pbs|slurm) comms:tcp' +. "$(dirname "$0")/test_header" +set_test_number 8 + +create_test_global_config " +[platforms] + [[${CYLC_TEST_PLATFORM}]] + [[[profile]]] + activate = True + polling interval = 10 + [[localhost]] + [[[profile]]] + activate = True + polling interval = 10 + cgroups path = the/thing/that/should/not/be +" + +init_workflow "${TEST_NAME_BASE}" <<'__FLOW_CONFIG__' +#!Jinja2 + +[scheduling] + [[graph]] + R1 = the_good & the_bad? & the_ugly + +[runtime] + [[the_good]] + # this task should succeeded normally + platform = {{ environ['CYLC_TEST_PLATFORM'] }} + script = sleep 1 + [[the_bad]] + # this task should fail (it should still send profiling info) + platform = {{ environ['CYLC_TEST_PLATFORM'] }} + script = sleep 5; false + [[the_ugly]] + # this task should succeed despite the broken profiler configuration + platform = localhost + script = sleep 5 +__FLOW_CONFIG__ + +run_ok "${TEST_NAME_BASE}-validate" cylc validate "${WORKFLOW_NAME}" +workflow_run_ok "${TEST_NAME_BASE}-run" cylc play --debug --no-detach "${WORKFLOW_NAME}" + +# ensure the cpu and memory messages were received and that these messages +# were received before the succeeded message +log_scan "${TEST_NAME_BASE}-task-succeeded" \ + "${WORKFLOW_RUN_DIR}/log/scheduler/log" 1 0 \ + '1/the_good.*(received)cpu_time.*max_rss*' \ + '1/the_good.*(received)succeeded' + +# ensure the cpu and memory messages were received and that these messages +# were received before the failed message +log_scan "${TEST_NAME_BASE}-task-succeeded" \ + "${WORKFLOW_RUN_DIR}/log/scheduler/log" 1 0 \ + '1/the_bad.*(received)cpu_time.*max_rss*' \ + '1/the_bad.*(received)failed' + +# ensure this task succeeded despite the broken profiler configuration +grep_workflow_log_ok "${TEST_NAME_BASE}-broken" '1/the_ugly.*(received)succeeded' +grep_ok 'FileNotFoundError: Cgroup not found' "$(cylc cat-log "${WORKFLOW_NAME}//1/the_ugly" -f e -m p)" + +purge diff --git a/tests/functional/jobscript/03-profiler/flow.cylc b/tests/functional/jobscript/03-profiler/flow.cylc new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/functional/jobscript/04-profiler-e2e.t b/tests/functional/jobscript/04-profiler-e2e.t new file mode 100644 index 00000000000..82436a8e214 --- /dev/null +++ b/tests/functional/jobscript/04-profiler-e2e.t @@ -0,0 +1,88 @@ +#!/usr/bin/env bash +# THIS FILE IS PART OF THE CYLC WORKFLOW ENGINE. +# Copyright (C) NIWA & British Crown (Met Office) & Contributors. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +#------------------------------------------------------------------------------- +# Cylc profile test +# NOTE: This test will run the Cylc profiler on the given test platform. +# The test platform may need to be configured for this to work (e.g. +# "cgroups path" may need to be set). + +. "$(dirname "$0")/test_header" + +if [[ "$OSTYPE" != "linux-gnu"* ]]; then + skip_all "Tests not compatibile with $OSTYPE" +fi + +set_test_number 7 + +mkdir -p "${PWD}/cgroups_test_data" + +echo '12345678' > cgroups_test_data/memory.stat +echo '123456789' > cgroups_test_data/memory.max +printf "blah blah 123456\nusage_usec 56781234" > cgroups_test_data/cpu.stat + +export profiler_test_env_var='/cgroups_test_data' +create_test_global_config " +[platforms] + [[localhost]] + [[[profile]]] + activate = True + cgroups path = ${PWD} +" + +init_workflow "${TEST_NAME_BASE}" <<'__FLOW_CONFIG__' +#!Jinja2 + +[scheduling] + [[graph]] + R1 = the_good & the_bad? & the_ugly + +[runtime] + [[the_good]] + # this task should succeeded normally + platform = localhost + script = sleep 5 + [[the_bad]] + # this task should fail (it should still send profiling info) + platform = localhost + script = sleep 5; false + [[the_ugly]] + # this task should succeed despite the broken profiler configuration + platform = localhost + script = sleep 1 +__FLOW_CONFIG__ + +run_ok "${TEST_NAME_BASE}-validate" cylc validate "${WORKFLOW_NAME}" +workflow_run_ok "${TEST_NAME_BASE}-run" cylc play --debug --no-detach "${WORKFLOW_NAME}" + +# ensure the cpu and memory messages were received and that these messages +# were received before the succeeded message +log_scan "${TEST_NAME_BASE}-task-succeeded" \ + "${WORKFLOW_RUN_DIR}/log/scheduler/log" 1 0 \ + '1/the_good.*(received)cpu_time.*max_rss*' \ + '1/the_good.*(received)succeeded' + +# ensure the cpu and memory messages were received and that these messages +# were received before the failed message +log_scan "${TEST_NAME_BASE}-task-succeeded" \ + "${WORKFLOW_RUN_DIR}/log/scheduler/log" 1 0 \ + '1/the_bad.*(received)cpu_time.*max_rss*' \ + '1/the_bad.*failed' + +# ensure this task succeeded despite the broken profiler configuration +grep_workflow_log_ok "${TEST_NAME_BASE}-broken" '1/the_ugly.*(received)succeeded' + +purge diff --git a/tests/functional/jobscript/04-profiler-e2e/flow.cylc b/tests/functional/jobscript/04-profiler-e2e/flow.cylc new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/unit/scripts/test_profiler.py b/tests/unit/scripts/test_profiler.py index e0df1c9f881..10e6f7df3fb 100644 --- a/tests/unit/scripts/test_profiler.py +++ b/tests/unit/scripts/test_profiler.py @@ -17,56 +17,145 @@ # Tests for functions contained in cylc.flow.scripts.profiler from cylc.flow.scripts.profiler import (parse_memory_file, parse_cpu_file, - write_data, + parse_memory_allocated, get_cgroup_name, get_cgroup_version, get_cgroup_paths, + get_resource_usage, stop_profiler, - profile) + profile, + Process) import pytest from unittest import mock -def test_parse_memory_file(mocker): +def test_stop_profiler(mocker, monkeypatch, tmpdir): + monkeypatch.setenv('CYLC_WORKFLOW_ID', "test_value") - with pytest.raises(FileNotFoundError): - parse_memory_file("non_existent_file.txt") + def mock_get_client(env_var, timeout=None): + return True - # Mock the 'open' function call to return a file object. - mock_file = mocker.mock_open(read_data="1024") - mocker.patch("builtins.open", mock_file) + class MockedClient(): + def __init__(self, *a, **k): + pass - # Test the parse_memory_file function - assert parse_memory_file("mocked_file.txt") == 1 + async def async_request(self, *a, **k): + pass - # Assert that the 'open' function was called with the expected arguments. - mock_file.assert_called_once_with("mocked_file.txt", "r") + mocker.patch("cylc.flow.scripts.profiler.get_client", MockedClient) + mem_file = tmpdir.join("memory_file.txt") + mem_file.write('1234') + cpu_file = tmpdir.join("cpu_file.txt") + cpu_file.write('5678') + mem_allocated_file = tmpdir.join("memory_allocated.txt") + mem_allocated_file.write('99999') -def test_parse_cpu_file(mocker): - with pytest.raises(FileNotFoundError): - parse_cpu_file("non_existent_file.txt", 1) + process_object = Process( + cgroup_memory_path=mem_file, + cgroup_cpu_path=cpu_file, + memory_allocated_path=mem_allocated_file, + cgroup_version=1) + with pytest.raises(SystemExit) as excinfo: + stop_profiler(process_object, 1) - # Mock the 'open' function call to return a file object. - mock_file = mocker.mock_open(read_data="usage_usec 1000000") - mocker.patch("builtins.open", mock_file) + assert excinfo.type == SystemExit + assert excinfo.value.code == 0 - assert parse_cpu_file( - "mocked_file.txt", 1) == 1000 - mock_file.assert_called_once_with("mocked_file.txt", "r") - mock_file = mocker.mock_open(read_data="1000000") - mocker.patch("builtins.open", mock_file) - assert parse_cpu_file("mocked_file.txt", 2) == 1 - mock_file.assert_called_once_with("mocked_file.txt", "r") +def test_get_resource_usage(mocker, monkeypatch, tmpdir): + monkeypatch.setenv('CYLC_WORKFLOW_ID', "test_value") + + process_object = Process( + cgroup_memory_path=None, + cgroup_cpu_path=None, + memory_allocated_path=None, + cgroup_version=1) + + max_rss, cpu_time, memory_allocated = get_resource_usage(process_object) + + assert max_rss == 0 + assert cpu_time == 0 + assert memory_allocated == 0 + + +def test_parse_memory_file(mocker, tmpdir): + mem_file_v1 = tmpdir.join("memory_file_v1.txt") + mem_file_v1.write('total_rss=1024') + mem_file_v2 = tmpdir.join("memory_file_v2.txt") + mem_file_v2.write('anon=666') + cpu_file = tmpdir.join("cpu_file.txt") + cpu_file.write('5678') + mem_allocated_file = tmpdir.join("memory_allocated.txt") + mem_allocated_file.write('99999') -def test_write_data(tmpdir): - # Create tmp file - file = tmpdir.join('output.txt') + good_process_object_v1 = Process( + cgroup_memory_path=mem_file_v1, + cgroup_cpu_path=cpu_file, + memory_allocated_path=mem_allocated_file, + cgroup_version=1) + good_process_object_v2 = Process( + cgroup_memory_path=mem_file_v2, + cgroup_cpu_path=cpu_file, + memory_allocated_path=mem_allocated_file, + cgroup_version=2) + bad_process_object = Process( + cgroup_memory_path='', + cgroup_cpu_path='', + memory_allocated_path='', + cgroup_version=1) - write_data('test_data', file.strpath) - assert file.read() == 'test_data\n' + with pytest.raises(FileNotFoundError): + parse_memory_file(bad_process_object) + + # Test the parse_memory_file function + assert parse_memory_file(good_process_object_v1) == 1024 + assert parse_memory_file(good_process_object_v2) == 666 + + +def test_parse_cpu_file(mocker, tmpdir): + + mem_file = tmpdir.join("memory_file.txt") + mem_file.write('1024') + cpu_file_v1 = tmpdir.join("cpu_file_v1.txt") + cpu_file_v1.write('1234567890') + cpu_file_v2 = tmpdir.join("cpu_file_v2.txt") + cpu_file_v2.write('usage_usec=1234567890') + cpu_file_v2_bad = tmpdir.join("cpu_file_v2_bad.txt") + cpu_file_v2_bad.write('Give me fuel, give me fire, ' + 'give me that which I desire') + mem_allocated_file = tmpdir.join("memory_allocated.txt") + mem_allocated_file.write('99999') + + good_process_object_v1 = Process( + cgroup_memory_path=mem_file, + cgroup_cpu_path=cpu_file_v1, + memory_allocated_path=mem_allocated_file, + cgroup_version=1) + good_process_object_v2 = Process( + cgroup_memory_path=mem_file, + cgroup_cpu_path=cpu_file_v2, + memory_allocated_path=mem_allocated_file, + cgroup_version=2) + bad_process_object = Process( + cgroup_memory_path='', + cgroup_cpu_path='', + memory_allocated_path='', + cgroup_version=1) + bad_process_object_v2 = Process( + cgroup_memory_path=mem_file, + cgroup_cpu_path=cpu_file_v2_bad, + memory_allocated_path=mem_allocated_file, + cgroup_version=2) + + assert parse_cpu_file(good_process_object_v1) == 1234 + assert parse_cpu_file(good_process_object_v2) == 1234567 + + with pytest.raises(FileNotFoundError): + parse_cpu_file(bad_process_object) + with pytest.raises(ValueError): + parse_cpu_file(bad_process_object_v2) def test_get_cgroup_name(mocker): @@ -81,6 +170,35 @@ def test_get_cgroup_name(mocker): assert get_cgroup_name() == "good/cgroup/place/2222222" +def test_parse_memory_allocated(mocker, tmpdir): + mem_allocated_file = tmpdir.join("memory.max") + mem_allocated_file.write('99999') + + # We currently do not track memory allocated for cgroups v1 + good_process_object_v1 = Process( + cgroup_memory_path='', + cgroup_cpu_path='', + memory_allocated_path=tmpdir, + cgroup_version=1) + + good_process_object_v2 = Process( + cgroup_memory_path='', + cgroup_cpu_path='', + memory_allocated_path=tmpdir, + cgroup_version=2) + + bad_process_object_v2 = Process( + cgroup_memory_path='', + cgroup_cpu_path='', + memory_allocated_path='/', + cgroup_version=2) + + assert parse_memory_allocated(good_process_object_v1) == 0 + assert parse_memory_allocated(good_process_object_v2) == 99999 + with pytest.raises(FileNotFoundError): + parse_memory_file(bad_process_object_v2) + + def test_get_cgroup_name_file_not_found(mocker): def mock_os_pid(): @@ -96,11 +214,11 @@ def test_get_cgroup_version(mocker): # Mock the Path.exists function call to return True mocker.patch("pathlib.Path.exists", return_value=True) assert get_cgroup_version('stuff/in/place', - 'more_stuff') == 1 + 'more_stuff') == 2 with mock.patch('pathlib.Path.exists', side_effect=[False, True]): assert get_cgroup_version('stuff/in/place', - 'more_stuff') == 2 + 'more_stuff') == 1 # Mock the Path.exists function call to return False mocker.patch("pathlib.Path.exists", return_value=False) @@ -109,88 +227,73 @@ def test_get_cgroup_version(mocker): 'things') -def test_get_cgroup_paths(): - - process = get_cgroup_paths(1, "test_location/", - "test_name") - assert process.cgroup_memory_path == "test_location/test_name/memory.peak" +def test_get_cgroup_paths(mocker): + mocker.patch("cylc.flow.scripts.profiler.get_cgroup_name", + return_value='test_name') + mocker.patch("cylc.flow.scripts.profiler.get_cgroup_version", + return_value=2) + process = get_cgroup_paths("test_location/") + assert process.cgroup_memory_path == "test_location/test_name/memory.stat" assert process.cgroup_cpu_path == "test_location/test_name/cpu.stat" - process = get_cgroup_paths(2, "test_location", - "/test_name") + mocker.patch("cylc.flow.scripts.profiler.get_cgroup_name", + return_value='test_name') + mocker.patch("cylc.flow.scripts.profiler.get_cgroup_version", + return_value=1) + + process = get_cgroup_paths("test_location/") assert (process.cgroup_memory_path == - "test_location/memory/test_name/memory.max_usage_in_bytes") + "test_location/memory/test_name/memory.stat") assert (process.cgroup_cpu_path == "test_location/cpu/test_name/cpuacct.usage") - -def test_profile_cpu(mocker): - process = get_cgroup_paths(1, "test_location/", - "test_name") - - mock_file = mocker.mock_open(read_data="") - mocker.patch("builtins.open", mock_file) - mocker.patch("cylc.flow.scripts.profiler.parse_memory_file", - return_value=0) - mocker.patch("cylc.flow.scripts.profiler.parse_cpu_file", - return_value=2048) - run_once = mock.Mock(side_effect=[True, False]) - profile(process, 1, 1, run_once) - mock_file.assert_called_with("cpu_time", "w") - - -def test_stop_profiler(): - with pytest.raises(SystemExit) as pytest_wrapped_e: - stop_profiler() - assert pytest_wrapped_e.type == SystemExit - assert pytest_wrapped_e.value.code == 0 + mocker.patch("cylc.flow.scripts.profiler.get_cgroup_name", + return_value='test_name') + mocker.patch("cylc.flow.scripts.profiler.get_cgroup_version", + return_value=3) + with pytest.raises(ValueError): + get_cgroup_paths("test_location/") -def test_profile_max_rss(mocker): - process = get_cgroup_paths(1, - "test_location/", - "test_name") +def test_profile_data(mocker): + # This test should run without error + mocker.patch("cylc.flow.scripts.profiler.get_cgroup_name", + return_value='test_name') + mocker.patch("cylc.flow.scripts.profiler.get_cgroup_version", + return_value=2) + process = get_cgroup_paths("test_location/") mock_file = mocker.mock_open(read_data="") mocker.patch("builtins.open", mock_file) mocker.patch("cylc.flow.scripts.profiler.parse_memory_file", - return_value=1024) + return_value=0) mocker.patch("cylc.flow.scripts.profiler.parse_cpu_file", return_value=2048) run_once = mock.Mock(side_effect=[True, False]) - profile(process, 1, 1, run_once) - mock_file.assert_called_with("max_rss", "w") + profile(process, 1, run_once) -def test_profile_1(mocker): - process = get_cgroup_paths( - 1, "test_location/", "test_name") +@pytest.fixture +def options(mocker): + opts = mocker.Mock() + opts.cgroup_location = "/fake/path" + opts.comms_timeout = 10 + opts.delay = 1 + return opts - mock_file = mocker.mock_open(read_data="") - mocker.patch("builtins.open", mock_file) - mocker.patch( - "cylc.flow.scripts.profiler.parse_memory_file", return_value=1024) - mocker.patch( - "cylc.flow.scripts.profiler.parse_cpu_file", return_value=2048) - run_once = mock.Mock(side_effect=[True, False]) - - profile(process, 1, 1, run_once) - mock_file.assert_called_with("max_rss", "w") +def test_main(mocker, options): + mock_get_cgroup_paths = mocker.patch( + "cylc.flow.scripts.profiler.get_cgroup_paths" + ) + mock_signal = mocker.patch("cylc.flow.scripts.profiler.signal.signal") + mock_profile = mocker.patch("cylc.flow.scripts.profiler.profile") -def test_profile_2(mocker): - # assert_called_with only shows the last call to open(). - # Setting peak memory to zero stops the memory call to open - process = get_cgroup_paths( - 1, "test_location/", "test_name") + mock_get_cgroup_paths.return_value = mocker.Mock() - mock_file = mocker.mock_open(read_data="") - mocker.patch("builtins.open", mock_file) - mocker.patch( - "cylc.flow.scripts.profiler.parse_cpu_file", return_value=2048) - mocker.patch( - "cylc.flow.scripts.profiler.parse_memory_file", return_value=0) - run_once = mock.Mock(side_effect=[True, False]) + from cylc.flow.scripts.profiler import _main + _main(options) - profile(process, 1, 1, run_once) - mock_file.assert_called_with("cpu_time", "w") + mock_get_cgroup_paths.assert_called_once_with("/fake/path") + assert mock_signal.call_count == 3 + mock_profile.assert_called_once() diff --git a/tests/unit/test_job_file.py b/tests/unit/test_job_file.py index 617d02fc1e1..3ccd8d2923c 100644 --- a/tests/unit/test_job_file.py +++ b/tests/unit/test_job_file.py @@ -400,6 +400,8 @@ def test_write_task_environment(): 'CYLC_TASK_TRY_NUMBER=1\n export ' 'CYLC_TASK_FLOW_NUMBERS=1\n export ' 'CYLC_PROFILE=true\n export ' + 'CYLC_CGROUP=exit_light\n export ' + 'CYLC_POLLING_INTERVAL=1\n export ' 'CYLC_TASK_PARAM_duck="quack"\n export ' 'CYLC_TASK_PARAM_mouse="squeak"\n ' 'CYLC_TASK_WORK_DIR_BASE=\'farm_noises/work_d\'\n}') @@ -408,6 +410,8 @@ def test_write_task_environment(): 'communication method': 'ssh', 'profile': { "activate": "true", + "cgroups path": 'exit_light', + "polling interval": 1 } }, "job_d": "1/moo/01", From e793c002e1eeb5147350646d55e2df03b60bf198 Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Wed, 19 Nov 2025 14:41:11 +0000 Subject: [PATCH 05/47] Adding test coverage --- cylc/flow/scripts/profiler.py | 21 ++++---- tests/unit/scripts/test_profiler.py | 76 +++++++++++++++++++++++------ 2 files changed, 70 insertions(+), 27 deletions(-) diff --git a/cylc/flow/scripts/profiler.py b/cylc/flow/scripts/profiler.py index 5e3b4b1e89b..8e2bc5c52d6 100755 --- a/cylc/flow/scripts/profiler.py +++ b/cylc/flow/scripts/profiler.py @@ -122,20 +122,15 @@ def parse_memory_allocated(process: Process) -> int: """Open the memory stat file and copy the appropriate data""" if process.cgroup_version == 2: cgroup_memory_path = Path(process.memory_allocated_path) - for i in range(5): with open(cgroup_memory_path / "memory.max", 'r') as f: line = f.readline() if "max" not in line: return int(line) cgroup_memory_path = cgroup_memory_path.parent - if i == 5: - break - elif process.cgroup_version == 1: - return 0 # Memory limit not tracked for cgroups v1 - - raise FileNotFoundError("Could not find memory.max file") - + return 0 + else : # Memory limit not tracked for cgroups v1 + return 0 def parse_cpu_file(process: Process) -> int: """Open the CPU stat file and return the appropriate data""" @@ -147,10 +142,12 @@ def parse_cpu_file(process: Process) -> int: raise ValueError("Unable to find cpu usage data") else: with open(process.cgroup_cpu_path, 'r') as f: - for line in f: - # Cgroups v1 uses nanoseconds - return int(line) // 1000000 - raise ValueError("Unable to find cpu usage data") + try: + for line in f: + # Cgroups v1 uses nanoseconds + return int(line) // 1000000 + except ValueError: + raise ValueError("Unable to find cpu usage data") def get_cgroup_version(cgroup_location: str, cgroup_name: str) -> int: diff --git a/tests/unit/scripts/test_profiler.py b/tests/unit/scripts/test_profiler.py index 10e6f7df3fb..6ab52a33143 100644 --- a/tests/unit/scripts/test_profiler.py +++ b/tests/unit/scripts/test_profiler.py @@ -118,10 +118,12 @@ def test_parse_cpu_file(mocker, tmpdir): mem_file = tmpdir.join("memory_file.txt") mem_file.write('1024') - cpu_file_v1 = tmpdir.join("cpu_file_v1.txt") - cpu_file_v1.write('1234567890') - cpu_file_v2 = tmpdir.join("cpu_file_v2.txt") - cpu_file_v2.write('usage_usec=1234567890') + cpu_file_v1_good = tmpdir.join("cpu_file_v1_good.txt") + cpu_file_v1_good.write('1234567890') + cpu_file_v1_bad = tmpdir.join("cpu_file_v1_bad.txt") + cpu_file_v1_bad.write("I'm your dream, mind ashtray") + cpu_file_v2_good = tmpdir.join("cpu_file_v2_good.txt") + cpu_file_v2_good.write('usage_usec=1234567890') cpu_file_v2_bad = tmpdir.join("cpu_file_v2_bad.txt") cpu_file_v2_bad.write('Give me fuel, give me fire, ' 'give me that which I desire') @@ -130,19 +132,24 @@ def test_parse_cpu_file(mocker, tmpdir): good_process_object_v1 = Process( cgroup_memory_path=mem_file, - cgroup_cpu_path=cpu_file_v1, + cgroup_cpu_path=cpu_file_v1_good, memory_allocated_path=mem_allocated_file, cgroup_version=1) good_process_object_v2 = Process( cgroup_memory_path=mem_file, - cgroup_cpu_path=cpu_file_v2, + cgroup_cpu_path=cpu_file_v2_good, memory_allocated_path=mem_allocated_file, cgroup_version=2) - bad_process_object = Process( + bad_process_object_v1_1 = Process( cgroup_memory_path='', cgroup_cpu_path='', memory_allocated_path='', cgroup_version=1) + bad_process_object_v1_2 = Process( + cgroup_memory_path=mem_file, + cgroup_cpu_path=cpu_file_v1_bad, + memory_allocated_path=mem_allocated_file, + cgroup_version=1) bad_process_object_v2 = Process( cgroup_memory_path=mem_file, cgroup_cpu_path=cpu_file_v2_bad, @@ -153,7 +160,9 @@ def test_parse_cpu_file(mocker, tmpdir): assert parse_cpu_file(good_process_object_v2) == 1234567 with pytest.raises(FileNotFoundError): - parse_cpu_file(bad_process_object) + parse_cpu_file(bad_process_object_v1_1) + with pytest.raises(ValueError): + parse_cpu_file(bad_process_object_v1_2) with pytest.raises(ValueError): parse_cpu_file(bad_process_object_v2) @@ -170,24 +179,25 @@ def test_get_cgroup_name(mocker): assert get_cgroup_name() == "good/cgroup/place/2222222" -def test_parse_memory_allocated(mocker, tmpdir): - mem_allocated_file = tmpdir.join("memory.max") - mem_allocated_file.write('99999') +def test_parse_memory_allocated(tmp_path_factory): + good_mem_dir = tmp_path_factory.mktemp("mem_dir") + mem_allocated_file = good_mem_dir / "memory.max" + mem_allocated_file.write_text('99999') # We currently do not track memory allocated for cgroups v1 good_process_object_v1 = Process( cgroup_memory_path='', cgroup_cpu_path='', - memory_allocated_path=tmpdir, + memory_allocated_path=str(good_mem_dir), cgroup_version=1) good_process_object_v2 = Process( cgroup_memory_path='', cgroup_cpu_path='', - memory_allocated_path=tmpdir, + memory_allocated_path=str(good_mem_dir), cgroup_version=2) - bad_process_object_v2 = Process( + bad_process_object_v2_1 = Process( cgroup_memory_path='', cgroup_cpu_path='', memory_allocated_path='/', @@ -196,8 +206,44 @@ def test_parse_memory_allocated(mocker, tmpdir): assert parse_memory_allocated(good_process_object_v1) == 0 assert parse_memory_allocated(good_process_object_v2) == 99999 with pytest.raises(FileNotFoundError): - parse_memory_file(bad_process_object_v2) + parse_memory_file(bad_process_object_v2_1) + + # Nested directories with 'max' value + base_dir = tmp_path_factory.mktemp("base") + + dir_1 = base_dir / "dir_1" + dir_1.mkdir() + mem_file_1 = dir_1 / "memory.max" + mem_file_1.write_text("max") + + dir_2 = dir_1 / "dir_2" + dir_2.mkdir() + mem_file_2 = dir_2 / "memory.max" + mem_file_2.write_text("max") + + dir_3 = dir_2 / "dir_3" + dir_3.mkdir() + mem_file_3 = dir_3 / "memory.max" + mem_file_3.write_text("max") + + dir_4 = dir_3 / "dir_4" + dir_4.mkdir() + mem_file_4 = dir_4 / "memory.max" + mem_file_4.write_text("max") + + dir_5 = dir_4 / "dir_5" + dir_5.mkdir() + mem_file_5 = dir_5 / "memory.max" + mem_file_5.write_text("max") + + + bad_process_object_v2_2 = Process( + cgroup_memory_path='', + cgroup_cpu_path='', + memory_allocated_path=str(dir_5), + cgroup_version=2) + assert parse_memory_allocated(bad_process_object_v2_2) == 0 def test_get_cgroup_name_file_not_found(mocker): From b4a32cd1e256ab43827d38870d22f761822317a5 Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Thu, 20 Nov 2025 13:38:07 +0000 Subject: [PATCH 06/47] Added custom exception for cylc profiler --- cylc/flow/exceptions.py | 13 ++ cylc/flow/scripts/profiler.py | 133 +++++++++++-------- tests/functional/jobscript/04-profiler-e2e.t | 2 +- tests/unit/scripts/test_profiler.py | 39 +++--- 4 files changed, 110 insertions(+), 77 deletions(-) diff --git a/cylc/flow/exceptions.py b/cylc/flow/exceptions.py index 7e55c014756..6faab0e25b9 100644 --- a/cylc/flow/exceptions.py +++ b/cylc/flow/exceptions.py @@ -529,3 +529,16 @@ def __init__(self, message, expr=None): def __str__(self): return self.message + + +class CylcProfilerError(CylcError): + """Exception for errors raised from the cylc profiler. These errors do not + affect workflows functionally, just stats gathering. We don't want to + panic users.""" + def __init__(self, exc: Exception, error_msg: str) -> None: + CylcError.__init__( + self, + f"{exc}. {error_msg}. This error came from the Cylc profiler" + f" and is not a problem with your workflow. Statistics gathering " + f"for the analysis view may be incomplete." + ) diff --git a/cylc/flow/scripts/profiler.py b/cylc/flow/scripts/profiler.py index 8e2bc5c52d6..0a9667c7bc2 100755 --- a/cylc/flow/scripts/profiler.py +++ b/cylc/flow/scripts/profiler.py @@ -31,9 +31,10 @@ from functools import partial from dataclasses import dataclass -from cylc.flow.network.client_factory import get_client -from cylc.flow.option_parsers import CylcOptionParser as COP +from cylc.flow.exceptions import CylcProfilerError from cylc.flow.terminal import cli_function +from cylc.flow.option_parsers import CylcOptionParser as COP +from cylc.flow.network.client_factory import get_client PID_REGEX = re.compile(r"([^:]*\d{6,}.*)") @@ -106,58 +107,70 @@ def get_resource_usage(process): def parse_memory_file(process: Process): """Open the memory stat file and copy the appropriate data""" - if process.cgroup_version == 2: - with open(process.cgroup_memory_path, 'r') as f: - for line in f: - if "anon" in line: - return int(''.join(filter(str.isdigit, line))) - else: - with open(process.cgroup_memory_path, 'r') as f: - for line in f: - if "total_rss" in line: - return int(''.join(filter(str.isdigit, line))) + try: + if process.cgroup_version == 2: + with open(process.cgroup_memory_path, 'r') as f: + for line in f: + if "anon" in line: + return int(''.join(filter(str.isdigit, line))) + else: + with open(process.cgroup_memory_path, 'r') as f: + for line in f: + if "total_rss" in line: + return int(''.join(filter(str.isdigit, line))) + except Exception as err: + raise CylcProfilerError( + err, "Unable to find memory usage data") from err def parse_memory_allocated(process: Process) -> int: """Open the memory stat file and copy the appropriate data""" if process.cgroup_version == 2: cgroup_memory_path = Path(process.memory_allocated_path) - for i in range(5): + for _ in range(5): with open(cgroup_memory_path / "memory.max", 'r') as f: line = f.readline() if "max" not in line: return int(line) cgroup_memory_path = cgroup_memory_path.parent return 0 - else : # Memory limit not tracked for cgroups v1 + else: # Memory limit not tracked for cgroups v1 return 0 + def parse_cpu_file(process: Process) -> int: """Open the CPU stat file and return the appropriate data""" - if process.cgroup_version == 2: - with open(process.cgroup_cpu_path, 'r') as f: - for line in f: - if "usage_usec" in line: - return int(RE_INT.findall(line)[0]) // 1000 - raise ValueError("Unable to find cpu usage data") - else: - with open(process.cgroup_cpu_path, 'r') as f: - try: + try: + if process.cgroup_version == 2: + with open(process.cgroup_cpu_path, 'r') as f: + for line in f: + if "usage_usec" in line: + return int(RE_INT.findall(line)[0]) // 1000 + raise FileNotFoundError(process.cgroup_cpu_path) + + elif process.cgroup_version == 1: + with open(process.cgroup_cpu_path, 'r') as f: for line in f: # Cgroups v1 uses nanoseconds return int(line) // 1000000 - except ValueError: - raise ValueError("Unable to find cpu usage data") + raise FileNotFoundError(process.cgroup_cpu_path) + + except Exception as err: + raise CylcProfilerError( + err, "Unable to find cpu usage data") from err def get_cgroup_version(cgroup_location: str, cgroup_name: str) -> int: - if Path.exists(Path(cgroup_location + cgroup_name)): - return 2 - elif Path.exists(Path(cgroup_location + "/memory" + cgroup_name)): - return 1 - else: - raise FileNotFoundError("Cgroup not found at " + - cgroup_location + cgroup_name) + try: + if Path.exists(Path(cgroup_location + cgroup_name)): + return 2 + elif Path.exists(Path(cgroup_location + "/memory" + cgroup_name)): + return 1 + raise FileNotFoundError(cgroup_location + cgroup_name) + except Exception as err: + raise CylcProfilerError( + err, "Cgroup not found at " + cgroup_location + + cgroup_name) from err def get_cgroup_name(): @@ -175,38 +188,40 @@ def get_cgroup_name(): result = f.read() result = PID_REGEX.search(result).group() return result - except FileNotFoundError as err: - raise FileNotFoundError( - '/proc/' + str(pid) + '/cgroup not found') from err - except AttributeError as err: - raise AttributeError("No cgroup found for process:", pid) from err + except Exception as err: + raise CylcProfilerError( + err, '/proc/' + str(pid) + '/cgroup not found') from err def get_cgroup_paths(location) -> Process: - cgroup_name = get_cgroup_name() - cgroup_version = get_cgroup_version(location, cgroup_name) - if cgroup_version == 2: - return Process( - cgroup_memory_path=location + - cgroup_name + "/" + "memory.stat", - cgroup_cpu_path=location + - cgroup_name + "/" + "cpu.stat", - memory_allocated_path=location + cgroup_name, - cgroup_version=cgroup_version, - ) - - elif cgroup_version == 1: - return Process( - cgroup_memory_path=location + "memory/" + - cgroup_name + "/memory.stat", - cgroup_cpu_path=location + "cpu/" + - cgroup_name + "/cpuacct.usage", - memory_allocated_path="", - cgroup_version=cgroup_version, - ) - raise ValueError("Unable to determine cgroup version") + try: + cgroup_name = get_cgroup_name() + cgroup_version = get_cgroup_version(location, cgroup_name) + if cgroup_version == 2: + return Process( + cgroup_memory_path=location + + cgroup_name + "/" + "memory.stat", + cgroup_cpu_path=location + + cgroup_name + "/" + "cpu.stat", + memory_allocated_path=location + cgroup_name, + cgroup_version=cgroup_version, + ) + + elif cgroup_version == 1: + return Process( + cgroup_memory_path=location + "memory/" + + cgroup_name + "/memory.stat", + cgroup_cpu_path=location + "cpu/" + + cgroup_name + "/cpuacct.usage", + memory_allocated_path="", + cgroup_version=cgroup_version, + ) + raise Exception + except Exception as err: + raise CylcProfilerError( + err, "Unable to determine cgroup version") from err def profile(_process: Process, delay, keep_looping=lambda: True): diff --git a/tests/functional/jobscript/04-profiler-e2e.t b/tests/functional/jobscript/04-profiler-e2e.t index 82436a8e214..8e960647233 100644 --- a/tests/functional/jobscript/04-profiler-e2e.t +++ b/tests/functional/jobscript/04-profiler-e2e.t @@ -30,7 +30,7 @@ set_test_number 7 mkdir -p "${PWD}/cgroups_test_data" -echo '12345678' > cgroups_test_data/memory.stat +echo 'anon 12345678' > cgroups_test_data/memory.stat echo '123456789' > cgroups_test_data/memory.max printf "blah blah 123456\nusage_usec 56781234" > cgroups_test_data/cpu.stat diff --git a/tests/unit/scripts/test_profiler.py b/tests/unit/scripts/test_profiler.py index 6ab52a33143..fa76b1e5639 100644 --- a/tests/unit/scripts/test_profiler.py +++ b/tests/unit/scripts/test_profiler.py @@ -27,15 +27,13 @@ Process) import pytest from unittest import mock +from cylc.flow.exceptions import CylcProfilerError def test_stop_profiler(mocker, monkeypatch, tmpdir): monkeypatch.setenv('CYLC_WORKFLOW_ID', "test_value") - def mock_get_client(env_var, timeout=None): - return True - - class MockedClient(): + class MockedClient: def __init__(self, *a, **k): pass @@ -45,7 +43,7 @@ async def async_request(self, *a, **k): mocker.patch("cylc.flow.scripts.profiler.get_client", MockedClient) mem_file = tmpdir.join("memory_file.txt") - mem_file.write('1234') + mem_file.write('total_rss 1234') cpu_file = tmpdir.join("cpu_file.txt") cpu_file.write('5678') mem_allocated_file = tmpdir.join("memory_allocated.txt") @@ -106,15 +104,16 @@ def test_parse_memory_file(mocker, tmpdir): memory_allocated_path='', cgroup_version=1) - with pytest.raises(FileNotFoundError): + with pytest.raises(CylcProfilerError) as excinfo: parse_memory_file(bad_process_object) + assert "Unable to find memory usage data" in str(excinfo.value) # Test the parse_memory_file function assert parse_memory_file(good_process_object_v1) == 1024 assert parse_memory_file(good_process_object_v2) == 666 -def test_parse_cpu_file(mocker, tmpdir): +def test_parse_cpu_file(tmpdir): mem_file = tmpdir.join("memory_file.txt") mem_file.write('1024') @@ -159,19 +158,22 @@ def test_parse_cpu_file(mocker, tmpdir): assert parse_cpu_file(good_process_object_v1) == 1234 assert parse_cpu_file(good_process_object_v2) == 1234567 - with pytest.raises(FileNotFoundError): + with pytest.raises(CylcProfilerError) as excinfo: parse_cpu_file(bad_process_object_v1_1) - with pytest.raises(ValueError): + assert "Unable to find cpu usage data" in str(excinfo.value) + with pytest.raises(CylcProfilerError) as excinfo: parse_cpu_file(bad_process_object_v1_2) - with pytest.raises(ValueError): + assert "Unable to find cpu usage data" in str(excinfo.value) + with pytest.raises(CylcProfilerError) as excinfo: parse_cpu_file(bad_process_object_v2) + assert "Unable to find cpu usage data" in str(excinfo.value) def test_get_cgroup_name(mocker): mock_file = mocker.mock_open(read_data="0::bad/test/cgroup/place") mocker.patch("builtins.open", mock_file) - with pytest.raises(AttributeError): + with pytest.raises(CylcProfilerError): get_cgroup_name() mock_file = mocker.mock_open(read_data="0::good/cgroup/place/2222222") @@ -205,9 +207,9 @@ def test_parse_memory_allocated(tmp_path_factory): assert parse_memory_allocated(good_process_object_v1) == 0 assert parse_memory_allocated(good_process_object_v2) == 99999 - with pytest.raises(FileNotFoundError): + with pytest.raises(CylcProfilerError) as excinfo: parse_memory_file(bad_process_object_v2_1) - + assert "Unable to find memory usage data" in str(excinfo.value) # Nested directories with 'max' value base_dir = tmp_path_factory.mktemp("base") @@ -236,7 +238,6 @@ def test_parse_memory_allocated(tmp_path_factory): mem_file_5 = dir_5 / "memory.max" mem_file_5.write_text("max") - bad_process_object_v2_2 = Process( cgroup_memory_path='', cgroup_cpu_path='', @@ -245,14 +246,16 @@ def test_parse_memory_allocated(tmp_path_factory): assert parse_memory_allocated(bad_process_object_v2_2) == 0 + def test_get_cgroup_name_file_not_found(mocker): def mock_os_pid(): return 'The Thing That Should Not Be' mocker.patch("os.getpid", mock_os_pid) - with pytest.raises(FileNotFoundError): + with pytest.raises(CylcProfilerError) as excinfo: get_cgroup_name() + assert "/cgroup not found" in str(excinfo.value) def test_get_cgroup_version(mocker): @@ -268,9 +271,10 @@ def test_get_cgroup_version(mocker): # Mock the Path.exists function call to return False mocker.patch("pathlib.Path.exists", return_value=False) - with pytest.raises(FileNotFoundError): + with pytest.raises(CylcProfilerError) as excinfo: get_cgroup_version('stuff/in/other/place', 'things') + assert "Cgroup not found" in str(excinfo.value) def test_get_cgroup_paths(mocker): @@ -297,8 +301,9 @@ def test_get_cgroup_paths(mocker): return_value='test_name') mocker.patch("cylc.flow.scripts.profiler.get_cgroup_version", return_value=3) - with pytest.raises(ValueError): + with pytest.raises(CylcProfilerError) as excinfo: get_cgroup_paths("test_location/") + assert "Unable to determine cgroup version" in str(excinfo.value) def test_profile_data(mocker): From d310c128757ad67e3b64908086b45fd4de554a31 Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Thu, 4 Dec 2025 10:20:33 +0000 Subject: [PATCH 07/47] Linting --- cylc/flow/scripts/profiler.py | 1 + 1 file changed, 1 insertion(+) diff --git a/cylc/flow/scripts/profiler.py b/cylc/flow/scripts/profiler.py index 0a9667c7bc2..002ee0652c9 100755 --- a/cylc/flow/scripts/profiler.py +++ b/cylc/flow/scripts/profiler.py @@ -158,6 +158,7 @@ def parse_cpu_file(process: Process) -> int: except Exception as err: raise CylcProfilerError( err, "Unable to find cpu usage data") from err + return 0 def get_cgroup_version(cgroup_location: str, cgroup_name: str) -> int: From 749f4568d9a9188fa5bc9c52e5e97f66df71a865 Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Tue, 13 Jan 2026 15:51:43 +0000 Subject: [PATCH 08/47] Removing spurious version control text --- .github/workflows/build.yml | 35 ----------------------------------- 1 file changed, 35 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 567b05aee2e..f9151fff5a0 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -32,43 +32,8 @@ jobs: strategy: fail-fast: false matrix: -<<<<<<< HEAD -<<<<<<< HEAD -<<<<<<< HEAD -<<<<<<< HEAD os: ['ubuntu-latest', 'macos-latest'] python: ['3.12', '3'] -======= -======= ->>>>>>> 38c31f56d (Fail gracefully if cgroups cannot be found) - os: ['ubuntu-latest'] - python: ['3.8', '3.9', '3.10', '3.11'] - include: - - os: 'ubuntu-22.04' -<<<<<<< HEAD - python: '3.7' - - os: 'macos-latest' - python: '3.8' ->>>>>>> 9727be9e5 (CPU/Memory Logging working) -======= - os: ['ubuntu-latest', 'macos-latest'] - python: ['3.7', '3.8', '3.9', '3.10', '3'] - exclude: - - os: 'macos-latest' - python: '3.7' ->>>>>>> 171f7ee1d (GH Actions: use explicit `bash` shell & other defaults) -======= - python: '3.7' - - os: 'macos-latest' - python: '3.8' ->>>>>>> 38c31f56d (Fail gracefully if cgroups cannot be found) -======= - os: ['ubuntu-latest', 'macos-latest'] - python: ['3.7', '3.8', '3.9', '3.10', '3'] - exclude: - - os: 'macos-latest' - python: '3.7' ->>>>>>> ad6b3a133 (Fixing my terrible rebasing) steps: - name: Checkout uses: actions/checkout@v6 From c5b175c97c224b1d56323f155161d71700c5a583 Mon Sep 17 00:00:00 2001 From: Oliver Sanders Date: Wed, 4 Feb 2026 12:59:10 +0000 Subject: [PATCH 09/47] profiler: change message format to JSON --- .../etc/examples/error-handling/bin/my-model | 18 ++++++++ .../etc/examples/error-handling/flow.cylc | 9 ++++ .../etc/examples/error-handling/index.rst | 2 + cylc/flow/scripts/profiler.py | 41 +++++++++++-------- tests/unit/scripts/test_profiler.py | 13 +++--- 5 files changed, 60 insertions(+), 23 deletions(-) create mode 100644 cylc/flow/etc/examples/error-handling/bin/my-model create mode 100644 cylc/flow/etc/examples/error-handling/flow.cylc create mode 100644 cylc/flow/etc/examples/error-handling/index.rst diff --git a/cylc/flow/etc/examples/error-handling/bin/my-model b/cylc/flow/etc/examples/error-handling/bin/my-model new file mode 100644 index 00000000000..a8f642a63f6 --- /dev/null +++ b/cylc/flow/etc/examples/error-handling/bin/my-model @@ -0,0 +1,18 @@ +#!/usr/bin/env bash + +echo 'running my-model' +sleep 2 + +case $(( RANDOM % 3 )) in; + 0) + echo 'succeeded' + exit 0 + ;; + 1) + echo 'ERROR: NaN' + exit 1 + ;; + 2) + echo 'ERROR: Configuration issue' + exit 2 + ;; diff --git a/cylc/flow/etc/examples/error-handling/flow.cylc b/cylc/flow/etc/examples/error-handling/flow.cylc new file mode 100644 index 00000000000..be445ebb223 --- /dev/null +++ b/cylc/flow/etc/examples/error-handling/flow.cylc @@ -0,0 +1,9 @@ +[scheduling] + [[graph]] + R1 = """ + foo => bar + """ + +[runtime] + [[foo]] + script = my-model diff --git a/cylc/flow/etc/examples/error-handling/index.rst b/cylc/flow/etc/examples/error-handling/index.rst new file mode 100644 index 00000000000..92a9708c8ae --- /dev/null +++ b/cylc/flow/etc/examples/error-handling/index.rst @@ -0,0 +1,2 @@ +Error Handling +-------------- diff --git a/cylc/flow/scripts/profiler.py b/cylc/flow/scripts/profiler.py index 002ee0652c9..732703f5eb6 100755 --- a/cylc/flow/scripts/profiler.py +++ b/cylc/flow/scripts/profiler.py @@ -20,6 +20,7 @@ the resource usage of jobs running on the node. """ +import json import os import re import sys @@ -50,11 +51,13 @@ class Process: cgroup_version: int -def stop_profiler(process, comms_timeout, *args): - """This function will be executed when the SIGINT signal is sent - to this process""" +def stop_profiler(process, comms_timeout, *_args): + """Stop the profiler and return its data to the sceduler. - max_rss, cpu_time, memory_allocated = get_resource_usage(process) + This function will be executed when the SIGINT signal is sent to this + process. + """ + profiler_data = get_profiler_data(process) graphql_mutation = """ mutation($WORKFLOWS: [WorkflowID]!, @@ -70,9 +73,8 @@ def stop_profiler(process, comms_timeout, *args): "WORKFLOWS": [os.environ.get('CYLC_WORKFLOW_ID')], "MESSAGES": [[ "DEBUG", - f"cpu_time {cpu_time} " - f"max_rss {max_rss} " - f"mem_alloc {memory_allocated}"]], + f'_cylc_profiler: {json.dumps(profiler_data)}', + ]], "JOB": os.environ.get('CYLC_TASK_JOB'), "TIME": "now" } @@ -91,17 +93,24 @@ async def send_cylc_message(): sys.exit(0) -def get_resource_usage(process): +def get_profiler_data(process): # If a task fails instantly, or finishes very quickly (< 1 second), # the get config function doesn't have time to run - if (process.cgroup_memory_path is None - or process.cgroup_cpu_path is None - or process.memory_allocated_path is None): - return 0, 0, 0 - max_rss = parse_memory_file(process) - cpu_time = parse_cpu_file(process) - memory_allocated = parse_memory_allocated(process) - return max_rss, cpu_time, memory_allocated + if ( + process.cgroup_memory_path is None + or process.cgroup_cpu_path is None + or process.memory_allocated_path is None + ): + max_rss = cpu_time = memory_allocated = 0 + else: + max_rss = parse_memory_file(process) + cpu_time = parse_cpu_file(process) + memory_allocated = parse_memory_allocated(process) + return { + 'max_rss': max_rss, + 'cpu_time': cpu_time, + 'memory_allocated': memory_allocated, + } def parse_memory_file(process: Process): diff --git a/tests/unit/scripts/test_profiler.py b/tests/unit/scripts/test_profiler.py index fa76b1e5639..a7c455e7050 100644 --- a/tests/unit/scripts/test_profiler.py +++ b/tests/unit/scripts/test_profiler.py @@ -21,7 +21,7 @@ get_cgroup_name, get_cgroup_version, get_cgroup_paths, - get_resource_usage, + get_profiler_data, stop_profiler, profile, Process) @@ -57,7 +57,6 @@ async def async_request(self, *a, **k): with pytest.raises(SystemExit) as excinfo: stop_profiler(process_object, 1) - assert excinfo.type == SystemExit assert excinfo.value.code == 0 @@ -70,11 +69,11 @@ def test_get_resource_usage(mocker, monkeypatch, tmpdir): memory_allocated_path=None, cgroup_version=1) - max_rss, cpu_time, memory_allocated = get_resource_usage(process_object) - - assert max_rss == 0 - assert cpu_time == 0 - assert memory_allocated == 0 + assert get_profiler_data(process_object) == { + 'max_rss': 0, + 'cpu_time': 0, + 'memory_allocated': 0, + } def test_parse_memory_file(mocker, tmpdir): From 19ad0d01ee17c6b5e67560af3a3f27f2a22a72fb Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Mon, 2 Mar 2026 14:08:28 +0000 Subject: [PATCH 10/47] Removing accidentally added files --- .../etc/examples/error-handling/bin/my-model | 18 ------------------ .../flow/etc/examples/error-handling/flow.cylc | 9 --------- .../flow/etc/examples/error-handling/index.rst | 2 -- 3 files changed, 29 deletions(-) delete mode 100644 cylc/flow/etc/examples/error-handling/bin/my-model delete mode 100644 cylc/flow/etc/examples/error-handling/flow.cylc delete mode 100644 cylc/flow/etc/examples/error-handling/index.rst diff --git a/cylc/flow/etc/examples/error-handling/bin/my-model b/cylc/flow/etc/examples/error-handling/bin/my-model deleted file mode 100644 index a8f642a63f6..00000000000 --- a/cylc/flow/etc/examples/error-handling/bin/my-model +++ /dev/null @@ -1,18 +0,0 @@ -#!/usr/bin/env bash - -echo 'running my-model' -sleep 2 - -case $(( RANDOM % 3 )) in; - 0) - echo 'succeeded' - exit 0 - ;; - 1) - echo 'ERROR: NaN' - exit 1 - ;; - 2) - echo 'ERROR: Configuration issue' - exit 2 - ;; diff --git a/cylc/flow/etc/examples/error-handling/flow.cylc b/cylc/flow/etc/examples/error-handling/flow.cylc deleted file mode 100644 index be445ebb223..00000000000 --- a/cylc/flow/etc/examples/error-handling/flow.cylc +++ /dev/null @@ -1,9 +0,0 @@ -[scheduling] - [[graph]] - R1 = """ - foo => bar - """ - -[runtime] - [[foo]] - script = my-model diff --git a/cylc/flow/etc/examples/error-handling/index.rst b/cylc/flow/etc/examples/error-handling/index.rst deleted file mode 100644 index 92a9708c8ae..00000000000 --- a/cylc/flow/etc/examples/error-handling/index.rst +++ /dev/null @@ -1,2 +0,0 @@ -Error Handling --------------- From ef9e59960c98068889a4147e441f844f88707db5 Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Mon, 2 Mar 2026 15:35:37 +0000 Subject: [PATCH 11/47] change how the profiler sends its data --- cylc/flow/scripts/profiler.py | 58 ++++++++--------------------------- cylc/flow/task_message.py | 24 ++++++++++----- 2 files changed, 30 insertions(+), 52 deletions(-) diff --git a/cylc/flow/scripts/profiler.py b/cylc/flow/scripts/profiler.py index 732703f5eb6..ea1e44f1a13 100755 --- a/cylc/flow/scripts/profiler.py +++ b/cylc/flow/scripts/profiler.py @@ -26,16 +26,15 @@ import sys import time import signal -import asyncio from pathlib import Path from functools import partial from dataclasses import dataclass from cylc.flow.exceptions import CylcProfilerError -from cylc.flow.terminal import cli_function from cylc.flow.option_parsers import CylcOptionParser as COP -from cylc.flow.network.client_factory import get_client +from cylc.flow.task_message import record_messages +from cylc.flow.terminal import cli_function PID_REGEX = re.compile(r"([^:]*\d{6,}.*)") @@ -52,55 +51,29 @@ class Process: def stop_profiler(process, comms_timeout, *_args): - """Stop the profiler and return its data to the sceduler. + """Stop the profiler and return its data to the scheduler. - This function will be executed when the SIGINT signal is sent to this - process. + This function will be executed when the profiler receives a stop signal. """ profiler_data = get_profiler_data(process) - graphql_mutation = """ - mutation($WORKFLOWS: [WorkflowID]!, - $MESSAGES: [[String]], $JOB: String!, $TIME: String) { - message(workflows: $WORKFLOWS, messages:$MESSAGES, - taskJob:$JOB, eventTime:$TIME) { - result - } - } - """ - - graphql_request_variables = { - "WORKFLOWS": [os.environ.get('CYLC_WORKFLOW_ID')], - "MESSAGES": [[ - "DEBUG", - f'_cylc_profiler: {json.dumps(profiler_data)}', - ]], - "JOB": os.environ.get('CYLC_TASK_JOB'), - "TIME": "now" - } - - pclient = get_client(os.environ.get('CYLC_WORKFLOW_ID'), - timeout=comms_timeout) - - async def send_cylc_message(): - await pclient.async_request( - 'graphql', - {'request_string': graphql_mutation, - 'variables': graphql_request_variables}, - ) - - asyncio.run(send_cylc_message()) + record_messages( + os.environ['CYLC_WORKFLOW_ID'], + os.environ['CYLC_TASK_JOB'], + [['DEBUG', f'_cylc_profiler: {json.dumps(profiler_data)}']], + comms_timeout = comms_timeout, + ) sys.exit(0) def get_profiler_data(process): - # If a task fails instantly, or finishes very quickly (< 1 second), - # the get config function doesn't have time to run if ( process.cgroup_memory_path is None or process.cgroup_cpu_path is None or process.memory_allocated_path is None ): + # If a task fails instantly, or finishes very quickly (< 1 second), + # the get config function doesn't have time to run max_rss = cpu_time = memory_allocated = 0 else: max_rss = parse_memory_file(process) @@ -188,7 +161,7 @@ def get_cgroup_name(): # fugly hack to allow functional tests to use test data if 'profiler_test_env_var' in os.environ: - return os.getenv('profiler_test_env_var') + return os.environ['profiler_test_env_var'] # Get the PID of the current process pid = os.getpid() @@ -279,8 +252,3 @@ def _main(options) -> None: # run profiler run profile(process, options.delay) - - -if __name__ == "__main__": - arg_parser = get_option_parser() - _main(arg_parser.parse_args([])) diff --git a/cylc/flow/task_message.py b/cylc/flow/task_message.py index 6f4f41bd5ad..5d287c5a530 100644 --- a/cylc/flow/task_message.py +++ b/cylc/flow/task_message.py @@ -92,24 +92,30 @@ def split_run_signal(message: str) -> tuple[str, str | None]: return prefix, signal[0] if signal else None -def record_messages(workflow: str, job_id: str, messages: List[list]) -> None: +def record_messages( + workflow: str, + job_id: str, + messages: List[list], + comms_timeout: float | None = None, +) -> None: """Record task job messages. - Print the messages according to their severity. - Write the messages in the job status file. - Send the messages to the workflow, if possible. + * Print the messages according to their severity. + * Write the messages in the job status file. + * Send the messages to the workflow, if possible. Arguments: workflow: Workflow ID. job_id: Job identifier "CYCLE/TASK_NAME/SUBMIT_NUM". messages: List of messages "[[severity, message], ...]". + comms_timeout: Used for sending messages if appropriate. """ # Record the event time, in case the message is delayed in some way. event_time = get_current_time_string( override_use_utc=(os.getenv('CYLC_UTC') == 'True')) write_messages(workflow, job_id, messages, event_time) if get_comms_method() != CommsMeth.POLL: - send_messages(workflow, job_id, messages, event_time) + send_messages(workflow, job_id, messages, event_time, comms_timeout) def write_messages(workflow, job_id, messages, event_time): @@ -126,11 +132,15 @@ def write_messages(workflow, job_id, messages, event_time): def send_messages( - workflow: str, job_id: str, messages: List[list], event_time: str + workflow: str, + job_id: str, + messages: List[list], + event_time: str, + comms_timeout: float | None = None, ) -> None: workflow = os.path.normpath(workflow) try: - pclient = get_client(workflow) + pclient = get_client(workflow, timeout=comms_timeout) except WorkflowStopped: # on a remote host this means the contact file is not present # either the workflow is stopped or the contact file is not present From ea0b764723d907c8298d9e9a86dd9d8cab8dbf3a Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Mon, 2 Mar 2026 16:20:50 +0000 Subject: [PATCH 12/47] Updating unit tests --- cylc/flow/scripts/profiler.py | 2 +- tests/unit/scripts/test_profiler.py | 8 +++----- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/cylc/flow/scripts/profiler.py b/cylc/flow/scripts/profiler.py index ea1e44f1a13..13de2f200d4 100755 --- a/cylc/flow/scripts/profiler.py +++ b/cylc/flow/scripts/profiler.py @@ -61,7 +61,7 @@ def stop_profiler(process, comms_timeout, *_args): os.environ['CYLC_WORKFLOW_ID'], os.environ['CYLC_TASK_JOB'], [['DEBUG', f'_cylc_profiler: {json.dumps(profiler_data)}']], - comms_timeout = comms_timeout, + comms_timeout=comms_timeout, ) sys.exit(0) diff --git a/tests/unit/scripts/test_profiler.py b/tests/unit/scripts/test_profiler.py index a7c455e7050..c31cf33bd80 100644 --- a/tests/unit/scripts/test_profiler.py +++ b/tests/unit/scripts/test_profiler.py @@ -30,7 +30,7 @@ from cylc.flow.exceptions import CylcProfilerError -def test_stop_profiler(mocker, monkeypatch, tmpdir): +def test_stop_profiler(monkeypatch, tmpdir): monkeypatch.setenv('CYLC_WORKFLOW_ID', "test_value") class MockedClient: @@ -40,8 +40,6 @@ def __init__(self, *a, **k): async def async_request(self, *a, **k): pass - mocker.patch("cylc.flow.scripts.profiler.get_client", MockedClient) - mem_file = tmpdir.join("memory_file.txt") mem_file.write('total_rss 1234') cpu_file = tmpdir.join("cpu_file.txt") @@ -60,7 +58,7 @@ async def async_request(self, *a, **k): assert excinfo.value.code == 0 -def test_get_resource_usage(mocker, monkeypatch, tmpdir): +def test_get_resource_usage(monkeypatch, tmpdir): monkeypatch.setenv('CYLC_WORKFLOW_ID', "test_value") process_object = Process( @@ -76,7 +74,7 @@ def test_get_resource_usage(mocker, monkeypatch, tmpdir): } -def test_parse_memory_file(mocker, tmpdir): +def test_parse_memory_file(tmpdir): mem_file_v1 = tmpdir.join("memory_file_v1.txt") mem_file_v1.write('total_rss=1024') From e880ab49eeeba00084a672c13fc8dd80569d7a09 Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Tue, 3 Mar 2026 16:33:44 +0000 Subject: [PATCH 13/47] Updating unit tests --- tests/unit/scripts/test_profiler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/scripts/test_profiler.py b/tests/unit/scripts/test_profiler.py index c31cf33bd80..6367e76c290 100644 --- a/tests/unit/scripts/test_profiler.py +++ b/tests/unit/scripts/test_profiler.py @@ -32,7 +32,7 @@ def test_stop_profiler(monkeypatch, tmpdir): monkeypatch.setenv('CYLC_WORKFLOW_ID', "test_value") - + monkeypatch.setenv('CYLC_TASK_JOB', "test_task_job") class MockedClient: def __init__(self, *a, **k): pass From 430372684315b7012086509f066c805e413673b9 Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Tue, 3 Mar 2026 16:40:16 +0000 Subject: [PATCH 14/47] Updating unit tests --- tests/unit/scripts/test_profiler.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/unit/scripts/test_profiler.py b/tests/unit/scripts/test_profiler.py index 6367e76c290..bf80b2f5b1b 100644 --- a/tests/unit/scripts/test_profiler.py +++ b/tests/unit/scripts/test_profiler.py @@ -33,6 +33,7 @@ def test_stop_profiler(monkeypatch, tmpdir): monkeypatch.setenv('CYLC_WORKFLOW_ID', "test_value") monkeypatch.setenv('CYLC_TASK_JOB', "test_task_job") + class MockedClient: def __init__(self, *a, **k): pass From 7fd5e4b6de5ea26e438cde6f536fd883941a5bcb Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Thu, 5 Mar 2026 10:17:22 +0000 Subject: [PATCH 15/47] Updating unit tests --- .../jobscript/{03-profiler.t => 03-profiler-e2e.t} | 0 .../jobscript/{04-profiler-e2e.t => 04-profiler.t} | 6 +++--- 2 files changed, 3 insertions(+), 3 deletions(-) rename tests/functional/jobscript/{03-profiler.t => 03-profiler-e2e.t} (100%) rename tests/functional/jobscript/{04-profiler-e2e.t => 04-profiler.t} (95%) diff --git a/tests/functional/jobscript/03-profiler.t b/tests/functional/jobscript/03-profiler-e2e.t similarity index 100% rename from tests/functional/jobscript/03-profiler.t rename to tests/functional/jobscript/03-profiler-e2e.t diff --git a/tests/functional/jobscript/04-profiler-e2e.t b/tests/functional/jobscript/04-profiler.t similarity index 95% rename from tests/functional/jobscript/04-profiler-e2e.t rename to tests/functional/jobscript/04-profiler.t index 8e960647233..84a30485d19 100644 --- a/tests/functional/jobscript/04-profiler-e2e.t +++ b/tests/functional/jobscript/04-profiler.t @@ -72,14 +72,14 @@ workflow_run_ok "${TEST_NAME_BASE}-run" cylc play --debug --no-detach "${WORKFLO # were received before the succeeded message log_scan "${TEST_NAME_BASE}-task-succeeded" \ "${WORKFLOW_RUN_DIR}/log/scheduler/log" 1 0 \ - '1/the_good.*(received)cpu_time.*max_rss*' \ - '1/the_good.*(received)succeeded' + '1/the_good.*(received)_cylc_profiler.*cpu_time' \ + '1/the_good.*(received)succeeded' # ensure the cpu and memory messages were received and that these messages # were received before the failed message log_scan "${TEST_NAME_BASE}-task-succeeded" \ "${WORKFLOW_RUN_DIR}/log/scheduler/log" 1 0 \ - '1/the_bad.*(received)cpu_time.*max_rss*' \ + '1/the_bad.*(received)_cylc_profiler.*cpu_time.*' \ '1/the_bad.*failed' # ensure this task succeeded despite the broken profiler configuration From 91f73bb87233a08614c853adc0d2af0edf30cf09 Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Thu, 5 Mar 2026 14:27:18 +0000 Subject: [PATCH 16/47] Adding mechanism to ensure profiler is killed --- cylc/flow/remote.py | 32 +++++++++++++++++---- cylc/flow/scripts/profiler.py | 28 +++++++++++++++---- tests/unit/scripts/test_profiler.py | 43 +++++++++++++++++------------ 3 files changed, 75 insertions(+), 28 deletions(-) diff --git a/cylc/flow/remote.py b/cylc/flow/remote.py index f59a2a3cd26..cf53ae95883 100644 --- a/cylc/flow/remote.py +++ b/cylc/flow/remote.py @@ -15,6 +15,7 @@ # along with this program. If not, see . """Run command on a remote, (i.e. a remote [user@]host).""" +import asyncio import os from pathlib import Path from posix import WIFSIGNALED @@ -76,17 +77,38 @@ def get_proc_ancestors(): pid = ppid -def watch_and_kill(proc): - """Kill proc if my PPID (etc.) changed - e.g. ssh connection dropped.""" +async def watch_and_kill(proc, interval=None): + """Watch a process and kill it if any of its parent processes change. + + Processes exist in a tree which inherits from the process with PID 1. + + If a parent process dies, the child will be re-assigned a new parent. + This can happen: + * If the parent is killed but the signal is not propagated to its children + (or is caught and swallowed). + * If an SSH connection drops. + + This coroutine monitors the parents of a process and will kill the child + if any of its parents change. + + Args: + proc: + The process to monitor and kill if needed. + interval: + The polling interval to check the parent process tree in seconds. + If not provided, this will default to the environment variable + "CYLC_PROC_POLL_INTERVAL" if set, else 60. + + """ gpa = get_proc_ancestors() # Allow customising the interval to allow tests to run faster: - interval = float(os.getenv('CYLC_PROC_POLL_INTERVAL', 60)) + interval = interval or float(os.getenv('CYLC_PROC_POLL_INTERVAL', 60)) while True: - sleep(interval) + await asyncio.sleep(interval) if proc.poll() is not None: break if get_proc_ancestors() != gpa: - sleep(1) + await asyncio.sleep(1) os.kill(proc.pid, signal.SIGTERM) break diff --git a/cylc/flow/scripts/profiler.py b/cylc/flow/scripts/profiler.py index 13de2f200d4..ca5413c1bea 100755 --- a/cylc/flow/scripts/profiler.py +++ b/cylc/flow/scripts/profiler.py @@ -20,6 +20,7 @@ the resource usage of jobs running on the node. """ +import asyncio import json import os import re @@ -30,9 +31,11 @@ from pathlib import Path from functools import partial from dataclasses import dataclass +from subprocess import Popen, PIPE from cylc.flow.exceptions import CylcProfilerError from cylc.flow.option_parsers import CylcOptionParser as COP +from cylc.flow.remote import watch_and_kill from cylc.flow.task_message import record_messages from cylc.flow.terminal import cli_function @@ -207,14 +210,14 @@ def get_cgroup_paths(location) -> Process: err, "Unable to determine cgroup version") from err -def profile(_process: Process, delay, keep_looping=lambda: True): +async def profile(_process: Process, delay, keep_looping=lambda: True): # The infinite loop that will constantly poll the cgroup # The lambda function is used to allow the loop to be stopped in unit tests while keep_looping(): # Write cpu / memory usage data to disk # CPU_TIME = parse_cpu_file(process.cgroup_cpu_path, version) - time.sleep(delay) + await asyncio.sleep(delay) def get_option_parser() -> COP: @@ -237,10 +240,10 @@ def get_option_parser() -> COP: @cli_function(get_option_parser) def main(_parser: COP, options) -> None: """CLI main.""" - _main(options) + asyncio.run(_main(options)) -def _main(options) -> None: +async def _main(options) -> None: # get cgroup information process = get_cgroup_paths(options.cgroup_location) @@ -250,5 +253,18 @@ def _main(options) -> None: signal.signal(signal.SIGHUP, _stop_profiler) signal.signal(signal.SIGTERM, _stop_profiler) - # run profiler run - profile(process, options.delay) + proc = Popen( # nosec + ["ps", "-p", str(os.getpid()), "-oppid="], + stdout=PIPE, + stderr=PIPE, + text=True + ) + # the profiler will run until one of these coroutines calls `sys.exit`: + await asyncio.gather( + # run the profiler itself + profile(process, options.delay), + + # kill the profiler if its PPID changes + # (i.e, if the job exits before the profiler does) + watch_and_kill(proc), + ) diff --git a/tests/unit/scripts/test_profiler.py b/tests/unit/scripts/test_profiler.py index bf80b2f5b1b..a39790e3eed 100644 --- a/tests/unit/scripts/test_profiler.py +++ b/tests/unit/scripts/test_profiler.py @@ -15,19 +15,25 @@ # along with this program. If not, see . # # Tests for functions contained in cylc.flow.scripts.profiler -from cylc.flow.scripts.profiler import (parse_memory_file, - parse_cpu_file, - parse_memory_allocated, - get_cgroup_name, - get_cgroup_version, - get_cgroup_paths, - get_profiler_data, - stop_profiler, - profile, - Process) -import pytest +import os from unittest import mock + +import pytest + from cylc.flow.exceptions import CylcProfilerError +from cylc.flow.scripts.profiler import ( + Process, + _main, + get_cgroup_name, + get_cgroup_paths, + get_cgroup_version, + get_profiler_data, + parse_cpu_file, + parse_memory_allocated, + parse_memory_file, + profile, + stop_profiler, +) def test_stop_profiler(monkeypatch, tmpdir): @@ -303,8 +309,8 @@ def test_get_cgroup_paths(mocker): get_cgroup_paths("test_location/") assert "Unable to determine cgroup version" in str(excinfo.value) - -def test_profile_data(mocker): +@pytest.mark.asyncio +async def test_profile_data(mocker): # This test should run without error mocker.patch("cylc.flow.scripts.profiler.get_cgroup_name", return_value='test_name') @@ -319,7 +325,7 @@ def test_profile_data(mocker): mocker.patch("cylc.flow.scripts.profiler.parse_cpu_file", return_value=2048) run_once = mock.Mock(side_effect=[True, False]) - profile(process, 1, run_once) + await profile(process, 1, run_once) @pytest.fixture @@ -330,8 +336,12 @@ def options(mocker): opts.delay = 1 return opts +@pytest.mark.asyncio +async def test_main(mocker, options): + + # Speed up the test by reducing the poll interval + os.environ['CYLC_PROC_POLL_INTERVAL'] = "5" -def test_main(mocker, options): mock_get_cgroup_paths = mocker.patch( "cylc.flow.scripts.profiler.get_cgroup_paths" ) @@ -340,8 +350,7 @@ def test_main(mocker, options): mock_get_cgroup_paths.return_value = mocker.Mock() - from cylc.flow.scripts.profiler import _main - _main(options) + await _main(options) mock_get_cgroup_paths.assert_called_once_with("/fake/path") assert mock_signal.call_count == 3 From 7afd6a6e6ae276ddfb41fe8ae9beed8e0a25b7ea Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Thu, 5 Mar 2026 14:31:00 +0000 Subject: [PATCH 17/47] Linting --- cylc/flow/scripts/profiler.py | 1 - tests/unit/scripts/test_profiler.py | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/cylc/flow/scripts/profiler.py b/cylc/flow/scripts/profiler.py index ca5413c1bea..aab4a3e8785 100755 --- a/cylc/flow/scripts/profiler.py +++ b/cylc/flow/scripts/profiler.py @@ -25,7 +25,6 @@ import os import re import sys -import time import signal from pathlib import Path diff --git a/tests/unit/scripts/test_profiler.py b/tests/unit/scripts/test_profiler.py index a39790e3eed..888a13997bb 100644 --- a/tests/unit/scripts/test_profiler.py +++ b/tests/unit/scripts/test_profiler.py @@ -309,6 +309,7 @@ def test_get_cgroup_paths(mocker): get_cgroup_paths("test_location/") assert "Unable to determine cgroup version" in str(excinfo.value) + @pytest.mark.asyncio async def test_profile_data(mocker): # This test should run without error @@ -336,6 +337,7 @@ def options(mocker): opts.delay = 1 return opts + @pytest.mark.asyncio async def test_main(mocker, options): From 6ec836a37d96c44343d130e45a07b05b0e0ccb4c Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Fri, 6 Mar 2026 13:40:50 +0000 Subject: [PATCH 18/47] Updating testing to cope with asyncio --- cylc/flow/scripts/profiler.py | 25 +++++++-------- tests/functional/jobscript/03-profiler-e2e.t | 8 ++--- .../flow.cylc | 0 tests/functional/jobscript/04-profiler.t | 2 +- .../flow.cylc | 0 tests/unit/scripts/test_profiler.py | 32 ++++++++++++------- 6 files changed, 38 insertions(+), 29 deletions(-) rename tests/functional/jobscript/{03-profiler => 03-profiler-e2e}/flow.cylc (100%) rename tests/functional/jobscript/{04-profiler-e2e => 04-profiler}/flow.cylc (100%) diff --git a/cylc/flow/scripts/profiler.py b/cylc/flow/scripts/profiler.py index aab4a3e8785..a480fa98ecf 100755 --- a/cylc/flow/scripts/profiler.py +++ b/cylc/flow/scripts/profiler.py @@ -52,14 +52,14 @@ class Process: cgroup_version: int -def stop_profiler(process, comms_timeout, *_args): +async def stop_profiler(process, comms_timeout, *_args): """Stop the profiler and return its data to the scheduler. This function will be executed when the profiler receives a stop signal. """ profiler_data = get_profiler_data(process) - record_messages( + await record_messages( os.environ['CYLC_WORKFLOW_ID'], os.environ['CYLC_TASK_JOB'], [['DEBUG', f'_cylc_profiler: {json.dumps(profiler_data)}']], @@ -247,17 +247,16 @@ async def _main(options) -> None: process = get_cgroup_paths(options.cgroup_location) # Register the stop_profiler function with the signal library - _stop_profiler = partial(stop_profiler, process, options.comms_timeout) - signal.signal(signal.SIGINT, _stop_profiler) - signal.signal(signal.SIGHUP, _stop_profiler) - signal.signal(signal.SIGTERM, _stop_profiler) - - proc = Popen( # nosec - ["ps", "-p", str(os.getpid()), "-oppid="], - stdout=PIPE, - stderr=PIPE, - text=True - ) + loop = asyncio.get_running_loop() + for sig in (signal.SIGINT, signal.SIGHUP, signal.SIGTERM): + loop.add_signal_handler( + sig, + lambda: asyncio.create_task( + stop_profiler(process, options.comms_timeout) + ), + ) + + proc = Popen(["ps", "-p", str(os.getpid())]) # nosec # the profiler will run until one of these coroutines calls `sys.exit`: await asyncio.gather( # run the profiler itself diff --git a/tests/functional/jobscript/03-profiler-e2e.t b/tests/functional/jobscript/03-profiler-e2e.t index 2a910e6705c..a20237c3176 100644 --- a/tests/functional/jobscript/03-profiler-e2e.t +++ b/tests/functional/jobscript/03-profiler-e2e.t @@ -65,18 +65,18 @@ workflow_run_ok "${TEST_NAME_BASE}-run" cylc play --debug --no-detach "${WORKFLO # were received before the succeeded message log_scan "${TEST_NAME_BASE}-task-succeeded" \ "${WORKFLOW_RUN_DIR}/log/scheduler/log" 1 0 \ - '1/the_good.*(received)cpu_time.*max_rss*' \ + '1/the_good.*(received)_cylc_profiler.*cpu_time' \ '1/the_good.*(received)succeeded' # ensure the cpu and memory messages were received and that these messages # were received before the failed message log_scan "${TEST_NAME_BASE}-task-succeeded" \ "${WORKFLOW_RUN_DIR}/log/scheduler/log" 1 0 \ - '1/the_bad.*(received)cpu_time.*max_rss*' \ - '1/the_bad.*(received)failed' + '1/the_bad.*(received)_cylc_profiler.*cpu_time.*' \ + '1/the_bad.*failed' # ensure this task succeeded despite the broken profiler configuration grep_workflow_log_ok "${TEST_NAME_BASE}-broken" '1/the_ugly.*(received)succeeded' -grep_ok 'FileNotFoundError: Cgroup not found' "$(cylc cat-log "${WORKFLOW_NAME}//1/the_ugly" -f e -m p)" +grep_ok 'Unable to determine cgroup version' "$(cylc cat-log "${WORKFLOW_NAME}//1/the_ugly" -f e -m p)" purge diff --git a/tests/functional/jobscript/03-profiler/flow.cylc b/tests/functional/jobscript/03-profiler-e2e/flow.cylc similarity index 100% rename from tests/functional/jobscript/03-profiler/flow.cylc rename to tests/functional/jobscript/03-profiler-e2e/flow.cylc diff --git a/tests/functional/jobscript/04-profiler.t b/tests/functional/jobscript/04-profiler.t index 84a30485d19..f45592f2db0 100644 --- a/tests/functional/jobscript/04-profiler.t +++ b/tests/functional/jobscript/04-profiler.t @@ -73,7 +73,7 @@ workflow_run_ok "${TEST_NAME_BASE}-run" cylc play --debug --no-detach "${WORKFLO log_scan "${TEST_NAME_BASE}-task-succeeded" \ "${WORKFLOW_RUN_DIR}/log/scheduler/log" 1 0 \ '1/the_good.*(received)_cylc_profiler.*cpu_time' \ - '1/the_good.*(received)succeeded' + '1/the_good.*(received)succeeded' # ensure the cpu and memory messages were received and that these messages # were received before the failed message diff --git a/tests/functional/jobscript/04-profiler-e2e/flow.cylc b/tests/functional/jobscript/04-profiler/flow.cylc similarity index 100% rename from tests/functional/jobscript/04-profiler-e2e/flow.cylc rename to tests/functional/jobscript/04-profiler/flow.cylc diff --git a/tests/unit/scripts/test_profiler.py b/tests/unit/scripts/test_profiler.py index 888a13997bb..82570efc2a6 100644 --- a/tests/unit/scripts/test_profiler.py +++ b/tests/unit/scripts/test_profiler.py @@ -16,6 +16,7 @@ # # Tests for functions contained in cylc.flow.scripts.profiler import os +import asyncio from unittest import mock import pytest @@ -36,16 +37,15 @@ ) -def test_stop_profiler(monkeypatch, tmpdir): +@pytest.mark.asyncio +async def test_stop_profiler(monkeypatch, tmpdir): monkeypatch.setenv('CYLC_WORKFLOW_ID', "test_value") monkeypatch.setenv('CYLC_TASK_JOB', "test_task_job") - class MockedClient: - def __init__(self, *a, **k): - pass - - async def async_request(self, *a, **k): - pass + # Mock the async record_messages function with AsyncMock + monkeypatch.setattr( + 'cylc.flow.scripts.profiler.record_messages', mock.AsyncMock() + ) mem_file = tmpdir.join("memory_file.txt") mem_file.write('total_rss 1234') @@ -60,7 +60,7 @@ async def async_request(self, *a, **k): memory_allocated_path=mem_allocated_file, cgroup_version=1) with pytest.raises(SystemExit) as excinfo: - stop_profiler(process_object, 1) + await stop_profiler(process_object, 1) assert excinfo.value.code == 0 @@ -347,13 +347,23 @@ async def test_main(mocker, options): mock_get_cgroup_paths = mocker.patch( "cylc.flow.scripts.profiler.get_cgroup_paths" ) - mock_signal = mocker.patch("cylc.flow.scripts.profiler.signal.signal") + # Mock the loop and its add_signal_handler method + mock_loop = mocker.patch('asyncio.get_running_loop').return_value mock_profile = mocker.patch("cylc.flow.scripts.profiler.profile") + mock_watch_and_kill = mocker.patch( + "cylc.flow.scripts.profiler.watch_and_kill" + ) mock_get_cgroup_paths.return_value = mocker.Mock() - await _main(options) + # Mock asyncio.gather to raise an exception to break the loop + mocker.patch( + "asyncio.gather", side_effect=asyncio.CancelledError + ) + with pytest.raises(asyncio.CancelledError): + await _main(options) mock_get_cgroup_paths.assert_called_once_with("/fake/path") - assert mock_signal.call_count == 3 + assert mock_loop.add_signal_handler.call_count == 3 mock_profile.assert_called_once() + mock_watch_and_kill.assert_called_once() From 9956de76b48a809f62b4cf7784004ebec3df044c Mon Sep 17 00:00:00 2001 From: Christopher Bennett Date: Mon, 9 Mar 2026 08:39:32 +0000 Subject: [PATCH 19/47] Update changes.d/6663.feat.md Co-authored-by: Oliver Sanders --- changes.d/6663.feat.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/changes.d/6663.feat.md b/changes.d/6663.feat.md index 9a40cc43d59..affb22fc75a 100644 --- a/changes.d/6663.feat.md +++ b/changes.d/6663.feat.md @@ -1 +1 @@ -Adding CPU time and Max RSS to Analysis Tools +Added a Cylc job profiler which captures CPU and memory information from job runners which use cGroups. This information can be reviewed in the Analysis view in the GUI. From 5790c386c0c6c1a30b4357d119d782e769ebe39c Mon Sep 17 00:00:00 2001 From: Christopher Bennett Date: Mon, 9 Mar 2026 08:41:10 +0000 Subject: [PATCH 20/47] Update cylc/flow/cfgspec/globalcfg.py Co-authored-by: Oliver Sanders --- cylc/flow/cfgspec/globalcfg.py | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/cylc/flow/cfgspec/globalcfg.py b/cylc/flow/cfgspec/globalcfg.py index 1565f986d87..24e1338f474 100644 --- a/cylc/flow/cfgspec/globalcfg.py +++ b/cylc/flow/cfgspec/globalcfg.py @@ -1478,21 +1478,31 @@ def default_for( .. versionadded:: 8.0.0 ''') - with Conf('profile'): - Conf('activate', VDR.V_BOOLEAN, False, desc=''' - A Boolean that sets if the cylc profiler will be used + with Conf('profile', desc=''' + Configure the Cylc job profiler. - .. versionadded:: 8.0.0 + This tool can capture CPU and memory information from + job runners which use cGroups such as PBS and Slurm. + + .. versionadded:: 8.7.0 + '''): + Conf('activate', VDR.V_BOOLEAN, False, desc=''' + Enable the Cylc profiler for this platform. ''') Conf('cgroups path', VDR.V_STRING, default='/sys/fs/cgroup', desc=''' - The path to the cgroups filesystem. The default value - (/sys/fs/cgroup) is the standard location for cgroups on - linux and should work in most circumstances''') + Configure the path to the cgroups filesystem. + + The default value (``/sys/fs/cgroup``) is the standard + location for cgroups on linux and should work in + most circumstances + ''') Conf('polling interval', VDR.V_INTEGER, default=10, desc=''' + Configure the profiler polling interval. + The interval (in seconds) at which the profiler will poll the cgroups filesystem for resource usage data. The default value of 10 seconds should be sufficient for From 5c0375f490d61a4f88638ee2db46b08a6945c93f Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Mon, 9 Mar 2026 16:29:44 +0000 Subject: [PATCH 21/47] Hopefully working asyncio --- cylc/flow/remote.py | 8 ++++---- cylc/flow/scripts/profiler.py | 10 ++++++---- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/cylc/flow/remote.py b/cylc/flow/remote.py index a401d8e822e..e0fe8106601 100644 --- a/cylc/flow/remote.py +++ b/cylc/flow/remote.py @@ -105,13 +105,13 @@ async def watch_and_kill(proc, interval=None): interval = interval or float(os.getenv('CYLC_PROC_POLL_INTERVAL', 60)) while True: await asyncio.sleep(interval) - if proc.poll() is not None: + if proc.is_running() is False: break - if get_proc_ancestors() != gpa: + new_gpa = get_proc_ancestors() + if new_gpa != gpa: await asyncio.sleep(1) os.kill(proc.pid, signal.SIGTERM) - break - sleep(interval) + return True def run_cmd( diff --git a/cylc/flow/scripts/profiler.py b/cylc/flow/scripts/profiler.py index a480fa98ecf..1f5c5ac5096 100755 --- a/cylc/flow/scripts/profiler.py +++ b/cylc/flow/scripts/profiler.py @@ -26,6 +26,7 @@ import re import sys import signal +import psutil from pathlib import Path from functools import partial @@ -52,14 +53,14 @@ class Process: cgroup_version: int -async def stop_profiler(process, comms_timeout, *_args): +def stop_profiler(process, comms_timeout, *_args): """Stop the profiler and return its data to the scheduler. This function will be executed when the profiler receives a stop signal. """ profiler_data = get_profiler_data(process) - await record_messages( + record_messages( os.environ['CYLC_WORKFLOW_ID'], os.environ['CYLC_TASK_JOB'], [['DEBUG', f'_cylc_profiler: {json.dumps(profiler_data)}']], @@ -247,6 +248,8 @@ async def _main(options) -> None: process = get_cgroup_paths(options.cgroup_location) # Register the stop_profiler function with the signal library + # The signal library doesn't work with asyncio, so we have to use the + # loop's add_signal_handler function instead loop = asyncio.get_running_loop() for sig in (signal.SIGINT, signal.SIGHUP, signal.SIGTERM): loop.add_signal_handler( @@ -256,7 +259,6 @@ async def _main(options) -> None: ), ) - proc = Popen(["ps", "-p", str(os.getpid())]) # nosec # the profiler will run until one of these coroutines calls `sys.exit`: await asyncio.gather( # run the profiler itself @@ -264,5 +266,5 @@ async def _main(options) -> None: # kill the profiler if its PPID changes # (i.e, if the job exits before the profiler does) - watch_and_kill(proc), + watch_and_kill(psutil.Process(os.getpid())), ) From 9ed2fc41ee850678a4170913a99bcc981f48e230 Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Tue, 10 Mar 2026 09:03:21 +0000 Subject: [PATCH 22/47] Linting --- cylc/flow/remote.py | 1 - cylc/flow/scripts/profiler.py | 2 -- 2 files changed, 3 deletions(-) diff --git a/cylc/flow/remote.py b/cylc/flow/remote.py index e0fe8106601..bdec9ce7e23 100644 --- a/cylc/flow/remote.py +++ b/cylc/flow/remote.py @@ -33,7 +33,6 @@ Popen, ) import sys -from time import sleep from typing import ( Any, Dict, diff --git a/cylc/flow/scripts/profiler.py b/cylc/flow/scripts/profiler.py index 1f5c5ac5096..0942bac16cc 100755 --- a/cylc/flow/scripts/profiler.py +++ b/cylc/flow/scripts/profiler.py @@ -29,9 +29,7 @@ import psutil from pathlib import Path -from functools import partial from dataclasses import dataclass -from subprocess import Popen, PIPE from cylc.flow.exceptions import CylcProfilerError from cylc.flow.option_parsers import CylcOptionParser as COP From 3520e8c958996bd7ad204a0c00cc2b18269f722f Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Tue, 17 Mar 2026 11:26:07 +0000 Subject: [PATCH 23/47] Removing redundant asyncio --- cylc/flow/scripts/profiler.py | 5 ++- tests/unit/scripts/test_profiler.py | 50 ++++++++++++++++------------- 2 files changed, 30 insertions(+), 25 deletions(-) diff --git a/cylc/flow/scripts/profiler.py b/cylc/flow/scripts/profiler.py index 0942bac16cc..30866daf9d2 100755 --- a/cylc/flow/scripts/profiler.py +++ b/cylc/flow/scripts/profiler.py @@ -27,6 +27,7 @@ import sys import signal import psutil +import functools from pathlib import Path from dataclasses import dataclass @@ -252,9 +253,7 @@ async def _main(options) -> None: for sig in (signal.SIGINT, signal.SIGHUP, signal.SIGTERM): loop.add_signal_handler( sig, - lambda: asyncio.create_task( - stop_profiler(process, options.comms_timeout) - ), + functools.partial(stop_profiler, process, options.comms_timeout) ) # the profiler will run until one of these coroutines calls `sys.exit`: diff --git a/tests/unit/scripts/test_profiler.py b/tests/unit/scripts/test_profiler.py index 82570efc2a6..14ae54d93ba 100644 --- a/tests/unit/scripts/test_profiler.py +++ b/tests/unit/scripts/test_profiler.py @@ -37,8 +37,7 @@ ) -@pytest.mark.asyncio -async def test_stop_profiler(monkeypatch, tmpdir): +def test_stop_profiler(monkeypatch, tmpdir): monkeypatch.setenv('CYLC_WORKFLOW_ID', "test_value") monkeypatch.setenv('CYLC_TASK_JOB', "test_task_job") @@ -60,7 +59,7 @@ async def test_stop_profiler(monkeypatch, tmpdir): memory_allocated_path=mem_allocated_file, cgroup_version=1) with pytest.raises(SystemExit) as excinfo: - await stop_profiler(process_object, 1) + stop_profiler(process_object, 1) assert excinfo.value.code == 0 @@ -310,7 +309,6 @@ def test_get_cgroup_paths(mocker): assert "Unable to determine cgroup version" in str(excinfo.value) -@pytest.mark.asyncio async def test_profile_data(mocker): # This test should run without error mocker.patch("cylc.flow.scripts.profiler.get_cgroup_name", @@ -333,37 +331,45 @@ async def test_profile_data(mocker): def options(mocker): opts = mocker.Mock() opts.cgroup_location = "/fake/path" + opts.cgroup_memory_path = "/another/fake/path" opts.comms_timeout = 10 opts.delay = 1 return opts -@pytest.mark.asyncio async def test_main(mocker, options): + # Mock Cylc env vars + os.environ['CYLC_WORKFLOW_ID'] = "Exit Light" + os.environ['CYLC_TASK_JOB'] = "Enter Night" + + # Mock the gets and parse functions to return something sensible + # without needing actual files + mocker.patch("cylc.flow.scripts.profiler.get_cgroup_paths", + return_value=Process( + cgroup_memory_path="/some/place/memory.stat", + cgroup_cpu_path="/some/place/cpu.stat", + memory_allocated_path="/some/place", + cgroup_version=2, + )) + mocker.patch("cylc.flow.scripts.profiler.parse_memory_file", + return_value=1234) + mocker.patch("cylc.flow.scripts.profiler.parse_cpu_file", + return_value=5678) + mocker.patch("cylc.flow.scripts.profiler.parse_memory_allocated", + return_value=90) - # Speed up the test by reducing the poll interval - os.environ['CYLC_PROC_POLL_INTERVAL'] = "5" - - mock_get_cgroup_paths = mocker.patch( - "cylc.flow.scripts.profiler.get_cgroup_paths" - ) - # Mock the loop and its add_signal_handler method - mock_loop = mocker.patch('asyncio.get_running_loop').return_value + mock_signal = mocker.patch("cylc.flow.scripts.profiler.signal.signal") mock_profile = mocker.patch("cylc.flow.scripts.profiler.profile") mock_watch_and_kill = mocker.patch( "cylc.flow.scripts.profiler.watch_and_kill" ) - mock_get_cgroup_paths.return_value = mocker.Mock() + await _main(options) - # Mock asyncio.gather to raise an exception to break the loop - mocker.patch( - "asyncio.gather", side_effect=asyncio.CancelledError - ) - with pytest.raises(asyncio.CancelledError): - await _main(options) + # Make sure the 3 types of kill signal are registered. + assert mock_signal.call_count == 3 - mock_get_cgroup_paths.assert_called_once_with("/fake/path") - assert mock_loop.add_signal_handler.call_count == 3 + # Ensure the profiler and watch and kill functions are called by + # asyncio.gather mock_profile.assert_called_once() mock_watch_and_kill.assert_called_once() From 4071d8e35fe7929af1ed411402b6fda0919855c8 Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Tue, 17 Mar 2026 12:03:20 +0000 Subject: [PATCH 24/47] Linting --- tests/unit/scripts/test_profiler.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/tests/unit/scripts/test_profiler.py b/tests/unit/scripts/test_profiler.py index 14ae54d93ba..995f8e05a19 100644 --- a/tests/unit/scripts/test_profiler.py +++ b/tests/unit/scripts/test_profiler.py @@ -16,7 +16,6 @@ # # Tests for functions contained in cylc.flow.scripts.profiler import os -import asyncio from unittest import mock import pytest @@ -345,12 +344,11 @@ async def test_main(mocker, options): # Mock the gets and parse functions to return something sensible # without needing actual files mocker.patch("cylc.flow.scripts.profiler.get_cgroup_paths", - return_value=Process( - cgroup_memory_path="/some/place/memory.stat", - cgroup_cpu_path="/some/place/cpu.stat", - memory_allocated_path="/some/place", - cgroup_version=2, - )) + return_value=Process( + cgroup_memory_path="/some/place/memory.stat", + cgroup_cpu_path="/some/place/cpu.stat", + memory_allocated_path="/some/place", + cgroup_version=2)) mocker.patch("cylc.flow.scripts.profiler.parse_memory_file", return_value=1234) mocker.patch("cylc.flow.scripts.profiler.parse_cpu_file", From 6c3d108f1cf6c162591e281ddad9b02fda399a17 Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Tue, 31 Mar 2026 15:34:04 +0100 Subject: [PATCH 25/47] Reverted back to constant polling of the memory statistic --- cylc/flow/scripts/profiler.py | 11 +++++-- tests/unit/scripts/test_profiler.py | 45 +++++++++++++++++++---------- 2 files changed, 38 insertions(+), 18 deletions(-) diff --git a/cylc/flow/scripts/profiler.py b/cylc/flow/scripts/profiler.py index 30866daf9d2..b1854d67cc7 100755 --- a/cylc/flow/scripts/profiler.py +++ b/cylc/flow/scripts/profiler.py @@ -47,6 +47,7 @@ class Process: """Class for representing CPU and Memory usage of a process""" cgroup_memory_path: str + max_rss: int cgroup_cpu_path: str memory_allocated_path: str cgroup_version: int @@ -78,7 +79,7 @@ def get_profiler_data(process): # the get config function doesn't have time to run max_rss = cpu_time = memory_allocated = 0 else: - max_rss = parse_memory_file(process) + max_rss = process.max_rss cpu_time = parse_cpu_file(process) memory_allocated = parse_memory_allocated(process) return { @@ -192,6 +193,7 @@ def get_cgroup_paths(location) -> Process: cgroup_name + "/" + "cpu.stat", memory_allocated_path=location + cgroup_name, cgroup_version=cgroup_version, + max_rss=0, ) elif cgroup_version == 1: @@ -202,6 +204,7 @@ def get_cgroup_paths(location) -> Process: cgroup_name + "/cpuacct.usage", memory_allocated_path="", cgroup_version=cgroup_version, + max_rss=0, ) raise Exception except Exception as err: @@ -214,8 +217,10 @@ async def profile(_process: Process, delay, keep_looping=lambda: True): # The lambda function is used to allow the loop to be stopped in unit tests while keep_looping(): - # Write cpu / memory usage data to disk - # CPU_TIME = parse_cpu_file(process.cgroup_cpu_path, version) + # Polling the cgroup for memory and keeping track of the max rss value + max_rss = parse_memory_file(_process) + if max_rss > _process.max_rss: + _process.max_rss = max_rss await asyncio.sleep(delay) diff --git a/tests/unit/scripts/test_profiler.py b/tests/unit/scripts/test_profiler.py index 995f8e05a19..b82a669e4fe 100644 --- a/tests/unit/scripts/test_profiler.py +++ b/tests/unit/scripts/test_profiler.py @@ -56,7 +56,8 @@ def test_stop_profiler(monkeypatch, tmpdir): cgroup_memory_path=mem_file, cgroup_cpu_path=cpu_file, memory_allocated_path=mem_allocated_file, - cgroup_version=1) + cgroup_version=1, + max_rss=0) with pytest.raises(SystemExit) as excinfo: stop_profiler(process_object, 1) @@ -70,7 +71,8 @@ def test_get_resource_usage(monkeypatch, tmpdir): cgroup_memory_path=None, cgroup_cpu_path=None, memory_allocated_path=None, - cgroup_version=1) + cgroup_version=1, + max_rss=0) assert get_profiler_data(process_object) == { 'max_rss': 0, @@ -94,17 +96,20 @@ def test_parse_memory_file(tmpdir): cgroup_memory_path=mem_file_v1, cgroup_cpu_path=cpu_file, memory_allocated_path=mem_allocated_file, - cgroup_version=1) + cgroup_version=1, + max_rss=0) good_process_object_v2 = Process( cgroup_memory_path=mem_file_v2, cgroup_cpu_path=cpu_file, memory_allocated_path=mem_allocated_file, - cgroup_version=2) + cgroup_version=2, + max_rss=0) bad_process_object = Process( cgroup_memory_path='', cgroup_cpu_path='', memory_allocated_path='', - cgroup_version=1) + cgroup_version=1, + max_rss=0) with pytest.raises(CylcProfilerError) as excinfo: parse_memory_file(bad_process_object) @@ -135,27 +140,32 @@ def test_parse_cpu_file(tmpdir): cgroup_memory_path=mem_file, cgroup_cpu_path=cpu_file_v1_good, memory_allocated_path=mem_allocated_file, - cgroup_version=1) + cgroup_version=1, + max_rss=0) good_process_object_v2 = Process( cgroup_memory_path=mem_file, cgroup_cpu_path=cpu_file_v2_good, memory_allocated_path=mem_allocated_file, - cgroup_version=2) + cgroup_version=2, + max_rss=0) bad_process_object_v1_1 = Process( cgroup_memory_path='', cgroup_cpu_path='', memory_allocated_path='', - cgroup_version=1) + cgroup_version=1, + max_rss = 0) bad_process_object_v1_2 = Process( cgroup_memory_path=mem_file, cgroup_cpu_path=cpu_file_v1_bad, memory_allocated_path=mem_allocated_file, - cgroup_version=1) + cgroup_version=1, + max_rss=0) bad_process_object_v2 = Process( cgroup_memory_path=mem_file, cgroup_cpu_path=cpu_file_v2_bad, memory_allocated_path=mem_allocated_file, - cgroup_version=2) + cgroup_version=2, + max_rss=0) assert parse_cpu_file(good_process_object_v1) == 1234 assert parse_cpu_file(good_process_object_v2) == 1234567 @@ -193,19 +203,22 @@ def test_parse_memory_allocated(tmp_path_factory): cgroup_memory_path='', cgroup_cpu_path='', memory_allocated_path=str(good_mem_dir), - cgroup_version=1) + cgroup_version=1, + max_rss=0) good_process_object_v2 = Process( cgroup_memory_path='', cgroup_cpu_path='', memory_allocated_path=str(good_mem_dir), - cgroup_version=2) + cgroup_version=2, + max_rss=0) bad_process_object_v2_1 = Process( cgroup_memory_path='', cgroup_cpu_path='', memory_allocated_path='/', - cgroup_version=2) + cgroup_version=2, + max_rss=0) assert parse_memory_allocated(good_process_object_v1) == 0 assert parse_memory_allocated(good_process_object_v2) == 99999 @@ -244,7 +257,8 @@ def test_parse_memory_allocated(tmp_path_factory): cgroup_memory_path='', cgroup_cpu_path='', memory_allocated_path=str(dir_5), - cgroup_version=2) + cgroup_version=2, + max_rss=0) assert parse_memory_allocated(bad_process_object_v2_2) == 0 @@ -348,7 +362,8 @@ async def test_main(mocker, options): cgroup_memory_path="/some/place/memory.stat", cgroup_cpu_path="/some/place/cpu.stat", memory_allocated_path="/some/place", - cgroup_version=2)) + cgroup_version=2, + max_rss=0,)) mocker.patch("cylc.flow.scripts.profiler.parse_memory_file", return_value=1234) mocker.patch("cylc.flow.scripts.profiler.parse_cpu_file", From e42cc583c955f8618371cf41691680d6eee68c09 Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Tue, 31 Mar 2026 15:51:21 +0100 Subject: [PATCH 26/47] Linting --- tests/unit/scripts/test_profiler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/scripts/test_profiler.py b/tests/unit/scripts/test_profiler.py index b82a669e4fe..fc1f0c81ddf 100644 --- a/tests/unit/scripts/test_profiler.py +++ b/tests/unit/scripts/test_profiler.py @@ -153,7 +153,7 @@ def test_parse_cpu_file(tmpdir): cgroup_cpu_path='', memory_allocated_path='', cgroup_version=1, - max_rss = 0) + max_rss=0) bad_process_object_v1_2 = Process( cgroup_memory_path=mem_file, cgroup_cpu_path=cpu_file_v1_bad, From c2a40999b7963af94d97d22ee7dd8a47e01c5483 Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Tue, 31 Mar 2026 16:24:55 +0100 Subject: [PATCH 27/47] Code review changes --- cylc/flow/remote.py | 9 ++++- cylc/flow/scripts/cat_log.py | 21 ++++++----- cylc/flow/scripts/cylc.py | 2 +- cylc/flow/scripts/message.py | 11 ++++-- cylc/flow/scripts/profiler.py | 35 ++++++++++++------- cylc/flow/task_message.py | 9 ++--- .../functional/cylc-cat-log/12-delete-kill.t | 4 ++- tests/integration/scripts/test_cat_log.py | 27 ++++++++------ tests/unit/scripts/test_profiler.py | 7 ++-- tests/unit/test_task_message.py | 4 +-- 10 files changed, 81 insertions(+), 48 deletions(-) diff --git a/cylc/flow/remote.py b/cylc/flow/remote.py index bdec9ce7e23..4c36905a771 100644 --- a/cylc/flow/remote.py +++ b/cylc/flow/remote.py @@ -34,6 +34,7 @@ ) import sys from typing import ( + TYPE_CHECKING, Any, Dict, List, @@ -56,6 +57,9 @@ ) from cylc.flow.util import format_cmd +if TYPE_CHECKING: + import psutil + def get_proc_ancestors(): """Return list of parent PIDs back to init.""" @@ -76,7 +80,10 @@ def get_proc_ancestors(): pid = ppid -async def watch_and_kill(proc, interval=None): +async def watch_and_kill( + proc: 'psutil.Process', + interval: float | None = None, +): """Watch a process and kill it if any of its parent processes change. Processes exist in a tree which inherits from the process with PID 1. diff --git a/cylc/flow/scripts/cat_log.py b/cylc/flow/scripts/cat_log.py index 53650e8f606..9e948279cde 100755 --- a/cylc/flow/scripts/cat_log.py +++ b/cylc/flow/scripts/cat_log.py @@ -61,6 +61,7 @@ $ cylc cat-log foo//2020/bar -m f """ +import asyncio import os from contextlib import suppress from glob import glob @@ -70,10 +71,12 @@ import sys from typing import TYPE_CHECKING +from psutil import Process + from cylc.flow.exceptions import InputError import cylc.flow.flags from cylc.flow.hostuserutil import is_remote_platform -from cylc.flow.id_cli import parse_id +from cylc.flow.id_cli import parse_id_async from cylc.flow.log_level import verbosity_to_opts from cylc.flow.option_parsers import ( ID_MULTI_ARG_DOC, @@ -244,7 +247,7 @@ def _check_fs_path(path): ) -def view_log( +async def view_log( logpath, mode, tailer_tmpl, @@ -308,7 +311,7 @@ def view_log( proc = Popen(shlex.split(cmd), stdin=DEVNULL) # nosec # * batchview command is user configurable with suppress(KeyboardInterrupt): - watch_and_kill(proc) + await watch_and_kill(Process(proc.pid)) return proc.wait() @@ -414,10 +417,10 @@ def main( ): """Wrapper around the main script for simpler testing. """ - _main(parser, options, *ids, color=color) + asyncio.run(_main(parser, options, *ids, color=color)) -def _main( +async def _main( parser: COP, options: 'Values', *ids, @@ -445,7 +448,7 @@ def _main( batchview_cmd = options.remote_args[3] except IndexError: batchview_cmd = None - res = view_log( + res = await view_log( logpath, mode, tail_tmpl, @@ -458,7 +461,7 @@ def _main( sys.exit(res) return - workflow_id, tokens, _ = parse_id(*ids, constraint='mixed') + workflow_id, tokens, _ = await parse_id_async(*ids, constraint='mixed') # Get long-format mode. try: @@ -522,7 +525,7 @@ def _main( tail_tmpl = os.path.expandvars( get_platform()["tail command template"] ) - out = view_log( + out = await view_log( log_file_path, mode, tail_tmpl, @@ -679,7 +682,7 @@ def _main( # Local task job or local job log. logpath = os.path.join(local_log_dir, options.filename) tail_tmpl = os.path.expandvars(platform["tail command template"]) - out = view_log( + out = await view_log( logpath, mode, tail_tmpl, diff --git a/cylc/flow/scripts/cylc.py b/cylc/flow/scripts/cylc.py index 48ff21e2bd5..bd2096b2083 100644 --- a/cylc/flow/scripts/cylc.py +++ b/cylc/flow/scripts/cylc.py @@ -653,7 +653,7 @@ def main() -> None: # pragma: no cover def _main(opts, cmd_args) -> int: - """Implemnent the Cylc CLI. + """Implement the Cylc CLI. Returns the exit code as an integer. """ diff --git a/cylc/flow/scripts/message.py b/cylc/flow/scripts/message.py index 03a4eed3bf8..fe601de9c16 100755 --- a/cylc/flow/scripts/message.py +++ b/cylc/flow/scripts/message.py @@ -93,12 +93,13 @@ """ +import asyncio from logging import getLevelName, INFO import os import sys from typing import TYPE_CHECKING -from cylc.flow.id_cli import parse_id +from cylc.flow.id_cli import parse_id_async from cylc.flow.option_parsers import ( WORKFLOW_ID_ARG_DOC, CylcOptionParser as COP @@ -142,6 +143,10 @@ def main(parser: COP, options: 'Values', *args: str) -> None: if not args: parser.error('No message supplied') return + asyncio.run(_main(options, args)) + + +async def _main(options, args): if len(args) <= 2: # BACK COMPAT: args <= 2 # from: @@ -160,7 +165,7 @@ def main(parser: COP, options: 'Values', *args: str) -> None: message_strs = list(args) else: workflow_id, job_id, *message_strs = args - workflow_id, *_ = parse_id( + workflow_id, *_ = await parse_id_async( workflow_id, constraint='workflows', ) @@ -198,4 +203,4 @@ def main(parser: COP, options: 'Values', *args: str) -> None: messages.append([options.severity, message_str.strip()]) else: messages.append([getLevelName(INFO), message_str.strip()]) - record_messages(workflow_id, job_id, messages) + await record_messages(workflow_id, job_id, messages) diff --git a/cylc/flow/scripts/profiler.py b/cylc/flow/scripts/profiler.py index b1854d67cc7..c9927fd3748 100755 --- a/cylc/flow/scripts/profiler.py +++ b/cylc/flow/scripts/profiler.py @@ -21,13 +21,12 @@ """ import asyncio +from contextlib import suppress import json import os import re -import sys import signal import psutil -import functools from pathlib import Path from dataclasses import dataclass @@ -53,20 +52,25 @@ class Process: cgroup_version: int -def stop_profiler(process, comms_timeout, *_args): +async def stop_profiler(process, comms_timeout, tasks, *_args): """Stop the profiler and return its data to the scheduler. This function will be executed when the profiler receives a stop signal. """ + # stop the profiler + for task in tasks: + task.cancel() + + # extract the stats profiler_data = get_profiler_data(process) - record_messages( + # send a task message to the scheduler / write message to job.status file + await record_messages( os.environ['CYLC_WORKFLOW_ID'], os.environ['CYLC_TASK_JOB'], [['DEBUG', f'_cylc_profiler: {json.dumps(profiler_data)}']], comms_timeout=comms_timeout, ) - sys.exit(0) def get_profiler_data(process): @@ -219,7 +223,7 @@ async def profile(_process: Process, delay, keep_looping=lambda: True): while keep_looping(): # Polling the cgroup for memory and keeping track of the max rss value max_rss = parse_memory_file(_process) - if max_rss > _process.max_rss: + if max_rss is not None and max_rss > _process.max_rss: _process.max_rss = max_rss await asyncio.sleep(delay) @@ -244,13 +248,17 @@ def get_option_parser() -> COP: @cli_function(get_option_parser) def main(_parser: COP, options) -> None: """CLI main.""" - asyncio.run(_main(options)) + with suppress(SystemExit, asyncio.exceptions.CancelledError, Exception): + asyncio.run(_main(options)) async def _main(options) -> None: # get cgroup information process = get_cgroup_paths(options.cgroup_location) + # list of asyncio tasks + tasks = [] + # Register the stop_profiler function with the signal library # The signal library doesn't work with asyncio, so we have to use the # loop's add_signal_handler function instead @@ -258,15 +266,18 @@ async def _main(options) -> None: for sig in (signal.SIGINT, signal.SIGHUP, signal.SIGTERM): loop.add_signal_handler( sig, - functools.partial(stop_profiler, process, options.comms_timeout) + lambda: asyncio.create_task( + stop_profiler(process, options.comms_timeout, tasks) + ), ) # the profiler will run until one of these coroutines calls `sys.exit`: - await asyncio.gather( + tasks.extend([ # run the profiler itself - profile(process, options.delay), + asyncio.create_task(profile(process, options.delay)), # kill the profiler if its PPID changes # (i.e, if the job exits before the profiler does) - watch_and_kill(psutil.Process(os.getpid())), - ) + asyncio.create_task(watch_and_kill(psutil.Process(os.getpid()))), + ]) + await asyncio.gather(*tasks) diff --git a/cylc/flow/task_message.py b/cylc/flow/task_message.py index 5d287c5a530..85067ca27d5 100644 --- a/cylc/flow/task_message.py +++ b/cylc/flow/task_message.py @@ -92,7 +92,7 @@ def split_run_signal(message: str) -> tuple[str, str | None]: return prefix, signal[0] if signal else None -def record_messages( +async def record_messages( workflow: str, job_id: str, messages: List[list], @@ -115,7 +115,8 @@ def record_messages( override_use_utc=(os.getenv('CYLC_UTC') == 'True')) write_messages(workflow, job_id, messages, event_time) if get_comms_method() != CommsMeth.POLL: - send_messages(workflow, job_id, messages, event_time, comms_timeout) + await send_messages(workflow, job_id, messages, + event_time, comms_timeout) def write_messages(workflow, job_id, messages, event_time): @@ -131,7 +132,7 @@ def write_messages(workflow, job_id, messages, event_time): _append_job_status_file(workflow, job_id, event_time, messages) -def send_messages( +async def send_messages( workflow: str, job_id: str, messages: List[list], @@ -163,7 +164,7 @@ def send_messages( 'messages': messages, } } - pclient('graphql', mutation_kwargs) + await pclient.async_request('graphql', mutation_kwargs) def _append_job_status_file(workflow, job_id, event_time, messages): diff --git a/tests/functional/cylc-cat-log/12-delete-kill.t b/tests/functional/cylc-cat-log/12-delete-kill.t index c642d4995ec..935194ccf0d 100644 --- a/tests/functional/cylc-cat-log/12-delete-kill.t +++ b/tests/functional/cylc-cat-log/12-delete-kill.t @@ -19,7 +19,7 @@ # or when tail is killed. . "$(dirname "$0")/test_header" -set_test_number 2 +set_test_number 1 # Get PID of tail cmd given the parent cat-log PPID get_tail_pid() { @@ -33,6 +33,8 @@ __EOF__ log_file="${WORKFLOW_RUN_DIR}/log/foo.log" echo "Hello, Mr. Thompson" > "$log_file" +export CYLC_PROC_POLL_INTERVAL=0.5 + TEST_NAME="${TEST_NAME_BASE}-delete" cylc cat-log --mode=tail "$WORKFLOW_NAME" -f foo.log 2>&1 & cat_log_pid="$!" diff --git a/tests/integration/scripts/test_cat_log.py b/tests/integration/scripts/test_cat_log.py index bb92ad5965e..36d29163b2f 100644 --- a/tests/integration/scripts/test_cat_log.py +++ b/tests/integration/scripts/test_cat_log.py @@ -39,16 +39,18 @@ def brokendir(run_dir): shutil.rmtree(brokendir) -def test_fail_no_file(flow): +@pytest.mark.asyncio +async def test_fail_no_file(flow): """It produces a helpful error if there is no workflow log file. """ parser = cat_log_gop() id_ = flow({}) with pytest.raises(InputError, match='Log file not found.'): - cat_log(parser, Options(parser)(), id_) + await cat_log(parser, Options(parser)(), id_) -def test_fail_rotation_out_of_range(flow): +@pytest.mark.asyncio +async def test_fail_rotation_out_of_range(flow): """It produces a helpful error if rotation number > number of log files. """ parser = cat_log_gop() @@ -60,15 +62,16 @@ def test_fail_rotation_out_of_range(flow): (logpath / '01-start-01.log').touch() with pytest.raises(SystemExit): - cat_log(parser, Options(parser)(rotation_num=0), id_) + await cat_log(parser, Options(parser)(rotation_num=0), id_) msg = r'--rotation 1 invalid \(max value is 0\)' with pytest.raises(InputError, match=msg): - cat_log(parser, Options(parser)(rotation_num=1), id_) + await cat_log(parser, Options(parser)(rotation_num=1), id_) -def test_bad_workflow(run_dir): +@pytest.mark.asyncio +async def test_bad_workflow(run_dir): """Test "cylc cat-log" with bad workflow name.""" parser = cat_log_gop() msg = re.compile( @@ -77,15 +80,16 @@ def test_bad_workflow(run_dir): re.MULTILINE ) with pytest.raises(InputError, match=msg): - cat_log(parser, Options(parser)(filename='l'), BAD_NAME) + await cat_log(parser, Options(parser)(filename='l'), BAD_NAME) -def test_bad_workflow2(run_dir, brokendir, capsys): +@pytest.mark.asyncio +async def test_bad_workflow2(run_dir, brokendir, capsys): """Check a non existent file in a valid workflow results in error. """ parser = cat_log_gop() with pytest.raises(SystemExit, match='1'): - cat_log( + await cat_log( parser, Options(parser)(filename='j'), BAD_NAME @@ -96,12 +100,13 @@ def test_bad_workflow2(run_dir, brokendir, capsys): assert capsys.readouterr().err == msg -def test_bad_task_dir(run_dir, brokendir, capsys): +@pytest.mark.asyncio +async def test_bad_task_dir(run_dir, brokendir, capsys): """Check a non-existent job log dir in a valid workflow results in error. """ parser = cat_log_gop() with pytest.raises(SystemExit, match='1'): - cat_log( + await cat_log( parser, Options(parser)(mode='list-dir'), BAD_NAME + "//1/foo" diff --git a/tests/unit/scripts/test_profiler.py b/tests/unit/scripts/test_profiler.py index fc1f0c81ddf..12bf717db47 100644 --- a/tests/unit/scripts/test_profiler.py +++ b/tests/unit/scripts/test_profiler.py @@ -36,7 +36,8 @@ ) -def test_stop_profiler(monkeypatch, tmpdir): +@pytest.mark.asyncio +async def test_stop_profiler(monkeypatch, tmpdir): monkeypatch.setenv('CYLC_WORKFLOW_ID', "test_value") monkeypatch.setenv('CYLC_TASK_JOB', "test_task_job") @@ -58,10 +59,8 @@ def test_stop_profiler(monkeypatch, tmpdir): memory_allocated_path=mem_allocated_file, cgroup_version=1, max_rss=0) - with pytest.raises(SystemExit) as excinfo: - stop_profiler(process_object, 1) - assert excinfo.value.code == 0 + await stop_profiler(process_object, 1, []) def test_get_resource_usage(monkeypatch, tmpdir): diff --git a/tests/unit/test_task_message.py b/tests/unit/test_task_message.py index 6894631a1ac..1a6e50e222d 100644 --- a/tests/unit/test_task_message.py +++ b/tests/unit/test_task_message.py @@ -21,7 +21,7 @@ from cylc.flow.task_message import send_messages -def test_send_messages_err( +async def test_send_messages_err( monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture ): """If an error occurs while initializing the client, it should be printed. @@ -32,7 +32,7 @@ def mock_get_client(*a, **k): raise gaierror(-2, exc_msg) monkeypatch.setattr('cylc.flow.task_message.get_client', mock_get_client) - send_messages( + await send_messages( 'arasaka', '1/v/01', [['INFO', 'silverhand']], '2077-01-01T00:00:00Z' ) assert f"gaierror: [Errno -2] {exc_msg}" in capsys.readouterr().err From a79a3ceace0c4250fb26723b9a5818c8111164b5 Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Wed, 8 Apr 2026 13:38:30 +0100 Subject: [PATCH 28/47] Fix unit tests --- cylc/flow/scripts/cat_log.py | 9 ++++++--- tests/functional/cylc-cat-log/12-delete-kill.t | 4 ++-- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/cylc/flow/scripts/cat_log.py b/cylc/flow/scripts/cat_log.py index 9e948279cde..d2a41e7900c 100755 --- a/cylc/flow/scripts/cat_log.py +++ b/cylc/flow/scripts/cat_log.py @@ -310,9 +310,12 @@ async def view_log( cmd = tailer_tmpl % {"filename": shlex.quote(str(logpath))} proc = Popen(shlex.split(cmd), stdin=DEVNULL) # nosec # * batchview command is user configurable - with suppress(KeyboardInterrupt): - await watch_and_kill(Process(proc.pid)) - return proc.wait() + watcher = asyncio.create_task(watch_and_kill(Process(proc.pid))) + try: + ret = proc.wait() + finally: + watcher.cancel() + return ret def get_option_parser() -> COP: diff --git a/tests/functional/cylc-cat-log/12-delete-kill.t b/tests/functional/cylc-cat-log/12-delete-kill.t index 935194ccf0d..d52d92c9e38 100644 --- a/tests/functional/cylc-cat-log/12-delete-kill.t +++ b/tests/functional/cylc-cat-log/12-delete-kill.t @@ -19,7 +19,7 @@ # or when tail is killed. . "$(dirname "$0")/test_header" -set_test_number 1 +set_test_number 2 # Get PID of tail cmd given the parent cat-log PPID get_tail_pid() { @@ -33,7 +33,7 @@ __EOF__ log_file="${WORKFLOW_RUN_DIR}/log/foo.log" echo "Hello, Mr. Thompson" > "$log_file" -export CYLC_PROC_POLL_INTERVAL=0.5 + TEST_NAME="${TEST_NAME_BASE}-delete" cylc cat-log --mode=tail "$WORKFLOW_NAME" -f foo.log 2>&1 & From 9e8f426b79054b27062b2b97831b8b1859ff0b6f Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Wed, 8 Apr 2026 13:48:26 +0100 Subject: [PATCH 29/47] Type hinting --- cylc/flow/scripts/profiler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cylc/flow/scripts/profiler.py b/cylc/flow/scripts/profiler.py index c9927fd3748..59ced21739a 100755 --- a/cylc/flow/scripts/profiler.py +++ b/cylc/flow/scripts/profiler.py @@ -257,7 +257,7 @@ async def _main(options) -> None: process = get_cgroup_paths(options.cgroup_location) # list of asyncio tasks - tasks = [] + tasks: list[asyncio.Task] = [] # Register the stop_profiler function with the signal library # The signal library doesn't work with asyncio, so we have to use the From d6557158d9e31531929cb7385801438a450f4a9a Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Mon, 13 Apr 2026 11:16:11 +0100 Subject: [PATCH 30/47] Code Review changes --- tests/functional/cylc-cat-log/12-delete-kill.t | 2 ++ tests/unit/scripts/test_profiler.py | 18 ++++++++++++------ 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/tests/functional/cylc-cat-log/12-delete-kill.t b/tests/functional/cylc-cat-log/12-delete-kill.t index d52d92c9e38..8aef45ea143 100644 --- a/tests/functional/cylc-cat-log/12-delete-kill.t +++ b/tests/functional/cylc-cat-log/12-delete-kill.t @@ -35,6 +35,8 @@ echo "Hello, Mr. Thompson" > "$log_file" +export CYLC_PROC_POLL_INTERVAL=0.5 + TEST_NAME="${TEST_NAME_BASE}-delete" cylc cat-log --mode=tail "$WORKFLOW_NAME" -f foo.log 2>&1 & cat_log_pid="$!" diff --git a/tests/unit/scripts/test_profiler.py b/tests/unit/scripts/test_profiler.py index 12bf717db47..eb2ca19b0fc 100644 --- a/tests/unit/scripts/test_profiler.py +++ b/tests/unit/scripts/test_profiler.py @@ -38,6 +38,8 @@ @pytest.mark.asyncio async def test_stop_profiler(monkeypatch, tmpdir): + """It should stop the profiler and send the data to the scheduler when + it receives a stop signal.""" monkeypatch.setenv('CYLC_WORKFLOW_ID', "test_value") monkeypatch.setenv('CYLC_TASK_JOB', "test_task_job") @@ -64,6 +66,7 @@ async def test_stop_profiler(monkeypatch, tmpdir): def test_get_resource_usage(monkeypatch, tmpdir): + """It should return the resource usage data of the process.""" monkeypatch.setenv('CYLC_WORKFLOW_ID', "test_value") process_object = Process( @@ -81,7 +84,7 @@ def test_get_resource_usage(monkeypatch, tmpdir): def test_parse_memory_file(tmpdir): - + """It should return the memory usage of the process.""" mem_file_v1 = tmpdir.join("memory_file_v1.txt") mem_file_v1.write('total_rss=1024') mem_file_v2 = tmpdir.join("memory_file_v2.txt") @@ -120,7 +123,7 @@ def test_parse_memory_file(tmpdir): def test_parse_cpu_file(tmpdir): - + """It should return the cpu usage of the process.""" mem_file = tmpdir.join("memory_file.txt") mem_file.write('1024') cpu_file_v1_good = tmpdir.join("cpu_file_v1_good.txt") @@ -181,7 +184,7 @@ def test_parse_cpu_file(tmpdir): def test_get_cgroup_name(mocker): - + """It should return the cgroup name of the process.""" mock_file = mocker.mock_open(read_data="0::bad/test/cgroup/place") mocker.patch("builtins.open", mock_file) with pytest.raises(CylcProfilerError): @@ -193,6 +196,7 @@ def test_get_cgroup_name(mocker): def test_parse_memory_allocated(tmp_path_factory): + """It should return the memory allocated to the process.""" good_mem_dir = tmp_path_factory.mktemp("mem_dir") mem_allocated_file = good_mem_dir / "memory.max" mem_allocated_file.write_text('99999') @@ -263,7 +267,7 @@ def test_parse_memory_allocated(tmp_path_factory): def test_get_cgroup_name_file_not_found(mocker): - + """It should raise an error if the cgroup file is not found.""" def mock_os_pid(): return 'The Thing That Should Not Be' @@ -274,7 +278,7 @@ def mock_os_pid(): def test_get_cgroup_version(mocker): - + """It should return the cgroup version of the process.""" # Mock the Path.exists function call to return True mocker.patch("pathlib.Path.exists", return_value=True) assert get_cgroup_version('stuff/in/place', @@ -293,6 +297,7 @@ def test_get_cgroup_version(mocker): def test_get_cgroup_paths(mocker): + """It should return the cgroup paths of the process.""" mocker.patch("cylc.flow.scripts.profiler.get_cgroup_name", return_value='test_name') mocker.patch("cylc.flow.scripts.profiler.get_cgroup_version", @@ -322,7 +327,7 @@ def test_get_cgroup_paths(mocker): async def test_profile_data(mocker): - # This test should run without error + """""" mocker.patch("cylc.flow.scripts.profiler.get_cgroup_name", return_value='test_name') mocker.patch("cylc.flow.scripts.profiler.get_cgroup_version", @@ -350,6 +355,7 @@ def options(mocker): async def test_main(mocker, options): + """It should run the profiler and watch and kill functions concurrently.""" # Mock Cylc env vars os.environ['CYLC_WORKFLOW_ID'] = "Exit Light" os.environ['CYLC_TASK_JOB'] = "Enter Night" From 8b5d59252a0a24ff93c97a4959402a2a114b5f43 Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Mon, 13 Apr 2026 13:07:23 +0100 Subject: [PATCH 31/47] Code Review changes --- tests/functional/jobscript/04-profiler.t | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/functional/jobscript/04-profiler.t b/tests/functional/jobscript/04-profiler.t index f45592f2db0..93e2697841e 100644 --- a/tests/functional/jobscript/04-profiler.t +++ b/tests/functional/jobscript/04-profiler.t @@ -16,18 +16,16 @@ # along with this program. If not, see . #------------------------------------------------------------------------------- # Cylc profile test -# NOTE: This test will run the Cylc profiler on the given test platform. -# The test platform may need to be configured for this to work (e.g. -# "cgroups path" may need to be set). . "$(dirname "$0")/test_header" if [[ "$OSTYPE" != "linux-gnu"* ]]; then - skip_all "Tests not compatibile with $OSTYPE" + skip_all "Tests not compatible with $OSTYPE" fi set_test_number 7 +# Set up test data mkdir -p "${PWD}/cgroups_test_data" echo 'anon 12345678' > cgroups_test_data/memory.stat From 8aadd13d1bf3e68e52a14636dff1bb6c609d08e6 Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Mon, 13 Apr 2026 14:35:37 +0100 Subject: [PATCH 32/47] Code Review changes --- tests/integration/scripts/test_cat_log.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/tests/integration/scripts/test_cat_log.py b/tests/integration/scripts/test_cat_log.py index 36d29163b2f..8566cce4a51 100644 --- a/tests/integration/scripts/test_cat_log.py +++ b/tests/integration/scripts/test_cat_log.py @@ -39,7 +39,6 @@ def brokendir(run_dir): shutil.rmtree(brokendir) -@pytest.mark.asyncio async def test_fail_no_file(flow): """It produces a helpful error if there is no workflow log file. """ @@ -49,7 +48,6 @@ async def test_fail_no_file(flow): await cat_log(parser, Options(parser)(), id_) -@pytest.mark.asyncio async def test_fail_rotation_out_of_range(flow): """It produces a helpful error if rotation number > number of log files. """ @@ -70,7 +68,6 @@ async def test_fail_rotation_out_of_range(flow): await cat_log(parser, Options(parser)(rotation_num=1), id_) -@pytest.mark.asyncio async def test_bad_workflow(run_dir): """Test "cylc cat-log" with bad workflow name.""" parser = cat_log_gop() @@ -83,9 +80,8 @@ async def test_bad_workflow(run_dir): await cat_log(parser, Options(parser)(filename='l'), BAD_NAME) -@pytest.mark.asyncio async def test_bad_workflow2(run_dir, brokendir, capsys): - """Check a non existent file in a valid workflow results in error. + """Check a non-existent file in a valid workflow results in error. """ parser = cat_log_gop() with pytest.raises(SystemExit, match='1'): @@ -100,7 +96,6 @@ async def test_bad_workflow2(run_dir, brokendir, capsys): assert capsys.readouterr().err == msg -@pytest.mark.asyncio async def test_bad_task_dir(run_dir, brokendir, capsys): """Check a non-existent job log dir in a valid workflow results in error. """ From 712acad210794e23c23239bbf34203e256c87ad8 Mon Sep 17 00:00:00 2001 From: Christopher Bennett Date: Mon, 20 Apr 2026 14:22:31 +0100 Subject: [PATCH 33/47] Apply suggestions from code review Co-authored-by: Oliver Sanders --- tests/unit/scripts/test_profiler.py | 47 ++++++++++++++++++----------- 1 file changed, 30 insertions(+), 17 deletions(-) diff --git a/tests/unit/scripts/test_profiler.py b/tests/unit/scripts/test_profiler.py index eb2ca19b0fc..c47a706b881 100644 --- a/tests/unit/scripts/test_profiler.py +++ b/tests/unit/scripts/test_profiler.py @@ -36,22 +36,23 @@ ) -@pytest.mark.asyncio async def test_stop_profiler(monkeypatch, tmpdir): - """It should stop the profiler and send the data to the scheduler when - it receives a stop signal.""" + """It should capture and record data profiler is stopped.""" monkeypatch.setenv('CYLC_WORKFLOW_ID', "test_value") monkeypatch.setenv('CYLC_TASK_JOB', "test_task_job") - # Mock the async record_messages function with AsyncMock + # capture record_messages calls + record_messages = mock.AsyncMock() monkeypatch.setattr( - 'cylc.flow.scripts.profiler.record_messages', mock.AsyncMock() + 'cylc.flow.scripts.profiler.record_messages', record_messages ) + # mock the cGroup filesystem mem_file = tmpdir.join("memory_file.txt") - mem_file.write('total_rss 1234') + mem_file.write('total_rss=1234') cpu_file = tmpdir.join("cpu_file.txt") - cpu_file.write('5678') + cpu_file.write('5678000') + # NOTE: memory allocated is not available for cGroups v1 mem_allocated_file = tmpdir.join("memory_allocated.txt") mem_allocated_file.write('99999') @@ -60,15 +61,31 @@ async def test_stop_profiler(monkeypatch, tmpdir): cgroup_cpu_path=cpu_file, memory_allocated_path=mem_allocated_file, cgroup_version=1, - max_rss=0) + max_rss=42, + ) + # tell the profiler to stop await stop_profiler(process_object, 1, []) - -def test_get_resource_usage(monkeypatch, tmpdir): - """It should return the resource usage data of the process.""" - monkeypatch.setenv('CYLC_WORKFLOW_ID', "test_value") - + # the profiler should record a "cylc message" with the profiler data + assert record_messages.call_args_list == [ + mock.call( + 'test_value', + 'test_task_job', + [ + [ + 'DEBUG', + '_cylc_profiler:' + ' {"max_rss": 42, "cpu_time": 5, "memory_allocated": 0}', + ] + ], + comms_timeout=1, + ) + ] + + +def test_get_resource_usage(): + """It should return 0 if cGroup information is not provided.""" process_object = Process( cgroup_memory_path=None, cgroup_cpu_path=None, @@ -306,8 +323,6 @@ def test_get_cgroup_paths(mocker): assert process.cgroup_memory_path == "test_location/test_name/memory.stat" assert process.cgroup_cpu_path == "test_location/test_name/cpu.stat" - mocker.patch("cylc.flow.scripts.profiler.get_cgroup_name", - return_value='test_name') mocker.patch("cylc.flow.scripts.profiler.get_cgroup_version", return_value=1) @@ -317,8 +332,6 @@ def test_get_cgroup_paths(mocker): assert (process.cgroup_cpu_path == "test_location/cpu/test_name/cpuacct.usage") - mocker.patch("cylc.flow.scripts.profiler.get_cgroup_name", - return_value='test_name') mocker.patch("cylc.flow.scripts.profiler.get_cgroup_version", return_value=3) with pytest.raises(CylcProfilerError) as excinfo: From 7ba490752f1186d5e8cbf921d08055693d0a3fb3 Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Mon, 20 Apr 2026 15:34:46 +0100 Subject: [PATCH 34/47] Code review changes --- cylc/flow/scripts/profiler.py | 4 ++-- tests/unit/scripts/test_profiler.py | 16 ++++++++++++---- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/cylc/flow/scripts/profiler.py b/cylc/flow/scripts/profiler.py index 59ced21739a..51fdb73164d 100755 --- a/cylc/flow/scripts/profiler.py +++ b/cylc/flow/scripts/profiler.py @@ -134,14 +134,14 @@ def parse_cpu_file(process: Process) -> int: with open(process.cgroup_cpu_path, 'r') as f: for line in f: if "usage_usec" in line: - return int(RE_INT.findall(line)[0]) // 1000 + return round(int(RE_INT.findall(line)[0]) / 1000) raise FileNotFoundError(process.cgroup_cpu_path) elif process.cgroup_version == 1: with open(process.cgroup_cpu_path, 'r') as f: for line in f: # Cgroups v1 uses nanoseconds - return int(line) // 1000000 + return round(int(line) / 1000000) raise FileNotFoundError(process.cgroup_cpu_path) except Exception as err: diff --git a/tests/unit/scripts/test_profiler.py b/tests/unit/scripts/test_profiler.py index c47a706b881..e1eabf8ced0 100644 --- a/tests/unit/scripts/test_profiler.py +++ b/tests/unit/scripts/test_profiler.py @@ -76,7 +76,7 @@ async def test_stop_profiler(monkeypatch, tmpdir): [ 'DEBUG', '_cylc_profiler:' - ' {"max_rss": 42, "cpu_time": 5, "memory_allocated": 0}', + ' {"max_rss": 42, "cpu_time": 6, "memory_allocated": 0}', ] ], comms_timeout=1, @@ -186,8 +186,8 @@ def test_parse_cpu_file(tmpdir): cgroup_version=2, max_rss=0) - assert parse_cpu_file(good_process_object_v1) == 1234 - assert parse_cpu_file(good_process_object_v2) == 1234567 + assert parse_cpu_file(good_process_object_v1) == 1235 + assert parse_cpu_file(good_process_object_v2) == 1234568 with pytest.raises(CylcProfilerError) as excinfo: parse_cpu_file(bad_process_object_v1_1) @@ -280,8 +280,15 @@ def test_parse_memory_allocated(tmp_path_factory): cgroup_version=2, max_rss=0) + # The function should return 0 if it cannot find a memory.max file with + # a value assert parse_memory_allocated(bad_process_object_v2_2) == 0 + # Add a memory.max file with a value to the top level directory + # and check it is read + mem_file_1.write_text("99999") + assert parse_memory_allocated(bad_process_object_v2_2) == 99999 + def test_get_cgroup_name_file_not_found(mocker): """It should raise an error if the cgroup file is not found.""" @@ -340,7 +347,8 @@ def test_get_cgroup_paths(mocker): async def test_profile_data(mocker): - """""" + """Test the profile function with mocked data to ensure it calls the parse + functions and handles the data correctly.""" mocker.patch("cylc.flow.scripts.profiler.get_cgroup_name", return_value='test_name') mocker.patch("cylc.flow.scripts.profiler.get_cgroup_version", From c50db0e87236fea44ab671bd3c6803d09895bc94 Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Mon, 27 Apr 2026 16:18:48 +0100 Subject: [PATCH 35/47] Linting --- cylc/flow/remote.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/cylc/flow/remote.py b/cylc/flow/remote.py index 1a2d6d3b7ed..0cc112066d3 100644 --- a/cylc/flow/remote.py +++ b/cylc/flow/remote.py @@ -40,8 +40,6 @@ overload, ) -import psutil - from cylc.flow import ( LOG, __version__ as CYLC_VERSION, From 03a13e55b07c81de4b480ff8fb27f0dc27711f50 Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Mon, 27 Apr 2026 16:20:26 +0100 Subject: [PATCH 36/47] Linting --- cylc/flow/remote.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/cylc/flow/remote.py b/cylc/flow/remote.py index 0cc112066d3..9d148fa25c3 100644 --- a/cylc/flow/remote.py +++ b/cylc/flow/remote.py @@ -40,6 +40,8 @@ overload, ) +import psutil + from cylc.flow import ( LOG, __version__ as CYLC_VERSION, @@ -52,9 +54,6 @@ ) from cylc.flow.util import format_cmd -if TYPE_CHECKING: - import psutil - def get_proc_ancestors(proc: psutil.Process | None = None) -> list[int]: """Return list of the parent PIDs for a given process From 2e38a96fcd5d3948b6ef4f37675648ef27bb5c35 Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Mon, 27 Apr 2026 16:22:23 +0100 Subject: [PATCH 37/47] Linting --- cylc/flow/remote.py | 1 - 1 file changed, 1 deletion(-) diff --git a/cylc/flow/remote.py b/cylc/flow/remote.py index 9d148fa25c3..69a9025e628 100644 --- a/cylc/flow/remote.py +++ b/cylc/flow/remote.py @@ -31,7 +31,6 @@ import sys from time import monotonic from typing import ( - TYPE_CHECKING, Any, Dict, List, From 60012df344f67bb78d85d855da31b59c42806d7a Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Wed, 29 Apr 2026 14:44:14 +0100 Subject: [PATCH 38/47] Update unit tests --- changes.d/6663.feat.md | 2 +- cylc/flow/cfgspec/globalcfg.py | 4 ++-- cylc/flow/etc/job.sh | 4 ++-- cylc/flow/job_file.py | 10 +++++----- cylc/flow/scripts/message.py | 4 ++-- cylc/flow/scripts/profiler.py | 2 +- setup.cfg | 2 +- tests/functional/cylc-cat-log/12-delete-kill.t | 3 --- tests/functional/jobscript/03-profiler-e2e.t | 6 ++++-- tests/functional/jobscript/03-profiler-e2e/flow.cylc | 0 tests/functional/jobscript/04-profiler.t | 5 +++-- tests/functional/jobscript/04-profiler/flow.cylc | 0 tests/unit/test_job_file.py | 6 +++--- 13 files changed, 24 insertions(+), 24 deletions(-) delete mode 100644 tests/functional/jobscript/03-profiler-e2e/flow.cylc delete mode 100644 tests/functional/jobscript/04-profiler/flow.cylc diff --git a/changes.d/6663.feat.md b/changes.d/6663.feat.md index affb22fc75a..dc2112fb64f 100644 --- a/changes.d/6663.feat.md +++ b/changes.d/6663.feat.md @@ -1 +1 @@ -Added a Cylc job profiler which captures CPU and memory information from job runners which use cGroups. This information can be reviewed in the Analysis view in the GUI. +Added a Cylc job profiler which captures CPU and memory information from job runners which use cgroups. This information can be reviewed in the Analysis view in the GUI. diff --git a/cylc/flow/cfgspec/globalcfg.py b/cylc/flow/cfgspec/globalcfg.py index 5920b23b642..f65b266bf0e 100644 --- a/cylc/flow/cfgspec/globalcfg.py +++ b/cylc/flow/cfgspec/globalcfg.py @@ -1478,11 +1478,11 @@ def default_for( .. versionadded:: 8.0.0 ''') - with Conf('profile', desc=''' + with Conf('profiler', desc=''' Configure the Cylc job profiler. This tool can capture CPU and memory information from - job runners which use cGroups such as PBS and Slurm. + job runners which use cgroups such as PBS and Slurm. .. versionadded:: 8.7.0 '''): diff --git a/cylc/flow/etc/job.sh b/cylc/flow/etc/job.sh index 16849e9f4b2..ad65ef27bae 100755 --- a/cylc/flow/etc/job.sh +++ b/cylc/flow/etc/job.sh @@ -140,8 +140,8 @@ cylc__job__main() { mkdir -p "${CYLC_TASK_WORK_DIR}" cd "${CYLC_TASK_WORK_DIR}" - if [[ "${CYLC_PROFILE}" == "True" ]] ; then - cylc profile -m "${CYLC_CGROUP}" -i "${CYLC_POLLING_INTERVAL}" & + if [[ "${CYLC_PROFILER}" == "True" ]] ; then + cylc profile -m "${CYLC_CGROUP}" -i "${CYLC_PROFILER_POLL_INTERVAL}" & export profiler_pid="$!" fi diff --git a/cylc/flow/job_file.py b/cylc/flow/job_file.py index cc44aa42e03..c2760630473 100644 --- a/cylc/flow/job_file.py +++ b/cylc/flow/job_file.py @@ -226,14 +226,14 @@ def _write_task_environment(self, handle, job_conf): "\n export CYLC_TASK_FLOW_NUMBERS=" f"{','.join(str(f) for f in job_conf['flow_nums'])}") handle.write( - "\n export CYLC_PROFILE=" - f"{job_conf['platform']['profile']['activate']}") + "\n export CYLC_PROFILER=" + f"{job_conf['platform']['profiler']['activate']}") handle.write( "\n export CYLC_CGROUP=" - f"{job_conf['platform']['profile']['cgroups path']}") + f"{job_conf['platform']['profiler']['cgroups path']}") handle.write( - "\n export CYLC_POLLING_INTERVAL=" - f"{job_conf['platform']['profile']['polling interval']}") + "\n export CYLC_PROFILER_POLL_INTERVAL=" + f"{job_conf['platform']['profiler']['polling interval']}") # Standard parameter environment variables for var, val in job_conf['param_var'].items(): handle.write('\n export CYLC_TASK_PARAM_%s="%s"' % (var, val)) diff --git a/cylc/flow/scripts/message.py b/cylc/flow/scripts/message.py index fe601de9c16..a35295d236b 100755 --- a/cylc/flow/scripts/message.py +++ b/cylc/flow/scripts/message.py @@ -97,7 +97,7 @@ from logging import getLevelName, INFO import os import sys -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Sequence from cylc.flow.id_cli import parse_id_async from cylc.flow.option_parsers import ( @@ -146,7 +146,7 @@ def main(parser: COP, options: 'Values', *args: str) -> None: asyncio.run(_main(options, args)) -async def _main(options, args): +async def _main(options: 'Values', args: Sequence[str]) -> None: if len(args) <= 2: # BACK COMPAT: args <= 2 # from: diff --git a/cylc/flow/scripts/profiler.py b/cylc/flow/scripts/profiler.py index 51fdb73164d..b814f3171ad 100755 --- a/cylc/flow/scripts/profiler.py +++ b/cylc/flow/scripts/profiler.py @@ -37,7 +37,7 @@ from cylc.flow.task_message import record_messages from cylc.flow.terminal import cli_function - +INTERNAL = True PID_REGEX = re.compile(r"([^:]*\d{6,}.*)") RE_INT = re.compile(r'\d+') diff --git a/setup.cfg b/setup.cfg index e82b9b7abcb..8c4a131a074 100644 --- a/setup.cfg +++ b/setup.cfg @@ -169,7 +169,7 @@ cylc.command = ping = cylc.flow.scripts.ping:main play = cylc.flow.scripts.play:main poll = cylc.flow.scripts.poll:main - profile = cylc.flow.scripts.profiler:main + profiler = cylc.flow.scripts.profiler:main psutils = cylc.flow.scripts.psutil:main reinstall = cylc.flow.scripts.reinstall:main release = cylc.flow.scripts.release:main diff --git a/tests/functional/cylc-cat-log/12-delete-kill.t b/tests/functional/cylc-cat-log/12-delete-kill.t index 8aef45ea143..fd43bdedd65 100644 --- a/tests/functional/cylc-cat-log/12-delete-kill.t +++ b/tests/functional/cylc-cat-log/12-delete-kill.t @@ -34,9 +34,6 @@ log_file="${WORKFLOW_RUN_DIR}/log/foo.log" echo "Hello, Mr. Thompson" > "$log_file" - -export CYLC_PROC_POLL_INTERVAL=0.5 - TEST_NAME="${TEST_NAME_BASE}-delete" cylc cat-log --mode=tail "$WORKFLOW_NAME" -f foo.log 2>&1 & cat_log_pid="$!" diff --git a/tests/functional/jobscript/03-profiler-e2e.t b/tests/functional/jobscript/03-profiler-e2e.t index a20237c3176..64c88fb82f3 100644 --- a/tests/functional/jobscript/03-profiler-e2e.t +++ b/tests/functional/jobscript/03-profiler-e2e.t @@ -16,6 +16,8 @@ # along with this program. If not, see . #------------------------------------------------------------------------------- # Cylc profile test +# # Cylc profile test. This test will run the Cylc profiler with real data +# and test to see that the profiler data is received and processed correctly. # NOTE: This test will run the Cylc profiler on the given test platform. # The test platform may need to be configured for this to work (e.g. # "cgroups path" may need to be set). @@ -26,11 +28,11 @@ set_test_number 8 create_test_global_config " [platforms] [[${CYLC_TEST_PLATFORM}]] - [[[profile]]] + [[[profiler]]] activate = True polling interval = 10 [[localhost]] - [[[profile]]] + [[[profiler]]] activate = True polling interval = 10 cgroups path = the/thing/that/should/not/be diff --git a/tests/functional/jobscript/03-profiler-e2e/flow.cylc b/tests/functional/jobscript/03-profiler-e2e/flow.cylc deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/tests/functional/jobscript/04-profiler.t b/tests/functional/jobscript/04-profiler.t index 93e2697841e..887627f74f7 100644 --- a/tests/functional/jobscript/04-profiler.t +++ b/tests/functional/jobscript/04-profiler.t @@ -15,7 +15,8 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . #------------------------------------------------------------------------------- -# Cylc profile test +# Cylc profile test. This test will run the Cylc profiler with mocked data +# and test to see that the profiler data is received and processed correctly. . "$(dirname "$0")/test_header" @@ -36,7 +37,7 @@ export profiler_test_env_var='/cgroups_test_data' create_test_global_config " [platforms] [[localhost]] - [[[profile]]] + [[[profiler]]] activate = True cgroups path = ${PWD} " diff --git a/tests/functional/jobscript/04-profiler/flow.cylc b/tests/functional/jobscript/04-profiler/flow.cylc deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/tests/unit/test_job_file.py b/tests/unit/test_job_file.py index 45d4cc6668e..d55f54c027d 100644 --- a/tests/unit/test_job_file.py +++ b/tests/unit/test_job_file.py @@ -405,16 +405,16 @@ def test_write_task_environment(): 'CYLC_TASK_NAMESPACE_HIERARCHY="baa moo"\n export ' 'CYLC_TASK_TRY_NUMBER=1\n export ' 'CYLC_TASK_FLOW_NUMBERS=1\n export ' - 'CYLC_PROFILE=true\n export ' + 'CYLC_PROFILER=true\n export ' 'CYLC_CGROUP=exit_light\n export ' - 'CYLC_POLLING_INTERVAL=1\n export ' + 'CYLC_PROFILER_POLL_INTERVAL=1\n export ' 'CYLC_TASK_PARAM_duck="quack"\n export ' 'CYLC_TASK_PARAM_mouse="squeak"\n ' 'CYLC_TASK_WORK_DIR_BASE=\'farm_noises/work_d\'\n}') job_conf = { "platform": { 'communication method': 'ssh', - 'profile': { + 'profiler': { "activate": "true", "cgroups path": 'exit_light', "polling interval": 1 From f33adf8ac9a0ced97543d8b6c1dce54e5dafc0d2 Mon Sep 17 00:00:00 2001 From: Christopher Bennett Date: Thu, 14 May 2026 17:35:45 +0100 Subject: [PATCH 39/47] Update cylc/flow/cfgspec/globalcfg.py Co-authored-by: Ronnie Dutta <61982285+MetRonnie@users.noreply.github.com> --- cylc/flow/cfgspec/globalcfg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cylc/flow/cfgspec/globalcfg.py b/cylc/flow/cfgspec/globalcfg.py index f65b266bf0e..3e04edcac56 100644 --- a/cylc/flow/cfgspec/globalcfg.py +++ b/cylc/flow/cfgspec/globalcfg.py @@ -1494,7 +1494,7 @@ def default_for( desc=''' Configure the path to the cgroups filesystem. - The default value (``/sys/fs/cgroup``) is the standard + The default value is the standard location for cgroups on linux and should work in most circumstances ''') From 8340427774df5baeaf309bf7c3827e5db04c5d12 Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Fri, 15 May 2026 13:54:55 +0100 Subject: [PATCH 40/47] Code review changes --- cylc/flow/etc/job.sh | 2 +- cylc/flow/job_file.py | 2 +- tests/unit/test_job_file.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cylc/flow/etc/job.sh b/cylc/flow/etc/job.sh index ad65ef27bae..263ebe489b1 100755 --- a/cylc/flow/etc/job.sh +++ b/cylc/flow/etc/job.sh @@ -141,7 +141,7 @@ cylc__job__main() { cd "${CYLC_TASK_WORK_DIR}" if [[ "${CYLC_PROFILER}" == "True" ]] ; then - cylc profile -m "${CYLC_CGROUP}" -i "${CYLC_PROFILER_POLL_INTERVAL}" & + cylc profile -m "${CYLC_PROFILER_CGROUPS}" -i "${CYLC_PROFILER_POLL_INTERVAL}" & export profiler_pid="$!" fi diff --git a/cylc/flow/job_file.py b/cylc/flow/job_file.py index c2760630473..100d5c0da23 100644 --- a/cylc/flow/job_file.py +++ b/cylc/flow/job_file.py @@ -229,7 +229,7 @@ def _write_task_environment(self, handle, job_conf): "\n export CYLC_PROFILER=" f"{job_conf['platform']['profiler']['activate']}") handle.write( - "\n export CYLC_CGROUP=" + "\n export CYLC_PROFILER_CGROUPS=" f"{job_conf['platform']['profiler']['cgroups path']}") handle.write( "\n export CYLC_PROFILER_POLL_INTERVAL=" diff --git a/tests/unit/test_job_file.py b/tests/unit/test_job_file.py index d55f54c027d..555afcd4955 100644 --- a/tests/unit/test_job_file.py +++ b/tests/unit/test_job_file.py @@ -406,7 +406,7 @@ def test_write_task_environment(): 'CYLC_TASK_TRY_NUMBER=1\n export ' 'CYLC_TASK_FLOW_NUMBERS=1\n export ' 'CYLC_PROFILER=true\n export ' - 'CYLC_CGROUP=exit_light\n export ' + 'CYLC_PROFILER_CGROUPs=exit_light\n export ' 'CYLC_PROFILER_POLL_INTERVAL=1\n export ' 'CYLC_TASK_PARAM_duck="quack"\n export ' 'CYLC_TASK_PARAM_mouse="squeak"\n ' From 77801d36c29bc40a1f8a7298d2a4f796e7f46f21 Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Fri, 15 May 2026 13:57:57 +0100 Subject: [PATCH 41/47] Typo --- tests/unit/test_job_file.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/test_job_file.py b/tests/unit/test_job_file.py index 555afcd4955..08898b57361 100644 --- a/tests/unit/test_job_file.py +++ b/tests/unit/test_job_file.py @@ -406,7 +406,7 @@ def test_write_task_environment(): 'CYLC_TASK_TRY_NUMBER=1\n export ' 'CYLC_TASK_FLOW_NUMBERS=1\n export ' 'CYLC_PROFILER=true\n export ' - 'CYLC_PROFILER_CGROUPs=exit_light\n export ' + 'CYLC_PROFILER_CGROUPS=exit_light\n export ' 'CYLC_PROFILER_POLL_INTERVAL=1\n export ' 'CYLC_TASK_PARAM_duck="quack"\n export ' 'CYLC_TASK_PARAM_mouse="squeak"\n ' From 19808319c48c9d9019ec95af1e663ac96620cf08 Mon Sep 17 00:00:00 2001 From: Christopher Bennett Date: Mon, 18 May 2026 08:40:06 +0100 Subject: [PATCH 42/47] Update cylc/flow/scripts/profiler.py Co-authored-by: Ronnie Dutta <61982285+MetRonnie@users.noreply.github.com> --- cylc/flow/scripts/profiler.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cylc/flow/scripts/profiler.py b/cylc/flow/scripts/profiler.py index b814f3171ad..a3d6e2e17fc 100755 --- a/cylc/flow/scripts/profiler.py +++ b/cylc/flow/scripts/profiler.py @@ -176,8 +176,7 @@ def get_cgroup_name(): # Get the cgroup information for the current process with open('/proc/' + str(pid) + '/cgroup', 'r') as f: result = f.read() - result = PID_REGEX.search(result).group() - return result + return PID_REGEX.search(result).group() except Exception as err: raise CylcProfilerError( From 5506e7ed0fec51d27f22a5849ce8251e08a74ddd Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Mon, 18 May 2026 10:47:58 +0100 Subject: [PATCH 43/47] Update unit tests --- tests/unit/scripts/test_profiler.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/tests/unit/scripts/test_profiler.py b/tests/unit/scripts/test_profiler.py index e1eabf8ced0..9d2622d578d 100644 --- a/tests/unit/scripts/test_profiler.py +++ b/tests/unit/scripts/test_profiler.py @@ -358,11 +358,24 @@ async def test_profile_data(mocker): mock_file = mocker.mock_open(read_data="") mocker.patch("builtins.open", mock_file) mocker.patch("cylc.flow.scripts.profiler.parse_memory_file", - return_value=0) + side_effect=[100, 200, 150]) mocker.patch("cylc.flow.scripts.profiler.parse_cpu_file", return_value=2048) - run_once = mock.Mock(side_effect=[True, False]) - await profile(process, 1, run_once) + + # The profile function runs until the callable returns False. + run_count = 0 + + def run_four_times(): + nonlocal run_count + run_count += 1 + return run_count < 4 + + await profile(process, 0.1, run_four_times) + + # It should have been called 4 times before run_once returned False + assert run_count == 4 + # The max_rss should be the highest value from parse_memory_file + assert process.max_rss == 200 @pytest.fixture From 3ff17b23dc8e4af28f549b0ca6d50e5e643e0ec7 Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Mon, 18 May 2026 11:00:25 +0100 Subject: [PATCH 44/47] Update functional tests --- tests/functional/jobscript/04-profiler.t | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/tests/functional/jobscript/04-profiler.t b/tests/functional/jobscript/04-profiler.t index 887627f74f7..1142687e517 100644 --- a/tests/functional/jobscript/04-profiler.t +++ b/tests/functional/jobscript/04-profiler.t @@ -24,7 +24,7 @@ if [[ "$OSTYPE" != "linux-gnu"* ]]; then skip_all "Tests not compatible with $OSTYPE" fi -set_test_number 7 +set_test_number 6 # Set up test data mkdir -p "${PWD}/cgroups_test_data" @@ -47,7 +47,7 @@ init_workflow "${TEST_NAME_BASE}" <<'__FLOW_CONFIG__' [scheduling] [[graph]] - R1 = the_good & the_bad? & the_ugly + R1 = the_good & the_bad? [runtime] [[the_good]] @@ -58,10 +58,6 @@ init_workflow "${TEST_NAME_BASE}" <<'__FLOW_CONFIG__' # this task should fail (it should still send profiling info) platform = localhost script = sleep 5; false - [[the_ugly]] - # this task should succeed despite the broken profiler configuration - platform = localhost - script = sleep 1 __FLOW_CONFIG__ run_ok "${TEST_NAME_BASE}-validate" cylc validate "${WORKFLOW_NAME}" @@ -81,7 +77,4 @@ log_scan "${TEST_NAME_BASE}-task-succeeded" \ '1/the_bad.*(received)_cylc_profiler.*cpu_time.*' \ '1/the_bad.*failed' -# ensure this task succeeded despite the broken profiler configuration -grep_workflow_log_ok "${TEST_NAME_BASE}-broken" '1/the_ugly.*(received)succeeded' - purge From 30db51ac2d6a6b2b1493efc2a425bcac209ba7a7 Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Mon, 18 May 2026 15:34:04 +0100 Subject: [PATCH 45/47] Code review changes --- cylc/flow/scripts/profiler.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cylc/flow/scripts/profiler.py b/cylc/flow/scripts/profiler.py index a3d6e2e17fc..9eee16bee4b 100755 --- a/cylc/flow/scripts/profiler.py +++ b/cylc/flow/scripts/profiler.py @@ -39,7 +39,7 @@ INTERNAL = True PID_REGEX = re.compile(r"([^:]*\d{6,}.*)") -RE_INT = re.compile(r'\d+') +RE_CPU_USAGE = re.compile(r'usage_usec=(\d+)') @dataclass @@ -133,8 +133,8 @@ def parse_cpu_file(process: Process) -> int: if process.cgroup_version == 2: with open(process.cgroup_cpu_path, 'r') as f: for line in f: - if "usage_usec" in line: - return round(int(RE_INT.findall(line)[0]) / 1000) + if match := RE_CPU_USAGE.search(line): + return round(int(match.group(1)) / 1000) raise FileNotFoundError(process.cgroup_cpu_path) elif process.cgroup_version == 1: From bdb71b3fbc6ec0690de45c90dd4f9c786a6dd8bc Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Mon, 18 May 2026 15:36:01 +0100 Subject: [PATCH 46/47] Code review changes --- cylc/flow/scripts/profiler.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cylc/flow/scripts/profiler.py b/cylc/flow/scripts/profiler.py index 9eee16bee4b..aa6c9c8b1d0 100755 --- a/cylc/flow/scripts/profiler.py +++ b/cylc/flow/scripts/profiler.py @@ -215,15 +215,15 @@ def get_cgroup_paths(location) -> Process: err, "Unable to determine cgroup version") from err -async def profile(_process: Process, delay, keep_looping=lambda: True): +async def profile(process: Process, delay, keep_looping=lambda: True): # The infinite loop that will constantly poll the cgroup # The lambda function is used to allow the loop to be stopped in unit tests while keep_looping(): # Polling the cgroup for memory and keeping track of the max rss value - max_rss = parse_memory_file(_process) - if max_rss is not None and max_rss > _process.max_rss: - _process.max_rss = max_rss + max_rss = parse_memory_file(process) + if max_rss is not None and max_rss > process.max_rss: + process.max_rss = max_rss await asyncio.sleep(delay) From bedf56436ae896012f441d63c4b5d147667abc43 Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Tue, 19 May 2026 11:06:19 +0100 Subject: [PATCH 47/47] Code review changes --- cylc/flow/scripts/profiler.py | 2 +- tests/unit/scripts/test_profiler.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cylc/flow/scripts/profiler.py b/cylc/flow/scripts/profiler.py index aa6c9c8b1d0..e511fdf1519 100755 --- a/cylc/flow/scripts/profiler.py +++ b/cylc/flow/scripts/profiler.py @@ -39,7 +39,7 @@ INTERNAL = True PID_REGEX = re.compile(r"([^:]*\d{6,}.*)") -RE_CPU_USAGE = re.compile(r'usage_usec=(\d+)') +RE_CPU_USAGE = re.compile(r'usage_usec\s*(\d+)') @dataclass diff --git a/tests/unit/scripts/test_profiler.py b/tests/unit/scripts/test_profiler.py index 9d2622d578d..3c92ad4138f 100644 --- a/tests/unit/scripts/test_profiler.py +++ b/tests/unit/scripts/test_profiler.py @@ -148,7 +148,7 @@ def test_parse_cpu_file(tmpdir): cpu_file_v1_bad = tmpdir.join("cpu_file_v1_bad.txt") cpu_file_v1_bad.write("I'm your dream, mind ashtray") cpu_file_v2_good = tmpdir.join("cpu_file_v2_good.txt") - cpu_file_v2_good.write('usage_usec=1234567890') + cpu_file_v2_good.write('usage_usec 1234567890') cpu_file_v2_bad = tmpdir.join("cpu_file_v2_bad.txt") cpu_file_v2_bad.write('Give me fuel, give me fire, ' 'give me that which I desire')