Skip to content

_pixi_slurm_ssh

_get_workdir_remote(script_paths)

Check that there is one and only one workdir, and return it.

Note: The is_absolute check is to filter out a chmod command.

Source code in fractal_server/tasks/v2/ssh/_pixi_slurm_ssh.py
33
34
35
36
37
38
39
40
41
42
43
44
45
46
def _get_workdir_remote(script_paths: list[str]) -> str:
    """
    Check that there is one and only one `workdir`, and return it.

    Note: The `is_absolute` check is to filter out a `chmod` command.
    """
    workdirs = [
        Path(script_path).parent.as_posix()
        for script_path in script_paths
        if Path(script_path).is_absolute()
    ]
    if not len(set(workdirs)) == 1:
        raise ValueError(f"Invalid {script_paths=}.")
    return workdirs[0]

_log_change_of_job_state(*, old_state, new_state, logger_name)

Emit a log for state changes.

Parameters:

Name Type Description Default
old_state str | None
required
new_state str
required
logger_name str
required
Source code in fractal_server/tasks/v2/ssh/_pixi_slurm_ssh.py
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
def _log_change_of_job_state(
    *,
    old_state: str | None,
    new_state: str,
    logger_name: str,
) -> None:
    """
    Emit a log for state changes.

    Args:
        old_state:
        new_state:
        logger_name:
    """
    if new_state != old_state:
        logger = get_logger(logger_name=logger_name)
        logger.debug(
            f"SLURM-job state changed from {old_state=} to {new_state=}."
        )

_read_file_if_exists(*, fractal_ssh, path)

Read a remote file if it exists, or return an empty string.

Source code in fractal_server/tasks/v2/ssh/_pixi_slurm_ssh.py
49
50
51
52
53
54
55
56
57
58
59
60
def _read_file_if_exists(
    *,
    fractal_ssh: FractalSSH,
    path: str,
) -> str:
    """
    Read a remote file if it exists, or return an empty string.
    """
    if fractal_ssh.remote_exists(path=path):
        return fractal_ssh.read_remote_text_file(path)
    else:
        return ""

_run_squeue(*, fractal_ssh, squeue_cmd, logger_name)

Run a squeue command and handle exceptions.

Parameters:

Name Type Description Default
fractal_ssh FractalSSH
required
logger_name str
required
squeue_cmd str
required
Return

state: The SLURM-job state.

Source code in fractal_server/tasks/v2/ssh/_pixi_slurm_ssh.py
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
def _run_squeue(
    *,
    fractal_ssh: FractalSSH,
    squeue_cmd: str,
    logger_name: str,
) -> str:
    """
    Run a `squeue` command and handle exceptions.

    Args:
        fractal_ssh:
        logger_name:
        squeue_cmd:

    Return:
        state: The SLURM-job state.
    """
    try:
        cmd_stdout = fractal_ssh.run_command(cmd=squeue_cmd)
        state = cmd_stdout.strip().split()[1]
        return state
    except Exception as e:
        logger = get_logger(logger_name=logger_name)
        logger.info(f"`squeue` command failed (original error: {e})")
        return FRACTAL_SQUEUE_ERROR_STATE

_verify_success_file_exists(*, fractal_ssh, success_file_remote, logger_name, stderr_remote)

Fail if the success sentinel file does not exist remotely.

Note: the FractalSSH methods in this function may fail, and such failures are not handled in this function. Any such failure, however, will lead to a "failed" task-group lifecycle activity (because it will raise an exception from within run_script_on_remote_slurm, which will then be handled at the calling-function level.

Source code in fractal_server/tasks/v2/ssh/_pixi_slurm_ssh.py
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
def _verify_success_file_exists(
    *,
    fractal_ssh: FractalSSH,
    success_file_remote: str,
    logger_name: str,
    stderr_remote: str,
) -> None:
    """
    Fail if the success sentinel file does not exist remotely.

    Note: the `FractalSSH` methods in this function may fail, and such failures
    are not handled in this function. Any such failure, however, will lead to
    a "failed" task-group lifecycle activity (because it will raise an
    exception from within `run_script_on_remote_slurm`, which will then be
    handled at the calling-function level.
    """
    if not fractal_ssh.remote_exists(path=success_file_remote):
        logger = get_logger(logger_name=logger_name)
        error_msg = f"{success_file_remote=} missing."
        logger.info(error_msg)

        stderr = _read_file_if_exists(
            fractal_ssh=fractal_ssh, path=stderr_remote
        )
        if stderr:
            logger.info(f"SLURM-job stderr:\n{stderr}")
        raise RuntimeError(error_msg)

run_script_on_remote_slurm(*, script_paths, slurm_config, fractal_ssh, logger_name, log_file_path, prefix, db, activity, poll_interval)

Run a pixi install script as a SLURM job.

NOTE: This is called from within a try/except, thus we can use exceptions as a mechanism to propagate failure/errors.

Source code in fractal_server/tasks/v2/ssh/_pixi_slurm_ssh.py
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
def run_script_on_remote_slurm(
    *,
    script_paths: list[str],
    slurm_config: dict[str, Any],
    fractal_ssh: FractalSSH,
    logger_name: str,
    log_file_path: Path,
    prefix: str,
    db: Session,
    activity: TaskGroupActivityV2,
    poll_interval: int,
):
    """
    Run a `pixi install` script as a SLURM job.

    NOTE: This is called from within a try/except, thus we can use exceptions
    as a mechanism to propagate failure/errors.
    """

    slurm_config_obj = PixiSLURMConfig(**slurm_config)

    logger = get_logger(logger_name=logger_name)

    # (1) Prepare remote submission script
    workdir_remote = _get_workdir_remote(script_paths)
    submission_script_remote = os.path.join(
        workdir_remote, f"{prefix}-submit.sh"
    )
    stderr_remote = os.path.join(workdir_remote, f"{prefix}-err.txt")
    stdout_remote = os.path.join(workdir_remote, f"{prefix}-out.txt")
    success_file_remote = os.path.join(workdir_remote, f"{prefix}-success.txt")
    script_lines = [
        "#!/bin/bash",
        f"#SBATCH --partition={slurm_config_obj.partition}",
        f"#SBATCH --cpus-per-task={slurm_config_obj.cpus}",
        f"#SBATCH --mem={slurm_config_obj.mem}",
        f"#SBATCH --time={slurm_config_obj.time}",
        f"#SBATCH --err={stderr_remote}",
        f"#SBATCH --out={stdout_remote}",
        f"#SBATCH -D {workdir_remote}",
        "",
    ]
    for script_path in script_paths:
        script_lines.append(f"bash {script_path}")
    script_lines.append(f"touch {success_file_remote}")

    script_contents = "\n".join(script_lines)
    fractal_ssh.write_remote_file(
        path=submission_script_remote,
        content=script_contents,
    )
    logger.debug(f"Written {submission_script_remote=}.")

    activity.log = get_current_log(log_file_path)
    activity = add_commit_refresh(obj=activity, db=db)

    # (2) Submit SLURM job
    logger.debug("Now submit SLURM job.")
    sbatch_cmd = f"sbatch --parsable {submission_script_remote}"
    try:
        stdout = fractal_ssh.run_command(cmd=sbatch_cmd)
        job_id = int(stdout)
        logger.debug(f"SLURM-job submission successful ({job_id=}).")
    except Exception as e:
        logger.error(
            (
                f"Submission of {submission_script_remote} failed. "
                f"Original error: {str(e)}"
            )
        )
        raise e
    finally:
        activity.log = get_current_log(log_file_path)
        activity = add_commit_refresh(obj=activity, db=db)

    # (3) Monitor job
    squeue_cmd = (
        f"squeue --noheader --format='%i %T' --states=all --jobs={job_id}"
    )
    logger.debug(f"Start monitoring job with {squeue_cmd=}.")
    old_state = None
    while True:
        new_state = _run_squeue(
            fractal_ssh=fractal_ssh,
            squeue_cmd=squeue_cmd,
            logger_name=logger_name,
        )
        _log_change_of_job_state(
            old_state=old_state,
            new_state=new_state,
            logger_name=logger_name,
        )
        activity.log = get_current_log(log_file_path)
        activity = add_commit_refresh(obj=activity, db=db)
        if new_state in STATES_FINISHED:
            logger.debug(f"Exit retrieval loop (state={new_state}).")
            break
        old_state = new_state
        time.sleep(poll_interval)

    _verify_success_file_exists(
        fractal_ssh=fractal_ssh,
        logger_name=logger_name,
        success_file_remote=success_file_remote,
        stderr_remote=stderr_remote,
    )

    stdout = _read_file_if_exists(
        fractal_ssh=fractal_ssh,
        path=stdout_remote,
    )

    logger.info("SLURM-job execution completed successfully, continue.")
    activity.log = get_current_log(log_file_path)
    activity = add_commit_refresh(obj=activity, db=db)

    return stdout