Skip to content

_pixi_slurm_ssh

_log_change_of_job_state(*, old_state, new_state, logger_name)

Emit a log for state changes.

Parameters:

Name Type Description Default
old_state str | None
required
new_state str
required
logger_name str
required
Source code in fractal_server/tasks/v2/ssh/_pixi_slurm_ssh.py
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
def _log_change_of_job_state(
    *,
    old_state: str | None,
    new_state: str,
    logger_name: str,
) -> None:
    """
    Emit a log for state changes.

    Args:
        old_state:
        new_state:
        logger_name:
    """
    if new_state != old_state:
        logger = get_logger(logger_name=logger_name)
        logger.debug(
            f"SLURM-job state changed from {old_state=} to {new_state=}."
        )

_run_squeue(*, fractal_ssh, squeue_cmd, logger_name)

Run a squeue command and handle exceptions.

Parameters:

Name Type Description Default
fractal_ssh FractalSSH
required
logger_name str
required
squeue_cmd str
required
Return

state: The SLURM-job state.

Source code in fractal_server/tasks/v2/ssh/_pixi_slurm_ssh.py
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
def _run_squeue(
    *,
    fractal_ssh: FractalSSH,
    squeue_cmd: str,
    logger_name: str,
) -> str:
    """
    Run a `squeue` command and handle exceptions.

    Args:
        fractal_ssh:
        logger_name:
        squeue_cmd:

    Return:
        state: The SLURM-job state.
    """
    try:
        cmd_stdout = fractal_ssh.run_command(cmd=squeue_cmd)
        state = cmd_stdout.strip().split()[1]
        return state
    except Exception as e:
        logger = get_logger(logger_name=logger_name)
        logger.info(f"`squeue` command failed (original error: {e})")
        return FRACTAL_SQUEUE_ERROR_STATE

_verify_success_file_exists(*, fractal_ssh, success_file_remote, logger_name, stderr_remote)

Fail if the success sentinel file does not exist remotely.

Note: the FractalSSH methods in this function may fail, and such failures are not handled in this function. Any such failure, however, will lead to a "failed" task-group lifecycle activity (because it will raise an exception from within run_script_on_remote_slurm, which will then be handled at the calling-function level.

Source code in fractal_server/tasks/v2/ssh/_pixi_slurm_ssh.py
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
def _verify_success_file_exists(
    *,
    fractal_ssh: FractalSSH,
    success_file_remote: str,
    logger_name: str,
    stderr_remote: str,
) -> None:
    """
    Fail if the success sentinel file does not exist remotely.

    Note: the `FractalSSH` methods in this function may fail, and such failures
    are not handled in this function. Any such failure, however, will lead to
    a "failed" task-group lifecycle activity (because it will raise an
    exception from within `run_script_on_remote_slurm`, which will then be
    handled at the calling-function level.
    """
    if not fractal_ssh.remote_exists(path=success_file_remote):
        logger = get_logger(logger_name=logger_name)
        error_msg = f"{success_file_remote=} missing."
        logger.info(error_msg)
        if fractal_ssh.remote_exists(stderr_remote):
            stderr = fractal_ssh.read_remote_text_file(stderr_remote)
            logger.info(f"SLURM-job stderr:\n{stderr}")
        raise RuntimeError(error_msg)

run_script_on_remote_slurm(*, script_path, slurm_config, fractal_ssh, logger_name, log_file_path, prefix, db, activity)

Run a pixi install script as a SLURM job.

NOTE: This is called from within a try/except, thus we can use exceptions as a mechanism to propagate failure/errors.

Source code in fractal_server/tasks/v2/ssh/_pixi_slurm_ssh.py
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
def run_script_on_remote_slurm(
    *,
    script_path: str,
    slurm_config: PixiSLURMConfig,
    fractal_ssh: FractalSSH,
    logger_name: str,
    log_file_path: Path,
    prefix: str,
    db: Session,
    activity: TaskGroupActivityV2,
):
    """
    Run a `pixi install` script as a SLURM job.

    NOTE: This is called from within a try/except, thus we can use exceptions
    as a mechanism to propagate failure/errors.
    """

    logger = get_logger(logger_name=logger_name)
    settings = Inject(get_settings)

    # (1) Prepare remote submission script
    workdir_remote = Path(script_path).parent.as_posix()
    submission_script_remote = os.path.join(
        workdir_remote, f"{prefix}-submit.sh"
    )
    stderr_remote = os.path.join(workdir_remote, f"{prefix}-err.txt")
    stdout_remote = os.path.join(workdir_remote, f"{prefix}-out.txt")
    success_file_remote = os.path.join(workdir_remote, f"{prefix}-success.txt")
    script_lines = [
        "#!/bin/bash",
        f"#SBATCH --partition={slurm_config.partition}",
        f"#SBATCH --cpus-per-task={slurm_config.cpus}",
        f"#SBATCH --mem={slurm_config.mem}",
        f"#SBATCH --time={slurm_config.time}",
        f"#SBATCH --err={stderr_remote}",
        f"#SBATCH --out={stdout_remote}",
        f"#SBATCH -D {workdir_remote}",
        "",
        f"bash {script_path}",
        f"touch {success_file_remote}",
        "",
    ]
    script_contents = "\n".join(script_lines)
    fractal_ssh.write_remote_file(
        path=submission_script_remote,
        content=script_contents,
    )
    logger.debug(f"Written {submission_script_remote=}.")

    activity.log = get_current_log(log_file_path)
    activity = add_commit_refresh(obj=activity, db=db)

    # (2) Submit SLURM job
    logger.debug("Now submit SLURM job.")
    sbatch_cmd = f"sbatch --parsable {submission_script_remote}"
    try:
        stdout = fractal_ssh.run_command(cmd=sbatch_cmd)
        job_id = int(stdout)
        logger.debug(f"SLURM-job submission successful ({job_id=}).")
    except Exception as e:
        logger.error(
            (
                f"Submission of {submission_script_remote} failed. "
                f"Original error: {str(e)}"
            )
        )
        raise e
    finally:
        activity.log = get_current_log(log_file_path)
        activity = add_commit_refresh(obj=activity, db=db)

    # (3) Monitor job
    squeue_cmd = (
        f"squeue --noheader --format='%i %T' --states=all --jobs={job_id}"
    )
    logger.debug(f"Start monitoring job with {squeue_cmd=}.")
    old_state = None
    while True:
        new_state = _run_squeue(
            fractal_ssh=fractal_ssh,
            squeue_cmd=squeue_cmd,
            logger_name=logger_name,
        )
        _log_change_of_job_state(
            old_state=old_state,
            new_state=new_state,
            logger_name=logger_name,
        )
        activity.log = get_current_log(log_file_path)
        activity = add_commit_refresh(obj=activity, db=db)
        if new_state in STATES_FINISHED:
            logger.debug(f"Exit retrieval loop (state={new_state}).")
            break
        old_state = new_state
        time.sleep(settings.FRACTAL_SLURM_POLL_INTERVAL)

    _verify_success_file_exists(
        fractal_ssh=fractal_ssh,
        logger_name=logger_name,
        success_file_remote=success_file_remote,
        stderr_remote=stderr_remote,
    )

    logger.info("SLURM-job execution completed successfully, continue.")
    activity.log = get_current_log(log_file_path)
    activity = add_commit_refresh(obj=activity, db=db)