diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml new file mode 100644 index 00000000..6c85929a --- /dev/null +++ b/.github/workflows/docs.yml @@ -0,0 +1,90 @@ +name: Docs + +env: + SLURM_DOCKER_IMAGE: giovtorres/slurm-docker:25.11.4-rl10 + +on: + pull_request: + branches: [main] + push: + branches: [main] + release: + types: [published] + workflow_dispatch: + +jobs: + docs-build: + runs-on: ubuntu-latest + if: github.event_name == 'pull_request' || github.event_name == 'workflow_dispatch' + permissions: + contents: read + steps: + - name: Checkout repository + uses: actions/checkout@v6 + + - name: Pull Slurm container + run: docker pull ${{ env.SLURM_DOCKER_IMAGE }} + + - name: Start Slurm container + run: docker compose up -d + + - name: Build docs + run: docker exec slurmctl bash -ec "cd /pyslurm && scripts/builddocs.sh -j4 -s" + + - name: Upload docs artifact + uses: actions/upload-artifact@v7 + with: + name: docs-site + path: site/ + retention-days: 7 + if-no-files-found: error + + docs-deploy-dev: + runs-on: ubuntu-latest + if: github.event_name == 'push' && github.ref == 'refs/heads/main' + permissions: + contents: write + steps: + - name: Checkout repository + uses: actions/checkout@v6 + with: + fetch-depth: 0 + + - name: Configure git identity + run: | + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + + - name: Pull Slurm container + run: docker pull ${{ env.SLURM_DOCKER_IMAGE }} + + - name: Start Slurm container + run: docker compose up -d + + - name: Deploy dev docs + run: docker exec slurmctl bash -ec "cd /pyslurm && pip install -q ".[docs]" && scripts/build.sh -j4 -d && mike deploy dev --push --update-aliases" + + docs-deploy-release: + runs-on: ubuntu-latest + if: github.event_name == 'release' + permissions: + contents: write + steps: + - name: Checkout repository + uses: actions/checkout@v6 + with: + fetch-depth: 0 + + - name: Configure git identity + run: | + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + + - name: Pull Slurm container + run: docker pull ${{ env.SLURM_DOCKER_IMAGE }} + + - name: Start Slurm container + run: docker compose up -d + + - name: Deploy versioned docs + run: docker exec slurmctl bash -ec "cd /pyslurm && pip install -q ".[docs]" && scripts/build.sh -j4 -d && mike deploy ${{ github.ref_name }} latest --update-aliases --push" diff --git a/doc_requirements.txt b/doc_requirements.txt deleted file mode 100644 index 1169bcaa..00000000 --- a/doc_requirements.txt +++ /dev/null @@ -1,7 +0,0 @@ -cython>=3.0.11,<3.1 -wheel -setuptools -mkdocstrings[python] -mike -mkdocs-material -mkdocs-awesome-pages-plugin diff --git a/docs/getting-started.md b/docs/getting-started.md new file mode 100644 index 00000000..169d14e4 --- /dev/null +++ b/docs/getting-started.md @@ -0,0 +1,86 @@ +--- +title: Getting Started +--- + +# Getting Started + +## Prerequisites + +- Slurm 25.11.x running with `slurmctld` accessible from the host +- Python 3.6+ +- PySlurm installed (`pip install pyslurm` or [built from source](index.md)) + +## List all jobs + +```python +import pyslurm + +jobs = pyslurm.Jobs.load() +for job_id, job in jobs.items(): + print(f"{job_id}: {job.name} [{job.state}] user={job.user_name}") +``` + +## Load a single job + +```python +import pyslurm + +job = pyslurm.Job.load(12345) +print(job.name, job.state, job.allocated_nodes) +``` + +## Submit a job + +```python +import pyslurm + +desc = pyslurm.JobSubmitDescription( + name="myjob", + script="#!/bin/bash\nhostname", + time_limit="01:00:00", + ntasks=4, +) +job_id = desc.submit() +print(f"Submitted job {job_id}") +``` + +## List nodes + +```python +import pyslurm + +nodes = pyslurm.Nodes.load() +for name, node in nodes.items(): + print(f"{name}: {node.state} cpus={node.total_cpus} mem={node.real_memory}MB") +``` + +## List partitions + +```python +import pyslurm + +partitions = pyslurm.Partitions.load() +for name, part in partitions.items(): + print(f"{name}: {part.state} nodes={part.total_nodes}") +``` + +## Error handling + +All Slurm RPC calls raise [`pyslurm.RPCError`][pyslurm.RPCError] on failure: + +```python +import pyslurm + +try: + job = pyslurm.Job.load(99999) +except pyslurm.RPCError as e: + print(f"Failed: {e}") +``` + +## Next steps + +- [Job API](reference/job.md) +- [Node API](reference/node.md) +- [Partition API](reference/partition.md) +- [Database API](reference/db/index.md) +- [slurmctld API](reference/slurmctld.md) diff --git a/docs/migration.md b/docs/migration.md new file mode 100644 index 00000000..19c6c021 --- /dev/null +++ b/docs/migration.md @@ -0,0 +1,60 @@ +--- +title: Migrating from the Old API +--- + +# Migrating from the Old API + +PySlurm 25.11 removed the long-deprecated legacy API classes. This page lists +each removed class and its replacement. + +## Removed classes and their replacements + +| Removed | Replacement | +|---------|-------------| +| `pyslurm.job` | [`pyslurm.Job`][pyslurm.Job], [`pyslurm.Jobs`][pyslurm.Jobs], [`pyslurm.JobSubmitDescription`][pyslurm.JobSubmitDescription] | +| `pyslurm.node` | [`pyslurm.Node`][pyslurm.Node], [`pyslurm.Nodes`][pyslurm.Nodes] | +| `pyslurm.jobstep` | [`pyslurm.JobStep`][pyslurm.JobStep], [`pyslurm.JobSteps`][pyslurm.JobSteps] | +| `pyslurm.partition` | [`pyslurm.Partition`][pyslurm.Partition], [`pyslurm.Partitions`][pyslurm.Partitions] | +| `pyslurm.reservation` | [`pyslurm.Reservation`][pyslurm.Reservation], [`pyslurm.Reservations`][pyslurm.Reservations] | +| `pyslurm.statistics` | `pyslurm.slurmctld.diag()` → [`pyslurm.slurmctld.Statistics`][pyslurm.slurmctld.Statistics] | +| `pyslurm.front_end` | Removed from Slurm — no replacement | +| `pyslurm.config` | [`pyslurm.slurmctld.Config`][pyslurm.slurmctld.Config] | + +## Loading data + +Old API used `get_*()` module-level functions. The new API uses classmethods: + +```python +# Old +import pyslurm +jobs = pyslurm.job().get() + +# New +import pyslurm +jobs = pyslurm.Jobs.load() +``` + +## Iterating + +```python +# Old +for job_id, attrs in pyslurm.job().get().items(): + print(job_id, attrs["job_state"]) + +# New +for job in pyslurm.Jobs.load().values(): + print(job.id, job.state) +``` + +## Submitting a job + +```python +# Old +import pyslurm +job_id = pyslurm.job().submit_batch_job(0, {"name": "myjob", "script": "#!/bin/bash\nhostname"}) + +# New +import pyslurm +desc = pyslurm.JobSubmitDescription(name="myjob", script="#!/bin/bash\nhostname") +job_id = desc.submit() +``` diff --git a/docs/reference/enums.md b/docs/reference/enums.md new file mode 100644 index 00000000..2fd105bb --- /dev/null +++ b/docs/reference/enums.md @@ -0,0 +1,5 @@ +--- +title: Enums +--- + +::: pyslurm.SchedulerType diff --git a/docs/reference/frontend.md b/docs/reference/frontend.md deleted file mode 100644 index 437283d4..00000000 --- a/docs/reference/frontend.md +++ /dev/null @@ -1,9 +0,0 @@ ---- -title: Frontend ---- - -!!! warning - This API is currently being completely reworked, and is subject to be - removed in the future when a replacement is introduced - -::: pyslurm.deprecated.front_end diff --git a/docs/reference/index.md b/docs/reference/index.md index af0ef05e..914458bd 100644 --- a/docs/reference/index.md +++ b/docs/reference/index.md @@ -2,27 +2,9 @@ The `pyslurm` package is a wrapper around the Slurm C-API - -!!! warning - Please note that the `pyslurm` API is currently being completely reworked. - Reworked classes and functions that replace functionality of the old API - will be marked as such, with a link to the documentation of its old - counterpart. - - Old API functionality that is already replaced is marked as deprecated, - and will be removed at some point in the future. - - The new reworked classes will be tested thoroughly before making them - available here, although it is of course still possible that some bugs may - appear here and there, which we will try to identify as best as possible! - - In addition, since these classes are pretty new, their interface - (precisely: attribute names, return types) should not yet be considered - 100% stable, and changes may be made in rare cases if it makes sense to do - so. - - If you are using the new-style API, we would like to know your feedback on - it! +!!! note + The pyslurm API was fully reworked in v25.11.0. If you are upgrading from + an older version, see the [Migration Guide](../migration.md). ## Reworked Classes diff --git a/docs/reference/job.md b/docs/reference/job.md index cb1c19eb..521f428b 100644 --- a/docs/reference/job.md +++ b/docs/reference/job.md @@ -2,9 +2,10 @@ title: Job --- -!!! note - This supersedes the [pyslurm.job](old/job.md) class, which will be - removed in a future release +The `Job` class represents a single job in the Slurm workload manager. +Use [`Jobs.load`][pyslurm.Jobs.load] to fetch all jobs or [`Job.load`][pyslurm.Job.load] +for a single job by ID. To submit new jobs, see [`JobSubmitDescription`][pyslurm.JobSubmitDescription]. +For the steps within a running job, see [JobStep](jobstep.md). ::: pyslurm.Job ::: pyslurm.Jobs diff --git a/docs/reference/jobstep.md b/docs/reference/jobstep.md index b7b3e2b9..ca5ca05d 100644 --- a/docs/reference/jobstep.md +++ b/docs/reference/jobstep.md @@ -2,9 +2,11 @@ title: JobStep --- -!!! note - This supersedes the [pyslurm.jobstep](old/jobstep.md) class, which - will be removed in a future release +The `JobStep` class represents a single step within a Slurm job — typically +an `srun` invocation inside a batch script. Steps are automatically populated +on the parent [`Job`][pyslurm.Job] when calling [`Job.load`][pyslurm.Job.load] +for a running job, and are accessible via `job.steps`. See [Job](job.md) for +the parent job API. ::: pyslurm.JobStep ::: pyslurm.JobSteps diff --git a/docs/reference/node.md b/docs/reference/node.md index e8e8d619..64e49732 100644 --- a/docs/reference/node.md +++ b/docs/reference/node.md @@ -2,9 +2,11 @@ title: Node --- -!!! note - This supersedes the [pyslurm.node](old/node.md) class, which will be - removed in a future release +The `Node` class represents a compute node registered with the Slurm controller. +Use [`Nodes.load`][pyslurm.Nodes.load] to fetch all nodes or +[`Node.load`][pyslurm.Node.load] for a single node by name. +Nodes can be drained, modified, created, or deleted using the write methods. +For partition membership, see [Partition](partition.md). ::: pyslurm.Node ::: pyslurm.Nodes diff --git a/docs/reference/old/.pages b/docs/reference/old/.pages deleted file mode 100644 index ae2a9b18..00000000 --- a/docs/reference/old/.pages +++ /dev/null @@ -1,3 +0,0 @@ -hide: true -nav: - - ... diff --git a/docs/reference/old/db/.pages b/docs/reference/old/db/.pages deleted file mode 100644 index ae2a9b18..00000000 --- a/docs/reference/old/db/.pages +++ /dev/null @@ -1,3 +0,0 @@ -hide: true -nav: - - ... diff --git a/docs/reference/old/db/job.md b/docs/reference/old/db/job.md deleted file mode 100644 index 65e600db..00000000 --- a/docs/reference/old/db/job.md +++ /dev/null @@ -1,10 +0,0 @@ ---- -title: Job ---- - -!!! warning - This is superseded by [pyslurm.db.Job](../../db/job.md) class and will - be removed in a future release - -::: pyslurm.deprecated.slurmdb_jobs - handler: python diff --git a/docs/reference/old/job.md b/docs/reference/old/job.md deleted file mode 100644 index 7d42ad09..00000000 --- a/docs/reference/old/job.md +++ /dev/null @@ -1,10 +0,0 @@ ---- -title: Job ---- - -!!! warning - This class is superseded by [pyslurm.Job](../job.md) and will be removed - in a future release. - -::: pyslurm.deprecated.job - handler: python diff --git a/docs/reference/old/jobstep.md b/docs/reference/old/jobstep.md deleted file mode 100644 index bf81c363..00000000 --- a/docs/reference/old/jobstep.md +++ /dev/null @@ -1,10 +0,0 @@ ---- -title: JobStep ---- - -!!! warning - This class is superseded by [pyslurm.JobStep](../jobstep.md) and will be - removed in a future release. - -::: pyslurm.deprecated.jobstep - handler: python diff --git a/docs/reference/old/node.md b/docs/reference/old/node.md deleted file mode 100644 index 15b3ed51..00000000 --- a/docs/reference/old/node.md +++ /dev/null @@ -1,10 +0,0 @@ ---- -title: Node ---- - -!!! warning - This class is superseded by [pyslurm.Node](../node.md) and will be - removed in a future release. - -::: pyslurm.deprecated.node - handler: python diff --git a/docs/reference/old/partition.md b/docs/reference/old/partition.md deleted file mode 100644 index b2b1c7ae..00000000 --- a/docs/reference/old/partition.md +++ /dev/null @@ -1,10 +0,0 @@ ---- -title: Partition ---- - -!!! warning - This class is superseded by [pyslurm.Partition](../partition.md) and will - be removed in a future release. - -::: pyslurm.deprecated.partition - handler: python diff --git a/docs/reference/partition.md b/docs/reference/partition.md index 9181e10f..110b64f8 100644 --- a/docs/reference/partition.md +++ b/docs/reference/partition.md @@ -2,9 +2,10 @@ title: Partition --- -!!! note - This supersedes the [pyslurm.partition](old/partition.md) class, which - will be removed in a future release +The `Partition` class represents a Slurm partition (queue), which is a logical +grouping of nodes with shared scheduling policies and resource limits. Jobs are +submitted to a partition. Use [`Partitions.load`][pyslurm.Partitions.load] to +fetch all partitions. For the nodes within a partition, see [Node](node.md). ::: pyslurm.Partition ::: pyslurm.Partitions diff --git a/docs/reference/reservation.md b/docs/reference/reservation.md index 8fc9f401..7615921f 100644 --- a/docs/reference/reservation.md +++ b/docs/reference/reservation.md @@ -2,6 +2,11 @@ title: Reservation --- +The `Reservation` class represents a Slurm advance reservation, which +pre-allocates nodes or CPUs for a specific time window. Reservations are used +for maintenance windows or guaranteed access for specific users and accounts. +Use [`Reservations.load`][pyslurm.Reservations.load] to fetch all reservations. + ::: pyslurm.Reservation ::: pyslurm.Reservations ::: pyslurm.ReservationFlags diff --git a/docs/reference/statistics.md b/docs/reference/statistics.md deleted file mode 100644 index 1bc81d38..00000000 --- a/docs/reference/statistics.md +++ /dev/null @@ -1,9 +0,0 @@ ---- -title: Statistics ---- - -!!! warning - This API is currently being completely reworked, and is subject to be - removed in the future when a replacement is introduced - -::: pyslurm.deprecated.statistics diff --git a/docs/reference/topology.md b/docs/reference/topology.md deleted file mode 100644 index 976be156..00000000 --- a/docs/reference/topology.md +++ /dev/null @@ -1,9 +0,0 @@ ---- -title: Topology ---- - -!!! warning - This API is currently being completely reworked, and is subject to be - removed in the future when a replacement is introduced - -::: pyslurm.deprecated.topology diff --git a/mkdocs.yml b/mkdocs.yml index c80f2d93..91f8ad65 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -8,7 +8,9 @@ copyright: Copyright © 2024 PySlurm Developers nav: - Home: - Home: index.md + - Getting Started: getting-started.md - Changelog: changelog.md + - Migration Guide: migration.md - ... theme: @@ -44,7 +46,8 @@ theme: plugins: - search - awesome-pages - - autorefs + - autorefs: + warn_on_unresolved: false - mike - mkdocstrings: handlers: @@ -54,6 +57,8 @@ plugins: options: filters: ["!^_"] docstring_style: google + docstring_options: + warn_unknown_params: false allow_inspection: true inherited_members: false show_signature: true diff --git a/pyproject.toml b/pyproject.toml index 5334f894..4369af20 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -64,6 +64,12 @@ dev = [ "pytest", "pytest-sugar", ] +docs = [ + "mkdocstrings-python<2.0", + "mike", + "mkdocs-material", + "mkdocs-awesome-pages-plugin", +] [tool.setuptools.dynamic] version = { attr = "pyslurm.version.__version__" } @@ -72,6 +78,9 @@ version = { attr = "pyslurm.version.__version__" } include = ["pyslurm*"] exclude = ["pyslurm.pydefines", "pyslurm.slurm"] +[tool.pytest.ini_options] +testpaths = ["tests"] + [tool.ruff] line-length = 80 extend-exclude = [ diff --git a/pyslurm/core/job/job.pyx b/pyslurm/core/job/job.pyx index 1b3eb989..2b77ce26 100644 --- a/pyslurm/core/job/job.pyx +++ b/pyslurm/core/job/job.pyx @@ -55,6 +55,17 @@ from pyslurm.utils.helpers import ( cdef class Jobs(MultiClusterMap): + """A collection of Slurm Jobs, keyed by job ID. + + Behaves like a dict. In a multi-cluster setup, jobs are nested by cluster + name; on a single cluster all jobs are under the local cluster key. + + Examples: + >>> import pyslurm + >>> jobs = pyslurm.Jobs.load() + >>> for job_id, job in jobs.items(): + ... print(job_id, job.state) + """ def __cinit__(self): self.info = NULL @@ -227,6 +238,50 @@ cdef class Jobs(MultiClusterMap): cdef class Job: + """A Slurm Job. + + Represents a single job in the Slurm workload manager. Provides read + access to all job attributes and write operations (cancel, hold, modify, + signal, suspend). + + Attributes: + id (int): Numeric job ID assigned by Slurm. + name (str): Job name specified at submission. + state (str): Current job state, e.g. ``RUNNING``, ``PENDING``, + ``COMPLETED``, ``FAILED``. + state_reason (str): Human-readable reason for the current state. For + pending jobs this explains why scheduling was blocked. + user_name (str): Username of the job owner. + group_name (str): Group name of the job owner. + account (str): Slurm account charged for the job. + partition (str): Partition the job is running in (or queued for). + qos (str): Quality of Service assigned to the job. + priority (int): Scheduling priority. Higher values are scheduled first. + time_limit (int): Wall-clock time limit in minutes, or ``None`` for + unlimited. + submit_time (int): Unix timestamp when the job was submitted. + start_time (int): Unix timestamp when the job started running. + allocated_nodes (str): Nodelist expression of allocated nodes, e.g. + ``node[001-004]``. ``None`` if the job has not started. + num_nodes (int): Number of nodes allocated to the job. + cpus (int): Total number of CPUs allocated. + ntasks (int): Total number of tasks. + exit_code (int): Exit code of the job's main process. + steps (pyslurm.JobSteps): Job steps associated with this job. + working_directory (str): Working directory at time of submission. + command (str): The command that was submitted. + + Examples: + Load a specific job: + + >>> import pyslurm + >>> job = pyslurm.Job.load(12345) + >>> print(job.state, job.user_name, job.partition) + + Cancel a job: + + >>> pyslurm.Job(12345).cancel() + """ def __cinit__(self): self.ptr = NULL @@ -671,42 +726,52 @@ cdef class Job: @property def name(self): + """Job name as specified at submission.""" return cstr.to_unicode(self.ptr.name) @property def id(self): + """Numeric job ID assigned by Slurm.""" return self.ptr.job_id @property def association_id(self): + """Numeric ID of the Slurm association this job runs under.""" return u32_parse(self.ptr.assoc_id) @property def account(self): + """Slurm account charged for this job.""" return cstr.to_unicode(self.ptr.account) @property def user_id(self): + """Numeric UID of the job owner.""" return u32_parse(self.ptr.user_id, zero_is_noval=False) @property def user_name(self): + """Username of the job owner.""" return uid_to_name(self.ptr.user_id, lookup=self.passwd) @property def group_id(self): + """Numeric GID of the job owner.""" return u32_parse(self.ptr.group_id, zero_is_noval=False) @property def group_name(self): + """Group name of the job owner.""" return gid_to_name(self.ptr.group_id, lookup=self.groups) @property def priority(self): + """Scheduling priority; higher values are scheduled first.""" return u32_parse(self.ptr.priority, zero_is_noval=False) @property def nice(self): + """Nice value adjustment to the job's priority.""" if self.ptr.nice == slurm.NO_VAL: return None @@ -714,10 +779,12 @@ cdef class Job: @property def qos(self): + """Quality of Service (QOS) assigned to this job.""" return cstr.to_unicode(self.ptr.qos) @property def min_cpus_per_node(self): + """Minimum CPUs per node requested.""" return u32_parse(self.ptr.pn_min_cpus) # I don't think this is used anymore - there is no way in sbatch to ask @@ -729,10 +796,12 @@ cdef class Job: @property def state(self): + """Current job state string, e.g. RUNNING, PENDING, COMPLETED, FAILED.""" return cstr.to_unicode(slurm_job_state_string(self.ptr.job_state)) @property def state_reason(self): + """Reason the job is in its current state; for pending jobs explains why scheduling was blocked.""" if self.ptr.state_desc: return cstr.to_unicode(self.ptr.state_desc) @@ -740,162 +809,201 @@ cdef class Job: @property def is_requeueable(self): + """True if the job can be requeued after a node failure.""" return u16_parse_bool(self.ptr.requeue) @property def requeue_count(self): + """Number of times the job has been requeued.""" return u16_parse(self.ptr.restart_cnt, on_noval=0) @property def is_batch_job(self): + """True if the job was submitted as a batch script.""" return u16_parse_bool(self.ptr.batch_flag) @property def requires_node_reboot(self): + """True if a node reboot is required before the job can run.""" return u8_parse_bool(self.ptr.reboot) @property def dependencies(self): + """Dict of job dependencies, e.g. {afterok: [1234, 5678]}.""" return dependency_str_to_dict(cstr.to_unicode(self.ptr.dependency)) @property def time_limit(self): + """Wall-clock time limit in minutes, or None if unlimited.""" return _raw_time(self.ptr.time_limit) @property def time_limit_min(self): + """Minimum accepted time limit in minutes (for flexible time requests).""" return _raw_time(self.ptr.time_min) @property def submit_time(self): + """Unix timestamp when the job was submitted.""" return _raw_time(self.ptr.submit_time) @property def eligible_time(self): + """Unix timestamp when the job became eligible for scheduling.""" return _raw_time(self.ptr.eligible_time) @property def accrue_time(self): + """Unix timestamp when the job started accruing priority.""" return _raw_time(self.ptr.accrue_time) @property def start_time(self): + """Unix timestamp when the job started running.""" return _raw_time(self.ptr.start_time) @property def resize_time(self): + """Unix timestamp of the last resource resize for this job.""" return _raw_time(self.ptr.resize_time) @property def deadline(self): + """Unix timestamp of the job's scheduling deadline, or None.""" return _raw_time(self.ptr.deadline) @property def preempt_eligible_time(self): + """Unix timestamp when the job became eligible for preemption.""" return _raw_time(self.ptr.preemptable_time) @property def preempt_time(self): + """Unix timestamp when the job was last preempted.""" return _raw_time(self.ptr.preempt_time) @property def suspend_time(self): + """Unix timestamp of the last suspension, or None.""" return _raw_time(self.ptr.suspend_time) @property def last_sched_evaluation_time(self): + """Unix timestamp of the last scheduling evaluation for this job.""" return _raw_time(self.ptr.last_sched_eval) @property def pre_suspension_time(self): + """Time the job had been running before its last suspension, in seconds.""" return _raw_time(self.ptr.pre_sus_time) @property def mcs_label(self): + """Multi-Category Security (MCS) label of the job, or None.""" return cstr.to_unicode(self.ptr.mcs_label) @property def partition(self): + """Partition (queue) the job is running in or queued for.""" return cstr.to_unicode(self.ptr.partition) @property def submit_host(self): + """Name of the node from which the job was submitted.""" return cstr.to_unicode(self.ptr.alloc_node) @property def submit_session_id(self): + """Session ID of the process that submitted the job.""" return u32_parse(self.ptr.alloc_sid) @property def batch_host(self): + """Name of the node where the batch script is running.""" return cstr.to_unicode(self.ptr.batch_host) @property def num_nodes(self): + """Number of nodes allocated to this job.""" return u32_parse(self.ptr.num_nodes) @property def max_nodes(self): + """Maximum number of nodes requested, or None.""" return u32_parse(self.ptr.max_nodes) @property def allocated_nodes(self): + """Nodelist expression of allocated nodes, e.g. node[001-004]. None if not running.""" return cstr.to_unicode(self.ptr.nodes) @property def required_nodes(self): + """Nodelist expression of specifically requested nodes.""" return cstr.to_unicode(self.ptr.req_nodes) @property def excluded_nodes(self): + """Nodelist expression of nodes explicitly excluded from allocation.""" return cstr.to_unicode(self.ptr.exc_nodes) @property def scheduled_nodes(self): + """Nodes tentatively scheduled for this pending job, or None.""" return cstr.to_unicode(self.ptr.sched_nodes) @property def derived_exit_code(self): + """Highest exit code returned by any task in the job.""" ec, _ = _get_exit_code(self.ptr.derived_ec) return ec @property def derived_exit_code_signal(self): + """Signal that caused the highest exit code, if applicable.""" _, sig = _get_exit_code(self.ptr.derived_ec) return sig @property def exit_code(self): + """Exit code of the job's main process.""" ec, _ = _get_exit_code(self.ptr.exit_code) return ec @property def exit_code_signal(self): + """Signal number that caused the job to exit, if killed by a signal.""" _, sig = _get_exit_code(self.ptr.exit_code) return sig @property def batch_constraints(self): + """List of features required specifically on the batch host node.""" return cstr.to_list(self.ptr.batch_features) @property def federation_origin(self): + """Name of the origin cluster in a federation, or None.""" return cstr.to_unicode(self.ptr.fed_origin_str) @property def federation_siblings_active(self): + """List of sibling clusters with active copies of this job.""" return cstr.to_list(self.ptr.fed_siblings_active_str) @property def federation_siblings_viable(self): + """List of sibling clusters where this job could run.""" return cstr.to_list(self.ptr.fed_siblings_viable_str) @property def cpus(self): + """Total number of CPUs allocated to this job.""" return u32_parse(self.ptr.num_cpus, on_noval=1) @property def cpus_per_task(self): + """Number of CPUs requested per task.""" if self.ptr.cpus_per_tres: return None @@ -903,6 +1011,7 @@ cdef class Job: @property def cpus_per_gpu(self): + """Number of CPUs allocated per GPU, or None if not GPU-based.""" if (not self.ptr.cpus_per_tres or self.ptr.cpus_per_task != slurm.NO_VAL16): return None @@ -914,98 +1023,122 @@ cdef class Job: @property def boards_per_node(self): + """Number of boards per node requested.""" return u16_parse(self.ptr.boards_per_node) @property def sockets_per_board(self): + """Number of sockets per board requested.""" return u16_parse(self.ptr.sockets_per_board) @property def sockets_per_node(self): + """Number of sockets per node requested.""" return u16_parse(self.ptr.sockets_per_node) @property def cores_per_socket(self): + """Number of cores per socket requested.""" return u16_parse(self.ptr.cores_per_socket) @property def threads_per_core(self): + """Number of threads per core requested.""" return u16_parse(self.ptr.threads_per_core) @property def ntasks(self): + """Total number of tasks requested.""" return u32_parse(self.ptr.num_tasks, on_noval=1) @property def ntasks_per_node(self): + """Number of tasks per node.""" return u16_parse(self.ptr.ntasks_per_node) @property def ntasks_per_board(self): + """Number of tasks per board.""" return u16_parse(self.ptr.ntasks_per_board) @property def ntasks_per_socket(self): + """Number of tasks per socket.""" return u16_parse(self.ptr.ntasks_per_socket) @property def ntasks_per_core(self): + """Number of tasks per core.""" return u16_parse(self.ptr.ntasks_per_core) @property def ntasks_per_gpu(self): + """Number of tasks per GPU.""" return u16_parse(self.ptr.ntasks_per_tres) @property def delay_boot_time(self): + """Seconds to delay node boot before starting the job.""" return _raw_time(self.ptr.delay_boot) @property def constraints(self): + """List of node feature constraints requested.""" return cstr.to_list(self.ptr.features) @property def preferred_features(self): + """List of preferred (soft) node feature constraints.""" return cstr.to_list(self.ptr.prefer) @property def cluster(self): + """Cluster name this job belongs to.""" return cstr.to_unicode(self.ptr.cluster) @property def cluster_constraints(self): + """List of cluster feature constraints requested.""" return cstr.to_list(self.ptr.cluster_features) @property def reservation(self): + """Name of the advance reservation the job is running in, or None.""" return cstr.to_unicode(self.ptr.resv_name) @property def resource_sharing(self): + """Resource sharing mode: EXCLUSIVE, SHARED, or OVERSUBSCRIBE.""" return cstr.to_unicode(slurm_job_share_string(self.ptr.shared)) @property def requires_contiguous_nodes(self): + """True if the job requires nodes to be contiguous in the network topology.""" return u16_parse_bool(self.ptr.contiguous) @property def licenses(self): + """List of licenses requested by the job.""" return cstr.to_list(self.ptr.licenses) @property def allocated_licenses(self): + """List of licenses currently allocated to the job.""" return cstr.to_list(self.ptr.licenses_allocated) @property def network(self): + """Network specification string for the job.""" return cstr.to_unicode(self.ptr.network) @property def command(self): + """The command or script path that was submitted.""" return cstr.to_unicode(self.ptr.command) @property def working_directory(self): + """Working directory at time of submission.""" return cstr.to_unicode(self.ptr.work_dir) @property diff --git a/pyslurm/core/node.pyx b/pyslurm/core/node.pyx index 355faefb..32f9980c 100644 --- a/pyslurm/core/node.pyx +++ b/pyslurm/core/node.pyx @@ -45,6 +45,16 @@ from pyslurm.utils.helpers import ( cdef class Nodes(MultiClusterMap): + """A collection of Slurm compute nodes, keyed by node name. + + Behaves like a dict. Also exposes aggregated properties such as + `total_cpus`, `free_memory`, and `allocated_memory` across all nodes. + + Examples: + >>> import pyslurm + >>> nodes = pyslurm.Nodes.load() + >>> print(f"Total CPUs: {nodes.total_cpus}, Free: {nodes.idle_cpus}") + """ def __dealloc__(self): slurm_free_node_info_msg(self.info) @@ -214,6 +224,24 @@ cdef class Nodes(MultiClusterMap): cdef class Node: + """A Slurm compute node. + + Represents a single compute node registered with the Slurm controller. + Provides read access to node properties and write operations (create, + modify, delete). + + Examples: + Load a node and check its state: + + >>> import pyslurm + >>> node = pyslurm.Node.load("compute-01") + >>> print(node.state, node.total_cpus, node.real_memory) + + Drain a node for maintenance: + + >>> changes = pyslurm.Node(state="DRAIN", reason="scheduled maintenance") + >>> node.modify(changes) + """ def __cinit__(self): self.info = NULL @@ -417,7 +445,8 @@ cdef class Node: return instance_to_dict(self, recursive) @property - def name(self): + def name(self) -> str: + """Node name as configured in slurm.conf.""" return cstr.to_unicode(self.info.name) @name.setter @@ -425,11 +454,13 @@ cdef class Node: cstr.fmalloc2(&self.info.name, &self.umsg.node_names, val) @property - def architecture(self): + def architecture(self) -> str: + """CPU architecture string, e.g. x86_64.""" return cstr.to_unicode(self.info.arch) @property - def configured_gres(self): + def configured_gres(self) -> dict: + """Dict of GRES configured on the node, e.g. {"gpu": 4}.""" return cstr.to_gres_dict(self.info.gres) @configured_gres.setter @@ -438,11 +469,13 @@ cdef class Node: cstr.from_gres_dict(val)) @property - def owner(self): + def owner(self) -> str: + """Username of the node owner, or None if unowned.""" return uid_to_name(self.info.owner, lookup=self.passwd) @property - def address(self): + def address(self) -> str: + """Network address used for Slurm communication.""" return cstr.to_unicode(self.info.node_addr) @address.setter @@ -450,7 +483,8 @@ cdef class Node: cstr.fmalloc2(&self.info.node_addr, &self.umsg.node_addr, val) @property - def hostname(self): + def hostname(self) -> str: + """Hostname of the node.""" return cstr.to_unicode(self.info.node_hostname) @hostname.setter @@ -458,7 +492,8 @@ cdef class Node: cstr.fmalloc2(&self.info.node_hostname, &self.umsg.node_hostname, val) @property - def extra(self): + def extra(self) -> str: + """Extra arbitrary string attached to the node.""" return cstr.to_unicode(self.info.extra) @extra.setter @@ -466,7 +501,8 @@ cdef class Node: cstr.fmalloc2(&self.info.extra, &self.umsg.extra, val) @property - def reason(self): + def reason(self) -> str: + """Reason string set on the node (for DRAIN or DOWN state).""" return cstr.to_unicode(self.info.reason) @reason.setter @@ -474,11 +510,13 @@ cdef class Node: cstr.fmalloc2(&self.info.reason, &self.umsg.reason, val) @property - def reason_user(self): + def reason_user(self) -> str: + """Username of who set the node reason.""" return uid_to_name(self.info.reason_uid, lookup=self.passwd) @property - def comment(self): + def comment(self) -> str: + """Administrative comment on the node.""" return cstr.to_unicode(self.info.comment) @comment.setter @@ -486,52 +524,64 @@ cdef class Node: cstr.fmalloc2(&self.info.comment, &self.umsg.comment, val) @property - def bcast_address(self): + def bcast_address(self) -> str: + """Address used for broadcast communication.""" return cstr.to_unicode(self.info.bcast_address) @property - def slurm_version(self): + def slurm_version(self) -> str: + """Version of slurmd running on the node.""" return cstr.to_unicode(self.info.version) @property - def operating_system(self): + def operating_system(self) -> str: + """Operating system string reported by slurmd.""" return cstr.to_unicode(self.info.os) @property - def allocated_gres(self): + def allocated_gres(self) -> dict: + """Dict of GRES currently allocated on the node.""" return gres_from_tres_dict(self.allocated_tres) @property - def mcs_label(self): + def mcs_label(self) -> str: + """Multi-Category Security (MCS) label, or None.""" return cstr.to_unicode(self.info.mcs_label) @property - def allocated_memory(self): + def allocated_memory(self) -> int: + """Memory currently allocated to running jobs, in MiB.""" return u64_parse(self.info.alloc_memory, on_noval=0) @property - def real_memory(self): + def real_memory(self) -> int: + """Total memory configured on the node, in MiB.""" return u64_parse(self.info.real_memory) @property - def free_memory(self): + def free_memory(self) -> int: + """Currently free memory as reported by slurmd, in MiB.""" return u64_parse(self.info.free_mem) @property - def idle_memory(self): + def idle_memory(self) -> int: + """Memory not allocated to any job (real_memory minus allocated_memory), in MiB.""" real = self.real_memory return 0 if not real else real - self.allocated_memory @property - def memory_reserved_for_system(self): + def memory_reserved_for_system(self) -> int: + """Memory reserved for system use and unavailable to jobs, in MiB.""" return u64_parse(self.info.mem_spec_limit) @property - def temporary_disk(self): + def temporary_disk(self) -> int: + """Temporary disk space available in MiB.""" return u32_parse(self.info.tmp_disk) @property - def weight(self): + def weight(self) -> int: + """Scheduling weight; lower values are preferred by the scheduler.""" return u32_parse(self.info.weight) @weight.setter @@ -539,35 +589,43 @@ cdef class Node: self.info.weight=self.umsg.weight = u32(val) @property - def effective_cpus(self): + def effective_cpus(self) -> int: + """CPUs available for allocation after CPU specialisation is applied.""" return u16_parse(self.info.cpus_efctv, on_noval=0) @property - def total_cpus(self): + def total_cpus(self) -> int: + """Total CPUs configured on the node.""" return u16_parse(self.info.cpus, on_noval=0) @property - def sockets(self): + def sockets(self) -> int: + """Number of sockets on the node.""" return u16_parse(self.info.sockets, on_noval=0) @property - def cores_reserved_for_system(self): + def cores_reserved_for_system(self) -> int: + """Number of cores reserved for system use (core specialisation).""" return u16_parse(self.info.core_spec_cnt) @property - def boards(self): + def boards(self) -> int: + """Number of motherboards on the node.""" return u16_parse(self.info.boards) @property - def cores_per_socket(self): + def cores_per_socket(self) -> int: + """Number of cores per socket.""" return u16_parse(self.info.cores) @property - def threads_per_core(self): + def threads_per_core(self) -> int: + """Number of hardware threads per core.""" return u16_parse(self.info.threads) @property - def available_features(self): + def available_features(self) -> list: + """List of features available on the node (from slurm.conf).""" return cstr.to_list(self.info.features) @available_features.setter @@ -575,7 +633,8 @@ cdef class Node: cstr.from_list2(&self.info.features, &self.umsg.features, val) @property - def active_features(self): + def active_features(self) -> list: + """List of features currently active on the node.""" return cstr.to_list(self.info.features_act) @active_features.setter @@ -583,23 +642,28 @@ cdef class Node: cstr.from_list2(&self.info.features_act, &self.umsg.features_act, val) @property - def partitions(self): + def partitions(self) -> list: + """List of partition names this node is a member of.""" return cstr.to_list(self.info.partitions) @property - def boot_time(self): + def boot_time(self) -> int: + """Unix timestamp when the node last booted.""" return _raw_time(self.info.boot_time) @property - def slurmd_start_time(self): + def slurmd_start_time(self) -> int: + """Unix timestamp when slurmd last started on this node.""" return _raw_time(self.info.slurmd_start_time) @property - def last_busy_time(self): + def last_busy_time(self) -> int: + """Unix timestamp when the node last transitioned from ALLOCATED to IDLE.""" return _raw_time(self.info.last_busy) @property - def reason_time(self): + def reason_time(self) -> int: + """Unix timestamp when the node reason was set.""" return _raw_time(self.info.reason_time) # @property @@ -608,19 +672,23 @@ cdef class Node: # return cstr.to_dict(self.info.tres_fmt_str) @property - def allocated_tres(self): + def allocated_tres(self) -> dict: + """Dict of TRES (Trackable RESources) currently allocated on the node.""" return cstr.to_dict(self.info.alloc_tres_fmt_str) @property - def allocated_cpus(self): + def allocated_cpus(self) -> int: + """CPUs currently allocated to running jobs.""" return u16_parse(self.info.alloc_cpus, on_noval=0) @property - def idle_cpus(self): + def idle_cpus(self) -> int: + """CPUs not allocated to any job (effective_cpus minus allocated_cpus).""" return self.effective_cpus - self.allocated_cpus @property - def cpu_binding(self): + def cpu_binding(self) -> str: + """Default CPU binding type for jobs on this node.""" cdef char cpu_bind[128] slurm_sprint_cpu_bind_type(cpu_bind, self.info.cpu_bind) @@ -634,13 +702,15 @@ cdef class Node: self.info.cpu_bind=self.umsg.cpu_bind = cpubind_to_num(val) @property - def current_watts(self): + def current_watts(self) -> int: + """Current power consumption in watts, or 0 if not monitored.""" if not self.info.energy: return 0 return u32_parse(self.info.energy.current_watts, on_noval=0) @property - def avg_watts(self): + def avg_watts(self) -> int: + """Average power consumption in watts, or 0 if not monitored.""" if not self.info.energy: return 0 return u32_parse(self.info.energy.ave_watts, on_noval=0) @@ -659,7 +729,8 @@ cdef class Node: return state @property - def state(self): + def state(self) -> str: + """Current node state string, e.g. IDLE, ALLOCATED, DRAIN, DOWN.""" cdef char* state = slurm_node_state_string_complete(self._node_state) state_str = cstr.to_unicode(state) xfree(state) @@ -670,7 +741,8 @@ cdef class Node: self.umsg.node_state=self.info.node_state = _node_state_from_str(val) @property - def next_state(self): + def next_state(self) -> str: + """State the node will transition to after a pending reboot, or None.""" state = self._node_state if ((self.info.next_state != slurm.NO_VAL) and (state & slurm.NODE_STATE_REBOOT_REQUESTED @@ -681,12 +753,14 @@ cdef class Node: return None @property - def cpu_load(self): + def cpu_load(self) -> float: + """CPU load average on the node, or 0.0 if unavailable.""" load = u32_parse(self.info.cpu_load) return load / 100.0 if load is not None else 0.0 @property - def slurmd_port(self): + def slurmd_port(self) -> int: + """TCP port slurmd is listening on.""" return u16_parse(self.info.port) diff --git a/pyslurm/core/partition.pyx b/pyslurm/core/partition.pyx index ad55f7ec..27e61e94 100644 --- a/pyslurm/core/partition.pyx +++ b/pyslurm/core/partition.pyx @@ -50,6 +50,17 @@ from pyslurm.utils.ctime import ( cdef class Partitions(MultiClusterMap): + """A collection of Slurm partitions, keyed by partition name. + + Behaves like a dict. Use `Partitions.load()` to fetch all partitions + from the Slurm controller. + + Examples: + >>> import pyslurm + >>> partitions = pyslurm.Partitions.load() + >>> for name, part in partitions.items(): + ... print(name, part.state, part.total_nodes) + """ def __dealloc__(self): slurm_free_partition_info_msg(self.info) @@ -161,6 +172,40 @@ cdef class Partitions(MultiClusterMap): cdef class Partition: + """A Slurm partition (queue). + + Partitions are logical groupings of nodes with shared scheduling policies, + resource limits, and access controls. Jobs are submitted to a partition. + + Attributes: + name (str): Partition name. + state (str): Partition state: ``UP``, ``DOWN``, ``INACTIVE``, or + ``DRAIN``. + nodes (str): Nodelist expression of all nodes in this partition. + total_nodes (int): Total number of nodes. + total_cpus (int): Total number of CPUs across all nodes. + is_default (bool): True if this is the default partition for + submissions that do not specify a partition. + max_time (int): Maximum job time limit in minutes, or + ``None`` if unlimited. + default_time (int): Default time limit in minutes assigned to jobs + that do not request one, or ``None``. + max_nodes (int): Maximum nodes a single job may request, or ``None``. + min_nodes (int): Minimum nodes a job must request. + allowed_accounts (list): Accounts permitted to submit, or ``None`` + if all are allowed. + allowed_groups (list): Unix groups permitted to submit. + allowed_qos (list): QOS values permitted in this partition. + denied_accounts (list): Accounts explicitly blocked from submitting. + denied_qos (list): QOS values blocked in this partition. + priority_tier (int): Tier used to order this partition relative to + others when a job could run in multiple partitions. + + Examples: + >>> import pyslurm + >>> part = pyslurm.Partitions.load()["debug"] + >>> print(part.state, part.total_nodes, part.max_time) + """ def __cinit__(self): self.ptr = NULL @@ -319,6 +364,7 @@ cdef class Partition: @property def name(self): + """Partition name.""" return cstr.to_unicode(self.ptr.name) @property @@ -331,6 +377,7 @@ cdef class Partition: @property def allowed_submit_nodes(self): + """List of nodes from which jobs may be submitted to this partition.""" return cstr.to_list(self.ptr.allow_alloc_nodes, ["ALL"]) @allowed_submit_nodes.setter @@ -339,6 +386,7 @@ cdef class Partition: @property def allowed_accounts(self): + """List of accounts permitted to submit jobs; None means all accounts allowed.""" return cstr.to_list(self.ptr.allow_accounts, ["ALL"]) @allowed_accounts.setter @@ -347,6 +395,7 @@ cdef class Partition: @property def allowed_groups(self): + """List of Unix groups permitted to submit jobs.""" return cstr.to_list(self.ptr.allow_groups, ["ALL"]) @allowed_groups.setter @@ -355,6 +404,7 @@ cdef class Partition: @property def allowed_qos(self): + """List of QOS values permitted in this partition.""" return cstr.to_list(self.ptr.allow_qos, ["ALL"]) @allowed_qos.setter @@ -363,6 +413,7 @@ cdef class Partition: @property def alternate(self): + """Alternate partition to use if this partition is unavailable.""" return cstr.to_unicode(self.ptr.alternate) @alternate.setter @@ -371,10 +422,12 @@ cdef class Partition: @property def select_type_parameters(self): + """List of select plugin parameters active on this partition.""" return _select_type_int_to_list(self.ptr.cr_type) @property def cpu_binding(self): + """Default CPU binding type applied to jobs in this partition.""" cdef char cpu_bind[128] slurm_sprint_cpu_bind_type(cpu_bind, self.ptr.cpu_bind) @@ -389,6 +442,7 @@ cdef class Partition: @property def default_memory_per_cpu(self): + """Default memory per CPU in MiB for jobs that do not specify memory.""" return _get_memory(self.ptr.def_mem_per_cpu, per_cpu=True) @default_memory_per_cpu.setter @@ -398,6 +452,7 @@ cdef class Partition: @property def default_memory_per_node(self): + """Default memory per node in MiB for jobs that do not specify memory.""" return _get_memory(self.ptr.def_mem_per_cpu, per_cpu=False) @default_memory_per_node.setter @@ -406,6 +461,7 @@ cdef class Partition: @property def max_memory_per_cpu(self): + """Maximum memory per CPU in MiB a job may request.""" return _get_memory(self.ptr.max_mem_per_cpu, per_cpu=True) @max_memory_per_cpu.setter @@ -415,6 +471,7 @@ cdef class Partition: @property def max_memory_per_node(self): + """Maximum memory per node in MiB a job may request.""" return _get_memory(self.ptr.max_mem_per_cpu, per_cpu=False) @max_memory_per_node.setter @@ -423,6 +480,7 @@ cdef class Partition: @property def default_time(self): + """Default time limit in minutes assigned to jobs that request none; None if not set.""" return _raw_time(self.ptr.default_time, on_inf=UNLIMITED) @default_time.setter @@ -431,6 +489,7 @@ cdef class Partition: @property def denied_qos(self): + """List of QOS values that are blocked in this partition.""" return cstr.to_list(self.ptr.deny_qos, ["ALL"]) @denied_qos.setter @@ -439,6 +498,7 @@ cdef class Partition: @property def denied_accounts(self): + """List of accounts blocked from submitting to this partition.""" return cstr.to_list(self.ptr.deny_accounts, ["ALL"]) @denied_accounts.setter @@ -447,6 +507,7 @@ cdef class Partition: @property def preemption_grace_time(self): + """Grace period in seconds before a preempted job is killed.""" return _raw_time(self.ptr.grace_time) @preemption_grace_time.setter @@ -455,6 +516,7 @@ cdef class Partition: @property def default_cpus_per_gpu(self): + """Default number of CPUs allocated per GPU for jobs in this partition.""" def_dict = cstr.to_dict(self.ptr.job_defaults_str) if def_dict and "DefCpuPerGpu" in def_dict: return int(def_dict["DefCpuPerGpu"]) @@ -469,6 +531,7 @@ cdef class Partition: @property def default_memory_per_gpu(self): + """Default memory in MiB allocated per GPU for jobs in this partition.""" def_dict = cstr.to_dict(self.ptr.job_defaults_str) if def_dict and "DefMemPerGpu" in def_dict: return int(def_dict["DefMemPerGpu"]) @@ -483,6 +546,7 @@ cdef class Partition: @property def max_cpus_per_node(self): + """Maximum CPUs a job may request per node in this partition.""" return u32_parse(self.ptr.max_cpus_per_node) @max_cpus_per_node.setter @@ -491,6 +555,7 @@ cdef class Partition: @property def max_cpus_per_socket(self): + """Maximum CPUs a job may request per socket in this partition.""" return u32_parse(self.ptr.max_cpus_per_socket) @max_cpus_per_socket.setter @@ -499,6 +564,7 @@ cdef class Partition: @property def max_nodes(self): + """Maximum number of nodes a single job may request; None if unlimited.""" return u32_parse(self.ptr.max_nodes) @max_nodes.setter @@ -507,6 +573,7 @@ cdef class Partition: @property def min_nodes(self): + """Minimum number of nodes a job must request.""" return u32_parse(self.ptr.min_nodes, zero_is_noval=False) @min_nodes.setter @@ -515,6 +582,7 @@ cdef class Partition: @property def max_time(self): + """Maximum job time limit in minutes; None if unlimited.""" return _raw_time(self.ptr.max_time, on_inf=UNLIMITED) @max_time.setter @@ -523,6 +591,7 @@ cdef class Partition: @property def oversubscribe(self): + """Oversubscribe setting for this partition, e.g. NO, EXCLUSIVE, YES:N, or FORCE:N.""" return _oversubscribe_int_to_str(self.ptr.max_share) @oversubscribe.setter @@ -531,6 +600,7 @@ cdef class Partition: @property def nodes(self): + """Nodelist expression of all nodes in this partition.""" return cstr.to_unicode(self.ptr.nodes) @nodes.setter @@ -539,6 +609,7 @@ cdef class Partition: @property def nodesets(self): + """List of node set names in this partition.""" return cstr.to_list(self.ptr.nodesets) @nodesets.setter @@ -547,6 +618,7 @@ cdef class Partition: @property def over_time_limit(self): + """Minutes jobs may exceed their time limit before being killed; None if not set.""" return u16_parse(self.ptr.over_time_limit) @over_time_limit.setter @@ -555,6 +627,7 @@ cdef class Partition: @property def preempt_mode(self): + """Preemption mode for this partition, e.g. REQUEUE, SUSPEND, OFF.""" return _preempt_mode_int_to_str(self.ptr.preempt_mode, self.slurm_conf) @preempt_mode.setter @@ -563,6 +636,7 @@ cdef class Partition: @property def priority_job_factor(self): + """Job priority weighting factor for this partition.""" return u16_parse(self.ptr.priority_job_factor) @priority_job_factor.setter @@ -571,6 +645,7 @@ cdef class Partition: @property def priority_tier(self): + """Tier used to rank this partition when a job could run in multiple partitions.""" return u16_parse(self.ptr.priority_tier) @priority_tier.setter @@ -579,6 +654,7 @@ cdef class Partition: @property def qos(self): + """QOS associated with this partition.""" return cstr.to_unicode(self.ptr.qos_char) @qos.setter @@ -587,14 +663,17 @@ cdef class Partition: @property def total_cpus(self): + """Total number of CPUs across all nodes in this partition.""" return u32_parse(self.ptr.total_cpus, on_noval=0) @property def total_nodes(self): + """Total number of nodes in this partition.""" return u32_parse(self.ptr.total_nodes, on_noval=0) @property def state(self): + """Partition state: UP, DOWN, INACTIVE, or DRAIN.""" return _partition_state_int_to_str(self.ptr.state_up) @state.setter @@ -603,6 +682,7 @@ cdef class Partition: @property def is_default(self): + """True if this is the default partition for jobs that do not specify one.""" return u32_parse_bool_flag(self.ptr.flags, slurm.PART_FLAG_DEFAULT) @is_default.setter @@ -612,6 +692,7 @@ cdef class Partition: @property def allow_root_jobs(self): + """True if root is allowed to submit jobs to this partition.""" return u32_parse_bool_flag(self.ptr.flags, slurm.PART_FLAG_NO_ROOT) @allow_root_jobs.setter @@ -621,6 +702,7 @@ cdef class Partition: @property def is_user_exclusive(self): + """True if nodes are reserved exclusively per user.""" return u32_parse_bool_flag(self.ptr.flags, slurm.PART_FLAG_EXCLUSIVE_USER) @@ -631,6 +713,7 @@ cdef class Partition: @property def is_hidden(self): + """True if this partition is hidden from normal user view.""" return u32_parse_bool_flag(self.ptr.flags, slurm.PART_FLAG_HIDDEN) @is_hidden.setter @@ -640,6 +723,7 @@ cdef class Partition: @property def least_loaded_nodes_scheduling(self): + """True if jobs in this partition prefer least-loaded nodes.""" return u16_parse_bool_flag(self.ptr.flags, slurm.PART_FLAG_LLN) @least_loaded_nodes_scheduling.setter @@ -649,6 +733,7 @@ cdef class Partition: @property def is_root_only(self): + """True if only root may submit jobs to this partition.""" return u32_parse_bool_flag(self.ptr.flags, slurm.PART_FLAG_ROOT_ONLY) @is_root_only.setter @@ -658,6 +743,7 @@ cdef class Partition: @property def requires_reservation(self): + """True if jobs must have a reservation to run in this partition.""" return u32_parse_bool_flag(self.ptr.flags, slurm.PART_FLAG_REQ_RESV) @requires_reservation.setter @@ -667,6 +753,7 @@ cdef class Partition: @property def power_down_on_idle(self): + """True if nodes in this partition are powered down when idle.""" return u32_parse_bool_flag(self.ptr.flags, slurm.PART_FLAG_PDOI) @power_down_on_idle.setter diff --git a/pyslurm/core/reservation.pyx b/pyslurm/core/reservation.pyx index fa3099ad..bdbc8bd5 100644 --- a/pyslurm/core/reservation.pyx +++ b/pyslurm/core/reservation.pyx @@ -47,6 +47,17 @@ from pyslurm.core.error import ( cdef class Reservations(MultiClusterMap): + """A collection of Slurm advance reservations, keyed by name. + + Behaves like a dict. Use `Reservations.load()` to fetch all reservations + from the Slurm controller. + + Examples: + >>> import pyslurm + >>> reservations = pyslurm.Reservations.load() + >>> for name, resv in reservations.items(): + ... print(name, resv.nodes, resv.is_active) + """ def __dealloc__(self): slurm_free_reservation_info_msg(self.info) @@ -103,6 +114,50 @@ cdef class Reservations(MultiClusterMap): cdef class Reservation: + """A Slurm advance reservation. + + Advance reservations pre-allocate Slurm resources (nodes, CPUs, licenses) + for a specific time window, preventing regular jobs from using them. Used + for maintenance windows, dedicated allocations, or guaranteed access for + specific users or accounts. + + Attributes: + name (str): Reservation name. + nodes (str): Nodelist expression of reserved nodes. + start_time (int): Unix timestamp when the reservation begins. + end_time (int): Unix timestamp when the reservation ends. + duration (int): Duration of the reservation in minutes. + is_active (bool): True if the reservation is currently active. + users (list): List of usernames allowed to use the reservation. + accounts (list): List of account names allowed to use the reservation. + groups (list): List of Unix groups allowed to use the reservation. + node_count (int): Number of nodes reserved. + cpus (int): Number of cores/CPUs reserved. + flags (pyslurm.ReservationFlags): Flags controlling reservation + behaviour (e.g. ``MAINT``, ``OVERLAP``, ``IGNORE_JOBS``). + partition (str): Partition the reservation is associated with. + licenses (list): List of licenses reserved. + features (list): Node feature constraints for the reservation. + + Examples: + Load a reservation: + + >>> import pyslurm + >>> resv = pyslurm.Reservation.load("maintenance") + >>> print(resv.nodes, resv.start_time, resv.end_time) + + Create a reservation: + + >>> from pyslurm import ReservationFlags, ReservationReoccurrence + >>> resv = pyslurm.Reservation( + ... name="debug", + ... users=["root"], + ... nodes="node001", + ... duration="1-00:00:00", + ... flags=ReservationFlags.MAINTENANCE, + ... ) + >>> resv.create() + """ def __cinit__(self): self.info = NULL @@ -311,6 +366,7 @@ cdef class Reservation: @property def accounts(self): + """List of account names allowed to use this reservation.""" return cstr.to_list(self.info.accounts) @accounts.setter @@ -319,6 +375,7 @@ cdef class Reservation: @property def burst_buffer(self): + """Burst buffer resources reserved.""" return cstr.to_unicode(self.info.burst_buffer) @burst_buffer.setter @@ -327,6 +384,7 @@ cdef class Reservation: @property def comment(self): + """Administrative comment on the reservation.""" return cstr.to_unicode(self.info.comment) @comment.setter @@ -335,6 +393,7 @@ cdef class Reservation: @property def cpus(self): + """Number of cores/CPUs reserved.""" return u32_parse(self.info.core_cnt, zero_is_noval=False) @cpus.setter @@ -343,6 +402,7 @@ cdef class Reservation: @property def cpu_ids_by_node(self): + """Dict mapping node names to core ID ranges reserved on each node.""" out = {} for i in range(self.info.core_spec_cnt): node = cstr.to_unicode(self.info.core_spec[i].node_name) @@ -353,6 +413,7 @@ cdef class Reservation: @property def end_time(self): + """Unix timestamp when the reservation ends.""" return _raw_time(self.info.end_time) @end_time.setter @@ -365,6 +426,7 @@ cdef class Reservation: @property def features(self): + """List of node feature constraints for the reservation.""" return cstr.to_list(self.info.features) @features.setter @@ -373,6 +435,7 @@ cdef class Reservation: @property def groups(self): + """List of Unix groups allowed to use this reservation.""" return cstr.to_list(self.info.groups) @groups.setter @@ -381,6 +444,7 @@ cdef class Reservation: @property def licenses(self): + """List of licenses reserved.""" return cstr.to_list(self.info.licenses) @licenses.setter @@ -389,6 +453,7 @@ cdef class Reservation: @property def max_start_delay(self): + """Maximum seconds a job may delay the reservation start; 0 means no delay allowed.""" return u32_parse(self.info.max_start_delay) @max_start_delay.setter @@ -397,6 +462,7 @@ cdef class Reservation: @property def name(self): + """Reservation name.""" return cstr.to_unicode(self.info.name) @name.setter @@ -405,6 +471,7 @@ cdef class Reservation: @property def node_count(self): + """Number of nodes in the reservation.""" return u32_parse(self.info.node_cnt, zero_is_noval=False) @node_count.setter @@ -413,6 +480,7 @@ cdef class Reservation: @property def nodes(self): + """Nodelist expression of reserved nodes.""" return cstr.to_unicode(self.info.node_list) @nodes.setter @@ -421,6 +489,7 @@ cdef class Reservation: @property def partition(self): + """Partition associated with the reservation.""" return cstr.to_unicode(self.info.partition) @partition.setter @@ -429,6 +498,7 @@ cdef class Reservation: @property def purge_time(self): + """Seconds after the reservation ends before it is purged; requires PURGE flag.""" return u32_parse(self.info.purge_comp_time) @purge_time.setter @@ -439,6 +509,7 @@ cdef class Reservation: @property def start_time(self): + """Unix timestamp when the reservation begins.""" return _raw_time(self.info.start_time) @start_time.setter @@ -447,6 +518,7 @@ cdef class Reservation: @property def duration(self): + """Duration of the reservation in minutes (derived from start_time and end_time).""" cdef time_t duration = 0 if self.start_time and self.info.end_time >= self.info.start_time: @@ -464,6 +536,7 @@ cdef class Reservation: @property def is_active(self): + """True if the current time is within the reservation's start and end window.""" cdef time_t now = ctime.time(NULL) if self.info.start_time <= now and self.info.end_time >= now: return True @@ -471,6 +544,7 @@ cdef class Reservation: @property def tres(self): + """Dict of TRES reserved, e.g. {"cpu": 16, "mem": "64G"}.""" return cstr.to_dict(self.info.tres_str) @tres.setter @@ -480,6 +554,7 @@ cdef class Reservation: @property def users(self): + """List of usernames allowed to use this reservation.""" return cstr.to_list(self.info.users) @users.setter diff --git a/pyslurm/utils/helpers.pyx b/pyslurm/utils/helpers.pyx index 99cc895a..d4402345 100644 --- a/pyslurm/utils/helpers.pyx +++ b/pyslurm/utils/helpers.pyx @@ -145,6 +145,13 @@ def expand_range_str(range_str): Returns: (list): List of unique values + + Examples: + >>> from pyslurm.utils.helpers import expand_range_str + >>> expand_range_str("1-5,6,7,10-11") + [1, 2, 3, 4, 5, 6, 7, 10, 11] + >>> expand_range_str("3") + [3] """ ret = [] for mrange in range_str.split(","): @@ -236,6 +243,15 @@ def humanize(num, decimals=1): Returns: (str): Humanized number with appropriate suffix. + + Examples: + >>> from pyslurm.utils.helpers import humanize + >>> humanize(1024) + '1.0G' + >>> humanize(800) + '800.0M' + >>> humanize(None) is None + True """ if num is None or num == "unlimited" or num == UNLIMITED: return num diff --git a/scripts/build.sh b/scripts/build.sh index 0992d844..916395b7 100755 --- a/scripts/build.sh +++ b/scripts/build.sh @@ -22,7 +22,7 @@ done shift $((OPTIND-1)) -PY_VER=$(python -c "import sys; v=sys.version_info; print(f'{v.major}.{v.minor}')") +PY_VER=$(python3 -c "import sys; v=sys.version_info; print(f'{v.major}.{v.minor}')") echo "Building with ${OPT_JOBS} cores" export PYSLURM_BUILD_JOBS="$OPT_JOBS" diff --git a/scripts/builddocs.sh b/scripts/builddocs.sh index 23e37ce3..9d0e7d80 100755 --- a/scripts/builddocs.sh +++ b/scripts/builddocs.sh @@ -1,14 +1,18 @@ #!/bin/bash -usage() { echo "Usage: $0 [-j jobs]" 1>&2; exit 1; } +usage() { echo "Usage: $0 [-j jobs] [-s]" 1>&2; exit 1; } OPT_JOBS=${PYSLURM_BUILD_JOBS:-1} +OPT_STRICT="" -while getopts ":j:" o; do +while getopts ":j:s" o; do case "${o}" in j) OPT_JOBS=${OPTARG} ;; + s) + OPT_STRICT="--strict" + ;; *) usage ;; @@ -17,6 +21,6 @@ done shift $((OPTIND-1)) -pip install -r doc_requirements.txt +pip install ".[docs]" scripts/build.sh -j${OPT_JOBS} -d -mkdocs build +mkdocs build ${OPT_STRICT} diff --git a/scripts/griffe_exts.py b/scripts/griffe_exts.py index 06d87164..ca6aa8d8 100644 --- a/scripts/griffe_exts.py +++ b/scripts/griffe_exts.py @@ -21,9 +21,10 @@ import ast import inspect +import pathlib +import re import griffe import pyslurm -import re logger = griffe.get_logger(__name__) SLURM_VERSION = ".".join(pyslurm.__version__.split(".")[:-1]) @@ -38,6 +39,18 @@ "scontrol", ] +slurm_url_pattern = re.compile( + r"\{(" + + "|".join([re.escape(c) for c in config_files]) + + r")" + + r"([#][^}]+)\}" +) + +# Matches: def (self) -> : +_property_annotation_re = re.compile( + r"\bdef\s+{name}\s*\(\s*self\s*\)\s*->\s*([\w\[\], |.]+?)\s*:" +) + def replace_with_slurm_docs_url(match): first_part = match.group(1) @@ -46,12 +59,28 @@ def replace_with_slurm_docs_url(match): return f"{ref}({SLURM_DOCS_URL_VERSIONED}/{first_part}.html{second_part})" -pattern = re.compile( - r"\{(" - + "|".join([re.escape(config) for config in config_files]) - + r")" # Match the first word before "#" - + r"([#][^}]+)\}" # Match "#" and everything after it until } -) +def _find_pyx_file(obj: griffe.Object) -> pathlib.Path | None: + """Derive the .pyx source path from an object's dotted path.""" + parts = obj.path.split(".") + for i in range(len(parts), 0, -1): + candidate = pathlib.Path(*parts[:i]).with_suffix(".pyx") + if candidate.exists(): + return candidate + return None + + +def _read_pyx_annotation(obj: griffe.Object) -> str | None: + """Return the -> annotation for a property from the .pyx source, or None.""" + pyx = _find_pyx_file(obj) + if pyx is None: + return None + try: + source = pyx.read_text() + pat = _property_annotation_re.pattern.format(name=re.escape(obj.name)) + m = re.search(pat, source) + return m.group(1) if m else None + except Exception: + return None # This class is inspired from here, with a few adaptions: @@ -78,10 +107,25 @@ def __init__( include_paths: list[str] | None = None, ignore_paths: list[str] | None = None, ) -> None: - self.include_paths = include_paths self.ignore_paths = ignore_paths + def _is_filtered(self, obj: griffe.Object) -> bool: + if self.include_paths and obj.path not in self.include_paths: + return True + if self.ignore_paths and obj.path in self.ignore_paths: + return True + return False + + def _apply_slurm_url_substitution(self, obj: griffe.Object) -> None: + if not obj.docstring: + return + original = obj.docstring.value + if not slurm_url_pattern.search(original): + return + updated = slurm_url_pattern.sub(replace_with_slurm_docs_url, original) + obj.docstring.value = inspect.cleandoc(updated) + def on_instance( self, node: ast.AST | griffe.ObjectNode, @@ -89,23 +133,11 @@ def on_instance( agent: griffe.Visitor | griffe.Inspector, **kwargs, ) -> None: - - if (self.include_paths and obj.path not in self.include_paths) or ( - self.ignore_paths and obj.path in self.ignore_paths - ): + if self._is_filtered(obj): return - try: - runtime_obj = griffe.dynamic_import(obj.path) - docstring = runtime_obj.__doc__ - except ImportError: - logger.debug(f"Could not get dynamic docstring for {obj.path}") - return - except AttributeError: - logger.debug(f"Object {obj.path} does not have a __doc__ attribute") - return - - # Hack to improve generated docs for Enums. + # Fix up Enum member display: strip class prefix from value and + # remove labels so they render cleanly in the docs. if hasattr(obj.parent, "bases"): for base in obj.parent.bases: b = base.lower() @@ -113,17 +145,30 @@ def on_instance( v = obj.value[:-1].split(" ")[-1] obj.value = v obj.labels = {} - if "slurmflag" in b: obj.value = None - if not docstring or not obj.docstring: + self._apply_slurm_url_substitution(obj) + + def on_attribute_instance( + self, + node: ast.AST | griffe.ObjectNode, + attr: griffe.Attribute, + agent: griffe.Visitor | griffe.Inspector, + **kwargs, + ) -> None: + if self._is_filtered(attr): return - fmt_docstring = pattern.sub(replace_with_slurm_docs_url, docstring) - if fmt_docstring == docstring: - # No need to update the docstring if nothing has changed + # Cython cdef class properties appear as Attribute objects with a + # "property" label but no annotation, because the C-level descriptor + # has no fget and griffe cannot extract the -> type at runtime. + # Read the annotation directly from the .pyx source instead. + if attr.annotation is not None: + return + if "property" not in attr.labels: return - docstring = inspect.cleandoc(fmt_docstring) - obj.docstring.value = docstring + ann = _read_pyx_annotation(attr) + if ann: + attr.annotation = griffe.ExprName(ann) diff --git a/tests/unit/test_docstrings.py b/tests/unit/test_docstrings.py new file mode 100644 index 00000000..8d53f517 --- /dev/null +++ b/tests/unit/test_docstrings.py @@ -0,0 +1,37 @@ +import doctest +import unittest +from pyslurm.utils import helpers + +_FLAGS = doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE + +_MODULES = [helpers] + + +def _make_doctest_method(obj): + def test_method(self): + finder = doctest.DocTestFinder() + runner = doctest.DocTestRunner(optionflags=_FLAGS) + for dt in finder.find(obj, obj.__name__): + runner.run(dt) + failed, _ = runner.summarize() + assert failed == 0 + test_method.__name__ = f"test_{obj.__name__}" + return test_method + + +def _build_suite(module): + attrs = {} + for name in dir(module): + obj = getattr(module, name, None) + if callable(obj) and ">>>" in (getattr(obj, "__doc__", None) or ""): + attrs[f"test_{name}"] = _make_doctest_method(obj) + return type( + f"Docstrings_{module.__name__.split('.')[-1]}", + (unittest.TestCase,), + attrs, + ) + + +for _mod in _MODULES: + _cls = _build_suite(_mod) + globals()[_cls.__name__] = _cls