Skip to content

Commit 3787545

Browse files
authored
Kueue job priorities (#13)
* fix: Don't build container images when submitting to Ray cluster * deps: Upgrade Ray to 2.11.0 * feat(kueue): Implement scheduling options, including priority classes Jobs can now specify `SchedulingOptions`, which include the Kueue local queue and priority class names. Adds example priority classes (background, development, production) and preemption options to the cluster queue manifest. A sample "production" job has been added to simplify the demo of preemption and priority based workload ordering in Kueue. * feat(kueue): Validate existence of local queues and workload priority classes * fix: Make all job options optional by default * chore: Split up the Runner module The `runner.py` module was growing to a point where readability was hampered. This commit splits it up into the `jobs.runner` package, with submodules grouped by execution platform. Also, introduces a `KubernetesNamespaceMixin` to simplify determining the current or desired namespace for a workload. * fix: Canonicalize all paths in Docker image options
1 parent 01a51f2 commit 3787545

16 files changed

+409
-351
lines changed

example_hello.py

+8-3
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import logging
2+
import time
23

3-
from jobs import ImageOptions, JobOptions, ResourceOptions, job
4+
from jobs import ImageOptions, JobOptions, ResourceOptions, SchedulingOptions, job
45
from jobs.cli import submit_job
56

67

@@ -9,14 +10,18 @@
910
# A job with explicit Dockerfile
1011
image=ImageOptions(
1112
dockerfile="Dockerfile",
12-
name="hello-world-yaml",
13+
name="hello-world-dev",
1314
tag="latest",
1415
),
15-
resources=ResourceOptions(memory="2Gi", cpu="1", gpu=None),
16+
resources=ResourceOptions(memory="256Mi", cpu="1"),
17+
scheduling=SchedulingOptions(
18+
priority_class="background",
19+
),
1620
)
1721
)
1822
def hello_world():
1923
print("Hello, World!")
24+
time.sleep(60)
2025

2126

2227
if __name__ == "__main__":

example_hello_prod.py

+27
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
import logging
2+
import time
3+
4+
from jobs import ImageOptions, JobOptions, ResourceOptions, SchedulingOptions, job
5+
from jobs.cli import submit_job
6+
7+
8+
@job(
9+
options=JobOptions(
10+
# A job with explicit Dockerfile
11+
image=ImageOptions(dockerfile="Dockerfile"),
12+
resources=ResourceOptions(memory="256Mi", cpu="4"),
13+
scheduling=SchedulingOptions(
14+
priority_class="production",
15+
),
16+
)
17+
)
18+
def prod_training():
19+
print("Hello, World!")
20+
time.sleep(60)
21+
22+
23+
if __name__ == "__main__":
24+
logging.basicConfig(level=logging.DEBUG)
25+
logging.getLogger("urllib3.connectionpool").setLevel(logging.INFO)
26+
27+
submit_job(prod_training)

raycluster-manifest.yaml

-92
This file was deleted.

raycluster-values.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
image:
2-
tag: 2.9.0-py310
2+
tag: 2.11.0-py311
33
worker:
44
maxReplicas: 1
55
resources:

requirements.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,7 @@ pyyaml==6.0.1
171171
# kubernetes
172172
# pre-commit
173173
# ray
174-
ray==2.10.0
174+
ray==2.11.0
175175
referencing==0.34.0
176176
# via
177177
# jsonschema

single-clusterqueue-setup.yaml

+28
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,13 @@ metadata:
99
name: "cluster-queue"
1010
spec:
1111
namespaceSelector: {} # match all.
12+
# Allow preemption of workloads by higher priority ones
13+
preemption:
14+
reclaimWithinCohort: Any
15+
borrowWithinCohort:
16+
policy: LowerPriority
17+
maxPriorityThreshold: 100
18+
withinClusterQueue: LowerPriority
1219
resourceGroups:
1320
- coveredResources: ["cpu", "memory", "nvidia.com/gpu"]
1421
flavors:
@@ -28,3 +35,24 @@ metadata:
2835
name: "user-queue"
2936
spec:
3037
clusterQueue: "cluster-queue"
38+
---
39+
apiVersion: kueue.x-k8s.io/v1beta1
40+
kind: WorkloadPriorityClass
41+
metadata:
42+
name: background
43+
value: 1
44+
description: "Background (=lowest) priority"
45+
---
46+
apiVersion: kueue.x-k8s.io/v1beta1
47+
kind: WorkloadPriorityClass
48+
metadata:
49+
name: development
50+
value: 100
51+
description: "Development priority"
52+
---
53+
apiVersion: kueue.x-k8s.io/v1beta1
54+
kind: WorkloadPriorityClass
55+
metadata:
56+
name: production
57+
value: 1000
58+
description: "Production priority"

src/jobs/__init__.py

+9-1
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,21 @@
11
from jobs import assembler
22
from jobs.image import Image
3-
from jobs.job import ImageOptions, Job, JobOptions, ResourceOptions, job
3+
from jobs.job import (
4+
ImageOptions,
5+
Job,
6+
JobOptions,
7+
ResourceOptions,
8+
SchedulingOptions,
9+
job,
10+
)
411

512
__all__ = [
613
Image,
714
Job,
815
JobOptions,
916
ImageOptions,
1017
ResourceOptions,
18+
SchedulingOptions,
1119
job,
1220
assembler,
1321
]

src/jobs/cli.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ def submit_job(job: Job) -> None:
5353
logging.debug(f"Execution mode: {mode}")
5454

5555
image: Image | None = None
56-
if mode in [ExecutionMode.DOCKER, ExecutionMode.KUEUE, ExecutionMode.RAYCLUSTER]:
56+
if mode in [ExecutionMode.DOCKER, ExecutionMode.KUEUE]:
5757
image = job.build_image()
5858
if image is None:
5959
raise RuntimeError("Could not build container image")

src/jobs/job.py

+16-4
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,15 @@ def to_ray(self) -> RayResourceOptions:
9292
return remove_none_values(options)
9393

9494

95+
@dataclass(frozen=True)
96+
class SchedulingOptions:
97+
priority_class: str | None = None
98+
"""Kueue priority class name"""
99+
100+
queue_name: str | None = None
101+
"""Kueue local queue name"""
102+
103+
95104
@dataclass(frozen=True)
96105
class ImageOptions:
97106
name: str | None = None
@@ -112,7 +121,7 @@ def build_mode(self) -> BuildMode:
112121
def _to_pathlib(self, attr: str) -> None:
113122
val = self.__getattribute__(attr)
114123
if isinstance(val, str):
115-
object.__setattr__(self, attr, Path(val).absolute())
124+
object.__setattr__(self, attr, Path(val).resolve())
116125

117126
def __post_init__(self) -> None:
118127
def _is_yaml(path: AnyPath) -> bool:
@@ -136,14 +145,17 @@ def _is_yaml(path: AnyPath) -> bool:
136145
raise ValueError(f"Build context must be a directory: {self.build_context}")
137146

138147
if self.dockerfile and not self.dockerfile.is_relative_to(self.build_context):
139-
raise ValueError("Dockerfile must be relative to build context")
148+
raise ValueError(
149+
f"Dockerfile must be relative to build context {self.build_context}"
150+
)
140151

141152

142153
@dataclass(frozen=True)
143154
class JobOptions:
144-
resources: ResourceOptions | None
155+
resources: ResourceOptions | None = None
145156
"""Resource requests for this job in Kubernetes format (see https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-units-in-kubernetes)"""
146-
image: ImageOptions | None
157+
image: ImageOptions | None = None
158+
scheduling: SchedulingOptions | None = None
147159

148160

149161
class Job:

0 commit comments

Comments
 (0)