Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Warn in jobby list if there are failed pods to a Kueue workload #89

Merged
merged 4 commits into from
Sep 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions backend/src/jobs_server/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ class WorkloadMetadata(BaseModel):
termination_timestamp: datetime.datetime | None = None
was_evicted: bool = False
was_inadmissible: bool = False
has_failed_pods: bool = False

@classmethod
def from_kueue_workload(cls, workload: KueueWorkload) -> Self:
Expand All @@ -126,6 +127,7 @@ def from_kueue_workload(cls, workload: KueueWorkload) -> Self:
termination_timestamp=workload.termination_timestamp,
was_evicted=workload.was_evicted,
was_inadmissible=workload.was_inadmissible,
has_failed_pods=workload.has_failed_pods,
)


Expand Down
3 changes: 3 additions & 0 deletions backend/src/jobs_server/services/k8s.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,9 @@ def list_workloads(self, namespace: str | None = None) -> list[KueueWorkload]:
namespace=namespace or self.namespace,
plural="workloads",
)
self._core_v1_api.list_namespaced_pod(
namespace=namespace or self.namespace,
)
return [
KueueWorkload.model_validate(workload)
for workload in workloads.get("items", [])
Expand Down
4 changes: 4 additions & 0 deletions backend/src/jobs_server/utils/kueue.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,10 @@ def pods(self) -> list[client.V1Pod]:
).items
return pods

@property
def has_failed_pods(self) -> bool:
return any(p.status.phase == "Failed" for p in self.pods)

def stop(self, k8s: "KubernetesService") -> None:
if not self.managed_resource:
raise RuntimeError(
Expand Down
3 changes: 3 additions & 0 deletions backend/tests/e2e/test_jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,9 @@ def test_job_lifecycle(
assert str(status.managed_resource_id) == managed_resource_id.uid
assert status.execution_status != JobStatus.FAILED
assert status.kueue_status is not None and status.kueue_status.conditions != []
assert not status.was_evicted
assert not status.was_inadmissible
assert not status.has_failed_pods

if status.execution_status != JobStatus.PENDING:
break
Expand Down
7 changes: 7 additions & 0 deletions backend/tests/integration/test_jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,9 @@ def test_success(
mocker.patch.object(
KueueWorkload, "for_managed_resource", return_value=workload
)
# FIXME: This prevents the pod listing API call, which doesn't work in a integration
# test scenario.
mocker.patch.object(KueueWorkload, "has_failed_pods", return_value=False)

response = client.get(f"/jobs/{workload.metadata.uid}/status")

Expand Down Expand Up @@ -273,6 +276,10 @@ def test_list_jobs(
return_value=[workload],
)

# FIXME: This prevents the pod listing API call, which doesn't work in a integration
# test scenario.
mocker.patch.object(KueueWorkload, "has_failed_pods", return_value=False)

response = client.get("/jobs?include_metadata=true")

assert response.is_success
Expand Down
3 changes: 3 additions & 0 deletions client/src/cli/commands/list.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,9 @@ def format_status(s: JobStatus) -> str:
def status_flags(wl: openapi_client.WorkloadMetadata) -> str:
if wl.was_evicted or wl.was_inadmissible:
return "[bright_yellow] [!][/]"
# if the job is already failed, we don't really need to warn anymore.
elif wl.has_failed_pods and wl.execution_status != JobStatus.FAILED:
return "[bright_red] [!][/]"
else:
return ""

Expand Down
5 changes: 5 additions & 0 deletions client/src/openapi_client/models/workload_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ class WorkloadMetadata(BaseModel):
termination_timestamp: datetime | None = None
was_evicted: StrictBool | None = False
was_inadmissible: StrictBool | None = False
has_failed_pods: StrictBool | None = False
__properties: ClassVar[list[str]] = [
"managed_resource_id",
"execution_status",
Expand All @@ -49,6 +50,7 @@ class WorkloadMetadata(BaseModel):
"termination_timestamp",
"was_evicted",
"was_inadmissible",
"has_failed_pods",
]

model_config = ConfigDict(
Expand Down Expand Up @@ -139,5 +141,8 @@ def from_dict(cls, obj: dict[str, Any] | None) -> Self | None:
"was_inadmissible": obj.get("was_inadmissible")
if obj.get("was_inadmissible") is not None
else False,
"has_failed_pods": obj.get("has_failed_pods")
if obj.get("has_failed_pods") is not None
else False,
})
return _obj