diff --git a/backend/src/jobs_server/models.py b/backend/src/jobs_server/models.py index 9e80eeee..0f77c408 100644 --- a/backend/src/jobs_server/models.py +++ b/backend/src/jobs_server/models.py @@ -111,6 +111,7 @@ class WorkloadMetadata(BaseModel): termination_timestamp: datetime.datetime | None = None was_evicted: bool = False was_inadmissible: bool = False + has_failed_pods: bool = False @classmethod def from_kueue_workload(cls, workload: KueueWorkload) -> Self: @@ -126,6 +127,7 @@ def from_kueue_workload(cls, workload: KueueWorkload) -> Self: termination_timestamp=workload.termination_timestamp, was_evicted=workload.was_evicted, was_inadmissible=workload.was_inadmissible, + has_failed_pods=workload.has_failed_pods, ) diff --git a/backend/src/jobs_server/services/k8s.py b/backend/src/jobs_server/services/k8s.py index ec9b6a76..bb5fca54 100644 --- a/backend/src/jobs_server/services/k8s.py +++ b/backend/src/jobs_server/services/k8s.py @@ -122,6 +122,9 @@ def list_workloads(self, namespace: str | None = None) -> list[KueueWorkload]: namespace=namespace or self.namespace, plural="workloads", ) + self._core_v1_api.list_namespaced_pod( + namespace=namespace or self.namespace, + ) return [ KueueWorkload.model_validate(workload) for workload in workloads.get("items", []) diff --git a/backend/src/jobs_server/utils/kueue.py b/backend/src/jobs_server/utils/kueue.py index c5a2be77..0e2bb512 100644 --- a/backend/src/jobs_server/utils/kueue.py +++ b/backend/src/jobs_server/utils/kueue.py @@ -247,6 +247,10 @@ def pods(self) -> list[client.V1Pod]: ).items return pods + @property + def has_failed_pods(self) -> bool: + return any(p.status.phase == "Failed" for p in self.pods) + def stop(self, k8s: "KubernetesService") -> None: if not self.managed_resource: raise RuntimeError( diff --git a/backend/tests/e2e/test_jobs.py b/backend/tests/e2e/test_jobs.py index 23568174..79e44808 100644 --- a/backend/tests/e2e/test_jobs.py +++ b/backend/tests/e2e/test_jobs.py @@ -67,6 +67,9 @@ def test_job_lifecycle( assert str(status.managed_resource_id) == managed_resource_id.uid assert status.execution_status != JobStatus.FAILED assert status.kueue_status is not None and status.kueue_status.conditions != [] + assert not status.was_evicted + assert not status.was_inadmissible + assert not status.has_failed_pods if status.execution_status != JobStatus.PENDING: break diff --git a/backend/tests/integration/test_jobs.py b/backend/tests/integration/test_jobs.py index e5f621b3..efc3985d 100644 --- a/backend/tests/integration/test_jobs.py +++ b/backend/tests/integration/test_jobs.py @@ -124,6 +124,9 @@ def test_success( mocker.patch.object( KueueWorkload, "for_managed_resource", return_value=workload ) + # FIXME: This prevents the pod listing API call, which doesn't work in a integration + # test scenario. + mocker.patch.object(KueueWorkload, "has_failed_pods", return_value=False) response = client.get(f"/jobs/{workload.metadata.uid}/status") @@ -273,6 +276,10 @@ def test_list_jobs( return_value=[workload], ) + # FIXME: This prevents the pod listing API call, which doesn't work in a integration + # test scenario. + mocker.patch.object(KueueWorkload, "has_failed_pods", return_value=False) + response = client.get("/jobs?include_metadata=true") assert response.is_success diff --git a/client/src/cli/commands/list.py b/client/src/cli/commands/list.py index c888bd0f..ee413c1d 100644 --- a/client/src/cli/commands/list.py +++ b/client/src/cli/commands/list.py @@ -30,6 +30,9 @@ def format_status(s: JobStatus) -> str: def status_flags(wl: openapi_client.WorkloadMetadata) -> str: if wl.was_evicted or wl.was_inadmissible: return "[bright_yellow] [!][/]" + # if the job is already failed, we don't really need to warn anymore. + elif wl.has_failed_pods and wl.execution_status != JobStatus.FAILED: + return "[bright_red] [!][/]" else: return "" diff --git a/client/src/openapi_client/models/workload_metadata.py b/client/src/openapi_client/models/workload_metadata.py index 34b87c75..6e4be7d8 100644 --- a/client/src/openapi_client/models/workload_metadata.py +++ b/client/src/openapi_client/models/workload_metadata.py @@ -39,6 +39,7 @@ class WorkloadMetadata(BaseModel): termination_timestamp: datetime | None = None was_evicted: StrictBool | None = False was_inadmissible: StrictBool | None = False + has_failed_pods: StrictBool | None = False __properties: ClassVar[list[str]] = [ "managed_resource_id", "execution_status", @@ -49,6 +50,7 @@ class WorkloadMetadata(BaseModel): "termination_timestamp", "was_evicted", "was_inadmissible", + "has_failed_pods", ] model_config = ConfigDict( @@ -139,5 +141,8 @@ def from_dict(cls, obj: dict[str, Any] | None) -> Self | None: "was_inadmissible": obj.get("was_inadmissible") if obj.get("was_inadmissible") is not None else False, + "has_failed_pods": obj.get("has_failed_pods") + if obj.get("has_failed_pods") is not None + else False, }) return _obj