Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(app): detailed status messages #1289

Merged
merged 3 commits into from
Oct 14, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion helm-chart/renku-notebooks/requirements.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
dependencies:
- name: amalthea
repository: "https://swissdatasciencecenter.github.io/helm-charts/"
version: "0.5.2"
version: "0.6.0"
- name: certificates
version: "0.0.3"
repository: "https://swissdatasciencecenter.github.io/helm-charts/"
Expand Down
7 changes: 5 additions & 2 deletions renku_notebooks/api/amalthea_patches/general.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,10 +83,13 @@ def test(server: "UserServer"):
to fail if the test statements are not correct. This is used to ensure that the
order of containers in the amalthea manifests is what the notebook service expects."""
patches = []
# NOTE: Only the first 1 or 2 containers come "included" from Amalthea, the rest are patched in
# This tests checks whether the expected number and order is received from Amalthea and
# does not use all containers.
container_names = (
config.sessions.container_order_registered
config.sessions.containers.registered[:2]
if type(server._user) is RegisteredUser
else config.sessions.container_order_anonymous
else config.sessions.containers.anonymous[:1]
)
for container_ind, container_name in enumerate(container_names):
patches.append(
Expand Down
22 changes: 22 additions & 0 deletions renku_notebooks/api/amalthea_patches/init_containers.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,3 +173,25 @@ def certificates():
},
]
return patches


def download_image(server: "UserServer"):
container = client.V1Container(
name="download-image",
image=server.verified_image,
command=["sh", "-c"],
args=["exit", "0"],
)
api_client = client.ApiClient()
return [
{
"type": "application/json-patch+json",
"patch": [
{
"op": "add",
"path": "/statefulset/spec/template/spec/initContainers/-",
"value": api_client.sanitize_for_serialization(container),
},
],
},
]
1 change: 1 addition & 0 deletions renku_notebooks/api/classes/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,6 +293,7 @@ def _get_session_manifest(self):
# init container for certs must come before all other init containers
# so that it runs first before all other init containers
init_containers_patches.certificates(),
init_containers_patches.download_image(self),
init_containers_patches.git_clone(self),
inject_certificates_patches.proxy(self),
)
Expand Down
133 changes: 121 additions & 12 deletions renku_notebooks/api/schemas/servers_get.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import re
from collections import OrderedDict
from datetime import datetime
from enum import Enum

Expand Down Expand Up @@ -30,12 +31,40 @@ def list(cls):
return list(map(lambda c: c.value, cls))


class StepStatusEnum(Enum):
ready: str = "ready" # An init job completely done or container fully running
waiting: str = "waiting" # Waiting to start
executing: str = "executing" # Running but not complete or fully ready
failed: str = "failed"

@classmethod
def list(cls):
return list(map(lambda c: c.value, cls))


class ServerStatusDetail(Schema):
step = fields.String(required=True)
status = fields.String(
required=True,
validate=validate.OneOf(StepStatusEnum.list()),
)


class ServerStatus(Schema):
state = fields.String(
required=True,
validate=validate.OneOf(ServerStatusEnum.list()),
)
message = fields.String(required=False)
details = fields.List(fields.Nested(ServerStatusDetail), required=True)
totalNumContainers = fields.Integer(
required=True,
validate=validate.Range(min=0, min_inclusive=True),
)
readyNumContainers = fields.Integer(
required=True,
validate=validate.Range(min=0, min_inclusive=True),
)


class ResourceRequests(Schema):
Expand Down Expand Up @@ -190,7 +219,7 @@ def get_unschedulable_message(pod) -> str:
return msg
reason = sorted_parts[0].lstrip("1234567890 ")
return (
"You session cannot be scheduled due to insufficent resources. "
"Your session cannot be scheduled due to insufficent resources. "
f"The most likely reason is: '{reason}'. You may wait for resources "
"to free up or you can adjust the specific resource and restart your session."
)
Expand Down Expand Up @@ -219,16 +248,88 @@ def get_failed_containers(container_statuses):
]
return failed_containers

def get_starting_message(container_statuses):
containers_not_ready = [
container_status.get("name", "Unknown")
for container_status in container_statuses
if not container_status.get("ready", False)
def get_starting_message(step_summary):
steps_not_ready = [
step["step"].lower()
for step in step_summary
if step["status"] != StepStatusEnum.ready.value
]
if len(containers_not_ready) > 0:
return f"Containers with non-ready statuses: {', '.join(containers_not_ready)}."
if len(steps_not_ready) > 0:
return f"Steps with non-ready statuses: {', '.join(steps_not_ready)}."
return None

def get_status_breakdown(js):
init_container_summary = (
js.get("status", {}).get("containerStates", {}).get("init", {})
)
container_summary = (
js.get("status", {}).get("containerStates", {}).get("regular", {})
)
output = []
init_container_name_desc_xref = OrderedDict(
[
("init-certificates", "Initialization"),
("download-image", "Downloading server image"),
("git-clone", "Cloning and configuring the repository"),
]
)
container_name_desc_xref = OrderedDict(
[
("git-proxy", "Git credentials services"),
("oauth2-proxy", "Authentication and proxying services"),
("passthrough-proxy", "Proxying services"),
("git-sidecar", "Auxiliary session services"),
("jupyter-server", "Starting session"),
]
)
current_state = js.get("status", {}).get("state")
if (
current_state is None
or current_state == ServerStatusEnum.Starting.value
):
# NOTE: This means that the server is starting and the statuses are not populated
# yet, therefore in this case we will use defaults and set all statuses to waiting
if len(init_container_summary) == 0:
init_container_summary = {
container_name: StepStatusEnum.waiting.value
for container_name in config.sessions.init_containers
}
if len(container_summary) == 0:
annotations = js.get("metadata", {}).get("annotations", {})
prefix = (
config.session_get_endpoint_annotations.renku_annotation_prefix
)
is_user_anonymous = (
annotations.get(f"{prefix}userId", "").startswith("anon-")
and annotations.get(f"{prefix}username", "").startswith("anon-")
and js.get("metadata", {}).get("name", "").startswith("anon-")
)
container_summary = {
container_name: StepStatusEnum.waiting.value
for container_name in (
config.sessions.containers.anonymous
if is_user_anonymous
else config.sessions.containers.registered
)
}
for (container, desc) in init_container_name_desc_xref.items():
if container in init_container_summary:
output.append(
{
"step": desc,
"status": init_container_summary[container],
}
)
for (container, desc) in container_name_desc_xref.items():
if container in container_summary:
output.append(
{
"step": desc,
"status": container_summary[container],
}
)
return output

def get_status(js):
"""Get the status of the jupyterserver."""
state = js.get("status", {}).get("state", ServerStatusEnum.Starting.value)
Expand All @@ -237,17 +338,25 @@ def get_status(js):
}
container_statuses = get_all_container_statuses(js)
if state == ServerStatusEnum.Failed.value:
failed_container_statuses = get_failed_containers(container_statuses)
unschedulable_msg = get_unschedulable_message(
js.get("status", {}).get("mainPod", {})
)
if unschedulable_msg:
output["message"] = unschedulable_msg
else:
output["message"] = get_failed_message(
get_failed_containers(container_statuses)
)
output["message"] = get_failed_message(failed_container_statuses)
output["details"] = get_status_breakdown(js)
if state == ServerStatusEnum.Starting.value:
output["message"] = get_starting_message(container_statuses)
output["message"] = get_starting_message(output["details"])
output["totalNumContainers"] = len(output["details"])
output["readyNumContainers"] = len(
[
step
for step in output["details"]
if step["status"] in [StepStatusEnum.ready.value]
]
)
return output

def get_resource_requests(server):
Expand Down
20 changes: 13 additions & 7 deletions renku_notebooks/config/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,20 +113,26 @@ def get_config(default_config: str) -> _NotebooksConfig:
storage {
pvs_enabled: true
}
containers {
anonymous = [
jupyter-server,
passthrough-proxy,
git-proxy,
]
registered = [
jupyter-server,
oauth2-proxy,
git-proxy,
git-sidecar,
]
}
enforce_cpu_limits: false
autosave_minimum_lfs_file_size_bytes: 1000000
termination_grace_period_seconds: 600
image_default_workdir: /home/jovyan
node_selector: "{}"
affinity: "{}"
tolerations: "[]"
container_order_anonymous = [
jupyter-server
]
container_order_registered = [
jupyter-server
oauth2-proxy
]
}
amalthea {
group = amalthea.dev
Expand Down
19 changes: 11 additions & 8 deletions renku_notebooks/config/dynamic.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,12 @@ class _SessionCullingConfig:
registered: _GenericCullingConfig


@dataclass
class _SessionContainers:
anonymous: List[Text]
registered: List[Text]


@dataclass
class _SessionConfig:
culling: _SessionCullingConfig
Expand All @@ -182,6 +188,7 @@ class _SessionConfig:
ca_certs: _CustomCaCertsConfig
oidc: _SessionOidcConfig
storage: _SessionStorageConfig
containers: _SessionContainers
default_image: Text = "renku/singleuser:latest"
enforce_cpu_limits: Union[Text, bool] = False
autosave_minimum_lfs_file_size_bytes: Union[int, Text] = 1000000
Expand All @@ -190,15 +197,11 @@ class _SessionConfig:
node_selector: Text = "{}"
affinity: Text = "{}"
tolerations: Text = "[]"
container_order_anonymous: List[Text] = field(
default_factory=lambda: [
"jupyter-server",
]
)
container_order_registered: List[Text] = field(
init_containers: List[Text] = field(
default_factory=lambda: [
"jupyter-server",
"oauth2-proxy",
"init-certificates",
"download-image",
"git-clone",
]
)

Expand Down