Module codeflare_sdk.cluster.cluster
The cluster sub-module contains the definition of the Cluster object, which represents the resources requested by the user. It also contains functions for checking the cluster setup queue, a list of all existing clusters, and the user's working namespace.
Expand source code
# Copyright 2022 IBM, Red Hat
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
The cluster sub-module contains the definition of the Cluster object, which represents
the resources requested by the user. It also contains functions for checking the
cluster setup queue, a list of all existing clusters, and the user's working namespace.
"""
import re
from time import sleep
from typing import List, Optional, Tuple, Dict
from kubernetes import config
from ray.job_submission import JobSubmissionClient
from .auth import config_check, api_config_handler
from ..utils import pretty_print
from ..utils.generate_yaml import (
generate_appwrapper,
)
from ..utils.kube_api_helpers import _kube_api_error_handling
from ..utils.generate_yaml import is_openshift_cluster
from .config import ClusterConfiguration
from .model import (
AppWrapper,
AppWrapperStatus,
CodeFlareClusterStatus,
RayCluster,
RayClusterStatus,
)
from kubernetes import client, config
from kubernetes.utils import parse_quantity
import yaml
import os
import requests
from kubernetes import config
from kubernetes.client.rest import ApiException
class Cluster:
"""
An object for requesting, bringing up, and taking down resources.
Can also be used for seeing the resource cluster status and details.
Note that currently, the underlying implementation is a Ray cluster.
"""
def __init__(self, config: ClusterConfiguration):
"""
Create the resource cluster object by passing in a ClusterConfiguration
(defined in the config sub-module). An AppWrapper will then be generated
based off of the configured resources to represent the desired cluster
request.
"""
self.config = config
self.app_wrapper_yaml = self.create_app_wrapper()
self._job_submission_client = None
self.app_wrapper_name = self.config.name
@property
def _client_headers(self):
k8_client = api_config_handler() or client.ApiClient()
return {
"Authorization": k8_client.configuration.get_api_key_with_prefix(
"authorization"
)
}
@property
def _client_verify_tls(self):
if not is_openshift_cluster or not self.config.verify_tls:
return False
return True
@property
def job_client(self):
k8client = api_config_handler() or client.ApiClient()
if self._job_submission_client:
return self._job_submission_client
if is_openshift_cluster():
self._job_submission_client = JobSubmissionClient(
self.cluster_dashboard_uri(),
headers=self._client_headers,
verify=self._client_verify_tls,
)
else:
self._job_submission_client = JobSubmissionClient(
self.cluster_dashboard_uri()
)
return self._job_submission_client
def evaluate_dispatch_priority(self):
priority_class = self.config.dispatch_priority
try:
config_check()
api_instance = client.CustomObjectsApi(api_config_handler())
priority_classes = api_instance.list_cluster_custom_object(
group="scheduling.k8s.io",
version="v1",
plural="priorityclasses",
)
except Exception as e: # pragma: no cover
return _kube_api_error_handling(e)
for pc in priority_classes["items"]:
if pc["metadata"]["name"] == priority_class:
return pc["value"]
print(f"Priority class {priority_class} is not available in the cluster")
return None
def validate_image_config(self):
"""
Validates that the image configuration is not empty.
:param image: The image string to validate
:raises ValueError: If the image is not specified
"""
if self.config.image == "" or self.config.image == None:
raise ValueError("Image must be specified in the ClusterConfiguration")
def create_app_wrapper(self):
"""
Called upon cluster object creation, creates an AppWrapper yaml based on
the specifications of the ClusterConfiguration.
"""
if self.config.namespace is None:
self.config.namespace = get_current_namespace()
if self.config.namespace is None:
print("Please specify with namespace=<your_current_namespace>")
elif type(self.config.namespace) is not str:
raise TypeError(
f"Namespace {self.config.namespace} is of type {type(self.config.namespace)}. Check your Kubernetes Authentication."
)
# Validate image configuration
self.validate_image_config()
# Before attempting to create the cluster AW, let's evaluate the ClusterConfig
if self.config.dispatch_priority:
if not self.config.mcad:
raise ValueError(
"Invalid Cluster Configuration, cannot have dispatch priority without MCAD"
)
priority_val = self.evaluate_dispatch_priority()
if priority_val == None:
raise ValueError(
"Invalid Cluster Configuration, AppWrapper not generated"
)
else:
priority_val = None
name = self.config.name
namespace = self.config.namespace
head_cpus = self.config.head_cpus
head_memory = self.config.head_memory
head_gpus = self.config.head_gpus
min_cpu = self.config.min_cpus
max_cpu = self.config.max_cpus
min_memory = self.config.min_memory
max_memory = self.config.max_memory
gpu = self.config.num_gpus
workers = self.config.num_workers
template = self.config.template
image = self.config.image
instascale = self.config.instascale
mcad = self.config.mcad
instance_types = self.config.machine_types
env = self.config.envs
image_pull_secrets = self.config.image_pull_secrets
dispatch_priority = self.config.dispatch_priority
write_to_file = self.config.write_to_file
verify_tls = self.config.verify_tls
local_queue = self.config.local_queue
labels = self.config.labels
return generate_appwrapper(
name=name,
namespace=namespace,
head_cpus=head_cpus,
head_memory=head_memory,
head_gpus=head_gpus,
min_cpu=min_cpu,
max_cpu=max_cpu,
min_memory=min_memory,
max_memory=max_memory,
gpu=gpu,
workers=workers,
template=template,
image=image,
instascale=instascale,
mcad=mcad,
instance_types=instance_types,
env=env,
image_pull_secrets=image_pull_secrets,
dispatch_priority=dispatch_priority,
priority_val=priority_val,
write_to_file=write_to_file,
verify_tls=verify_tls,
local_queue=local_queue,
labels=labels,
)
# creates a new cluster with the provided or default spec
def up(self):
"""
Applies the AppWrapper yaml, pushing the resource request onto
the MCAD queue.
"""
# check if RayCluster CustomResourceDefinition exists if not throw RuntimeError
self._throw_for_no_raycluster()
namespace = self.config.namespace
try:
config_check()
api_instance = client.CustomObjectsApi(api_config_handler())
if self.config.mcad:
if self.config.write_to_file:
with open(self.app_wrapper_yaml) as f:
aw = yaml.load(f, Loader=yaml.FullLoader)
api_instance.create_namespaced_custom_object(
group="workload.codeflare.dev",
version="v1beta1",
namespace=namespace,
plural="appwrappers",
body=aw,
)
else:
aw = yaml.safe_load(self.app_wrapper_yaml)
api_instance.create_namespaced_custom_object(
group="workload.codeflare.dev",
version="v1beta1",
namespace=namespace,
plural="appwrappers",
body=aw,
)
else:
self._component_resources_up(namespace, api_instance)
except Exception as e: # pragma: no cover
return _kube_api_error_handling(e)
def _throw_for_no_raycluster(self):
api_instance = client.CustomObjectsApi(api_config_handler())
try:
api_instance.list_namespaced_custom_object(
group="ray.io",
version="v1",
namespace=self.config.namespace,
plural="rayclusters",
)
except ApiException as e:
if e.status == 404:
raise RuntimeError(
"RayCluster CustomResourceDefinition unavailable contact your administrator."
)
else:
raise RuntimeError(
"Failed to get RayCluster CustomResourceDefinition: " + str(e)
)
def down(self):
"""
Deletes the AppWrapper yaml, scaling-down and deleting all resources
associated with the cluster.
"""
namespace = self.config.namespace
self._throw_for_no_raycluster()
try:
config_check()
api_instance = client.CustomObjectsApi(api_config_handler())
if self.config.mcad:
api_instance.delete_namespaced_custom_object(
group="workload.codeflare.dev",
version="v1beta1",
namespace=namespace,
plural="appwrappers",
name=self.app_wrapper_name,
)
else:
self._component_resources_down(namespace, api_instance)
except Exception as e: # pragma: no cover
return _kube_api_error_handling(e)
def status(
self, print_to_console: bool = True
) -> Tuple[CodeFlareClusterStatus, bool]:
"""
Returns the requested cluster's status, as well as whether or not
it is ready for use.
"""
ready = False
status = CodeFlareClusterStatus.UNKNOWN
if self.config.mcad:
# check the app wrapper status
appwrapper = _app_wrapper_status(self.config.name, self.config.namespace)
if appwrapper:
if appwrapper.status in [
AppWrapperStatus.RUNNING,
AppWrapperStatus.COMPLETED,
AppWrapperStatus.RUNNING_HOLD_COMPLETION,
]:
ready = False
status = CodeFlareClusterStatus.STARTING
elif appwrapper.status in [
AppWrapperStatus.FAILED,
AppWrapperStatus.DELETED,
]:
ready = False
status = CodeFlareClusterStatus.FAILED # should deleted be separate
return status, ready # exit early, no need to check ray status
elif appwrapper.status in [
AppWrapperStatus.PENDING,
AppWrapperStatus.QUEUEING,
]:
ready = False
if appwrapper.status == AppWrapperStatus.PENDING:
status = CodeFlareClusterStatus.QUEUED
else:
status = CodeFlareClusterStatus.QUEUEING
if print_to_console:
pretty_print.print_app_wrappers_status([appwrapper])
return (
status,
ready,
) # no need to check the ray status since still in queue
# check the ray cluster status
cluster = _ray_cluster_status(self.config.name, self.config.namespace)
if cluster:
if cluster.status == RayClusterStatus.SUSPENDED:
ready = False
status = CodeFlareClusterStatus.SUSPENDED
if cluster.status == RayClusterStatus.UNKNOWN:
ready = False
status = CodeFlareClusterStatus.STARTING
if cluster.status == RayClusterStatus.READY:
ready = True
status = CodeFlareClusterStatus.READY
elif cluster.status in [
RayClusterStatus.UNHEALTHY,
RayClusterStatus.FAILED,
]:
ready = False
status = CodeFlareClusterStatus.FAILED
if print_to_console:
# overriding the number of gpus with requested
cluster.worker_gpu = self.config.num_gpus
pretty_print.print_cluster_status(cluster)
elif print_to_console:
if status == CodeFlareClusterStatus.UNKNOWN:
pretty_print.print_no_resources_found()
else:
pretty_print.print_app_wrappers_status([appwrapper], starting=True)
return status, ready
def is_dashboard_ready(self) -> bool:
try:
response = requests.get(
self.cluster_dashboard_uri(),
headers=self._client_headers,
timeout=5,
verify=self._client_verify_tls,
)
except requests.exceptions.SSLError: # pragma no cover
# SSL exception occurs when oauth ingress has been created but cluster is not up
return False
if response.status_code == 200:
return True
else:
return False
def wait_ready(self, timeout: Optional[int] = None, dashboard_check: bool = True):
"""
Waits for requested cluster to be ready, up to an optional timeout (s).
Checks every five seconds.
"""
print("Waiting for requested resources to be set up...")
time = 0
while True:
if timeout and time >= timeout:
raise TimeoutError(
f"wait() timed out after waiting {timeout}s for cluster to be ready"
)
status, ready = self.status(print_to_console=False)
if status == CodeFlareClusterStatus.UNKNOWN:
print(
"WARNING: Current cluster status is unknown, have you run cluster.up yet?"
)
if ready:
break
sleep(5)
time += 5
print("Requested cluster is up and running!")
while dashboard_check:
if timeout and time >= timeout:
raise TimeoutError(
f"wait() timed out after waiting {timeout}s for dashboard to be ready"
)
if self.is_dashboard_ready():
print("Dashboard is ready!")
break
sleep(5)
time += 5
def details(self, print_to_console: bool = True) -> RayCluster:
cluster = _copy_to_ray(self)
if print_to_console:
pretty_print.print_clusters([cluster])
return cluster
def cluster_uri(self) -> str:
"""
Returns a string containing the cluster's URI.
"""
return f"ray://{self.config.name}-head-svc.{self.config.namespace}.svc:10001"
def cluster_dashboard_uri(self) -> str:
"""
Returns a string containing the cluster's dashboard URI.
"""
config_check()
if is_openshift_cluster():
try:
api_instance = client.CustomObjectsApi(api_config_handler())
routes = api_instance.list_namespaced_custom_object(
group="route.openshift.io",
version="v1",
namespace=self.config.namespace,
plural="routes",
)
except Exception as e: # pragma: no cover
return _kube_api_error_handling(e)
for route in routes["items"]:
if route["metadata"][
"name"
] == f"ray-dashboard-{self.config.name}" or route["metadata"][
"name"
].startswith(
f"{self.config.name}-ingress"
):
protocol = "https" if route["spec"].get("tls") else "http"
return f"{protocol}://{route['spec']['host']}"
else:
try:
api_instance = client.NetworkingV1Api(api_config_handler())
ingresses = api_instance.list_namespaced_ingress(self.config.namespace)
except Exception as e: # pragma no cover
return _kube_api_error_handling(e)
for ingress in ingresses.items:
annotations = ingress.metadata.annotations
protocol = "http"
if (
ingress.metadata.name == f"ray-dashboard-{self.config.name}"
or ingress.metadata.name.startswith(f"{self.config.name}-ingress")
):
if annotations == None:
protocol = "http"
elif "route.openshift.io/termination" in annotations:
protocol = "https"
return f"{protocol}://{ingress.spec.rules[0].host}"
return "Dashboard not available yet, have you run cluster.up()?"
def list_jobs(self) -> List:
"""
This method accesses the head ray node in your cluster and lists the running jobs.
"""
return self.job_client.list_jobs()
def job_status(self, job_id: str) -> str:
"""
This method accesses the head ray node in your cluster and returns the job status for the provided job id.
"""
return self.job_client.get_job_status(job_id)
def job_logs(self, job_id: str) -> str:
"""
This method accesses the head ray node in your cluster and returns the logs for the provided job id.
"""
return self.job_client.get_job_logs(job_id)
def from_k8_cluster_object(
rc,
mcad=True,
write_to_file=False,
verify_tls=True,
):
config_check()
machine_types = (
rc["metadata"]["labels"]["orderedinstance"].split("_")
if "orderedinstance" in rc["metadata"]["labels"]
else []
)
cluster_config = ClusterConfiguration(
name=rc["metadata"]["name"],
namespace=rc["metadata"]["namespace"],
machine_types=machine_types,
num_workers=rc["spec"]["workerGroupSpecs"][0]["minReplicas"],
min_cpus=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
"containers"
][0]["resources"]["requests"]["cpu"],
max_cpus=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
"containers"
][0]["resources"]["limits"]["cpu"],
min_memory=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
"containers"
][0]["resources"]["requests"]["memory"],
max_memory=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
"containers"
][0]["resources"]["limits"]["memory"],
num_gpus=int(
rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0][
"resources"
]["limits"]["nvidia.com/gpu"]
),
instascale=True if machine_types else False,
image=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][
0
]["image"],
mcad=mcad,
write_to_file=write_to_file,
verify_tls=verify_tls,
)
return Cluster(cluster_config)
def local_client_url(self):
ingress_domain = _get_ingress_domain(self)
return f"ray://{ingress_domain}"
def _component_resources_up(
self, namespace: str, api_instance: client.CustomObjectsApi
):
if self.config.write_to_file:
with open(self.app_wrapper_yaml) as f:
yamls = list(yaml.load_all(f, Loader=yaml.FullLoader))
for resource in yamls:
enable_ingress = (
resource.get("spec", {})
.get("headGroupSpec", {})
.get("enableIngress")
)
if resource["kind"] == "RayCluster" and enable_ingress is not False:
name = resource["metadata"]["name"]
print(
f"Forbidden: RayCluster '{name}' has 'enableIngress' set to 'True' or is unset."
)
return
_create_resources(yamls, namespace, api_instance)
else:
yamls = yaml.load_all(self.app_wrapper_yaml, Loader=yaml.FullLoader)
_create_resources(yamls, namespace, api_instance)
def _component_resources_down(
self, namespace: str, api_instance: client.CustomObjectsApi
):
cluster_name = self.config.name
if self.config.write_to_file:
with open(self.app_wrapper_yaml) as f:
yamls = yaml.load_all(f, Loader=yaml.FullLoader)
_delete_resources(yamls, namespace, api_instance, cluster_name)
else:
yamls = yaml.safe_load_all(self.app_wrapper_yaml)
_delete_resources(yamls, namespace, api_instance, cluster_name)
def list_all_clusters(namespace: str, print_to_console: bool = True):
"""
Returns (and prints by default) a list of all clusters in a given namespace.
"""
clusters = _get_ray_clusters(namespace)
if print_to_console:
pretty_print.print_clusters(clusters)
return clusters
def list_all_queued(namespace: str, print_to_console: bool = True, mcad: bool = False):
"""
Returns (and prints by default) a list of all currently queued-up Ray Clusters
in a given namespace.
"""
if mcad:
resources = _get_app_wrappers(
namespace, filter=[AppWrapperStatus.RUNNING, AppWrapperStatus.PENDING]
)
if print_to_console:
pretty_print.print_app_wrappers_status(resources)
else:
resources = _get_ray_clusters(
namespace, filter=[RayClusterStatus.READY, RayClusterStatus.SUSPENDED]
)
if print_to_console:
pretty_print.print_ray_clusters_status(resources)
return resources
def get_current_namespace(): # pragma: no cover
if api_config_handler() != None:
if os.path.isfile("/var/run/secrets/kubernetes.io/serviceaccount/namespace"):
try:
file = open(
"/var/run/secrets/kubernetes.io/serviceaccount/namespace", "r"
)
active_context = file.readline().strip("\n")
return active_context
except Exception as e:
print("Unable to find current namespace")
return None
else:
print("Unable to find current namespace")
return None
else:
try:
_, active_context = config.list_kube_config_contexts(config_check())
except Exception as e:
return _kube_api_error_handling(e)
try:
return active_context["context"]["namespace"]
except KeyError:
return None
def get_cluster(
cluster_name: str,
namespace: str = "default",
write_to_file: bool = False,
verify_tls: bool = True,
):
try:
config_check()
api_instance = client.CustomObjectsApi(api_config_handler())
rcs = api_instance.list_namespaced_custom_object(
group="ray.io",
version="v1",
namespace=namespace,
plural="rayclusters",
)
except Exception as e:
return _kube_api_error_handling(e)
for rc in rcs["items"]:
if rc["metadata"]["name"] == cluster_name:
mcad = _check_aw_exists(cluster_name, namespace)
return Cluster.from_k8_cluster_object(
rc,
mcad=mcad,
write_to_file=write_to_file,
verify_tls=verify_tls,
)
raise FileNotFoundError(
f"Cluster {cluster_name} is not found in {namespace} namespace"
)
# private methods
def _delete_resources(
yamls, namespace: str, api_instance: client.CustomObjectsApi, cluster_name: str
):
for resource in yamls:
if resource["kind"] == "RayCluster":
name = resource["metadata"]["name"]
api_instance.delete_namespaced_custom_object(
group="ray.io",
version="v1",
namespace=namespace,
plural="rayclusters",
name=name,
)
def _create_resources(yamls, namespace: str, api_instance: client.CustomObjectsApi):
for resource in yamls:
if resource["kind"] == "RayCluster":
api_instance.create_namespaced_custom_object(
group="ray.io",
version="v1",
namespace=namespace,
plural="rayclusters",
body=resource,
)
def _check_aw_exists(name: str, namespace: str) -> bool:
try:
config_check()
api_instance = client.CustomObjectsApi(api_config_handler())
aws = api_instance.list_namespaced_custom_object(
group="workload.codeflare.dev",
version="v1beta1",
namespace=namespace,
plural="appwrappers",
)
except Exception as e: # pragma: no cover
return _kube_api_error_handling(e, print_error=False)
for aw in aws["items"]:
if aw["metadata"]["name"] == name:
return True
return False
# Cant test this until get_current_namespace is fixed and placed in this function over using `self`
def _get_ingress_domain(self): # pragma: no cover
config_check()
if self.config.namespace != None:
namespace = self.config.namespace
else:
namespace = get_current_namespace()
domain = None
if is_openshift_cluster():
try:
api_instance = client.CustomObjectsApi(api_config_handler())
routes = api_instance.list_namespaced_custom_object(
group="route.openshift.io",
version="v1",
namespace=namespace,
plural="routes",
)
except Exception as e: # pragma: no cover
return _kube_api_error_handling(e)
for route in routes["items"]:
if (
route["spec"]["port"]["targetPort"] == "client"
or route["spec"]["port"]["targetPort"] == 10001
):
domain = route["spec"]["host"]
else:
try:
api_client = client.NetworkingV1Api(api_config_handler())
ingresses = api_client.list_namespaced_ingress(namespace)
except Exception as e: # pragma: no cover
return _kube_api_error_handling(e)
for ingress in ingresses.items:
if ingress.spec.rules[0].http.paths[0].backend.service.port.number == 10001:
domain = ingress.spec.rules[0].host
return domain
def _app_wrapper_status(name, namespace="default") -> Optional[AppWrapper]:
try:
config_check()
api_instance = client.CustomObjectsApi(api_config_handler())
aws = api_instance.list_namespaced_custom_object(
group="workload.codeflare.dev",
version="v1beta1",
namespace=namespace,
plural="appwrappers",
)
except Exception as e: # pragma: no cover
return _kube_api_error_handling(e)
for aw in aws["items"]:
if aw["metadata"]["name"] == name:
return _map_to_app_wrapper(aw)
return None
def _ray_cluster_status(name, namespace="default") -> Optional[RayCluster]:
try:
config_check()
api_instance = client.CustomObjectsApi(api_config_handler())
rcs = api_instance.list_namespaced_custom_object(
group="ray.io",
version="v1",
namespace=namespace,
plural="rayclusters",
)
except Exception as e: # pragma: no cover
return _kube_api_error_handling(e)
for rc in rcs["items"]:
if rc["metadata"]["name"] == name:
return _map_to_ray_cluster(rc)
return None
def _get_ray_clusters(
namespace="default", filter: Optional[List[RayClusterStatus]] = None
) -> List[RayCluster]:
list_of_clusters = []
try:
config_check()
api_instance = client.CustomObjectsApi(api_config_handler())
rcs = api_instance.list_namespaced_custom_object(
group="ray.io",
version="v1",
namespace=namespace,
plural="rayclusters",
)
except Exception as e: # pragma: no cover
return _kube_api_error_handling(e)
# Get a list of RCs with the filter if it is passed to the function
if filter is not None:
for rc in rcs["items"]:
ray_cluster = _map_to_ray_cluster(rc)
if filter and ray_cluster.status in filter:
list_of_clusters.append(ray_cluster)
else:
for rc in rcs["items"]:
list_of_clusters.append(_map_to_ray_cluster(rc))
return list_of_clusters
def _get_app_wrappers(
namespace="default", filter=List[AppWrapperStatus]
) -> List[AppWrapper]:
list_of_app_wrappers = []
try:
config_check()
api_instance = client.CustomObjectsApi(api_config_handler())
aws = api_instance.list_namespaced_custom_object(
group="workload.codeflare.dev",
version="v1beta1",
namespace=namespace,
plural="appwrappers",
)
except Exception as e: # pragma: no cover
return _kube_api_error_handling(e)
for item in aws["items"]:
app_wrapper = _map_to_app_wrapper(item)
if filter and app_wrapper.status in filter:
list_of_app_wrappers.append(app_wrapper)
else:
# Unsure what the purpose of the filter is
list_of_app_wrappers.append(app_wrapper)
return list_of_app_wrappers
def _map_to_ray_cluster(rc) -> Optional[RayCluster]:
if "status" in rc and "state" in rc["status"]:
status = RayClusterStatus(rc["status"]["state"].lower())
else:
status = RayClusterStatus.UNKNOWN
config_check()
dashboard_url = None
if is_openshift_cluster():
try:
api_instance = client.CustomObjectsApi(api_config_handler())
routes = api_instance.list_namespaced_custom_object(
group="route.openshift.io",
version="v1",
namespace=rc["metadata"]["namespace"],
plural="routes",
)
except Exception as e: # pragma: no cover
return _kube_api_error_handling(e)
for route in routes["items"]:
rc_name = rc["metadata"]["name"]
if route["metadata"]["name"] == f"ray-dashboard-{rc_name}" or route[
"metadata"
]["name"].startswith(f"{rc_name}-ingress"):
protocol = "https" if route["spec"].get("tls") else "http"
dashboard_url = f"{protocol}://{route['spec']['host']}"
else:
try:
api_instance = client.NetworkingV1Api(api_config_handler())
ingresses = api_instance.list_namespaced_ingress(
rc["metadata"]["namespace"]
)
except Exception as e: # pragma no cover
return _kube_api_error_handling(e)
for ingress in ingresses.items:
annotations = ingress.metadata.annotations
protocol = "http"
if (
ingress.metadata.name == f"ray-dashboard-{rc['metadata']['name']}"
or ingress.metadata.name.startswith(f"{rc['metadata']['name']}-ingress")
):
if annotations == None:
protocol = "http"
elif "route.openshift.io/termination" in annotations:
protocol = "https"
dashboard_url = f"{protocol}://{ingress.spec.rules[0].host}"
return RayCluster(
name=rc["metadata"]["name"],
status=status,
# for now we are not using autoscaling so same replicas is fine
workers=rc["spec"]["workerGroupSpecs"][0]["replicas"],
worker_mem_max=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
"containers"
][0]["resources"]["limits"]["memory"],
worker_mem_min=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
"containers"
][0]["resources"]["requests"]["memory"],
worker_cpu=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][
0
]["resources"]["limits"]["cpu"],
worker_gpu=0, # hard to detect currently how many gpus, can override it with what the user asked for
namespace=rc["metadata"]["namespace"],
head_cpus=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][0][
"resources"
]["limits"]["cpu"],
head_mem=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][0][
"resources"
]["limits"]["memory"],
head_gpu=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][0][
"resources"
]["limits"]["nvidia.com/gpu"],
dashboard=dashboard_url,
)
def _map_to_app_wrapper(aw) -> AppWrapper:
if "status" in aw and "canrun" in aw["status"]:
return AppWrapper(
name=aw["metadata"]["name"],
status=AppWrapperStatus(aw["status"]["state"].lower()),
can_run=aw["status"]["canrun"],
job_state=aw["status"]["queuejobstate"],
)
return AppWrapper(
name=aw["metadata"]["name"],
status=AppWrapperStatus("queueing"),
can_run=False,
job_state="Still adding to queue",
)
def _copy_to_ray(cluster: Cluster) -> RayCluster:
ray = RayCluster(
name=cluster.config.name,
status=cluster.status(print_to_console=False)[0],
workers=cluster.config.num_workers,
worker_mem_min=cluster.config.min_memory,
worker_mem_max=cluster.config.max_memory,
worker_cpu=cluster.config.min_cpus,
worker_gpu=cluster.config.num_gpus,
namespace=cluster.config.namespace,
dashboard=cluster.cluster_dashboard_uri(),
head_cpus=cluster.config.head_cpus,
head_mem=cluster.config.head_memory,
head_gpu=cluster.config.head_gpus,
)
if ray.status == CodeFlareClusterStatus.READY:
ray.status = RayClusterStatus.READY
return ray
Functions
def get_cluster(cluster_name: str, namespace: str = 'default', write_to_file: bool = False, verify_tls: bool = True)-
Expand source code
def get_cluster( cluster_name: str, namespace: str = "default", write_to_file: bool = False, verify_tls: bool = True, ): try: config_check() api_instance = client.CustomObjectsApi(api_config_handler()) rcs = api_instance.list_namespaced_custom_object( group="ray.io", version="v1", namespace=namespace, plural="rayclusters", ) except Exception as e: return _kube_api_error_handling(e) for rc in rcs["items"]: if rc["metadata"]["name"] == cluster_name: mcad = _check_aw_exists(cluster_name, namespace) return Cluster.from_k8_cluster_object( rc, mcad=mcad, write_to_file=write_to_file, verify_tls=verify_tls, ) raise FileNotFoundError( f"Cluster {cluster_name} is not found in {namespace} namespace" ) def get_current_namespace()-
Expand source code
def get_current_namespace(): # pragma: no cover if api_config_handler() != None: if os.path.isfile("/var/run/secrets/kubernetes.io/serviceaccount/namespace"): try: file = open( "/var/run/secrets/kubernetes.io/serviceaccount/namespace", "r" ) active_context = file.readline().strip("\n") return active_context except Exception as e: print("Unable to find current namespace") return None else: print("Unable to find current namespace") return None else: try: _, active_context = config.list_kube_config_contexts(config_check()) except Exception as e: return _kube_api_error_handling(e) try: return active_context["context"]["namespace"] except KeyError: return None def list_all_clusters(namespace: str, print_to_console: bool = True)-
Returns (and prints by default) a list of all clusters in a given namespace.
Expand source code
def list_all_clusters(namespace: str, print_to_console: bool = True): """ Returns (and prints by default) a list of all clusters in a given namespace. """ clusters = _get_ray_clusters(namespace) if print_to_console: pretty_print.print_clusters(clusters) return clusters def list_all_queued(namespace: str, print_to_console: bool = True, mcad: bool = False)-
Returns (and prints by default) a list of all currently queued-up Ray Clusters in a given namespace.
Expand source code
def list_all_queued(namespace: str, print_to_console: bool = True, mcad: bool = False): """ Returns (and prints by default) a list of all currently queued-up Ray Clusters in a given namespace. """ if mcad: resources = _get_app_wrappers( namespace, filter=[AppWrapperStatus.RUNNING, AppWrapperStatus.PENDING] ) if print_to_console: pretty_print.print_app_wrappers_status(resources) else: resources = _get_ray_clusters( namespace, filter=[RayClusterStatus.READY, RayClusterStatus.SUSPENDED] ) if print_to_console: pretty_print.print_ray_clusters_status(resources) return resources
Classes
class Cluster (config: ClusterConfiguration)-
An object for requesting, bringing up, and taking down resources. Can also be used for seeing the resource cluster status and details.
Note that currently, the underlying implementation is a Ray cluster.
Create the resource cluster object by passing in a ClusterConfiguration (defined in the config sub-module). An AppWrapper will then be generated based off of the configured resources to represent the desired cluster request.
Expand source code
class Cluster: """ An object for requesting, bringing up, and taking down resources. Can also be used for seeing the resource cluster status and details. Note that currently, the underlying implementation is a Ray cluster. """ def __init__(self, config: ClusterConfiguration): """ Create the resource cluster object by passing in a ClusterConfiguration (defined in the config sub-module). An AppWrapper will then be generated based off of the configured resources to represent the desired cluster request. """ self.config = config self.app_wrapper_yaml = self.create_app_wrapper() self._job_submission_client = None self.app_wrapper_name = self.config.name @property def _client_headers(self): k8_client = api_config_handler() or client.ApiClient() return { "Authorization": k8_client.configuration.get_api_key_with_prefix( "authorization" ) } @property def _client_verify_tls(self): if not is_openshift_cluster or not self.config.verify_tls: return False return True @property def job_client(self): k8client = api_config_handler() or client.ApiClient() if self._job_submission_client: return self._job_submission_client if is_openshift_cluster(): self._job_submission_client = JobSubmissionClient( self.cluster_dashboard_uri(), headers=self._client_headers, verify=self._client_verify_tls, ) else: self._job_submission_client = JobSubmissionClient( self.cluster_dashboard_uri() ) return self._job_submission_client def evaluate_dispatch_priority(self): priority_class = self.config.dispatch_priority try: config_check() api_instance = client.CustomObjectsApi(api_config_handler()) priority_classes = api_instance.list_cluster_custom_object( group="scheduling.k8s.io", version="v1", plural="priorityclasses", ) except Exception as e: # pragma: no cover return _kube_api_error_handling(e) for pc in priority_classes["items"]: if pc["metadata"]["name"] == priority_class: return pc["value"] print(f"Priority class {priority_class} is not available in the cluster") return None def validate_image_config(self): """ Validates that the image configuration is not empty. :param image: The image string to validate :raises ValueError: If the image is not specified """ if self.config.image == "" or self.config.image == None: raise ValueError("Image must be specified in the ClusterConfiguration") def create_app_wrapper(self): """ Called upon cluster object creation, creates an AppWrapper yaml based on the specifications of the ClusterConfiguration. """ if self.config.namespace is None: self.config.namespace = get_current_namespace() if self.config.namespace is None: print("Please specify with namespace=<your_current_namespace>") elif type(self.config.namespace) is not str: raise TypeError( f"Namespace {self.config.namespace} is of type {type(self.config.namespace)}. Check your Kubernetes Authentication." ) # Validate image configuration self.validate_image_config() # Before attempting to create the cluster AW, let's evaluate the ClusterConfig if self.config.dispatch_priority: if not self.config.mcad: raise ValueError( "Invalid Cluster Configuration, cannot have dispatch priority without MCAD" ) priority_val = self.evaluate_dispatch_priority() if priority_val == None: raise ValueError( "Invalid Cluster Configuration, AppWrapper not generated" ) else: priority_val = None name = self.config.name namespace = self.config.namespace head_cpus = self.config.head_cpus head_memory = self.config.head_memory head_gpus = self.config.head_gpus min_cpu = self.config.min_cpus max_cpu = self.config.max_cpus min_memory = self.config.min_memory max_memory = self.config.max_memory gpu = self.config.num_gpus workers = self.config.num_workers template = self.config.template image = self.config.image instascale = self.config.instascale mcad = self.config.mcad instance_types = self.config.machine_types env = self.config.envs image_pull_secrets = self.config.image_pull_secrets dispatch_priority = self.config.dispatch_priority write_to_file = self.config.write_to_file verify_tls = self.config.verify_tls local_queue = self.config.local_queue labels = self.config.labels return generate_appwrapper( name=name, namespace=namespace, head_cpus=head_cpus, head_memory=head_memory, head_gpus=head_gpus, min_cpu=min_cpu, max_cpu=max_cpu, min_memory=min_memory, max_memory=max_memory, gpu=gpu, workers=workers, template=template, image=image, instascale=instascale, mcad=mcad, instance_types=instance_types, env=env, image_pull_secrets=image_pull_secrets, dispatch_priority=dispatch_priority, priority_val=priority_val, write_to_file=write_to_file, verify_tls=verify_tls, local_queue=local_queue, labels=labels, ) # creates a new cluster with the provided or default spec def up(self): """ Applies the AppWrapper yaml, pushing the resource request onto the MCAD queue. """ # check if RayCluster CustomResourceDefinition exists if not throw RuntimeError self._throw_for_no_raycluster() namespace = self.config.namespace try: config_check() api_instance = client.CustomObjectsApi(api_config_handler()) if self.config.mcad: if self.config.write_to_file: with open(self.app_wrapper_yaml) as f: aw = yaml.load(f, Loader=yaml.FullLoader) api_instance.create_namespaced_custom_object( group="workload.codeflare.dev", version="v1beta1", namespace=namespace, plural="appwrappers", body=aw, ) else: aw = yaml.safe_load(self.app_wrapper_yaml) api_instance.create_namespaced_custom_object( group="workload.codeflare.dev", version="v1beta1", namespace=namespace, plural="appwrappers", body=aw, ) else: self._component_resources_up(namespace, api_instance) except Exception as e: # pragma: no cover return _kube_api_error_handling(e) def _throw_for_no_raycluster(self): api_instance = client.CustomObjectsApi(api_config_handler()) try: api_instance.list_namespaced_custom_object( group="ray.io", version="v1", namespace=self.config.namespace, plural="rayclusters", ) except ApiException as e: if e.status == 404: raise RuntimeError( "RayCluster CustomResourceDefinition unavailable contact your administrator." ) else: raise RuntimeError( "Failed to get RayCluster CustomResourceDefinition: " + str(e) ) def down(self): """ Deletes the AppWrapper yaml, scaling-down and deleting all resources associated with the cluster. """ namespace = self.config.namespace self._throw_for_no_raycluster() try: config_check() api_instance = client.CustomObjectsApi(api_config_handler()) if self.config.mcad: api_instance.delete_namespaced_custom_object( group="workload.codeflare.dev", version="v1beta1", namespace=namespace, plural="appwrappers", name=self.app_wrapper_name, ) else: self._component_resources_down(namespace, api_instance) except Exception as e: # pragma: no cover return _kube_api_error_handling(e) def status( self, print_to_console: bool = True ) -> Tuple[CodeFlareClusterStatus, bool]: """ Returns the requested cluster's status, as well as whether or not it is ready for use. """ ready = False status = CodeFlareClusterStatus.UNKNOWN if self.config.mcad: # check the app wrapper status appwrapper = _app_wrapper_status(self.config.name, self.config.namespace) if appwrapper: if appwrapper.status in [ AppWrapperStatus.RUNNING, AppWrapperStatus.COMPLETED, AppWrapperStatus.RUNNING_HOLD_COMPLETION, ]: ready = False status = CodeFlareClusterStatus.STARTING elif appwrapper.status in [ AppWrapperStatus.FAILED, AppWrapperStatus.DELETED, ]: ready = False status = CodeFlareClusterStatus.FAILED # should deleted be separate return status, ready # exit early, no need to check ray status elif appwrapper.status in [ AppWrapperStatus.PENDING, AppWrapperStatus.QUEUEING, ]: ready = False if appwrapper.status == AppWrapperStatus.PENDING: status = CodeFlareClusterStatus.QUEUED else: status = CodeFlareClusterStatus.QUEUEING if print_to_console: pretty_print.print_app_wrappers_status([appwrapper]) return ( status, ready, ) # no need to check the ray status since still in queue # check the ray cluster status cluster = _ray_cluster_status(self.config.name, self.config.namespace) if cluster: if cluster.status == RayClusterStatus.SUSPENDED: ready = False status = CodeFlareClusterStatus.SUSPENDED if cluster.status == RayClusterStatus.UNKNOWN: ready = False status = CodeFlareClusterStatus.STARTING if cluster.status == RayClusterStatus.READY: ready = True status = CodeFlareClusterStatus.READY elif cluster.status in [ RayClusterStatus.UNHEALTHY, RayClusterStatus.FAILED, ]: ready = False status = CodeFlareClusterStatus.FAILED if print_to_console: # overriding the number of gpus with requested cluster.worker_gpu = self.config.num_gpus pretty_print.print_cluster_status(cluster) elif print_to_console: if status == CodeFlareClusterStatus.UNKNOWN: pretty_print.print_no_resources_found() else: pretty_print.print_app_wrappers_status([appwrapper], starting=True) return status, ready def is_dashboard_ready(self) -> bool: try: response = requests.get( self.cluster_dashboard_uri(), headers=self._client_headers, timeout=5, verify=self._client_verify_tls, ) except requests.exceptions.SSLError: # pragma no cover # SSL exception occurs when oauth ingress has been created but cluster is not up return False if response.status_code == 200: return True else: return False def wait_ready(self, timeout: Optional[int] = None, dashboard_check: bool = True): """ Waits for requested cluster to be ready, up to an optional timeout (s). Checks every five seconds. """ print("Waiting for requested resources to be set up...") time = 0 while True: if timeout and time >= timeout: raise TimeoutError( f"wait() timed out after waiting {timeout}s for cluster to be ready" ) status, ready = self.status(print_to_console=False) if status == CodeFlareClusterStatus.UNKNOWN: print( "WARNING: Current cluster status is unknown, have you run cluster.up yet?" ) if ready: break sleep(5) time += 5 print("Requested cluster is up and running!") while dashboard_check: if timeout and time >= timeout: raise TimeoutError( f"wait() timed out after waiting {timeout}s for dashboard to be ready" ) if self.is_dashboard_ready(): print("Dashboard is ready!") break sleep(5) time += 5 def details(self, print_to_console: bool = True) -> RayCluster: cluster = _copy_to_ray(self) if print_to_console: pretty_print.print_clusters([cluster]) return cluster def cluster_uri(self) -> str: """ Returns a string containing the cluster's URI. """ return f"ray://{self.config.name}-head-svc.{self.config.namespace}.svc:10001" def cluster_dashboard_uri(self) -> str: """ Returns a string containing the cluster's dashboard URI. """ config_check() if is_openshift_cluster(): try: api_instance = client.CustomObjectsApi(api_config_handler()) routes = api_instance.list_namespaced_custom_object( group="route.openshift.io", version="v1", namespace=self.config.namespace, plural="routes", ) except Exception as e: # pragma: no cover return _kube_api_error_handling(e) for route in routes["items"]: if route["metadata"][ "name" ] == f"ray-dashboard-{self.config.name}" or route["metadata"][ "name" ].startswith( f"{self.config.name}-ingress" ): protocol = "https" if route["spec"].get("tls") else "http" return f"{protocol}://{route['spec']['host']}" else: try: api_instance = client.NetworkingV1Api(api_config_handler()) ingresses = api_instance.list_namespaced_ingress(self.config.namespace) except Exception as e: # pragma no cover return _kube_api_error_handling(e) for ingress in ingresses.items: annotations = ingress.metadata.annotations protocol = "http" if ( ingress.metadata.name == f"ray-dashboard-{self.config.name}" or ingress.metadata.name.startswith(f"{self.config.name}-ingress") ): if annotations == None: protocol = "http" elif "route.openshift.io/termination" in annotations: protocol = "https" return f"{protocol}://{ingress.spec.rules[0].host}" return "Dashboard not available yet, have you run cluster.up()?" def list_jobs(self) -> List: """ This method accesses the head ray node in your cluster and lists the running jobs. """ return self.job_client.list_jobs() def job_status(self, job_id: str) -> str: """ This method accesses the head ray node in your cluster and returns the job status for the provided job id. """ return self.job_client.get_job_status(job_id) def job_logs(self, job_id: str) -> str: """ This method accesses the head ray node in your cluster and returns the logs for the provided job id. """ return self.job_client.get_job_logs(job_id) def from_k8_cluster_object( rc, mcad=True, write_to_file=False, verify_tls=True, ): config_check() machine_types = ( rc["metadata"]["labels"]["orderedinstance"].split("_") if "orderedinstance" in rc["metadata"]["labels"] else [] ) cluster_config = ClusterConfiguration( name=rc["metadata"]["name"], namespace=rc["metadata"]["namespace"], machine_types=machine_types, num_workers=rc["spec"]["workerGroupSpecs"][0]["minReplicas"], min_cpus=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ "containers" ][0]["resources"]["requests"]["cpu"], max_cpus=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ "containers" ][0]["resources"]["limits"]["cpu"], min_memory=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ "containers" ][0]["resources"]["requests"]["memory"], max_memory=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ "containers" ][0]["resources"]["limits"]["memory"], num_gpus=int( rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0][ "resources" ]["limits"]["nvidia.com/gpu"] ), instascale=True if machine_types else False, image=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][ 0 ]["image"], mcad=mcad, write_to_file=write_to_file, verify_tls=verify_tls, ) return Cluster(cluster_config) def local_client_url(self): ingress_domain = _get_ingress_domain(self) return f"ray://{ingress_domain}" def _component_resources_up( self, namespace: str, api_instance: client.CustomObjectsApi ): if self.config.write_to_file: with open(self.app_wrapper_yaml) as f: yamls = list(yaml.load_all(f, Loader=yaml.FullLoader)) for resource in yamls: enable_ingress = ( resource.get("spec", {}) .get("headGroupSpec", {}) .get("enableIngress") ) if resource["kind"] == "RayCluster" and enable_ingress is not False: name = resource["metadata"]["name"] print( f"Forbidden: RayCluster '{name}' has 'enableIngress' set to 'True' or is unset." ) return _create_resources(yamls, namespace, api_instance) else: yamls = yaml.load_all(self.app_wrapper_yaml, Loader=yaml.FullLoader) _create_resources(yamls, namespace, api_instance) def _component_resources_down( self, namespace: str, api_instance: client.CustomObjectsApi ): cluster_name = self.config.name if self.config.write_to_file: with open(self.app_wrapper_yaml) as f: yamls = yaml.load_all(f, Loader=yaml.FullLoader) _delete_resources(yamls, namespace, api_instance, cluster_name) else: yamls = yaml.safe_load_all(self.app_wrapper_yaml) _delete_resources(yamls, namespace, api_instance, cluster_name)Instance variables
var job_client-
Expand source code
@property def job_client(self): k8client = api_config_handler() or client.ApiClient() if self._job_submission_client: return self._job_submission_client if is_openshift_cluster(): self._job_submission_client = JobSubmissionClient( self.cluster_dashboard_uri(), headers=self._client_headers, verify=self._client_verify_tls, ) else: self._job_submission_client = JobSubmissionClient( self.cluster_dashboard_uri() ) return self._job_submission_client
Methods
def cluster_dashboard_uri(self) ‑> str-
Returns a string containing the cluster's dashboard URI.
Expand source code
def cluster_dashboard_uri(self) -> str: """ Returns a string containing the cluster's dashboard URI. """ config_check() if is_openshift_cluster(): try: api_instance = client.CustomObjectsApi(api_config_handler()) routes = api_instance.list_namespaced_custom_object( group="route.openshift.io", version="v1", namespace=self.config.namespace, plural="routes", ) except Exception as e: # pragma: no cover return _kube_api_error_handling(e) for route in routes["items"]: if route["metadata"][ "name" ] == f"ray-dashboard-{self.config.name}" or route["metadata"][ "name" ].startswith( f"{self.config.name}-ingress" ): protocol = "https" if route["spec"].get("tls") else "http" return f"{protocol}://{route['spec']['host']}" else: try: api_instance = client.NetworkingV1Api(api_config_handler()) ingresses = api_instance.list_namespaced_ingress(self.config.namespace) except Exception as e: # pragma no cover return _kube_api_error_handling(e) for ingress in ingresses.items: annotations = ingress.metadata.annotations protocol = "http" if ( ingress.metadata.name == f"ray-dashboard-{self.config.name}" or ingress.metadata.name.startswith(f"{self.config.name}-ingress") ): if annotations == None: protocol = "http" elif "route.openshift.io/termination" in annotations: protocol = "https" return f"{protocol}://{ingress.spec.rules[0].host}" return "Dashboard not available yet, have you run cluster.up()?" def cluster_uri(self) ‑> str-
Returns a string containing the cluster's URI.
Expand source code
def cluster_uri(self) -> str: """ Returns a string containing the cluster's URI. """ return f"ray://{self.config.name}-head-svc.{self.config.namespace}.svc:10001" def create_app_wrapper(self)-
Called upon cluster object creation, creates an AppWrapper yaml based on the specifications of the ClusterConfiguration.
Expand source code
def create_app_wrapper(self): """ Called upon cluster object creation, creates an AppWrapper yaml based on the specifications of the ClusterConfiguration. """ if self.config.namespace is None: self.config.namespace = get_current_namespace() if self.config.namespace is None: print("Please specify with namespace=<your_current_namespace>") elif type(self.config.namespace) is not str: raise TypeError( f"Namespace {self.config.namespace} is of type {type(self.config.namespace)}. Check your Kubernetes Authentication." ) # Validate image configuration self.validate_image_config() # Before attempting to create the cluster AW, let's evaluate the ClusterConfig if self.config.dispatch_priority: if not self.config.mcad: raise ValueError( "Invalid Cluster Configuration, cannot have dispatch priority without MCAD" ) priority_val = self.evaluate_dispatch_priority() if priority_val == None: raise ValueError( "Invalid Cluster Configuration, AppWrapper not generated" ) else: priority_val = None name = self.config.name namespace = self.config.namespace head_cpus = self.config.head_cpus head_memory = self.config.head_memory head_gpus = self.config.head_gpus min_cpu = self.config.min_cpus max_cpu = self.config.max_cpus min_memory = self.config.min_memory max_memory = self.config.max_memory gpu = self.config.num_gpus workers = self.config.num_workers template = self.config.template image = self.config.image instascale = self.config.instascale mcad = self.config.mcad instance_types = self.config.machine_types env = self.config.envs image_pull_secrets = self.config.image_pull_secrets dispatch_priority = self.config.dispatch_priority write_to_file = self.config.write_to_file verify_tls = self.config.verify_tls local_queue = self.config.local_queue labels = self.config.labels return generate_appwrapper( name=name, namespace=namespace, head_cpus=head_cpus, head_memory=head_memory, head_gpus=head_gpus, min_cpu=min_cpu, max_cpu=max_cpu, min_memory=min_memory, max_memory=max_memory, gpu=gpu, workers=workers, template=template, image=image, instascale=instascale, mcad=mcad, instance_types=instance_types, env=env, image_pull_secrets=image_pull_secrets, dispatch_priority=dispatch_priority, priority_val=priority_val, write_to_file=write_to_file, verify_tls=verify_tls, local_queue=local_queue, labels=labels, ) def details(self, print_to_console: bool = True) ‑> RayCluster-
Expand source code
def details(self, print_to_console: bool = True) -> RayCluster: cluster = _copy_to_ray(self) if print_to_console: pretty_print.print_clusters([cluster]) return cluster def down(self)-
Deletes the AppWrapper yaml, scaling-down and deleting all resources associated with the cluster.
Expand source code
def down(self): """ Deletes the AppWrapper yaml, scaling-down and deleting all resources associated with the cluster. """ namespace = self.config.namespace self._throw_for_no_raycluster() try: config_check() api_instance = client.CustomObjectsApi(api_config_handler()) if self.config.mcad: api_instance.delete_namespaced_custom_object( group="workload.codeflare.dev", version="v1beta1", namespace=namespace, plural="appwrappers", name=self.app_wrapper_name, ) else: self._component_resources_down(namespace, api_instance) except Exception as e: # pragma: no cover return _kube_api_error_handling(e) def evaluate_dispatch_priority(self)-
Expand source code
def evaluate_dispatch_priority(self): priority_class = self.config.dispatch_priority try: config_check() api_instance = client.CustomObjectsApi(api_config_handler()) priority_classes = api_instance.list_cluster_custom_object( group="scheduling.k8s.io", version="v1", plural="priorityclasses", ) except Exception as e: # pragma: no cover return _kube_api_error_handling(e) for pc in priority_classes["items"]: if pc["metadata"]["name"] == priority_class: return pc["value"] print(f"Priority class {priority_class} is not available in the cluster") return None def from_k8_cluster_object(rc, mcad=True, write_to_file=False, verify_tls=True)-
Expand source code
def from_k8_cluster_object( rc, mcad=True, write_to_file=False, verify_tls=True, ): config_check() machine_types = ( rc["metadata"]["labels"]["orderedinstance"].split("_") if "orderedinstance" in rc["metadata"]["labels"] else [] ) cluster_config = ClusterConfiguration( name=rc["metadata"]["name"], namespace=rc["metadata"]["namespace"], machine_types=machine_types, num_workers=rc["spec"]["workerGroupSpecs"][0]["minReplicas"], min_cpus=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ "containers" ][0]["resources"]["requests"]["cpu"], max_cpus=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ "containers" ][0]["resources"]["limits"]["cpu"], min_memory=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ "containers" ][0]["resources"]["requests"]["memory"], max_memory=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ "containers" ][0]["resources"]["limits"]["memory"], num_gpus=int( rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0][ "resources" ]["limits"]["nvidia.com/gpu"] ), instascale=True if machine_types else False, image=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][ 0 ]["image"], mcad=mcad, write_to_file=write_to_file, verify_tls=verify_tls, ) return Cluster(cluster_config) def is_dashboard_ready(self) ‑> bool-
Expand source code
def is_dashboard_ready(self) -> bool: try: response = requests.get( self.cluster_dashboard_uri(), headers=self._client_headers, timeout=5, verify=self._client_verify_tls, ) except requests.exceptions.SSLError: # pragma no cover # SSL exception occurs when oauth ingress has been created but cluster is not up return False if response.status_code == 200: return True else: return False def job_logs(self, job_id: str) ‑> str-
This method accesses the head ray node in your cluster and returns the logs for the provided job id.
Expand source code
def job_logs(self, job_id: str) -> str: """ This method accesses the head ray node in your cluster and returns the logs for the provided job id. """ return self.job_client.get_job_logs(job_id) def job_status(self, job_id: str) ‑> str-
This method accesses the head ray node in your cluster and returns the job status for the provided job id.
Expand source code
def job_status(self, job_id: str) -> str: """ This method accesses the head ray node in your cluster and returns the job status for the provided job id. """ return self.job_client.get_job_status(job_id) def list_jobs(self) ‑> List[~T]-
This method accesses the head ray node in your cluster and lists the running jobs.
Expand source code
def list_jobs(self) -> List: """ This method accesses the head ray node in your cluster and lists the running jobs. """ return self.job_client.list_jobs() def local_client_url(self)-
Expand source code
def local_client_url(self): ingress_domain = _get_ingress_domain(self) return f"ray://{ingress_domain}" def status(self, print_to_console: bool = True) ‑> Tuple[CodeFlareClusterStatus, bool]-
Returns the requested cluster's status, as well as whether or not it is ready for use.
Expand source code
def status( self, print_to_console: bool = True ) -> Tuple[CodeFlareClusterStatus, bool]: """ Returns the requested cluster's status, as well as whether or not it is ready for use. """ ready = False status = CodeFlareClusterStatus.UNKNOWN if self.config.mcad: # check the app wrapper status appwrapper = _app_wrapper_status(self.config.name, self.config.namespace) if appwrapper: if appwrapper.status in [ AppWrapperStatus.RUNNING, AppWrapperStatus.COMPLETED, AppWrapperStatus.RUNNING_HOLD_COMPLETION, ]: ready = False status = CodeFlareClusterStatus.STARTING elif appwrapper.status in [ AppWrapperStatus.FAILED, AppWrapperStatus.DELETED, ]: ready = False status = CodeFlareClusterStatus.FAILED # should deleted be separate return status, ready # exit early, no need to check ray status elif appwrapper.status in [ AppWrapperStatus.PENDING, AppWrapperStatus.QUEUEING, ]: ready = False if appwrapper.status == AppWrapperStatus.PENDING: status = CodeFlareClusterStatus.QUEUED else: status = CodeFlareClusterStatus.QUEUEING if print_to_console: pretty_print.print_app_wrappers_status([appwrapper]) return ( status, ready, ) # no need to check the ray status since still in queue # check the ray cluster status cluster = _ray_cluster_status(self.config.name, self.config.namespace) if cluster: if cluster.status == RayClusterStatus.SUSPENDED: ready = False status = CodeFlareClusterStatus.SUSPENDED if cluster.status == RayClusterStatus.UNKNOWN: ready = False status = CodeFlareClusterStatus.STARTING if cluster.status == RayClusterStatus.READY: ready = True status = CodeFlareClusterStatus.READY elif cluster.status in [ RayClusterStatus.UNHEALTHY, RayClusterStatus.FAILED, ]: ready = False status = CodeFlareClusterStatus.FAILED if print_to_console: # overriding the number of gpus with requested cluster.worker_gpu = self.config.num_gpus pretty_print.print_cluster_status(cluster) elif print_to_console: if status == CodeFlareClusterStatus.UNKNOWN: pretty_print.print_no_resources_found() else: pretty_print.print_app_wrappers_status([appwrapper], starting=True) return status, ready def up(self)-
Applies the AppWrapper yaml, pushing the resource request onto the MCAD queue.
Expand source code
def up(self): """ Applies the AppWrapper yaml, pushing the resource request onto the MCAD queue. """ # check if RayCluster CustomResourceDefinition exists if not throw RuntimeError self._throw_for_no_raycluster() namespace = self.config.namespace try: config_check() api_instance = client.CustomObjectsApi(api_config_handler()) if self.config.mcad: if self.config.write_to_file: with open(self.app_wrapper_yaml) as f: aw = yaml.load(f, Loader=yaml.FullLoader) api_instance.create_namespaced_custom_object( group="workload.codeflare.dev", version="v1beta1", namespace=namespace, plural="appwrappers", body=aw, ) else: aw = yaml.safe_load(self.app_wrapper_yaml) api_instance.create_namespaced_custom_object( group="workload.codeflare.dev", version="v1beta1", namespace=namespace, plural="appwrappers", body=aw, ) else: self._component_resources_up(namespace, api_instance) except Exception as e: # pragma: no cover return _kube_api_error_handling(e) def validate_image_config(self)-
Validates that the image configuration is not empty.
:param image: The image string to validate :raises ValueError: If the image is not specified
Expand source code
def validate_image_config(self): """ Validates that the image configuration is not empty. :param image: The image string to validate :raises ValueError: If the image is not specified """ if self.config.image == "" or self.config.image == None: raise ValueError("Image must be specified in the ClusterConfiguration") def wait_ready(self, timeout: Optional[int] = None, dashboard_check: bool = True)-
Waits for requested cluster to be ready, up to an optional timeout (s). Checks every five seconds.
Expand source code
def wait_ready(self, timeout: Optional[int] = None, dashboard_check: bool = True): """ Waits for requested cluster to be ready, up to an optional timeout (s). Checks every five seconds. """ print("Waiting for requested resources to be set up...") time = 0 while True: if timeout and time >= timeout: raise TimeoutError( f"wait() timed out after waiting {timeout}s for cluster to be ready" ) status, ready = self.status(print_to_console=False) if status == CodeFlareClusterStatus.UNKNOWN: print( "WARNING: Current cluster status is unknown, have you run cluster.up yet?" ) if ready: break sleep(5) time += 5 print("Requested cluster is up and running!") while dashboard_check: if timeout and time >= timeout: raise TimeoutError( f"wait() timed out after waiting {timeout}s for dashboard to be ready" ) if self.is_dashboard_ready(): print("Dashboard is ready!") break sleep(5) time += 5