Module codeflare_sdk.utils.generate_yaml
This sub-module exists primarily to be used internally by the Cluster object (in the cluster sub-module) for AppWrapper generation.
Expand source code
# Copyright 2022 IBM, Red Hat
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This sub-module exists primarily to be used internally by the Cluster object
(in the cluster sub-module) for AppWrapper generation.
"""
import yaml
import sys
import argparse
import uuid
from kubernetes import client, config
from .kube_api_helpers import _kube_api_error_handling
from ..cluster.auth import api_config_handler, config_check
from os import urandom
from base64 import b64encode
from urllib3.util import parse_url
from kubernetes import client, config
from .kube_api_helpers import _get_api_host
def read_template(template):
with open(template, "r") as stream:
try:
return yaml.safe_load(stream)
except yaml.YAMLError as exc:
print(exc)
def gen_names(name):
if not name:
gen_id = str(uuid.uuid4())
appwrapper_name = "appwrapper-" + gen_id
cluster_name = "cluster-" + gen_id
return appwrapper_name, cluster_name
else:
return name, name
def gen_dashboard_ingress_name(cluster_name):
return f"ray-dashboard-{cluster_name}"
# Check if the ingress api cluster resource exists
def is_openshift_cluster():
try:
config_check()
api_instance = client.CustomObjectsApi(api_config_handler())
api_instance.get_cluster_custom_object(
"config.openshift.io", "v1", "ingresses", "cluster"
)
return True
except client.ApiException as e: # pragma: no cover
if e.status == 404 or e.status == 403:
return False
else:
print(f"Error detecting cluster type defaulting to Kubernetes: {e}")
return False
def update_dashboard_ingress(
ingress_item, cluster_name, namespace, ingress_options, ingress_domain
): # pragma: no cover
metadata = ingress_item.get("generictemplate", {}).get("metadata")
spec = ingress_item.get("generictemplate", {}).get("spec")
if ingress_options != {}:
for index, ingress_option in enumerate(ingress_options["ingresses"]):
if "ingressName" not in ingress_option.keys():
raise ValueError(
f"Error: 'ingressName' is missing or empty for ingress item at index {index}"
)
if "port" not in ingress_option.keys():
raise ValueError(
f"Error: 'port' is missing or empty for ingress item at index {index}"
)
elif not isinstance(ingress_option["port"], int):
raise ValueError(
f"Error: 'port' is not of type int for ingress item at index {index}"
)
if ingress_option["port"] == 8265:
metadata["name"] = ingress_option["ingressName"]
metadata["namespace"] = namespace
if "annotations" not in ingress_option.keys():
del metadata["annotations"]
else:
metadata["annotations"] = ingress_option["annotations"]
if "path" not in ingress_option.keys():
del spec["rules"][0]["http"]["paths"][0]["path"]
else:
spec["rules"][0]["http"]["paths"][0]["path"] = ingress_option[
"path"
]
if "pathType" not in ingress_option.keys():
spec["rules"][0]["http"]["paths"][0][
"pathType"
] = "ImplementationSpecific"
if "host" not in ingress_option.keys():
del spec["rules"][0]["host"]
else:
spec["rules"][0]["host"] = ingress_option["host"]
if "ingressClassName" not in ingress_option.keys():
del spec["ingressClassName"]
else:
spec["ingressClassName"] = ingress_option["ingressClassName"]
spec["rules"][0]["http"]["paths"][0]["backend"]["service"][
"name"
] = f"{cluster_name}-head-svc"
else:
metadata["name"] = f"ray-dashboard-{cluster_name}"
metadata["namespace"] = namespace
spec["rules"][0]["http"]["paths"][0]["backend"]["service"][
"name"
] = f"{cluster_name}-head-svc"
if is_openshift_cluster():
try:
config_check()
api_client = client.CustomObjectsApi(api_config_handler())
ingress = api_client.get_cluster_custom_object(
"config.openshift.io", "v1", "ingresses", "cluster"
)
del spec["ingressClassName"]
except Exception as e: # pragma: no cover
return _kube_api_error_handling(e)
domain = ingress["spec"]["domain"]
elif ingress_domain is None:
raise ValueError(
"ingress_domain is invalid. For Kubernetes Clusters please specify an ingress domain"
)
else:
domain = ingress_domain
del metadata["annotations"]
spec["rules"][0]["host"] = f"ray-dashboard-{cluster_name}-{namespace}.{domain}"
def update_rayclient_ingress(
ingress_item, cluster_name, namespace, ingress_domain
): # pragma: no cover
metadata = ingress_item.get("generictemplate", {}).get("metadata")
spec = ingress_item.get("generictemplate", {}).get("spec")
metadata["name"] = f"rayclient-{cluster_name}"
metadata["namespace"] = namespace
metadata["labels"]["odh-ray-cluster-service"] = f"{cluster_name}-head-svc"
spec["rules"][0]["http"]["paths"][0]["backend"]["service"][
"name"
] = f"{cluster_name}-head-svc"
if is_openshift_cluster():
try:
config_check()
api_client = client.CustomObjectsApi(api_config_handler())
ingress = api_client.get_cluster_custom_object(
"config.openshift.io", "v1", "ingresses", "cluster"
)
ingressClassName = "openshift-default"
annotations = {
"nginx.ingress.kubernetes.io/rewrite-target": "/",
"nginx.ingress.kubernetes.io/ssl-redirect": "true",
"route.openshift.io/termination": "passthrough",
}
except Exception as e: # pragma: no cover
return _kube_api_error_handling(e)
domain = ingress["spec"]["domain"]
elif ingress_domain is None:
raise ValueError(
"ingress_domain is invalid. For Kubernetes Clusters please specify an ingress domain"
)
else:
domain = ingress_domain
ingressClassName = "nginx"
annotations = {
"nginx.ingress.kubernetes.io/rewrite-target": "/",
"nginx.ingress.kubernetes.io/ssl-redirect": "true",
"nginx.ingress.kubernetes.io/ssl-passthrough": "true",
}
metadata["annotations"] = annotations
spec["ingressClassName"] = ingressClassName
spec["rules"][0]["host"] = f"rayclient-{cluster_name}-{namespace}.{domain}"
def update_names(yaml, item, appwrapper_name, cluster_name, namespace):
metadata = yaml.get("metadata")
metadata["name"] = appwrapper_name
metadata["namespace"] = namespace
lower_meta = item.get("generictemplate", {}).get("metadata")
lower_meta["labels"]["workload.codeflare.dev/appwrapper"] = appwrapper_name
lower_meta["name"] = cluster_name
lower_meta["namespace"] = namespace
def update_labels(yaml, instascale, instance_types):
metadata = yaml.get("metadata")
if instascale:
if not len(instance_types) > 0:
sys.exit(
"If instascale is set to true, must provide at least one instance type"
)
type_str = ""
for type in instance_types:
type_str += type + "_"
type_str = type_str[:-1]
metadata["labels"]["orderedinstance"] = type_str
else:
metadata.pop("labels")
def update_priority(yaml, item, dispatch_priority, priority_val):
spec = yaml.get("spec")
if dispatch_priority is not None:
if priority_val:
spec["priority"] = priority_val
else:
raise ValueError(
"AW generation error: Priority value is None, while dispatch_priority is defined"
)
head = item.get("generictemplate").get("spec").get("headGroupSpec")
worker = item.get("generictemplate").get("spec").get("workerGroupSpecs")[0]
head["template"]["spec"]["priorityClassName"] = dispatch_priority
worker["template"]["spec"]["priorityClassName"] = dispatch_priority
else:
spec.pop("priority")
def update_custompodresources(
item,
min_cpu,
max_cpu,
min_memory,
max_memory,
gpu,
workers,
head_cpus,
head_memory,
head_gpus,
):
if "custompodresources" in item.keys():
custompodresources = item.get("custompodresources")
for i in range(len(custompodresources)):
resource = custompodresources[i]
if i == 0:
# Leave head node resources as template default
resource["requests"]["cpu"] = head_cpus
resource["limits"]["cpu"] = head_cpus
resource["requests"]["memory"] = str(head_memory) + "G"
resource["limits"]["memory"] = str(head_memory) + "G"
resource["requests"]["nvidia.com/gpu"] = head_gpus
resource["limits"]["nvidia.com/gpu"] = head_gpus
else:
for k, v in resource.items():
if k == "replicas" and i == 1:
resource[k] = workers
if k == "requests" or k == "limits":
for spec, _ in v.items():
if spec == "cpu":
if k == "limits":
resource[k][spec] = max_cpu
else:
resource[k][spec] = min_cpu
if spec == "memory":
if k == "limits":
resource[k][spec] = str(max_memory) + "G"
else:
resource[k][spec] = str(min_memory) + "G"
if spec == "nvidia.com/gpu":
if i == 0:
resource[k][spec] = 0
else:
resource[k][spec] = gpu
else:
sys.exit("Error: malformed template")
def update_affinity(spec, appwrapper_name, instascale):
if instascale:
node_selector_terms = (
spec.get("affinity")
.get("nodeAffinity")
.get("requiredDuringSchedulingIgnoredDuringExecution")
.get("nodeSelectorTerms")
)
node_selector_terms[0]["matchExpressions"][0]["values"][0] = appwrapper_name
node_selector_terms[0]["matchExpressions"][0]["key"] = appwrapper_name
else:
spec.pop("affinity")
def update_image(spec, image):
containers = spec.get("containers")
for container in containers:
container["image"] = image
def update_image_pull_secrets(spec, image_pull_secrets):
template_secrets = spec.get("imagePullSecrets", [])
spec["imagePullSecrets"] = template_secrets + [
{"name": x} for x in image_pull_secrets
]
def update_env(spec, env):
containers = spec.get("containers")
for container in containers:
if env:
if "env" in container:
container["env"].extend(env)
else:
container["env"] = env
def update_resources(spec, min_cpu, max_cpu, min_memory, max_memory, gpu):
container = spec.get("containers")
for resource in container:
requests = resource.get("resources").get("requests")
if requests is not None:
requests["cpu"] = min_cpu
requests["memory"] = str(min_memory) + "G"
requests["nvidia.com/gpu"] = gpu
limits = resource.get("resources").get("limits")
if limits is not None:
limits["cpu"] = max_cpu
limits["memory"] = str(max_memory) + "G"
limits["nvidia.com/gpu"] = gpu
def update_nodes(
item,
appwrapper_name,
min_cpu,
max_cpu,
min_memory,
max_memory,
gpu,
workers,
image,
instascale,
env,
image_pull_secrets,
head_cpus,
head_memory,
head_gpus,
):
if "generictemplate" in item.keys():
head = item.get("generictemplate").get("spec").get("headGroupSpec")
head["rayStartParams"]["num-gpus"] = str(int(head_gpus))
worker = item.get("generictemplate").get("spec").get("workerGroupSpecs")[0]
# Head counts as first worker
worker["replicas"] = workers
worker["minReplicas"] = workers
worker["maxReplicas"] = workers
worker["groupName"] = "small-group-" + appwrapper_name
worker["rayStartParams"]["num-gpus"] = str(int(gpu))
for comp in [head, worker]:
spec = comp.get("template").get("spec")
update_affinity(spec, appwrapper_name, instascale)
update_image_pull_secrets(spec, image_pull_secrets)
update_image(spec, image)
update_env(spec, env)
if comp == head:
# TODO: Eventually add head node configuration outside of template
update_resources(
spec, head_cpus, head_cpus, head_memory, head_memory, head_gpus
)
else:
update_resources(spec, min_cpu, max_cpu, min_memory, max_memory, gpu)
def update_ca_secret(ca_secret_item, cluster_name, namespace):
from . import generate_cert
metadata = ca_secret_item.get("generictemplate", {}).get("metadata")
metadata["name"] = f"ca-secret-{cluster_name}"
metadata["namespace"] = namespace
metadata["labels"]["odh-ray-cluster-service"] = f"{cluster_name}-head-svc"
data = ca_secret_item.get("generictemplate", {}).get("data")
data["ca.key"], data["ca.crt"] = generate_cert.generate_ca_cert(365)
def enable_local_interactive(resources, cluster_name, namespace, ingress_domain):
rayclient_ingress_item = resources["resources"].get("GenericItems")[2]
ca_secret_item = resources["resources"].get("GenericItems")[3]
item = resources["resources"].get("GenericItems")[0]
update_ca_secret(ca_secret_item, cluster_name, namespace)
# update_ca_secret_volumes
item["generictemplate"]["spec"]["headGroupSpec"]["template"]["spec"]["volumes"][0][
"secret"
]["secretName"] = f"ca-secret-{cluster_name}"
item["generictemplate"]["spec"]["workerGroupSpecs"][0]["template"]["spec"][
"volumes"
][0]["secret"]["secretName"] = f"ca-secret-{cluster_name}"
# update_tls_env
item["generictemplate"]["spec"]["headGroupSpec"]["template"]["spec"]["containers"][
0
]["env"][1]["value"] = "1"
item["generictemplate"]["spec"]["workerGroupSpecs"][0]["template"]["spec"][
"containers"
][0]["env"][1]["value"] = "1"
# update_init_container
command = item["generictemplate"]["spec"]["headGroupSpec"]["template"]["spec"][
"initContainers"
][0].get("command")[2]
command = command.replace("deployment-name", cluster_name)
if is_openshift_cluster():
# We can try get the domain through checking ingresses.config.openshift.io
try:
config_check()
api_client = client.CustomObjectsApi(api_config_handler())
ingress = api_client.get_cluster_custom_object(
"config.openshift.io", "v1", "ingresses", "cluster"
)
except Exception as e: # pragma: no cover
return _kube_api_error_handling(e)
domain = ingress["spec"]["domain"]
elif ingress_domain is None:
raise ValueError(
"ingress_domain is invalid. For Kubernetes Clusters please specify an ingress domain"
)
else:
domain = ingress_domain
command = command.replace("server-name", domain)
update_rayclient_ingress(rayclient_ingress_item, cluster_name, namespace, domain)
item["generictemplate"]["spec"]["headGroupSpec"]["template"]["spec"][
"initContainers"
][0].get("command")[2] = command
def disable_raycluster_tls(resources):
generic_template_spec = resources["GenericItems"][0]["generictemplate"]["spec"]
if "volumes" in generic_template_spec["headGroupSpec"]["template"]["spec"]:
del generic_template_spec["headGroupSpec"]["template"]["spec"]["volumes"]
if (
"volumeMounts"
in generic_template_spec["headGroupSpec"]["template"]["spec"]["containers"][0]
):
del generic_template_spec["headGroupSpec"]["template"]["spec"]["containers"][0][
"volumeMounts"
]
if "initContainers" in generic_template_spec["headGroupSpec"]["template"]["spec"]:
del generic_template_spec["headGroupSpec"]["template"]["spec"]["initContainers"]
if "volumes" in generic_template_spec["workerGroupSpecs"][0]["template"]["spec"]:
del generic_template_spec["workerGroupSpecs"][0]["template"]["spec"]["volumes"]
if (
"volumeMounts"
in generic_template_spec["workerGroupSpecs"][0]["template"]["spec"][
"containers"
][0]
):
del generic_template_spec["workerGroupSpecs"][0]["template"]["spec"][
"containers"
][0]["volumeMounts"]
for i in range(
len(
generic_template_spec["workerGroupSpecs"][0]["template"]["spec"][
"initContainers"
]
)
):
if (
generic_template_spec["workerGroupSpecs"][0]["template"]["spec"][
"initContainers"
][i]["name"]
== "create-cert"
):
del generic_template_spec["workerGroupSpecs"][0]["template"]["spec"][
"initContainers"
][i]
updated_items = []
for i in resources["GenericItems"][:]:
if "rayclient-deployment-name" in i["generictemplate"]["metadata"]["name"]:
continue
if "ca-secret-deployment-name" in i["generictemplate"]["metadata"]["name"]:
continue
updated_items.append(i)
resources["GenericItems"] = updated_items
def write_user_appwrapper(user_yaml, output_file_name):
with open(output_file_name, "w") as outfile:
yaml.dump(user_yaml, outfile, default_flow_style=False)
print(f"Written to: {output_file_name}")
def enable_openshift_oauth(user_yaml, cluster_name, namespace):
config_check()
k8_client = api_config_handler() or client.ApiClient()
tls_mount_location = "/etc/tls/private"
oauth_port = 8443
oauth_sa_name = f"{cluster_name}-oauth-proxy"
tls_secret_name = f"{cluster_name}-proxy-tls-secret"
tls_volume_name = "proxy-tls-secret"
port_name = "oauth-proxy"
host = _get_api_host(k8_client)
host = host.replace(
"api.", f"{gen_dashboard_ingress_name(cluster_name)}-{namespace}.apps."
)
oauth_sidecar = _create_oauth_sidecar_object(
namespace,
tls_mount_location,
oauth_port,
oauth_sa_name,
tls_volume_name,
port_name,
)
tls_secret_volume = client.V1Volume(
name=tls_volume_name,
secret=client.V1SecretVolumeSource(secret_name=tls_secret_name),
)
# allows for setting value of Cluster object when initializing object from an existing AppWrapper on cluster
user_yaml["metadata"]["annotations"] = user_yaml["metadata"].get("annotations", {})
user_yaml["metadata"]["annotations"][
"codeflare-sdk-use-oauth"
] = "true" # if the user gets an
ray_headgroup_pod = user_yaml["spec"]["resources"]["GenericItems"][0][
"generictemplate"
]["spec"]["headGroupSpec"]["template"]["spec"]
user_yaml["spec"]["resources"]["GenericItems"].pop(1)
ray_headgroup_pod["serviceAccount"] = oauth_sa_name
ray_headgroup_pod["volumes"] = ray_headgroup_pod.get("volumes", [])
# we use a generic api client here so that the serialization function doesn't need to be mocked for unit tests
ray_headgroup_pod["volumes"].append(
client.ApiClient().sanitize_for_serialization(tls_secret_volume)
)
ray_headgroup_pod["containers"].append(
client.ApiClient().sanitize_for_serialization(oauth_sidecar)
)
def _create_oauth_sidecar_object(
namespace: str,
tls_mount_location: str,
oauth_port: int,
oauth_sa_name: str,
tls_volume_name: str,
port_name: str,
) -> client.V1Container:
return client.V1Container(
args=[
f"--https-address=:{oauth_port}",
"--provider=openshift",
f"--openshift-service-account={oauth_sa_name}",
"--upstream=http://localhost:8265",
f"--tls-cert={tls_mount_location}/tls.crt",
f"--tls-key={tls_mount_location}/tls.key",
f"--cookie-secret={b64encode(urandom(64)).decode('utf-8')}", # create random string for encrypting cookie
f'--openshift-delegate-urls={{"/":{{"resource":"pods","namespace":"{namespace}","verb":"get"}}}}',
],
image="registry.redhat.io/openshift4/ose-oauth-proxy@sha256:1ea6a01bf3e63cdcf125c6064cbd4a4a270deaf0f157b3eabb78f60556840366",
name="oauth-proxy",
ports=[client.V1ContainerPort(container_port=oauth_port, name=port_name)],
resources=client.V1ResourceRequirements(limits=None, requests=None),
volume_mounts=[
client.V1VolumeMount(
mount_path=tls_mount_location, name=tls_volume_name, read_only=True
)
],
)
def write_components(user_yaml: dict, output_file_name: str):
components = user_yaml.get("spec", "resources")["resources"].get("GenericItems")
open(output_file_name, "w").close()
with open(output_file_name, "a") as outfile:
for component in components:
if "generictemplate" in component:
outfile.write("---\n")
yaml.dump(
component["generictemplate"], outfile, default_flow_style=False
)
print(f"Written to: {output_file_name}")
def generate_appwrapper(
name: str,
namespace: str,
head_cpus: int,
head_memory: int,
head_gpus: int,
min_cpu: int,
max_cpu: int,
min_memory: int,
max_memory: int,
gpu: int,
workers: int,
template: str,
image: str,
instascale: bool,
mcad: bool,
instance_types: list,
env,
local_interactive: bool,
image_pull_secrets: list,
dispatch_priority: str,
priority_val: int,
openshift_oauth: bool,
ingress_domain: str,
ingress_options: dict,
):
user_yaml = read_template(template)
appwrapper_name, cluster_name = gen_names(name)
resources = user_yaml.get("spec", "resources")
item = resources["resources"].get("GenericItems")[0]
ingress_item = resources["resources"].get("GenericItems")[1]
update_names(user_yaml, item, appwrapper_name, cluster_name, namespace)
update_labels(user_yaml, instascale, instance_types)
update_priority(user_yaml, item, dispatch_priority, priority_val)
update_custompodresources(
item,
min_cpu,
max_cpu,
min_memory,
max_memory,
gpu,
workers,
head_cpus,
head_memory,
head_gpus,
)
update_nodes(
item,
appwrapper_name,
min_cpu,
max_cpu,
min_memory,
max_memory,
gpu,
workers,
image,
instascale,
env,
image_pull_secrets,
head_cpus,
head_memory,
head_gpus,
)
update_dashboard_ingress(
ingress_item, cluster_name, namespace, ingress_options, ingress_domain
)
if local_interactive:
enable_local_interactive(resources, cluster_name, namespace, ingress_domain)
else:
disable_raycluster_tls(resources["resources"])
if openshift_oauth:
enable_openshift_oauth(user_yaml, cluster_name, namespace)
outfile = appwrapper_name + ".yaml"
if not mcad:
write_components(user_yaml, outfile)
else:
write_user_appwrapper(user_yaml, outfile)
return outfile
Functions
def disable_raycluster_tls(resources)-
Expand source code
def disable_raycluster_tls(resources): generic_template_spec = resources["GenericItems"][0]["generictemplate"]["spec"] if "volumes" in generic_template_spec["headGroupSpec"]["template"]["spec"]: del generic_template_spec["headGroupSpec"]["template"]["spec"]["volumes"] if ( "volumeMounts" in generic_template_spec["headGroupSpec"]["template"]["spec"]["containers"][0] ): del generic_template_spec["headGroupSpec"]["template"]["spec"]["containers"][0][ "volumeMounts" ] if "initContainers" in generic_template_spec["headGroupSpec"]["template"]["spec"]: del generic_template_spec["headGroupSpec"]["template"]["spec"]["initContainers"] if "volumes" in generic_template_spec["workerGroupSpecs"][0]["template"]["spec"]: del generic_template_spec["workerGroupSpecs"][0]["template"]["spec"]["volumes"] if ( "volumeMounts" in generic_template_spec["workerGroupSpecs"][0]["template"]["spec"][ "containers" ][0] ): del generic_template_spec["workerGroupSpecs"][0]["template"]["spec"][ "containers" ][0]["volumeMounts"] for i in range( len( generic_template_spec["workerGroupSpecs"][0]["template"]["spec"][ "initContainers" ] ) ): if ( generic_template_spec["workerGroupSpecs"][0]["template"]["spec"][ "initContainers" ][i]["name"] == "create-cert" ): del generic_template_spec["workerGroupSpecs"][0]["template"]["spec"][ "initContainers" ][i] updated_items = [] for i in resources["GenericItems"][:]: if "rayclient-deployment-name" in i["generictemplate"]["metadata"]["name"]: continue if "ca-secret-deployment-name" in i["generictemplate"]["metadata"]["name"]: continue updated_items.append(i) resources["GenericItems"] = updated_items def enable_local_interactive(resources, cluster_name, namespace, ingress_domain)-
Expand source code
def enable_local_interactive(resources, cluster_name, namespace, ingress_domain): rayclient_ingress_item = resources["resources"].get("GenericItems")[2] ca_secret_item = resources["resources"].get("GenericItems")[3] item = resources["resources"].get("GenericItems")[0] update_ca_secret(ca_secret_item, cluster_name, namespace) # update_ca_secret_volumes item["generictemplate"]["spec"]["headGroupSpec"]["template"]["spec"]["volumes"][0][ "secret" ]["secretName"] = f"ca-secret-{cluster_name}" item["generictemplate"]["spec"]["workerGroupSpecs"][0]["template"]["spec"][ "volumes" ][0]["secret"]["secretName"] = f"ca-secret-{cluster_name}" # update_tls_env item["generictemplate"]["spec"]["headGroupSpec"]["template"]["spec"]["containers"][ 0 ]["env"][1]["value"] = "1" item["generictemplate"]["spec"]["workerGroupSpecs"][0]["template"]["spec"][ "containers" ][0]["env"][1]["value"] = "1" # update_init_container command = item["generictemplate"]["spec"]["headGroupSpec"]["template"]["spec"][ "initContainers" ][0].get("command")[2] command = command.replace("deployment-name", cluster_name) if is_openshift_cluster(): # We can try get the domain through checking ingresses.config.openshift.io try: config_check() api_client = client.CustomObjectsApi(api_config_handler()) ingress = api_client.get_cluster_custom_object( "config.openshift.io", "v1", "ingresses", "cluster" ) except Exception as e: # pragma: no cover return _kube_api_error_handling(e) domain = ingress["spec"]["domain"] elif ingress_domain is None: raise ValueError( "ingress_domain is invalid. For Kubernetes Clusters please specify an ingress domain" ) else: domain = ingress_domain command = command.replace("server-name", domain) update_rayclient_ingress(rayclient_ingress_item, cluster_name, namespace, domain) item["generictemplate"]["spec"]["headGroupSpec"]["template"]["spec"][ "initContainers" ][0].get("command")[2] = command def enable_openshift_oauth(user_yaml, cluster_name, namespace)-
Expand source code
def enable_openshift_oauth(user_yaml, cluster_name, namespace): config_check() k8_client = api_config_handler() or client.ApiClient() tls_mount_location = "/etc/tls/private" oauth_port = 8443 oauth_sa_name = f"{cluster_name}-oauth-proxy" tls_secret_name = f"{cluster_name}-proxy-tls-secret" tls_volume_name = "proxy-tls-secret" port_name = "oauth-proxy" host = _get_api_host(k8_client) host = host.replace( "api.", f"{gen_dashboard_ingress_name(cluster_name)}-{namespace}.apps." ) oauth_sidecar = _create_oauth_sidecar_object( namespace, tls_mount_location, oauth_port, oauth_sa_name, tls_volume_name, port_name, ) tls_secret_volume = client.V1Volume( name=tls_volume_name, secret=client.V1SecretVolumeSource(secret_name=tls_secret_name), ) # allows for setting value of Cluster object when initializing object from an existing AppWrapper on cluster user_yaml["metadata"]["annotations"] = user_yaml["metadata"].get("annotations", {}) user_yaml["metadata"]["annotations"][ "codeflare-sdk-use-oauth" ] = "true" # if the user gets an ray_headgroup_pod = user_yaml["spec"]["resources"]["GenericItems"][0][ "generictemplate" ]["spec"]["headGroupSpec"]["template"]["spec"] user_yaml["spec"]["resources"]["GenericItems"].pop(1) ray_headgroup_pod["serviceAccount"] = oauth_sa_name ray_headgroup_pod["volumes"] = ray_headgroup_pod.get("volumes", []) # we use a generic api client here so that the serialization function doesn't need to be mocked for unit tests ray_headgroup_pod["volumes"].append( client.ApiClient().sanitize_for_serialization(tls_secret_volume) ) ray_headgroup_pod["containers"].append( client.ApiClient().sanitize_for_serialization(oauth_sidecar) ) def gen_dashboard_ingress_name(cluster_name)-
Expand source code
def gen_dashboard_ingress_name(cluster_name): return f"ray-dashboard-{cluster_name}" def gen_names(name)-
Expand source code
def gen_names(name): if not name: gen_id = str(uuid.uuid4()) appwrapper_name = "appwrapper-" + gen_id cluster_name = "cluster-" + gen_id return appwrapper_name, cluster_name else: return name, name def generate_appwrapper(name: str, namespace: str, head_cpus: int, head_memory: int, head_gpus: int, min_cpu: int, max_cpu: int, min_memory: int, max_memory: int, gpu: int, workers: int, template: str, image: str, instascale: bool, mcad: bool, instance_types: list, env, local_interactive: bool, image_pull_secrets: list, dispatch_priority: str, priority_val: int, openshift_oauth: bool, ingress_domain: str, ingress_options: dict)-
Expand source code
def generate_appwrapper( name: str, namespace: str, head_cpus: int, head_memory: int, head_gpus: int, min_cpu: int, max_cpu: int, min_memory: int, max_memory: int, gpu: int, workers: int, template: str, image: str, instascale: bool, mcad: bool, instance_types: list, env, local_interactive: bool, image_pull_secrets: list, dispatch_priority: str, priority_val: int, openshift_oauth: bool, ingress_domain: str, ingress_options: dict, ): user_yaml = read_template(template) appwrapper_name, cluster_name = gen_names(name) resources = user_yaml.get("spec", "resources") item = resources["resources"].get("GenericItems")[0] ingress_item = resources["resources"].get("GenericItems")[1] update_names(user_yaml, item, appwrapper_name, cluster_name, namespace) update_labels(user_yaml, instascale, instance_types) update_priority(user_yaml, item, dispatch_priority, priority_val) update_custompodresources( item, min_cpu, max_cpu, min_memory, max_memory, gpu, workers, head_cpus, head_memory, head_gpus, ) update_nodes( item, appwrapper_name, min_cpu, max_cpu, min_memory, max_memory, gpu, workers, image, instascale, env, image_pull_secrets, head_cpus, head_memory, head_gpus, ) update_dashboard_ingress( ingress_item, cluster_name, namespace, ingress_options, ingress_domain ) if local_interactive: enable_local_interactive(resources, cluster_name, namespace, ingress_domain) else: disable_raycluster_tls(resources["resources"]) if openshift_oauth: enable_openshift_oauth(user_yaml, cluster_name, namespace) outfile = appwrapper_name + ".yaml" if not mcad: write_components(user_yaml, outfile) else: write_user_appwrapper(user_yaml, outfile) return outfile def is_openshift_cluster()-
Expand source code
def is_openshift_cluster(): try: config_check() api_instance = client.CustomObjectsApi(api_config_handler()) api_instance.get_cluster_custom_object( "config.openshift.io", "v1", "ingresses", "cluster" ) return True except client.ApiException as e: # pragma: no cover if e.status == 404 or e.status == 403: return False else: print(f"Error detecting cluster type defaulting to Kubernetes: {e}") return False def read_template(template)-
Expand source code
def read_template(template): with open(template, "r") as stream: try: return yaml.safe_load(stream) except yaml.YAMLError as exc: print(exc) def update_affinity(spec, appwrapper_name, instascale)-
Expand source code
def update_affinity(spec, appwrapper_name, instascale): if instascale: node_selector_terms = ( spec.get("affinity") .get("nodeAffinity") .get("requiredDuringSchedulingIgnoredDuringExecution") .get("nodeSelectorTerms") ) node_selector_terms[0]["matchExpressions"][0]["values"][0] = appwrapper_name node_selector_terms[0]["matchExpressions"][0]["key"] = appwrapper_name else: spec.pop("affinity") def update_ca_secret(ca_secret_item, cluster_name, namespace)-
Expand source code
def update_ca_secret(ca_secret_item, cluster_name, namespace): from . import generate_cert metadata = ca_secret_item.get("generictemplate", {}).get("metadata") metadata["name"] = f"ca-secret-{cluster_name}" metadata["namespace"] = namespace metadata["labels"]["odh-ray-cluster-service"] = f"{cluster_name}-head-svc" data = ca_secret_item.get("generictemplate", {}).get("data") data["ca.key"], data["ca.crt"] = generate_cert.generate_ca_cert(365) def update_custompodresources(item, min_cpu, max_cpu, min_memory, max_memory, gpu, workers, head_cpus, head_memory, head_gpus)-
Expand source code
def update_custompodresources( item, min_cpu, max_cpu, min_memory, max_memory, gpu, workers, head_cpus, head_memory, head_gpus, ): if "custompodresources" in item.keys(): custompodresources = item.get("custompodresources") for i in range(len(custompodresources)): resource = custompodresources[i] if i == 0: # Leave head node resources as template default resource["requests"]["cpu"] = head_cpus resource["limits"]["cpu"] = head_cpus resource["requests"]["memory"] = str(head_memory) + "G" resource["limits"]["memory"] = str(head_memory) + "G" resource["requests"]["nvidia.com/gpu"] = head_gpus resource["limits"]["nvidia.com/gpu"] = head_gpus else: for k, v in resource.items(): if k == "replicas" and i == 1: resource[k] = workers if k == "requests" or k == "limits": for spec, _ in v.items(): if spec == "cpu": if k == "limits": resource[k][spec] = max_cpu else: resource[k][spec] = min_cpu if spec == "memory": if k == "limits": resource[k][spec] = str(max_memory) + "G" else: resource[k][spec] = str(min_memory) + "G" if spec == "nvidia.com/gpu": if i == 0: resource[k][spec] = 0 else: resource[k][spec] = gpu else: sys.exit("Error: malformed template") def update_dashboard_ingress(ingress_item, cluster_name, namespace, ingress_options, ingress_domain)-
Expand source code
def update_dashboard_ingress( ingress_item, cluster_name, namespace, ingress_options, ingress_domain ): # pragma: no cover metadata = ingress_item.get("generictemplate", {}).get("metadata") spec = ingress_item.get("generictemplate", {}).get("spec") if ingress_options != {}: for index, ingress_option in enumerate(ingress_options["ingresses"]): if "ingressName" not in ingress_option.keys(): raise ValueError( f"Error: 'ingressName' is missing or empty for ingress item at index {index}" ) if "port" not in ingress_option.keys(): raise ValueError( f"Error: 'port' is missing or empty for ingress item at index {index}" ) elif not isinstance(ingress_option["port"], int): raise ValueError( f"Error: 'port' is not of type int for ingress item at index {index}" ) if ingress_option["port"] == 8265: metadata["name"] = ingress_option["ingressName"] metadata["namespace"] = namespace if "annotations" not in ingress_option.keys(): del metadata["annotations"] else: metadata["annotations"] = ingress_option["annotations"] if "path" not in ingress_option.keys(): del spec["rules"][0]["http"]["paths"][0]["path"] else: spec["rules"][0]["http"]["paths"][0]["path"] = ingress_option[ "path" ] if "pathType" not in ingress_option.keys(): spec["rules"][0]["http"]["paths"][0][ "pathType" ] = "ImplementationSpecific" if "host" not in ingress_option.keys(): del spec["rules"][0]["host"] else: spec["rules"][0]["host"] = ingress_option["host"] if "ingressClassName" not in ingress_option.keys(): del spec["ingressClassName"] else: spec["ingressClassName"] = ingress_option["ingressClassName"] spec["rules"][0]["http"]["paths"][0]["backend"]["service"][ "name" ] = f"{cluster_name}-head-svc" else: metadata["name"] = f"ray-dashboard-{cluster_name}" metadata["namespace"] = namespace spec["rules"][0]["http"]["paths"][0]["backend"]["service"][ "name" ] = f"{cluster_name}-head-svc" if is_openshift_cluster(): try: config_check() api_client = client.CustomObjectsApi(api_config_handler()) ingress = api_client.get_cluster_custom_object( "config.openshift.io", "v1", "ingresses", "cluster" ) del spec["ingressClassName"] except Exception as e: # pragma: no cover return _kube_api_error_handling(e) domain = ingress["spec"]["domain"] elif ingress_domain is None: raise ValueError( "ingress_domain is invalid. For Kubernetes Clusters please specify an ingress domain" ) else: domain = ingress_domain del metadata["annotations"] spec["rules"][0]["host"] = f"ray-dashboard-{cluster_name}-{namespace}.{domain}" def update_env(spec, env)-
Expand source code
def update_env(spec, env): containers = spec.get("containers") for container in containers: if env: if "env" in container: container["env"].extend(env) else: container["env"] = env def update_image(spec, image)-
Expand source code
def update_image(spec, image): containers = spec.get("containers") for container in containers: container["image"] = image def update_image_pull_secrets(spec, image_pull_secrets)-
Expand source code
def update_image_pull_secrets(spec, image_pull_secrets): template_secrets = spec.get("imagePullSecrets", []) spec["imagePullSecrets"] = template_secrets + [ {"name": x} for x in image_pull_secrets ] def update_labels(yaml, instascale, instance_types)-
Expand source code
def update_labels(yaml, instascale, instance_types): metadata = yaml.get("metadata") if instascale: if not len(instance_types) > 0: sys.exit( "If instascale is set to true, must provide at least one instance type" ) type_str = "" for type in instance_types: type_str += type + "_" type_str = type_str[:-1] metadata["labels"]["orderedinstance"] = type_str else: metadata.pop("labels") def update_names(yaml, item, appwrapper_name, cluster_name, namespace)-
Expand source code
def update_names(yaml, item, appwrapper_name, cluster_name, namespace): metadata = yaml.get("metadata") metadata["name"] = appwrapper_name metadata["namespace"] = namespace lower_meta = item.get("generictemplate", {}).get("metadata") lower_meta["labels"]["workload.codeflare.dev/appwrapper"] = appwrapper_name lower_meta["name"] = cluster_name lower_meta["namespace"] = namespace def update_nodes(item, appwrapper_name, min_cpu, max_cpu, min_memory, max_memory, gpu, workers, image, instascale, env, image_pull_secrets, head_cpus, head_memory, head_gpus)-
Expand source code
def update_nodes( item, appwrapper_name, min_cpu, max_cpu, min_memory, max_memory, gpu, workers, image, instascale, env, image_pull_secrets, head_cpus, head_memory, head_gpus, ): if "generictemplate" in item.keys(): head = item.get("generictemplate").get("spec").get("headGroupSpec") head["rayStartParams"]["num-gpus"] = str(int(head_gpus)) worker = item.get("generictemplate").get("spec").get("workerGroupSpecs")[0] # Head counts as first worker worker["replicas"] = workers worker["minReplicas"] = workers worker["maxReplicas"] = workers worker["groupName"] = "small-group-" + appwrapper_name worker["rayStartParams"]["num-gpus"] = str(int(gpu)) for comp in [head, worker]: spec = comp.get("template").get("spec") update_affinity(spec, appwrapper_name, instascale) update_image_pull_secrets(spec, image_pull_secrets) update_image(spec, image) update_env(spec, env) if comp == head: # TODO: Eventually add head node configuration outside of template update_resources( spec, head_cpus, head_cpus, head_memory, head_memory, head_gpus ) else: update_resources(spec, min_cpu, max_cpu, min_memory, max_memory, gpu) def update_priority(yaml, item, dispatch_priority, priority_val)-
Expand source code
def update_priority(yaml, item, dispatch_priority, priority_val): spec = yaml.get("spec") if dispatch_priority is not None: if priority_val: spec["priority"] = priority_val else: raise ValueError( "AW generation error: Priority value is None, while dispatch_priority is defined" ) head = item.get("generictemplate").get("spec").get("headGroupSpec") worker = item.get("generictemplate").get("spec").get("workerGroupSpecs")[0] head["template"]["spec"]["priorityClassName"] = dispatch_priority worker["template"]["spec"]["priorityClassName"] = dispatch_priority else: spec.pop("priority") def update_rayclient_ingress(ingress_item, cluster_name, namespace, ingress_domain)-
Expand source code
def update_rayclient_ingress( ingress_item, cluster_name, namespace, ingress_domain ): # pragma: no cover metadata = ingress_item.get("generictemplate", {}).get("metadata") spec = ingress_item.get("generictemplate", {}).get("spec") metadata["name"] = f"rayclient-{cluster_name}" metadata["namespace"] = namespace metadata["labels"]["odh-ray-cluster-service"] = f"{cluster_name}-head-svc" spec["rules"][0]["http"]["paths"][0]["backend"]["service"][ "name" ] = f"{cluster_name}-head-svc" if is_openshift_cluster(): try: config_check() api_client = client.CustomObjectsApi(api_config_handler()) ingress = api_client.get_cluster_custom_object( "config.openshift.io", "v1", "ingresses", "cluster" ) ingressClassName = "openshift-default" annotations = { "nginx.ingress.kubernetes.io/rewrite-target": "/", "nginx.ingress.kubernetes.io/ssl-redirect": "true", "route.openshift.io/termination": "passthrough", } except Exception as e: # pragma: no cover return _kube_api_error_handling(e) domain = ingress["spec"]["domain"] elif ingress_domain is None: raise ValueError( "ingress_domain is invalid. For Kubernetes Clusters please specify an ingress domain" ) else: domain = ingress_domain ingressClassName = "nginx" annotations = { "nginx.ingress.kubernetes.io/rewrite-target": "/", "nginx.ingress.kubernetes.io/ssl-redirect": "true", "nginx.ingress.kubernetes.io/ssl-passthrough": "true", } metadata["annotations"] = annotations spec["ingressClassName"] = ingressClassName spec["rules"][0]["host"] = f"rayclient-{cluster_name}-{namespace}.{domain}" def update_resources(spec, min_cpu, max_cpu, min_memory, max_memory, gpu)-
Expand source code
def update_resources(spec, min_cpu, max_cpu, min_memory, max_memory, gpu): container = spec.get("containers") for resource in container: requests = resource.get("resources").get("requests") if requests is not None: requests["cpu"] = min_cpu requests["memory"] = str(min_memory) + "G" requests["nvidia.com/gpu"] = gpu limits = resource.get("resources").get("limits") if limits is not None: limits["cpu"] = max_cpu limits["memory"] = str(max_memory) + "G" limits["nvidia.com/gpu"] = gpu def write_components(user_yaml: dict, output_file_name: str)-
Expand source code
def write_components(user_yaml: dict, output_file_name: str): components = user_yaml.get("spec", "resources")["resources"].get("GenericItems") open(output_file_name, "w").close() with open(output_file_name, "a") as outfile: for component in components: if "generictemplate" in component: outfile.write("---\n") yaml.dump( component["generictemplate"], outfile, default_flow_style=False ) print(f"Written to: {output_file_name}") def write_user_appwrapper(user_yaml, output_file_name)-
Expand source code
def write_user_appwrapper(user_yaml, output_file_name): with open(output_file_name, "w") as outfile: yaml.dump(user_yaml, outfile, default_flow_style=False) print(f"Written to: {output_file_name}")