containers · rhatdan · Apr 29, 2025 · terrytangyuan · Apr 30, 2025 · rhatdan
@@ -69,8 +69,9 @@ Generate specified configuration format for running the AI Model as a service
 
 | Key          | Description                                                              |
 | ------------ | -------------------------------------------------------------------------|
-| quadlet      | Podman supported container definition for running AI Model under systemd |
+| kserve       | KServe YAML definition for running the AI Model as a KServe service in Kubernetes        |
 | kube         | Kubernetes YAML definition for running the AI Model as a service         |
+| quadlet      | Podman supported container definition for running AI Model under systemd |
 | quadlet/kube | Kubernetes YAML definition for running the AI Model as a service and Podman supported container definition for running the Kube YAML specified pod under systemd|
 
 #### **--help**, **-h**
@@ -112,7 +113,7 @@ On Nvidia based GPU systems, RamaLama defaults to using the
 `nvidia-container-runtime`. Use this option to override this selection.
 
 #### **--port**, **-p**
-port for AI Model server to listen on. It must be available. If not specified, 
+port for AI Model server to listen on. It must be available. If not specified,
 the serving port will be 8080 if available, otherwise a free port in 8081-8090 range.
 
 #### **--privileged**
@@ -159,7 +160,7 @@ llama.cpp explains this as:
 
     The higher the number is the more creative the response is, but more likely to hallucinate when set too high.
 
-        Usage: Lower numbers are good for virtual assistants where we need deterministic responses. Higher numbers are good for roleplay or creative tasks like editing stories
+	Usage: Lower numbers are good for virtual assistants where we need deterministic responses. Higher numbers are good for roleplay or creative tasks like editing stories
 
 #### **--threads**, **-t**
 Maximum number of cpu threads to use.
@@ -187,6 +188,64 @@ CONTAINER ID  IMAGE                             COMMAND               CREATED
 3f64927f11a5  quay.io/ramalama/ramalama:latest  /usr/bin/ramalama...  17 seconds ago  Up 17 seconds  0.0.0.0:8082->8082/tcp  ramalama_YMPQvJxN97
 ```
 
+### Generate kserve service off of OCI Model car quay.io/ramalama/granite:1.0
+```
+$ ramalama serve --pull=never --threads 10 --port 8081 --generate kserve oci://quay.io/rhatdan/granite
+Generating kserve runtime file: granite-cuda-kserve-runtime.yaml
+Generating kserve file: granite-cuda-kserve.yaml
+
+$  cat granite-cuda-kserve-runtime.yaml
+apiVersion: serving.kserve.io/v1alpha1
+kind: ServingRuntime
+metadata:
+  name: llama.cpp-cuda-runtime
+  annotations:
+    opendatahub.io/recommended-accelerators: '["nvidia.com/gpu"]'
+  labels:
+    opendatahub.io/dashboard: 'true'
+spec:
+  annotations:
+    prometheus.io/port: '8081'
+    prometheus.io/path: '/metrics'
+  multiModel: false
+  supportedModelFormats:
+    - autoSelect: true
+      name: vLLM
+  containers:
+    - name: kserve-container
+      image: quay.io/ramalama/cuda:0.8
+      command: ["python", "-m", "vllm.entrypoints.openai.api_server"]
+      args: ["--port=8081", "--model=/mnt/models", "--served-model-name=granite"]
+      env:
+        - name: HF_HOME
+          value: /tmp/hf_home
+      ports:
+        - containerPort: 8081
+          protocol: TCP
+
+$  cat granite-cuda-kserve.yaml
+# RamaLama granite AI Model Service
+# kubectl create -f to import this kserve file into Kubernetes.
+#
+apiVersion: serving.kserve.io/v1beta1
+kind: InferenceService
+metadata:
+  name: huggingface-granite
+spec:
+  predictor:
+    model:
+      modelFormat:
+        name: vLLM
+      storageUri: "oci://quay.io/rhatdan/granite"
+      resources:
+        limits:
+          cpu: "10"
+          memory: 24Gi
+        requests:
+          cpu: "10"
+          memory: 24Gi
-        limits:
-          cpu: "10"
-          memory: 24Gi
-        requests:
-          cpu: "10"
-          memory: 24Gi
+        limits:
+          cpu: "10"
+          memory: 24Gi
+          nvidia.com/gpu: '1'
+        requests:
+          cpu: "10"
+          memory: 24Gi
+          nvidia.com/gpu: '1'
-        limits:
-          cpu: "10"
-          memory: 24Gi
-        requests:
-          cpu: "10"
-          memory: 24Gi
+        limits:
+          cpu: "10"
+          memory: 24Gi
+          nvidia.com/gpu: '1'
+        requests:
+          cpu: "10"
+          memory: 24Gi
+          nvidia.com/gpu: '1'
+```
+
 ### Generate quadlet service off of HuggingFace granite Model
 ```
 $ ramalama serve --name MyGraniteServer --generate=quadlet granite

@@ -861,7 +861,12 @@ def serve_parser(subparsers):
     )
     parser.add_argument(
         "--generate",
-        choices=["quadlet", "kube", "quadlet/kube"],
+        choices=[
+            "kserve",
+            "kube",
+            "quadlet",
+            "quadlet/kube",
+        ],
         help="generate specified configuration format for running the AI Model as a service",
     )
     parser.add_argument(

@@ -466,7 +466,7 @@ def get_accel():
     if gpu_type := check_intel():
         return gpu_type
 
-    return "none"
+    return "cpu"
 
 
 def set_accel_env_vars():

@@ -0,0 +1,125 @@
+import os
+
+from ramalama.common import get_accel_env_vars, get_accel
+
+
+def create_yaml(template_str, params):
+    return template_str.format(**params)
+
+
+KSERVE_RUNTIME_TMPL = """
+apiVersion: serving.kserve.io/v1alpha1
+kind: ServingRuntime
+metadata:
+  name: {runtime}-runtime
+  annotations:
+    opendatahub.io/recommended-accelerators: '["{gpu}"]'
+  labels:
+    opendatahub.io/dashboard: 'true'
+spec:
+  annotations:
+    prometheus.io/port: '{port}'
+    prometheus.io/path: '/metrics'
+  multiModel: false
+  supportedModelFormats:
+    - autoSelect: true
+      name: vLLM
+  containers:
+    - name: kserve-container
+      image: {image}
+      command: ["python", "-m", "vllm.entrypoints.openai.api_server"]
+      args: ["--port={port}", "--model=/mnt/models", "--served-model-name={name}"]
+      env:
+        - name: HF_HOME
+          value: /tmp/hf_home
+      ports:
+        - containerPort: {port}
+          protocol: TCP
+"""
+
+KSERVE_MODEL_SERVICE = """\
+# RamaLama {name} AI Model Service
+# kubectl create -f to import this kserve file into Kubernetes.
+#
+apiVersion: serving.kserve.io/v1beta1
+kind: InferenceService
+metadata:
+  name: huggingface-{name}
+spec:
+  predictor:
+    model:
+      modelFormat:
+        name: vLLM
+      storageUri: "oci://{model}"
+      resources:
+        limits:
+          cpu: "{threads}"
+          memory: 24Gi{gpu}
+        requests:
+          cpu: "{threads}"
+          memory: 24Gi{gpu}
+"""
+
+
+class Kserve:
+    def __init__(self, model, chat_template_path, image, args, exec_args):
+        self.ai_image = model
+        if hasattr(args, "MODEL"):
+            self.ai_image = args.MODEL
+        self.ai_image = self.ai_image.removeprefix("oci://")
+        if args.name:
+            self.name = args.name
+        else:
+            self.name = os.path.basename(self.ai_image)
+
+        self.model = model.removeprefix("oci://")
+        self.args = args
+        self.exec_args = exec_args
+        self.image = image
+        self.runtime = args.runtime
+
+    def generate(self):
+        env_var_string = ""
+        for k, v in get_accel_env_vars().items():
+            env_var_string += f"Environment={k}={v}\n"
+
+        _gpu = ""
+        if os.getenv("CUDA_VISIBLE_DEVICES") != "":
+            _gpu = 'nvidia.com/gpu'
+        elif os.getenv("HIP_VISIBLE_DEVICES") != "":
+            _gpu = 'amd.com/gpu'
+
+        outfile = f"{self.name}-{get_accel()}-kserve-runtime.yaml"
+        outfile = outfile.replace(":", "-")
+        print(f"Generating kserve runtime file: {outfile}")
+
+        # In your generate() method:
+        yaml_content = create_yaml(
+            KSERVE_RUNTIME_TMPL,
+            {
+                'runtime': self.runtime + "-" + get_accel(),
+                'model': self.model,
+                'gpu': _gpu if _gpu else "",
+                'port': self.args.port,
+                'image': self.image,
+                'name': self.name,
+            },
+        )
+        with open(outfile, 'w') as c:
+            c.write(yaml_content)
+
+        outfile = f"{self.name}-{get_accel()}-kserve.yaml"
+        outfile = outfile.replace(":", "-")
+        print(f"Generating kserve file: {outfile}")
+        yaml_content = create_yaml(
+            KSERVE_MODEL_SERVICE,
+            {
+                'name': self.name,
+                'model': self.model,
+                'gpu': _gpu if _gpu else "",
+                'threads': self.args.threads,
+                'gpu': f"\n          {_gpu}: '1'" if _gpu else "",
+            },
+        )
+        with open(outfile, 'w') as c:
+            c.write(yaml_content)
@@ -23,6 +23,7 @@
 from ramalama.console import EMOJI
 from ramalama.engine import Engine, dry_run
 from ramalama.gguf_parser import GGUFInfoParser
+from ramalama.kserve import Kserve
 from ramalama.kube import Kube
 from ramalama.model_inspect import GGUFModelInfo, ModelInfoBase
 from ramalama.model_store import ModelStore
@@ -558,7 +559,9 @@ def handle_runtime(self, args, exec_args, exec_model_path):
 
     def generate_container_config(self, model_path, chat_template_path, args, exec_args):
         self.image = accel_image(CONFIG, args)
-        if args.generate == "quadlet":
+        if args.generate == "kserve":
+            self.kserve(model_path, chat_template_path, args, exec_args)
+        elif args.generate == "quadlet":
             self.quadlet(model_path, chat_template_path, args, exec_args)
         elif args.generate == "kube":
             self.kube(model_path, chat_template_path, args, exec_args)
@@ -618,6 +621,10 @@ def serve(self, args, quiet=False):
 
         self.execute_command(model_path, exec_args, args)
 
+    def kserve(self, model, chat_template_path, args, exec_args):
+        kserve = Kserve(model, chat_template_path, self.image, args, exec_args)
+        kserve.generate()
+
     def quadlet(self, model, chat_template, args, exec_args):
         quadlet = Quadlet(model, chat_template, self.image, args, exec_args)
         quadlet.generate()

@@ -197,7 +197,17 @@ verify_begin=".*run --rm"
 
     rm tinyllama.container
     run_ramalama 2 serve --name=${name} --port 1234 --generate=bogus tiny
-    is "$output" ".*error: argument --generate: invalid choice: 'bogus' (choose from.*quadlet.*kube.*quadlet/kube.*)" "Should fail"
+    is "$output" ".*error: argument --generate: invalid choice: 'bogus' (choose from.*kserve.*kube.*quadlet.*quadlet/kube.*)" "Should fail"
+}
+
+@test "ramalama serve --generate=kserve" {
+    model=smollm:135m
+    fixed_model=$(echo $model | tr ':' '-')
+    name=c_$(safename)
+    run_ramalama pull ${model}
+    run_ramalama -q serve --port 1234 --generate=kserve ${model}
+    is "$output" "Generating kserve runtime file: ${fixed_model}-kserve-runtime.yaml.*" "generate kserve runtime file"
+    is "$output" ".*Generating kserve file: ${fixed_model}-kserve.yaml" "generate kserve file"
 }
 
 @test "ramalama serve --generate=quadlet and --generate=kube with OCI" {