Skip to content

Commit 40cd706

Browse files
committed
Add support for kserve
Signed-off-by: Daniel J Walsh <[email protected]>
1 parent 00839ee commit 40cd706

File tree

4 files changed

+193
-13
lines changed

4 files changed

+193
-13
lines changed

docs/ramalama-serve.1.md

Lines changed: 70 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -60,8 +60,9 @@ Generate specified configuration format for running the AI Model as a service
6060

6161
| Key | Description |
6262
| ------------ | -------------------------------------------------------------------------|
63-
| quadlet | Podman supported container definition for running AI Model under systemd |
63+
| kserve | Kserve YAML definition for running the AI Model as a kserve service in Kubernetes |
6464
| kube | Kubernetes YAML definition for running the AI Model as a service |
65+
| quadlet | Podman supported container definition for running AI Model under systemd |
6566
| quadlet/kube | Kubernetes YAML definition for running the AI Model as a service and Podman supported container definition for running the Kube YAML specified pod under systemd|
6667

6768
#### **--help**, **-h**
@@ -119,7 +120,7 @@ llama.cpp explains this as:
119120

120121
The higher the number is the more creative the response is, but more likely to hallucinate when set too high.
121122

122-
Usage: Lower numbers are good for virtual assistants where we need deterministic responses. Higher numbers are good for roleplay or creative tasks like editing stories
123+
Usage: Lower numbers are good for virtual assistants where we need deterministic responses. Higher numbers are good for roleplay or creative tasks like editing stories
123124

124125
#### **--tls-verify**=*true*
125126
require HTTPS and verify certificates when contacting OCI registries
@@ -140,6 +141,73 @@ CONTAINER ID IMAGE COMMAND CREATED
140141
3f64927f11a5 quay.io/ramalama/ramalama:latest /usr/bin/ramalama... 17 seconds ago Up 17 seconds 0.0.0.0:8082->8082/tcp ramalama_YMPQvJxN97
141142
```
142143

144+
### Generate kserve service off of OCI Model car quay.io/ramalama/granite:1.0
145+
```
146+
./bin/ramalama serve --port 8081 --generate kserve oci://quay.io/ramalama/granite:1.0
147+
Generating kserve runtime file: granite-1.0-kserve-runtime.yaml
148+
Generating kserve file: granite-1.0-kserve.yaml
149+
150+
$ cat granite-1.0-kserve-runtime.yaml
151+
apiVersion: serving.kserve.io/v1alpha1
152+
kind: ServingRuntime
153+
metadata:
154+
name: llama.cpp-runtime
155+
annotations:
156+
openshift.io/display-name: KServe ServingRuntime for quay.io/ramalama/granite:1.0
157+
opendatahub.io/recommended-accelerators: '["nvidia.com/gpu"]'
158+
labels:
159+
opendatahub.io/dashboard: 'true'
160+
spec:
161+
annotations:
162+
prometheus.io/port: '8081'
163+
prometheus.io/path: '/metrics'
164+
multiModel: false
165+
supportedModelFormats:
166+
- autoSelect: true
167+
name: vLLM
168+
containers:
169+
- name: kserve-container
170+
image: quay.io/ramalama/ramalama:latest
171+
command:
172+
- python
173+
- -m
174+
- vllm.entrypoints.openai.api_server
175+
args:
176+
- "--port=8081"
177+
- "--model=/mnt/models"
178+
- "--served-model-name={.Name}"
179+
env:
180+
- name: HF_HOME
181+
value: /tmp/hf_home
182+
ports:
183+
- containerPort: 8081
184+
protocol: TCP
185+
186+
$ cat granite-1.0-kserve.yaml
187+
# RamaLama quay.io/ramalama/granite:1.0 AI Model Service
188+
# kubectl create -f to import this kserve file into Kubernetes.
189+
#
190+
apiVersion: serving.kserve.io/v1beta1
191+
kind: InferenceService
192+
metadata:
193+
name: huggingface-quay.io/ramalama/granite:1.0
194+
spec:
195+
predictor:
196+
model:
197+
modelFormat:
198+
name: vLLM
199+
storageUri: "oci://quay.io/ramalama/granite:1.0"
200+
resources:
201+
limits:
202+
cpu: "6"
203+
memory: 24Gi
204+
nvidia.com/gpu: "1"
205+
requests:
206+
cpu: "6"
207+
memory: 24Gi
208+
nvidia.com/gpu: "1"
209+
```
210+
143211
### Generate quadlet service off of HuggingFace granite Model
144212
```
145213
$ ramalama serve --name MyGraniteServer --generate=quadlet granite

ramalama/cli.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -859,7 +859,7 @@ def serve_parser(subparsers):
859859
parser.add_argument("--host", default=config.get('host', "0.0.0.0"), help="IP address to listen")
860860
parser.add_argument(
861861
"--generate",
862-
choices=["quadlet", "kube", "quadlet/kube"],
862+
choices=["kserve", "kube", "quadlet", "quadlet/kube", ],
863863
help="generate specified configuration format for running the AI Model as a service",
864864
)
865865
parser.add_argument(

ramalama/kserve.py

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
import os
2+
3+
from ramalama.common import get_env_vars
4+
5+
6+
class Kserve:
7+
def __init__(self, model, image, args, exec_args):
8+
self.ai_image = model
9+
if hasattr(args, "MODEL"):
10+
self.ai_image = args.MODEL
11+
self.ai_image = self.ai_image.removeprefix("oci://")
12+
if args.name:
13+
self.name = args.name
14+
else:
15+
self.name = os.path.basename(self.ai_image)
16+
17+
self.model = model.removeprefix("oci://")
18+
self.args = args
19+
self.exec_args = exec_args
20+
self.image = image
21+
self.runtime = args.runtime
22+
23+
def generate(self):
24+
env_var_string = ""
25+
for k, v in get_env_vars().items():
26+
env_var_string += f"Environment={k}={v}\n"
27+
28+
_gpu = ""
29+
if os.getenv("CUDA_VISIBLE_DEVICES") != "":
30+
_gpu = 'nvidia.com/gpu'
31+
elif os.getenv("HIP_VISIBLE_DEVICES") != "":
32+
_gpu = 'amd.com/gpu'
33+
if _gpu != "":
34+
gpu = f'\n {_gpu}: "1"'
35+
36+
outfile = self.name + "-kserve-runtime.yaml"
37+
outfile = outfile.replace(":", "-")
38+
print(f"Generating kserve runtime file: {outfile}")
39+
with open(outfile, 'w') as c:
40+
c.write(
41+
f"""\
42+
apiVersion: serving.kserve.io/v1alpha1
43+
kind: ServingRuntime
44+
metadata:
45+
name: {self.runtime}-runtime
46+
annotations:
47+
openshift.io/display-name: KServe ServingRuntime for {self.model}
48+
opendatahub.io/recommended-accelerators: '["{_gpu}"]'
49+
labels:
50+
opendatahub.io/dashboard: 'true'
51+
spec:
52+
annotations:
53+
prometheus.io/port: '{self.args.port}'
54+
prometheus.io/path: '/metrics'
55+
multiModel: false
56+
supportedModelFormats:
57+
- autoSelect: true
58+
name: vLLM
59+
containers:
60+
- name: kserve-container
61+
image: {self.image}
62+
command:
63+
- python
64+
- -m
65+
- vllm.entrypoints.openai.api_server
66+
args:
67+
- "--port={self.args.port}"
68+
- "--model=/mnt/models"
69+
- "--served-model-name={{.Name}}"
70+
env:
71+
- name: HF_HOME
72+
value: /tmp/hf_home
73+
ports:
74+
- containerPort: {self.args.port}
75+
protocol: TCP
76+
""")
77+
78+
outfile = self.name + "-kserve.yaml"
79+
outfile = outfile.replace(":", "-")
80+
print(f"Generating kserve file: {outfile}")
81+
with open(outfile, 'w') as c:
82+
c.write(
83+
f"""\
84+
# RamaLama {self.model} AI Model Service
85+
# kubectl create -f to import this kserve file into Kubernetes.
86+
#
87+
apiVersion: serving.kserve.io/v1beta1
88+
kind: InferenceService
89+
metadata:
90+
name: huggingface-{self.model}
91+
spec:
92+
predictor:
93+
model:
94+
modelFormat:
95+
name: vLLM
96+
storageUri: "oci://{self.model}"
97+
resources:
98+
limits:
99+
cpu: "6"
100+
memory: 24Gi{gpu}
101+
requests:
102+
cpu: "6"
103+
memory: 24Gi{gpu}
104+
"""
105+
)

ramalama/model.py

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from ramalama.kube import Kube
1818
from ramalama.model_inspect import GGUFModelInfo, ModelInfoBase
1919
from ramalama.quadlet import Quadlet
20+
from ramalama.kserve import Kserve
2021
from ramalama.version import version
2122

2223
MODEL_TYPES = ["file", "https", "http", "oci", "huggingface", "hf", "ollama"]
@@ -360,7 +361,6 @@ def get_model_path(self, args):
360361

361362
if args.dryrun:
362363
return "/path/to/model"
363-
364364
model_path = self.pull(args)
365365

366366
return model_path
@@ -486,16 +486,15 @@ def handle_runtime(self, args, exec_args, exec_model_path):
486486

487487
def generate_container_config(self, model_path, args, exec_args):
488488
self.image = self._image(args)
489+
if args.generate == "kserve":
490+
return self.kserve(model_path, args, exec_args)
489491
if args.generate == "quadlet":
490-
self.quadlet(model_path, args, exec_args)
491-
elif args.generate == "kube":
492-
self.kube(model_path, args, exec_args)
493-
elif args.generate == "quadlet/kube":
494-
self.quadlet_kube(model_path, args, exec_args)
495-
else:
496-
return False
497-
498-
return True
492+
return self.quadlet(model_path, args, exec_args)
493+
if args.generate == "kube":
494+
return self.kube(model_path, args, exec_args)
495+
if args.generate == "quadlet/kube":
496+
return self.quadlet_kube(model_path, args, exec_args)
497+
return False
499498

500499
def execute_command(self, model_path, exec_args, args):
501500
try:
@@ -526,19 +525,27 @@ def serve(self, args):
526525

527526
self.execute_command(model_path, exec_args, args)
528527

528+
def kserve(self, model, args, exec_args):
529+
kserve = Kserve(model, self.image, args, exec_args)
530+
kserve.generate()
531+
return True
532+
529533
def quadlet(self, model, args, exec_args):
530534
quadlet = Quadlet(model, self.image, args, exec_args)
531535
quadlet.generate()
536+
return True
532537

533538
def quadlet_kube(self, model, args, exec_args):
534539
kube = Kube(model, self.image, args, exec_args)
535540
kube.generate()
536541
quadlet = Quadlet(model, self.image, args, exec_args)
537542
quadlet.kube()
543+
return True
538544

539545
def kube(self, model, args, exec_args):
540546
kube = Kube(model, self.image, args, exec_args)
541547
kube.generate()
548+
return True
542549

543550
def path(self, args):
544551
return self.model_path(args)

0 commit comments

Comments
 (0)