containers
diff --git a/‎docs/ramalama.1.md‎
Lines changed: 4 additions & 0 deletions b/‎docs/ramalama.1.md‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎docsite/docs/commands/ramalama/ramalama.mdx‎
Lines changed: 3 additions & 0 deletions b/‎docsite/docs/commands/ramalama/ramalama.mdx‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎ramalama/cli.py‎
Lines changed: 44 additions & 39 deletions b/‎ramalama/cli.py‎
Lines changed: 44 additions & 39 deletions
diff --git a/‎ramalama/common.py‎
Lines changed: 2 additions & 2 deletions b/‎ramalama/common.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎ramalama/daemon/client.py‎
Lines changed: 104 additions & 0 deletions b/‎ramalama/daemon/client.py‎
Lines changed: 104 additions & 0 deletions
diff --git a/‎ramalama/daemon/daemon.py‎
Lines changed: 7 additions & 11 deletions b/‎ramalama/daemon/daemon.py‎
Lines changed: 7 additions & 11 deletions
diff --git a/‎ramalama/daemon/dto/model.py‎
Lines changed: 2 additions & 0 deletions b/‎ramalama/daemon/dto/model.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎ramalama/daemon/dto/serve.py‎
Lines changed: 3 additions & 28 deletions b/‎ramalama/daemon/dto/serve.py‎
Lines changed: 3 additions & 28 deletions
@@ -132,6 +132,10 @@ The default can be overridden in the ramalama.conf file.
 store AI Models in the specified directory (default rootless: `$HOME/.local/share/ramalama`, default rootful: `/var/lib/ramalama`)
 The default can be overridden in the ramalama.conf file.
 
+#### **--use-daemon**
+Feature flag to enable using the RamaLama daemon as backend for [ramalama-serve(1)](ramalama-serve.1.md) and [ramalama-run(1)](ramalama-run.1.md).
+
+
 ## COMMANDS
 
 | Command                                           | Description                                                |
 
@@ -142,6 +142,9 @@ The default can be overridden in the ramalama.conf file.
 store AI Models in the specified directory (default rootless: `$HOME/.local/share/ramalama`, default rootful: `/var/lib/ramalama`)
 The default can be overridden in the ramalama.conf file.
 
+#### **--use-daemon**
+Feature flag to enable using the RamaLama daemon as backend for [ramalama-serve(1)](/docs/commands/ramalama/serve) and [ramalama-run(1)](/docs/commands/ramalama/run).
+
 ## COMMANDS
 
 | Command                                           | Description                                                |
 
@@ -39,6 +39,7 @@
     get_inference_spec_files,
     load_file_config,
 )
+from ramalama.daemon_stub import DaemonStub, run_daemon
 from ramalama.endian import EndianMismatchError
 from ramalama.logger import configure_logger, logger
 from ramalama.model_inspect.error import ParseError
@@ -218,6 +219,7 @@ def create_argument_parser(description: str):
         prog="ramalama",
         description=description,
         formatter_class=argparse.RawTextHelpFormatter,
+        exit_on_error=False,
     )
     configure_arguments(parser)
     return parser
@@ -279,6 +281,21 @@ def configure_arguments(parser):
         help=argparse.SUPPRESS,
     )
 
+    # Feature flags
+    parser.add_argument(
+        "--use-daemon",
+        dest="use_daemon",
+        default=False,
+        action="store_true",
+        help="Feature Flag: Enable using the daemon as backend by default",
+    )
+    parser.add_argument(
+        "--daemon-name",
+        dest="daemon_name",
+        help="Requires --use-daemon. Specifies the name of the daemon either for creation or further use.",
+        completer=suppressCompleter,
+    )
+
 
 def configure_subcommands(parser):
     """Add subcommand parsers to the main argument parser."""
@@ -499,7 +516,14 @@ def containers_parser(subparsers):
 
 
 def list_containers(args):
-    containers = engine.containers(args)
+    label = engine.LABEL_CONTAINER_RAMALAMA
+
+    # Feature Flag:
+    # Use daemon backend when feature flag is given
+    if args.use_daemon:
+        label = engine.LABEL_CONTAINER_RAMALAMA_DAEMON
+
+    containers = engine.containers(args, label)
     if len(containers) == 0:
         return
     print("\n".join(containers))
@@ -1137,6 +1161,14 @@ def run_cli(args):
         except Exception as exc:
             raise e from exc
 
+    # Feature Flag:
+    # Use daemon backend when feature flag is given
+    if args.use_daemon:
+        stub = DaemonStub(shortnames, args, sys.argv)
+        serve_path = stub.serve_model(getattr(args, "daemon_name"))
+        stub.chat(serve_path)
+        return
+
     if args.rag:
         if not args.container:
             raise ValueError("ramalama run --rag cannot be run with the --nocontainer option.")
@@ -1155,6 +1187,15 @@ def serve_parser(subparsers):
 
 
 def serve_cli(args):
+
+    # Feature Flag:
+    # Use daemon backend when feature flag is given
+    if args.use_daemon:
+        stub = DaemonStub(shortnames, args, sys.argv)
+        serve_path = stub.serve_model(getattr(args, "daemon_name"))
+        stub.wait_for_model(serve_path)
+        return
+
     if not args.container:
         args.detach = False
 
@@ -1269,47 +1310,11 @@ def daemon_parser(subparsers) -> None:
 
 
 def daemon_start_cli(args):
-    from ramalama.common import exec_cmd
-
-    daemon_cmd = []
-    daemon_model_store_dir = args.store
-    is_daemon_in_container = args.container and args.engine in get_args(SUPPORTED_ENGINES)
-
-    if is_daemon_in_container:
-        # If run inside a container, map the model store to the container internal directory
-        daemon_model_store_dir = "/ramalama/models"
-
-        daemon_cmd += [
-            args.engine,
-            "run",
-            "--pull",
-            args.pull,
-            "-d",
-            "-p",
-            f"{args.port}:8080",
-            "-v",
-            f"{args.store}:{daemon_model_store_dir}",
-            args.image,
-        ]
-
-    daemon_cmd += [
-        "ramalama",
-        "--store",
-        daemon_model_store_dir,
-        "daemon",
-        "run",
-        "--port",
-        "8080" if is_daemon_in_container else args.port,
-        "--host",
-        CONFIG.host if is_daemon_in_container else args.host,
-    ]
-    exec_cmd(daemon_cmd)
+    DaemonStub(shortnames, args, sys.argv).start_daemon()
 
 
 def daemon_run_cli(args):
-    from ramalama.daemon.daemon import run
-
-    run(host=args.host, port=int(args.port), model_store_path=args.store)
+    run_daemon(args)
 
 
 def version_parser(subparsers):
 
@@ -286,8 +286,8 @@ def verify_checksum(filename: str) -> bool:
     return sha256_hash.hexdigest() == expected_checksum
 
 
-def genname():
-    return "ramalama_" + "".join(random.choices(string.ascii_letters + string.digits, k=10))
+def genname(prefix: str = "ramalama"):
+    return prefix + "_" + "".join(random.choices(string.ascii_letters + string.digits, k=10))
 
 
 def engine_version(engine: SUPPORTED_ENGINES) -> str:
 
@@ -0,0 +1,104 @@
+import http
+import json
+import urllib.error
+import urllib.parse
+import urllib.request
+from typing import Any, Optional, Tuple
+
+from ramalama.daemon.dto.model import ModelResponse, RunningModelResponse
+from ramalama.daemon.dto.serve import ServeRequest, ServeResponse, StopServeRequest
+from ramalama.logger import logger
+
+
+class DaemonAPIError(Exception):
+
+    def __init__(self, reason: str, code: Optional[http.HTTPStatus] = None, *args):
+        super().__init__(*args)
+
+        self.reason = reason
+        self.code = code
+
+    def __str__(self):
+        if self.code:
+            return f"Call to daemon API failed ({self.code}): {self.reason}"
+        return f"Call to daemon API failed: {self.reason}"
+
+
+class DaemonClient:
+
+    def __init__(self, host: str, port: int):
+        self.host = host
+        self.port = port
+
+    @property
+    def base_url(self) -> str:
+        return f"{self.host}:{self.port}"
+
+    def list_available_models(self) -> list[ModelResponse]:
+        url = f"http://{self.base_url}/api/tags"
+        resp, _ = DaemonClient.call_api(url)
+        if resp:
+            return [ModelResponse(**model) for model in resp["models"]]
+
+    def list_running_models(self) -> list[RunningModelResponse]:
+        url = f"http://{self.base_url}/api/ps"
+        resp, _ = DaemonClient.call_api(url)
+        if resp:
+            return [RunningModelResponse(**model) for model in resp["models"]]
+
+    def start_model(self, model_name: str, runtime: str, exec_args: list[str]) -> Optional[str]:
+        url = f"http://{self.base_url}/api/serve"
+        request = ServeRequest(model_name, runtime, exec_args).to_dict()
+        resp, _ = DaemonClient.call_api(url, method=http.HTTPMethod.POST, json_data=request)
+        if resp:
+            return f"http://{self.base_url}{ServeResponse(**resp).serve_path}"
+        return None
+
+    def stop_model(self, model_name: str) -> Optional[str]:
+        url = f"http://{self.base_url}/api/stop"
+        request = StopServeRequest(model_name).to_dict()
+        DaemonClient.call_api(url, method=http.HTTPMethod.POST, json_data=request)
+
+    def is_healthy(self) -> bool:
+        url = f"http://{self.base_url}/api/health"
+        try:
+            _, code = DaemonClient.call_api(url)
+            logger.debug(f"Health check success, code: {code}")
+            return code == http.HTTPStatus.NO_CONTENT
+        except DaemonAPIError as e:
+            logger.debug(f"Health check failed: {e}")
+            return False
+
+    @staticmethod
+    def call_api(
+        url: str, method: http.HTTPMethod = http.HTTPMethod.GET, headers=None, params=None, json_data=None, timeout=10
+    ) -> Tuple[Any | None, http.HTTPStatus]:
+        headers = headers or {}
+
+        if params:
+            query_string = urllib.parse.urlencode(params)
+            separator = '&' if '?' in url else '?'
+            url = f"{url}{separator}{query_string}"
+
+        body = None
+        if json_data is not None:
+            body = json.dumps(json_data).encode('utf-8')
+            headers['Content-Type'] = 'application/json'
+
+        req = urllib.request.Request(url, data=body, headers=headers, method=method.value)
+        try:
+            with urllib.request.urlopen(req, timeout=timeout) as response:
+                response_code = response.getcode()
+                response_data = response.read().decode('utf-8')
+                try:
+                    return json.loads(response_data), response_code
+                except json.JSONDecodeError:
+                    return response_data, response_code
+        except urllib.error.HTTPError as e:
+            raise DaemonAPIError(e.reason, e.code)
+        except urllib.error.URLError as e:
+            raise DaemonAPIError(e.reason)
+        except ConnectionResetError:
+            raise DaemonAPIError("Connection reset")
+        except Exception as e:
+            raise DaemonAPIError(f"Unexpected error occurred: {e}")
@@ -7,7 +7,7 @@
 from datetime import datetime, timedelta
 
 from ramalama.daemon.handler.ramalama import RamalamaHandler
-from ramalama.daemon.logging import LogLevel, configure_logger, logger
+from ramalama.daemon.logger import LogLevel, configure_logger, logger
 from ramalama.daemon.service.model_runner import ModelRunner
 
 
@@ -61,26 +61,22 @@ def finish_request(self, request, client_address):
 
     def check_model_expiration(self):
         curr_time = datetime.now()
-        for name, m in self.model_runner.managed_models.items():
+
+        for id in list(self.model_runner.managed_models.keys()):
+            m = self.model_runner.managed_models[id]
             if m.expiration_date > curr_time:
                 continue
 
             try:
-                logger.info(f"Stopping expired model '{name}'...")
+                logger.info(f"Stopping expired model '{m.model.model_organization}/{m.model.model_name}'...")
                 self.model_runner.stop_model(m.id)
             except Exception as e:
-                logger.error(f"Failed to stop expired model '{name}': {e}")
+                logger.error(f"Failed to stop expired model '{m.model.model_organization}/{m.model.model_name}': {e}")
 
     def shutdown(self):
         logger.info("Shutting down ramalama daemon...")
 
-        for name, managed_model in self.model_runner.managed_models.items():
-            try:
-                logger.info(f"Stopping model runner {name}...")
-                self.model_runner.stop_model(managed_model.id)
-            except Exception as e:
-                logger.error(f"Error stopping model runner {name}: {e}")
-
+        self.model_runner.stop()
         super().shutdown()
 
 
 
@@ -78,6 +78,7 @@ class RunningModelResponse:
     size_vram: int
     digest: str
     cmd: str
+    serve_path: str
 
     def to_dict(self) -> dict:
         return {
@@ -91,6 +92,7 @@ def to_dict(self) -> dict:
             "size_vram": self.size_vram,
             "digest": self.digest,
             "cmd": self.cmd,
+            "serve_path": self.serve_path,
         }
 
     def serialize(self) -> str:
 
@@ -1,7 +1,6 @@
 import json
 from dataclasses import dataclass
 
-from ramalama.config import BaseConfig
 from ramalama.daemon.dto.errors import MissingArgumentError
 
 
@@ -10,17 +9,13 @@ class ServeRequest:
 
     model_name: str
     runtime: str
-    exec_args: dict[str, str]
+    exec_args: list[str]
 
     def to_dict(self) -> dict:
         return {
             "model_name": self.model_name,
             "runtime": self.runtime,
-            "exec_args": dict(
-                [
-                    (key, value) for key, value in self.exec_args.items() if type(value) is str
-                ]  # Filter out non-string values
-            ),
+            "exec_args": [entry for entry in self.exec_args],
         }
 
     def serialize(self) -> str:
@@ -38,27 +33,7 @@ def from_string(data: str) -> "ServeRequest":
         if not runtime:
             raise MissingArgumentError("runtime")
 
-        base_exec_args = BaseConfig().__dict__
-        exec_args_input = data_dict.get("exec_args", {})
-        # merge missing args to args from base config
-        exec_args = {
-            **base_exec_args,
-            **{
-                "runtime_args": [],
-                "debug": False,
-            },
-        }
-        # overwrite with input args
-        exec_args = {**exec_args, **exec_args_input}
-        # overwrite certain values which do not make sense in this context
-        exec_args = {
-            **exec_args,
-            **{
-                "container": False,
-                "generate": False,
-                "dryrun": False,
-            },
-        }
+        exec_args = data_dict.get("exec_args", [])
 
         return ServeRequest(
             model_name=model_name,