From a7bd59d3c8b77d66400870d51126ad1073fdce23 Mon Sep 17 00:00:00 2001 From: Brian Date: Thu, 18 Sep 2025 15:08:57 -0400 Subject: [PATCH] feat: added new flag default-template for models to use tools Signed-off-by: Brian --- docs/ramalama-run.1.md | 3 ++ docs/ramalama-serve.1.md | 3 ++ ramalama/cli.py | 7 ++++ ramalama/daemon/service/command_factory.py | 9 +++-- ramalama/model.py | 40 +++++++++++++++------- 5 files changed, 46 insertions(+), 16 deletions(-) diff --git a/docs/ramalama-run.1.md b/docs/ramalama-run.1.md index 8ed75f898..f3263e93b 100644 --- a/docs/ramalama-run.1.md +++ b/docs/ramalama-run.1.md @@ -43,6 +43,9 @@ Possible values are "never", "always" and "auto". (default: auto) #### **--ctx-size**, **-c** size of the prompt context. This option is also available as **--max-model-len**. Applies to llama.cpp and vllm regardless of alias (default: 4096, 0 = loaded from model) +#### **--default-template** +Use the default chat template instead of model-specific chat template files. When specified, RamaLama will not use any extracted chat template files from the model and will rely on the runtime's built-in default template handling. + #### **--device** Add a host device to the container. Optional permissions parameter can be used to specify device permissions by combining r for read, w for diff --git a/docs/ramalama-serve.1.md b/docs/ramalama-serve.1.md index 74478a195..45344a377 100644 --- a/docs/ramalama-serve.1.md +++ b/docs/ramalama-serve.1.md @@ -63,6 +63,9 @@ Min chunk size to attempt reusing from the cache via KV shifting #### **--ctx-size**, **-c** size of the prompt context. This option is also available as **--max-model-len**. Applies to llama.cpp and vllm regardless of alias (default: 4096, 0 = loaded from model) +#### **--default-template** +Use the default chat template instead of model-specific chat template files. When specified, RamaLama will not use any extracted chat template files from the model and will rely on the runtime's built-in default template handling. + #### **--detach**, **-d** Run the container in the background and print the new container ID. The default is TRUE. The --nocontainer option forces this option to False. diff --git a/ramalama/cli.py b/ramalama/cli.py index 8967c57fd..a42ce7b40 100644 --- a/ramalama/cli.py +++ b/ramalama/cli.py @@ -892,6 +892,13 @@ def runtime_options(parser, command): help="enable/disable thinking mode in reasoning models", action=CoerceToBool, ) + if command in ["run", "serve"]: + parser.add_argument( + "--default-template", + dest="default_template", + action="store_true", + help="use the default chat template instead of model-specific chat template files", + ) parser.add_argument( "--oci-runtime", help="override the default OCI runtime used to launch the container", diff --git a/ramalama/daemon/service/command_factory.py b/ramalama/daemon/service/command_factory.py index cade75fc0..0cde3de92 100644 --- a/ramalama/daemon/service/command_factory.py +++ b/ramalama/daemon/service/command_factory.py @@ -82,9 +82,12 @@ def _build_llama_serve_command(self) -> list[str]: else: cmd += ["--jinja"] - chat_template_path = self.model._get_chat_template_path(False, False, False) - if chat_template_path: - cmd += ["--chat-template-file", chat_template_path] + # Add chat template unless using default template + use_default_template = self.request_args.get("default_template", False) + if not use_default_template: + chat_template_path = self.model._get_chat_template_path(False, False, False) + if chat_template_path: + cmd += ["--chat-template-file", chat_template_path] cmd += [ "--alias", diff --git a/ramalama/model.py b/ramalama/model.py index ba38dbcac..6cf99c992 100644 --- a/ramalama/model.py +++ b/ramalama/model.py @@ -647,9 +647,12 @@ def llama_serve(self, args): else: exec_args += ["--jinja"] - chat_template_path = self._get_chat_template_path(args.container, args.generate, args.dryrun) - if chat_template_path is not None: - exec_args += ["--chat-template-file", chat_template_path] + # Add chat template unless using default template + use_default_template = getattr(args, 'default_template', False) + if not use_default_template: + chat_template_path = self._get_chat_template_path(args.container, args.generate, args.dryrun) + if chat_template_path is not None: + exec_args += ["--chat-template-file", chat_template_path] if should_colorize(): exec_args += ["--log-colors", "on"] @@ -739,17 +742,28 @@ def handle_runtime(self, args, exec_args): def generate_container_config(self, args, exec_args): # Get the blob paths (src) and mounted paths (dest) model_src_path = self._get_entry_model_path(False, False, args.dryrun) - chat_template_src_path = self._get_chat_template_path(False, False, args.dryrun) mmproj_src_path = self._get_mmproj_path(False, False, args.dryrun) model_dest_path = self._get_entry_model_path(True, True, args.dryrun) - chat_template_dest_path = self._get_chat_template_path(True, True, args.dryrun) mmproj_dest_path = self._get_mmproj_path(True, True, args.dryrun) + + # Get chat template paths unless using default template + use_default_template = getattr(args, 'default_template', False) + if use_default_template: + chat_template_src_path = None + chat_template_dest_path = None + else: + chat_template_src_path = self._get_chat_template_path(False, False, args.dryrun) + chat_template_dest_path = self._get_chat_template_path(True, True, args.dryrun) + + # Prepare chat template paths tuple or None + chat_template_paths = None if chat_template_src_path is None else (chat_template_src_path, chat_template_dest_path) + mmproj_paths = None if mmproj_src_path is None else (mmproj_src_path, mmproj_dest_path) if args.generate.gen_type == "quadlet": self.quadlet( (model_src_path, model_dest_path), - (chat_template_src_path, chat_template_dest_path), - (mmproj_src_path, mmproj_dest_path), + chat_template_paths, + mmproj_paths, args, exec_args, args.generate.output_dir, @@ -757,8 +771,8 @@ def generate_container_config(self, args, exec_args): elif args.generate.gen_type == "kube": self.kube( (model_src_path, model_dest_path), - (chat_template_src_path, chat_template_dest_path), - (mmproj_src_path, mmproj_dest_path), + chat_template_paths, + mmproj_paths, args, exec_args, args.generate.output_dir, @@ -766,8 +780,8 @@ def generate_container_config(self, args, exec_args): elif args.generate.gen_type == "quadlet/kube": self.quadlet_kube( (model_src_path, model_dest_path), - (chat_template_src_path, chat_template_dest_path), - (mmproj_src_path, mmproj_dest_path), + chat_template_paths, + mmproj_paths, args, exec_args, args.generate.output_dir, @@ -775,8 +789,8 @@ def generate_container_config(self, args, exec_args): elif args.generate.gen_type == "compose": self.compose( (model_src_path, model_dest_path), - (chat_template_src_path, chat_template_dest_path), - (mmproj_src_path, mmproj_dest_path), + chat_template_paths, + mmproj_paths, args, exec_args, args.generate.output_dir,