- reverted API changes: this package is standalone

vMaroon · vMaroon · commit 8a60b221a17c · 2025-07-20T00:52:45.000+03:00
Signed-off-by: Maroon Ayoub &lt;maroon.ayoub@ibm.com&gt;
diff --git a/Makefile b/Makefile
@@ -23,6 +23,7 @@ help: ## Print help
 	@awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n  make \033[36m<target>\033[0m\n"} /^[a-zA-Z_0-9-]+:.*?##/ { printf "  \033[36m%-15s\033[0m %s\n", $$1, $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST)
 
 ##@ Tokenizer & Linking
+
 LDFLAGS ?= -extldflags '-L$(shell pwd)/lib'
 CGO_ENABLED=1
 TOKENIZER_LIB = lib/libtokenizers.a
@@ -82,7 +83,6 @@ e2e-test: download-tokenizer
 	@printf "\033[33;1m==== Running unit tests ====\033[0m\n"
 	go test -v -ldflags="$(LDFLAGS)" ./tests/...
 
-
 ##@ Build
 
 .PHONY: build
diff --git a/examples/kv_cache_index/main.go b/examples/kv_cache_index/main.go
@@ -115,7 +115,7 @@ func runPrompts(ctx context.Context, kvCacheIndexer *kvcache.Indexer) error {
 	logger.Info("Started Indexer", "model", modelName)
 
 	// Get pods for the prompt
-	pods, err := kvCacheIndexer.GetPodScores(ctx, testdata.Prompt, modelName, nil, false)
+	pods, err := kvCacheIndexer.GetPodScores(ctx, testdata.Prompt, modelName, nil)
 	if err != nil {
 		return err
 	}
@@ -136,7 +136,7 @@ func runPrompts(ctx context.Context, kvCacheIndexer *kvcache.Indexer) error {
 	time.Sleep(3 * time.Second)
 
 	// Get pods for the prompt
-	pods, err = kvCacheIndexer.GetPodScores(ctx, testdata.Prompt, modelName, nil, false)
+	pods, err = kvCacheIndexer.GetPodScores(ctx, testdata.Prompt, modelName, nil)
 	if err != nil {
 		return err
 	}
diff --git a/examples/kv_events/offline/main.go b/examples/kv_events/offline/main.go
@@ -152,7 +152,7 @@ func runEventsDemo(ctx context.Context, kvCacheIndexer *kvcache.Indexer, publish
 	logger.Info("@@@ Starting KV Events Demo", "model", testdata.ModelName)
 
 	// Initial query - should be empty since no events have been published
-	pods, err := kvCacheIndexer.GetPodScores(ctx, testdata.Prompt, testdata.ModelName, nil, false)
+	pods, err := kvCacheIndexer.GetPodScores(ctx, testdata.Prompt, testdata.ModelName, nil)
 	if err != nil {
 		return err
 	}
@@ -185,7 +185,7 @@ func runEventsDemo(ctx context.Context, kvCacheIndexer *kvcache.Indexer, publish
 	time.Sleep(3 * time.Second)
 
 	// Query again to see the effect of the events
-	pods, err = kvCacheIndexer.GetPodScores(ctx, testdata.Prompt, testdata.ModelName, nil, false)
+	pods, err = kvCacheIndexer.GetPodScores(ctx, testdata.Prompt, testdata.ModelName, nil)
 	if err != nil {
 		return err
 	}
@@ -214,7 +214,7 @@ func runEventsDemo(ctx context.Context, kvCacheIndexer *kvcache.Indexer, publish
 	time.Sleep(3 * time.Second)
 
 	// Final query
-	pods, err = kvCacheIndexer.GetPodScores(ctx, testdata.Prompt, testdata.ModelName, nil, false)
+	pods, err = kvCacheIndexer.GetPodScores(ctx, testdata.Prompt, testdata.ModelName, nil)
 	if err != nil {
 		return err
 	}
diff --git a/examples/kv_events/online/main.go b/examples/kv_events/online/main.go
@@ -147,7 +147,7 @@ func main() {
 			return
 		}
 
-		pods, err := kvCacheIndexer.GetPodScores(ctx, req.Prompt, modelName, nil, false)
+		pods, err := kvCacheIndexer.GetPodScores(ctx, req.Prompt, modelName, nil)
 		if err != nil {
 			http.Error(w, fmt.Sprintf("error: %v", err), http.StatusInternalServerError)
 			return
diff --git a/pkg/kvcache/indexer.go b/pkg/kvcache/indexer.go
@@ -18,15 +18,13 @@ package kvcache
 
 import (
 	"context"
-	"encoding/json"
 	"fmt"
 
 	"k8s.io/apimachinery/pkg/util/sets"
 	"k8s.io/klog/v2"
 
 	"github.com/llm-d/llm-d-kv-cache-manager/pkg/kvcache/kvblock"
 	"github.com/llm-d/llm-d-kv-cache-manager/pkg/tokenization"
-	chattemplatego "github.com/llm-d/llm-d-kv-cache-manager/pkg/tokenization/chat_template_go"
 	"github.com/llm-d/llm-d-kv-cache-manager/pkg/tokenization/prefixstore"
 	"github.com/llm-d/llm-d-kv-cache-manager/pkg/utils/logging"
 )
@@ -117,50 +115,14 @@ func (k *Indexer) KVBlockIndex() kvblock.Index {
 //
 // The function returns a map of pod identifiers to scores.
 func (k *Indexer) GetPodScores(ctx context.Context, prompt, modelName string,
-	podIdentifiers []string, chatCompletion bool,
+	podIdentifiers []string,
 ) (map[string]int, error) {
 	traceLogger := klog.FromContext(ctx).V(logging.TRACE).WithName("kvcache.GetPodScores")
-
-	// Handle chat completion requests
-	if chatCompletion {
-		// Parse the prompt as a ChatTemplateRequest JSON
-		var req chattemplatego.ChatTemplateRequest
-		if err := json.Unmarshal([]byte(prompt), &req); err != nil {
-			return nil, fmt.Errorf("failed to parse chat template request: %w", err)
-		}
-
-		// Create or reuse the CGo wrapper (could be a singleton in production)
-		// TODO: cache, instance management
-		wrapper := chattemplatego.NewChatTemplateCGoWrapper()
-
-		// Fetch the chat template for the model (if not already set)
-		if req.ChatTemplate == "" {
-			getReq := chattemplatego.GetChatTemplateRequest{ModelName: modelName}
-			template, template_vars, err := wrapper.GetModelChatTemplate(getReq)
-			if err != nil {
-				return nil, fmt.Errorf("failed to fetch chat template: %w", err)
-			}
-			req.ChatTemplate = template
-			req.TemplateVars = template_vars
-		}
-
-		// Apply the template to the request
-		resp, err := wrapper.RenderChatTemplate(req)
-		if err != nil {
-			return nil, fmt.Errorf("failed to render chat template: %w", err)
-		}
-		if len(resp.RenderedChats) == 0 {
-			return nil, nil
-		}
-		prompt = resp.RenderedChats[0]
-	}
-
 	// 0. add to tokenizers pool
 	k.tokenizersPool.AddTask(prompt, modelName)
 
 	// 1. get available tokens of longest prefix
 	tokens := k.tokensIndexer.FindLongestContainedTokens(prompt, modelName)
-
 	if len(tokens) == 0 {
 		//nolint:nilnil // no need to return an error
 		return nil, nil
@@ -188,14 +150,6 @@ func (k *Indexer) GetPodScores(ctx context.Context, prompt, modelName string,
 	return podScores, nil
 }
 
-// GetPodScoresDefault is a convenience function for backward compatibility
-// that calls GetPodScores with chatCompletion=false
-func (k *Indexer) GetPodScoresDefault(ctx context.Context, prompt, modelName string,
-	podIdentifiers []string,
-) (map[string]int, error) {
-	return k.GetPodScores(ctx, prompt, modelName, podIdentifiers, false)
-}
-
 // podsPerKeyPrintHelper formats a map of keys to pod names for printing.
 func podsPerKeyPrintHelper(ks map[kvblock.Key][]string) string {
 	flattened := ""
diff --git a/pkg/preprocessing/chat_completions_template/README.md b/pkg/preprocessing/chat_completions_template/README.md
@@ -0,0 +1,85 @@
+# Chat Template Integration for OpenAI-API v1/chat_completions Compatibility
+
+## Why Templating is Needed
+
+When processing OpenAI ChatCompletions requests, vLLM templates the input before tokenization. 
+For KV-cache lookups to work correctly, we must replicate this templating process in our indexer.
+
+**Example:**
+```json
+// Input: ChatCompletions request
+{
+  "messages": [
+    {"role": "user", "content": "What's 2+2?"},
+    {"role": "assistant", "content": "Let me calculate that."},
+    {"role": "user", "content": "Thanks!"}
+  ]
+}
+```
+
+```jinja2
+<!-- Model template (e.g., Llama-2) -->
+{% for message in messages %}
+{% if message['role'] == 'user' %}
+{{ '<s>[INST] ' + message['content'] + ' [/INST]' }}
+{% elif message['role'] == 'assistant' %}
+{{ message['content'] + '</s>' }}
+{% endif %}
+{% endfor %}
+```
+
+```text
+<!-- Flattened prompt the model actually sees -->
+<s>[INST] What's 2+2? [/INST]Let me calculate that.</s><s>[INST] Thanks! [/INST]
+```
+
+**Without templating**, we'd not be able to recreate the same tokens vLLM will produce, leading to incorrect KV-cache lookups.
+
+## Integration with Existing Pipeline
+
+This package provides a library to be used for templating before using the `kvcache.Indexer` entry point.
+
+### Requirements
+
+The router can receive a standard OpenAI ChatCompletions request and convert it to a JSON string representing our `ChatTemplateRequest`:
+
+**ChatTemplateRequest accepts these fields:**
+- `Conversations` - List of message lists (role/content pairs)
+- `Tools` - (Optional) List of tool schemas
+- `Documents` - (Optional) List of document dicts
+- `ChatTemplate` - (Optional) Override for the chat template
+- `ReturnAssistantTokensMask` - (Optional) Whether to return assistant token indices
+- `ContinueFinalMessage` - (Optional) Whether to continue from the final message
+- `AddGenerationPrompt` - (Optional) Whether to add a generation prompt
+- `TemplateVars` - (Optional) Special tokens for template rendering
+
+### Template Processing Flow
+
+The templating process (steps 1.1-1.4) handles the conversion from structured request to flattened prompt:
+
+```
+1.1. **CGO Binding**: chattemplatego.NewChatTemplateCGoWrapper()
+    └── cgo_functions.go:NewChatTemplateCGoWrapper()
+        └── Creates ChatTemplateCGoWrapper struct with initialized=false
+
+1.2. **Template Fetching**: wrapper.GetModelChatTemplate(getReq)
+    ├── cgo_functions.go:GetModelChatTemplate(req)
+    │   ├── Initialize() Python interpreter via CGO
+    │   ├── executePythonCode() - **CGO Binding** to Python
+    │   └── **Python Wrapper**: chat_template_wrapper.py:get_model_chat_template()
+    │       └── Uses Hugging Face AutoTokenizer to fetch model template
+    └── Returns: (template, template_vars)
+
+1.3. **Template Rendering**: wrapper.RenderChatTemplate(req)
+    ├── cgo_functions.go:RenderChatTemplate(req)
+    │   ├── Initialize() Python interpreter via CGO (if not already done)
+    │   ├── executePythonCode() - **CGO Binding** to Python
+    │   └── **Python Wrapper**: chat_template_wrapper.py:render_jinja_template()
+    │       └── Imports render_jinja_template from transformers.utils.chat_template_utils
+    │           └── Uses transformers library's core template rendering functionality
+    └── Returns: ChatTemplateResponse
+
+1.4. **Extract Flattened Prompt**
+    └── prompt := resp.RenderedChats[0]
+    └── Continue with existing pipeline: Tokenize → KV Block Keys → Pod Scoring
+```
diff --git a/pkg/preprocessing/chat_completions_template/cgo_functions.go b/pkg/preprocessing/chat_completions_template/cgo_functions.go
@@ -1,10 +1,28 @@
-package chattemplatego
+//go:build exclude
+
+/*
+Copyright 2025 The llm-d Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package chat_completions_template
 
 /*
 // CGo build flags for Python 3.11
-// These are platform-specific and may need adjustment for different systems
-#cgo CFLAGS: -I/Library/Frameworks/Python.framework/Versions/3.11/include/python3.11
-#cgo LDFLAGS: -L/Library/Frameworks/Python.framework/Versions/3.11/lib -lpython3.11
+// TODO: proper setup
+// #cgo CFLAGS: -I/Library/Frameworks/Python.framework/Versions/3.11/include/python3.11
+// #cgo LDFLAGS: -L/Library/Frameworks/Python.framework/Versions/3.11/lib -lpython3.11
 #include "cgo_functions.h"
 */
 import "C"
diff --git a/pkg/preprocessing/chat_completions_template/cgo_functions.h b/pkg/preprocessing/chat_completions_template/cgo_functions.h
@@ -1,3 +1,19 @@
+/*
+Copyright 2025 The llm-d Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
 #ifndef CGO_FUNCTIONS_H
 #define CGO_FUNCTIONS_H
 
diff --git a/pkg/preprocessing/chat_completions_template/chat_template_wrapper.py b/pkg/preprocessing/chat_completions_template/chat_template_wrapper.py
@@ -1,3 +1,17 @@
+# Copyright 2025 The llm-d Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 #!/usr/bin/env python3
 """
 Standalone wrapper for render_jinja_template function from transformers.
diff --git a/pkg/preprocessing/chat_completions_template/requirements.txt b/pkg/preprocessing/chat_completions_template/requirements.txt
diff --git a/pkg/tokenization/chat_template_go/README_CHATCOMPLETIONS_KVCACHE.md b/pkg/tokenization/chat_template_go/README_CHATCOMPLETIONS_KVCACHE.md
diff --git a/tests/e2e/redis_mock/e2e_suite_test.go b/tests/e2e/redis_mock/e2e_suite_test.go
diff --git a/tests/e2e/redis_mock/e2e_test.go b/tests/e2e/redis_mock/e2e_test.go

Original file line number	Diff line number	Diff line change
`@@ -147,7 +147,7 @@ func main() {`
`147`	`147`	`return`
`148`	`148`	`}`
`149`	`149`
`150`		`- pods, err := kvCacheIndexer.GetPodScores(ctx, req.Prompt, modelName, nil, false)`
	`150`	`+ pods, err := kvCacheIndexer.GetPodScores(ctx, req.Prompt, modelName, nil)`
`151`	`151`	`if err != nil {`
`152`	`152`	`http.Error(w, fmt.Sprintf("error: %v", err), http.StatusInternalServerError)`
`153`	`153`	`return`