feat: add base64 encoded PDF support for OpenAI Chat Completions (llamastack#2881)

nathan-weinberg · web-flow · commit 870a37ff4bd5 · 2025-07-29T06:23:41.000-04:00
# What does this PR do? OpenAI Chat Completions supports passing a base64 encoded PDF file to a model, but Llama Stack currently does not allow for this behavior. This PR extends our implementation of the OpenAI API spec to change that. Closes llamastack#2129 ## Test Plan A new functional test has been added to test the validity of such a request Signed-off-by: Nathan Weinberg <nweinber@redhat.com>
diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
@@ -9821,13 +9821,17 @@
                     },
                     {
                         "$ref": "#/components/schemas/OpenAIChatCompletionContentPartImageParam"
+                    },
+                    {
+                        "$ref": "#/components/schemas/OpenAIFile"
                     }
                 ],
                 "discriminator": {
                     "propertyName": "type",
                     "mapping": {
                         "text": "#/components/schemas/OpenAIChatCompletionContentPartTextParam",
-                        "image_url": "#/components/schemas/OpenAIChatCompletionContentPartImageParam"
+                        "image_url": "#/components/schemas/OpenAIChatCompletionContentPartImageParam",
+                        "file": "#/components/schemas/OpenAIFile"
                     }
                 }
             },
@@ -9974,6 +9978,41 @@
                 "title": "OpenAIDeveloperMessageParam",
                 "description": "A message from the developer in an OpenAI-compatible chat completion request."
             },
+            "OpenAIFile": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "file",
+                        "default": "file"
+                    },
+                    "file": {
+                        "$ref": "#/components/schemas/OpenAIFileFile"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type",
+                    "file"
+                ],
+                "title": "OpenAIFile"
+            },
+            "OpenAIFileFile": {
+                "type": "object",
+                "properties": {
+                    "file_data": {
+                        "type": "string"
+                    },
+                    "file_id": {
+                        "type": "string"
+                    },
+                    "filename": {
+                        "type": "string"
+                    }
+                },
+                "additionalProperties": false,
+                "title": "OpenAIFileFile"
+            },
             "OpenAIImageURL": {
                 "type": "object",
                 "properties": {
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
@@ -6934,11 +6934,13 @@ components:
       oneOf:
         - $ref: '#/components/schemas/OpenAIChatCompletionContentPartTextParam'
         - $ref: '#/components/schemas/OpenAIChatCompletionContentPartImageParam'
+        - $ref: '#/components/schemas/OpenAIFile'
       discriminator:
         propertyName: type
         mapping:
           text: '#/components/schemas/OpenAIChatCompletionContentPartTextParam'
           image_url: '#/components/schemas/OpenAIChatCompletionContentPartImageParam'
+          file: '#/components/schemas/OpenAIFile'
     OpenAIChatCompletionContentPartTextParam:
       type: object
       properties:
@@ -7050,6 +7052,31 @@ components:
       title: OpenAIDeveloperMessageParam
       description: >-
         A message from the developer in an OpenAI-compatible chat completion request.
+    OpenAIFile:
+      type: object
+      properties:
+        type:
+          type: string
+          const: file
+          default: file
+        file:
+          $ref: '#/components/schemas/OpenAIFileFile'
+      additionalProperties: false
+      required:
+        - type
+        - file
+      title: OpenAIFile
+    OpenAIFileFile:
+      type: object
+      properties:
+        file_data:
+          type: string
+        file_id:
+          type: string
+        filename:
+          type: string
+      additionalProperties: false
+      title: OpenAIFileFile
     OpenAIImageURL:
       type: object
       properties:
diff --git a/llama_stack/apis/inference/inference.py b/llama_stack/apis/inference/inference.py
@@ -455,8 +455,21 @@ class OpenAIChatCompletionContentPartImageParam(BaseModel):
     image_url: OpenAIImageURL
 
 
+@json_schema_type
+class OpenAIFileFile(BaseModel):
+    file_data: str | None = None
+    file_id: str | None = None
+    filename: str | None = None
+
+
+@json_schema_type
+class OpenAIFile(BaseModel):
+    type: Literal["file"] = "file"
+    file: OpenAIFileFile
+
+
 OpenAIChatCompletionContentPartParam = Annotated[
-    OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam,
+    OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam | OpenAIFile,
     Field(discriminator="type"),
 ]
 register_schema(OpenAIChatCompletionContentPartParam, name="OpenAIChatCompletionContentPartParam")
diff --git a/pyproject.toml b/pyproject.toml
@@ -114,6 +114,7 @@ test = [
     "sqlalchemy[asyncio]>=2.0.41",
     "requests",
     "pymilvus>=2.5.12",
+    "reportlab",
 ]
 docs = [
     "setuptools",
diff --git a/tests/integration/inference/test_openai_completion.py b/tests/integration/inference/test_openai_completion.py
@@ -5,8 +5,14 @@
 # the root directory of this source tree.
 
 
+import base64
+import os
+import tempfile
+
 import pytest
 from openai import OpenAI
+from reportlab.lib.pagesizes import letter
+from reportlab.pdfgen import canvas
 
 from llama_stack.distribution.library_client import LlamaStackAsLibraryClient
 
@@ -82,6 +88,14 @@ def skip_if_provider_isnt_vllm(client_with_models, model_id):
         pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support vllm extra_body parameters.")
 
 
+def skip_if_provider_isnt_openai(client_with_models, model_id):
+    provider = provider_from_model(client_with_models, model_id)
+    if provider.provider_type != "remote::openai":
+        pytest.skip(
+            f"Model {model_id} hosted by {provider.provider_type} doesn't support chat completion calls with base64 encoded files."
+        )
+
+
 @pytest.fixture
 def openai_client(client_with_models):
     base_url = f"{client_with_models.base_url}/v1/openai/v1"
@@ -418,3 +432,45 @@ def test_inference_store_tool_calls(compat_client, client_with_models, text_mode
         # failed tool call parses show up as a message with content, so ensure
         # that the retrieve response content matches the original request
         assert retrieved_response.choices[0].message.content == content
+
+
+def test_openai_chat_completion_non_streaming_with_file(openai_client, client_with_models, text_model_id):
+    skip_if_provider_isnt_openai(client_with_models, text_model_id)
+
+    # Generate temporary PDF with "Hello World" text
+    with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_pdf:
+        c = canvas.Canvas(temp_pdf.name, pagesize=letter)
+        c.drawString(100, 750, "Hello World")
+        c.save()
+
+        # Read the PDF and sencode to base64
+        with open(temp_pdf.name, "rb") as pdf_file:
+            pdf_base64 = base64.b64encode(pdf_file.read()).decode("utf-8")
+
+        # Clean up temporary file
+        os.unlink(temp_pdf.name)
+
+    response = openai_client.chat.completions.create(
+        model=text_model_id,
+        messages=[
+            {
+                "role": "user",
+                "content": "Describe what you see in this PDF file.",
+            },
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "file",
+                        "file": {
+                            "filename": "my-temp-hello-world-pdf",
+                            "file_data": f"data:application/pdf;base64,{pdf_base64}",
+                        },
+                    }
+                ],
+            },
+        ],
+        stream=False,
+    )
+    message_content = response.choices[0].message.content.lower().strip()
+    assert "hello world" in message_content
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -114,6 +114,7 @@ test = [`
`114`	`114`	`"sqlalchemy[asyncio]>=2.0.41",`
`115`	`115`	`"requests",`
`116`	`116`	`"pymilvus>=2.5.12",`
	`117`	`+ "reportlab",`
`117`	`118`	`]`
`118`	`119`	`docs = [`
`119`	`120`	`"setuptools",`