Skip to content

Commit 870a37f

Browse files
feat: add base64 encoded PDF support for OpenAI Chat Completions (llamastack#2881)
# What does this PR do? OpenAI Chat Completions supports passing a base64 encoded PDF file to a model, but Llama Stack currently does not allow for this behavior. This PR extends our implementation of the OpenAI API spec to change that. Closes llamastack#2129 ## Test Plan A new functional test has been added to test the validity of such a request Signed-off-by: Nathan Weinberg <[email protected]>
1 parent cf87220 commit 870a37f

File tree

6 files changed

+1551
-1237
lines changed

6 files changed

+1551
-1237
lines changed

docs/_static/llama-stack-spec.html

Lines changed: 40 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9821,13 +9821,17 @@
98219821
},
98229822
{
98239823
"$ref": "#/components/schemas/OpenAIChatCompletionContentPartImageParam"
9824+
},
9825+
{
9826+
"$ref": "#/components/schemas/OpenAIFile"
98249827
}
98259828
],
98269829
"discriminator": {
98279830
"propertyName": "type",
98289831
"mapping": {
98299832
"text": "#/components/schemas/OpenAIChatCompletionContentPartTextParam",
9830-
"image_url": "#/components/schemas/OpenAIChatCompletionContentPartImageParam"
9833+
"image_url": "#/components/schemas/OpenAIChatCompletionContentPartImageParam",
9834+
"file": "#/components/schemas/OpenAIFile"
98319835
}
98329836
}
98339837
},
@@ -9974,6 +9978,41 @@
99749978
"title": "OpenAIDeveloperMessageParam",
99759979
"description": "A message from the developer in an OpenAI-compatible chat completion request."
99769980
},
9981+
"OpenAIFile": {
9982+
"type": "object",
9983+
"properties": {
9984+
"type": {
9985+
"type": "string",
9986+
"const": "file",
9987+
"default": "file"
9988+
},
9989+
"file": {
9990+
"$ref": "#/components/schemas/OpenAIFileFile"
9991+
}
9992+
},
9993+
"additionalProperties": false,
9994+
"required": [
9995+
"type",
9996+
"file"
9997+
],
9998+
"title": "OpenAIFile"
9999+
},
10000+
"OpenAIFileFile": {
10001+
"type": "object",
10002+
"properties": {
10003+
"file_data": {
10004+
"type": "string"
10005+
},
10006+
"file_id": {
10007+
"type": "string"
10008+
},
10009+
"filename": {
10010+
"type": "string"
10011+
}
10012+
},
10013+
"additionalProperties": false,
10014+
"title": "OpenAIFileFile"
10015+
},
997710016
"OpenAIImageURL": {
997810017
"type": "object",
997910018
"properties": {

docs/_static/llama-stack-spec.yaml

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6934,11 +6934,13 @@ components:
69346934
oneOf:
69356935
- $ref: '#/components/schemas/OpenAIChatCompletionContentPartTextParam'
69366936
- $ref: '#/components/schemas/OpenAIChatCompletionContentPartImageParam'
6937+
- $ref: '#/components/schemas/OpenAIFile'
69376938
discriminator:
69386939
propertyName: type
69396940
mapping:
69406941
text: '#/components/schemas/OpenAIChatCompletionContentPartTextParam'
69416942
image_url: '#/components/schemas/OpenAIChatCompletionContentPartImageParam'
6943+
file: '#/components/schemas/OpenAIFile'
69426944
OpenAIChatCompletionContentPartTextParam:
69436945
type: object
69446946
properties:
@@ -7050,6 +7052,31 @@ components:
70507052
title: OpenAIDeveloperMessageParam
70517053
description: >-
70527054
A message from the developer in an OpenAI-compatible chat completion request.
7055+
OpenAIFile:
7056+
type: object
7057+
properties:
7058+
type:
7059+
type: string
7060+
const: file
7061+
default: file
7062+
file:
7063+
$ref: '#/components/schemas/OpenAIFileFile'
7064+
additionalProperties: false
7065+
required:
7066+
- type
7067+
- file
7068+
title: OpenAIFile
7069+
OpenAIFileFile:
7070+
type: object
7071+
properties:
7072+
file_data:
7073+
type: string
7074+
file_id:
7075+
type: string
7076+
filename:
7077+
type: string
7078+
additionalProperties: false
7079+
title: OpenAIFileFile
70537080
OpenAIImageURL:
70547081
type: object
70557082
properties:

llama_stack/apis/inference/inference.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -455,8 +455,21 @@ class OpenAIChatCompletionContentPartImageParam(BaseModel):
455455
image_url: OpenAIImageURL
456456

457457

458+
@json_schema_type
459+
class OpenAIFileFile(BaseModel):
460+
file_data: str | None = None
461+
file_id: str | None = None
462+
filename: str | None = None
463+
464+
465+
@json_schema_type
466+
class OpenAIFile(BaseModel):
467+
type: Literal["file"] = "file"
468+
file: OpenAIFileFile
469+
470+
458471
OpenAIChatCompletionContentPartParam = Annotated[
459-
OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam,
472+
OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam | OpenAIFile,
460473
Field(discriminator="type"),
461474
]
462475
register_schema(OpenAIChatCompletionContentPartParam, name="OpenAIChatCompletionContentPartParam")

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,7 @@ test = [
114114
"sqlalchemy[asyncio]>=2.0.41",
115115
"requests",
116116
"pymilvus>=2.5.12",
117+
"reportlab",
117118
]
118119
docs = [
119120
"setuptools",

tests/integration/inference/test_openai_completion.py

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,14 @@
55
# the root directory of this source tree.
66

77

8+
import base64
9+
import os
10+
import tempfile
11+
812
import pytest
913
from openai import OpenAI
14+
from reportlab.lib.pagesizes import letter
15+
from reportlab.pdfgen import canvas
1016

1117
from llama_stack.distribution.library_client import LlamaStackAsLibraryClient
1218

@@ -82,6 +88,14 @@ def skip_if_provider_isnt_vllm(client_with_models, model_id):
8288
pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support vllm extra_body parameters.")
8389

8490

91+
def skip_if_provider_isnt_openai(client_with_models, model_id):
92+
provider = provider_from_model(client_with_models, model_id)
93+
if provider.provider_type != "remote::openai":
94+
pytest.skip(
95+
f"Model {model_id} hosted by {provider.provider_type} doesn't support chat completion calls with base64 encoded files."
96+
)
97+
98+
8599
@pytest.fixture
86100
def openai_client(client_with_models):
87101
base_url = f"{client_with_models.base_url}/v1/openai/v1"
@@ -418,3 +432,45 @@ def test_inference_store_tool_calls(compat_client, client_with_models, text_mode
418432
# failed tool call parses show up as a message with content, so ensure
419433
# that the retrieve response content matches the original request
420434
assert retrieved_response.choices[0].message.content == content
435+
436+
437+
def test_openai_chat_completion_non_streaming_with_file(openai_client, client_with_models, text_model_id):
438+
skip_if_provider_isnt_openai(client_with_models, text_model_id)
439+
440+
# Generate temporary PDF with "Hello World" text
441+
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_pdf:
442+
c = canvas.Canvas(temp_pdf.name, pagesize=letter)
443+
c.drawString(100, 750, "Hello World")
444+
c.save()
445+
446+
# Read the PDF and sencode to base64
447+
with open(temp_pdf.name, "rb") as pdf_file:
448+
pdf_base64 = base64.b64encode(pdf_file.read()).decode("utf-8")
449+
450+
# Clean up temporary file
451+
os.unlink(temp_pdf.name)
452+
453+
response = openai_client.chat.completions.create(
454+
model=text_model_id,
455+
messages=[
456+
{
457+
"role": "user",
458+
"content": "Describe what you see in this PDF file.",
459+
},
460+
{
461+
"role": "user",
462+
"content": [
463+
{
464+
"type": "file",
465+
"file": {
466+
"filename": "my-temp-hello-world-pdf",
467+
"file_data": f"data:application/pdf;base64,{pdf_base64}",
468+
},
469+
}
470+
],
471+
},
472+
],
473+
stream=False,
474+
)
475+
message_content = response.choices[0].message.content.lower().strip()
476+
assert "hello world" in message_content

0 commit comments

Comments
 (0)