Skip to content

Commit 579894d

Browse files
committed
feat: serve cached digest if available
1 parent a63ed9e commit 579894d

File tree

2 files changed

+404
-36
lines changed

2 files changed

+404
-36
lines changed

src/server/query_processor.py

Lines changed: 218 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -2,19 +2,219 @@
22

33
from __future__ import annotations
44

5+
import logging
56
from pathlib import Path
6-
from typing import cast
7+
from typing import TYPE_CHECKING, cast
78

89
from gitingest.clone import clone_repo
910
from gitingest.ingestion import ingest_query
1011
from gitingest.query_parser import parse_remote_repo
11-
from gitingest.utils.git_utils import validate_github_token
12+
from gitingest.utils.git_utils import resolve_commit, validate_github_token
1213
from gitingest.utils.pattern_utils import process_patterns
1314
from server.models import IngestErrorResponse, IngestResponse, IngestSuccessResponse, PatternType
14-
from server.s3_utils import generate_s3_file_path, is_s3_enabled, upload_to_s3
15+
from server.s3_utils import (
16+
_build_s3_url,
17+
check_s3_object_exists,
18+
generate_s3_file_path,
19+
get_metadata_from_s3,
20+
is_s3_enabled,
21+
upload_metadata_to_s3,
22+
upload_to_s3,
23+
)
1524
from server.server_config import MAX_DISPLAY_SIZE
1625
from server.server_utils import Colors
1726

27+
if TYPE_CHECKING:
28+
from gitingest.schemas.cloning import CloneConfig
29+
from gitingest.schemas.ingestion import IngestionQuery
30+
31+
logger = logging.getLogger(__name__)
32+
33+
34+
async def _check_s3_cache(
35+
query: IngestionQuery,
36+
input_text: str,
37+
max_file_size: int,
38+
pattern_type: str,
39+
pattern: str,
40+
token: str | None,
41+
) -> IngestSuccessResponse | None:
42+
"""Check if digest already exists on S3 and return response if found.
43+
44+
Parameters
45+
----------
46+
query : IngestionQuery
47+
The parsed query object.
48+
input_text : str
49+
Original input text.
50+
max_file_size : int
51+
Maximum file size in KB.
52+
pattern_type : str
53+
Pattern type (include/exclude).
54+
pattern : str
55+
Pattern string.
56+
token : str | None
57+
GitHub token.
58+
59+
Returns
60+
-------
61+
IngestSuccessResponse | None
62+
Response if file exists on S3, None otherwise.
63+
64+
"""
65+
if not is_s3_enabled():
66+
return None
67+
68+
try:
69+
# Use git ls-remote to get commit SHA without cloning
70+
clone_config = query.extract_clone_config()
71+
commit_sha = await resolve_commit(clone_config, token=token)
72+
query.commit = commit_sha
73+
74+
# Generate S3 file path using the resolved commit
75+
s3_file_path = generate_s3_file_path(
76+
source=query.url,
77+
user_name=cast("str", query.user_name),
78+
repo_name=cast("str", query.repo_name),
79+
commit=commit_sha,
80+
include_patterns=query.include_patterns,
81+
ignore_patterns=query.ignore_patterns,
82+
)
83+
84+
# Check if file exists on S3
85+
if check_s3_object_exists(s3_file_path):
86+
# File exists on S3, serve it directly without cloning
87+
s3_url = _build_s3_url(s3_file_path)
88+
query.s3_url = s3_url
89+
90+
short_repo_url = f"{query.user_name}/{query.repo_name}"
91+
92+
# Try to get cached metadata
93+
metadata = get_metadata_from_s3(s3_file_path)
94+
95+
if metadata:
96+
# Use cached metadata if available
97+
summary = metadata.get(
98+
"summary",
99+
"Digest served from cache (S3). Download the full digest to see content details.",
100+
)
101+
tree = metadata.get("tree", "Digest served from cache. Download the full digest to see the file tree.")
102+
content = metadata.get(
103+
"content",
104+
"Digest served from cache. Download the full digest to see the content.",
105+
)
106+
else:
107+
# Fallback to placeholder messages if metadata not available
108+
summary = "Digest served from cache (S3). Download the full digest to see content details."
109+
tree = "Digest served from cache. Download the full digest to see the file tree."
110+
content = "Digest served from cache. Download the full digest to see the content."
111+
112+
return IngestSuccessResponse(
113+
repo_url=input_text,
114+
short_repo_url=short_repo_url,
115+
summary=summary,
116+
digest_url=s3_url,
117+
tree=tree,
118+
content=content,
119+
default_max_file_size=max_file_size,
120+
pattern_type=pattern_type,
121+
pattern=pattern,
122+
)
123+
except Exception as exc:
124+
# Log the exception but don't fail the entire request
125+
logger.warning("S3 cache check failed, falling back to normal cloning: %s", exc)
126+
127+
return None
128+
129+
130+
def _store_digest_content(
131+
query: IngestionQuery,
132+
clone_config: CloneConfig,
133+
digest_content: str,
134+
summary: str,
135+
tree: str,
136+
content: str,
137+
) -> None:
138+
"""Store digest content either to S3 or locally based on configuration.
139+
140+
Parameters
141+
----------
142+
query : IngestionQuery
143+
The query object containing repository information.
144+
clone_config : CloneConfig
145+
The clone configuration object.
146+
digest_content : str
147+
The complete digest content to store.
148+
summary : str
149+
The summary content for metadata.
150+
tree : str
151+
The tree content for metadata.
152+
content : str
153+
The file content for metadata.
154+
155+
"""
156+
if is_s3_enabled():
157+
# Upload to S3 instead of storing locally
158+
s3_file_path = generate_s3_file_path(
159+
source=query.url,
160+
user_name=cast("str", query.user_name),
161+
repo_name=cast("str", query.repo_name),
162+
commit=query.commit,
163+
include_patterns=query.include_patterns,
164+
ignore_patterns=query.ignore_patterns,
165+
)
166+
s3_url = upload_to_s3(content=digest_content, s3_file_path=s3_file_path, ingest_id=query.id)
167+
168+
# Also upload metadata JSON for caching
169+
metadata = {
170+
"summary": summary,
171+
"tree": tree,
172+
"content": content,
173+
}
174+
try:
175+
upload_metadata_to_s3(metadata=metadata, s3_file_path=s3_file_path, ingest_id=query.id)
176+
logger.debug("Successfully uploaded metadata to S3")
177+
except Exception as metadata_exc:
178+
# Log the error but don't fail the entire request
179+
logger.warning("Failed to upload metadata to S3: %s", metadata_exc)
180+
181+
# Store S3 URL in query for later use
182+
query.s3_url = s3_url
183+
else:
184+
# Store locally
185+
local_txt_file = Path(clone_config.local_path).with_suffix(".txt")
186+
with local_txt_file.open("w", encoding="utf-8") as f:
187+
f.write(digest_content)
188+
189+
190+
def _generate_digest_url(query: IngestionQuery) -> str:
191+
"""Generate the digest URL based on S3 configuration.
192+
193+
Parameters
194+
----------
195+
query : IngestionQuery
196+
The query object containing repository information.
197+
198+
Returns
199+
-------
200+
str
201+
The digest URL.
202+
203+
Raises
204+
------
205+
RuntimeError
206+
If S3 is enabled but no S3 URL was generated.
207+
208+
"""
209+
if is_s3_enabled():
210+
digest_url = getattr(query, "s3_url", None)
211+
if not digest_url:
212+
# This should not happen if S3 upload was successful
213+
msg = "S3 is enabled but no S3 URL was generated"
214+
raise RuntimeError(msg)
215+
return digest_url
216+
return f"/api/download/file/{query.id}"
217+
18218

19219
async def process_query(
20220
input_text: str,
@@ -69,10 +269,22 @@ async def process_query(
69269
include_patterns=pattern if pattern_type == PatternType.INCLUDE else None,
70270
)
71271

272+
# Check if digest already exists on S3 before cloning
273+
s3_response = await _check_s3_cache(
274+
query=query,
275+
input_text=input_text,
276+
max_file_size=max_file_size,
277+
pattern_type=pattern_type.value,
278+
pattern=pattern,
279+
token=token,
280+
)
281+
if s3_response:
282+
return s3_response
283+
72284
clone_config = query.extract_clone_config()
73285
await clone_repo(clone_config, token=token)
74286

75-
short_repo_url = f"{query.user_name}/{query.repo_name}" # Sets the "<user>/<repo>" for the page title
287+
short_repo_url = f"{query.user_name}/{query.repo_name}"
76288

77289
# The commit hash should always be available at this point
78290
if not query.commit:
@@ -81,30 +293,8 @@ async def process_query(
81293

82294
try:
83295
summary, tree, content = ingest_query(query)
84-
85-
# Prepare the digest content (tree + content)
86296
digest_content = tree + "\n" + content
87-
88-
# Store digest based on S3 configuration
89-
if is_s3_enabled():
90-
# Upload to S3 instead of storing locally
91-
s3_file_path = generate_s3_file_path(
92-
source=query.url,
93-
user_name=cast("str", query.user_name),
94-
repo_name=cast("str", query.repo_name),
95-
commit=query.commit,
96-
include_patterns=query.include_patterns,
97-
ignore_patterns=query.ignore_patterns,
98-
)
99-
s3_url = upload_to_s3(content=digest_content, s3_file_path=s3_file_path, ingest_id=query.id)
100-
# Store S3 URL in query for later use
101-
query.s3_url = s3_url
102-
else:
103-
# Store locally
104-
local_txt_file = Path(clone_config.local_path).with_suffix(".txt")
105-
with local_txt_file.open("w", encoding="utf-8") as f:
106-
f.write(digest_content)
107-
297+
_store_digest_content(query, clone_config, digest_content, summary, tree, content)
108298
except Exception as exc:
109299
_print_error(query.url, exc, max_file_size, pattern_type, pattern)
110300
return IngestErrorResponse(error=str(exc))
@@ -123,15 +313,7 @@ async def process_query(
123313
summary=summary,
124314
)
125315

126-
# Generate digest_url based on S3 configuration
127-
if is_s3_enabled():
128-
digest_url = getattr(query, "s3_url", None)
129-
if not digest_url:
130-
# This should not happen if S3 upload was successful
131-
msg = "S3 is enabled but no S3 URL was generated"
132-
raise RuntimeError(msg)
133-
else:
134-
digest_url = f"/api/download/file/{query.id}"
316+
digest_url = _generate_digest_url(query)
135317

136318
return IngestSuccessResponse(
137319
repo_url=input_text,

0 commit comments

Comments
 (0)