Skip to content

Commit 74e5c11

Browse files
committed
feat: serve cached digest if available
- Added methods to upload metadata alongside digest files to S3. - Implemented S3-based digest caching mechanism for improved efficiency. - Refactored digest storage logic to support both S3 and local storage.
1 parent 148f171 commit 74e5c11

File tree

2 files changed

+404
-36
lines changed

2 files changed

+404
-36
lines changed

src/server/query_processor.py

Lines changed: 218 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -2,19 +2,219 @@
22

33
from __future__ import annotations
44

5+
import logging
56
from pathlib import Path
6-
from typing import cast
7+
from typing import TYPE_CHECKING, cast
78

89
from gitingest.clone import clone_repo
910
from gitingest.ingestion import ingest_query
1011
from gitingest.query_parser import parse_remote_repo
11-
from gitingest.utils.git_utils import validate_github_token
12+
from gitingest.utils.git_utils import resolve_commit, validate_github_token
1213
from gitingest.utils.pattern_utils import process_patterns
1314
from server.models import IngestErrorResponse, IngestResponse, IngestSuccessResponse, PatternType
14-
from server.s3_utils import generate_s3_file_path, is_s3_enabled, upload_to_s3
15+
from server.s3_utils import (
16+
_build_s3_url,
17+
check_s3_object_exists,
18+
generate_s3_file_path,
19+
get_metadata_from_s3,
20+
is_s3_enabled,
21+
upload_metadata_to_s3,
22+
upload_to_s3,
23+
)
1524
from server.server_config import MAX_DISPLAY_SIZE
1625
from server.server_utils import Colors, log_slider_to_size
1726

27+
if TYPE_CHECKING:
28+
from gitingest.schemas.cloning import CloneConfig
29+
from gitingest.schemas.ingestion import IngestionQuery
30+
31+
logger = logging.getLogger(__name__)
32+
33+
34+
async def _check_s3_cache(
35+
query: IngestionQuery,
36+
input_text: str,
37+
slider_position: int,
38+
pattern_type: str,
39+
pattern: str,
40+
token: str | None,
41+
) -> IngestSuccessResponse | None:
42+
"""Check if digest already exists on S3 and return response if found.
43+
44+
Parameters
45+
----------
46+
query : IngestionQuery
47+
The parsed query object.
48+
input_text : str
49+
Original input text.
50+
slider_position : int
51+
Slider position for file size.
52+
pattern_type : str
53+
Pattern type (include/exclude).
54+
pattern : str
55+
Pattern string.
56+
token : str | None
57+
GitHub token.
58+
59+
Returns
60+
-------
61+
IngestSuccessResponse | None
62+
Response if file exists on S3, None otherwise.
63+
64+
"""
65+
if not is_s3_enabled():
66+
return None
67+
68+
try:
69+
# Use git ls-remote to get commit SHA without cloning
70+
clone_config = query.extract_clone_config()
71+
commit_sha = await resolve_commit(clone_config, token=token)
72+
query.commit = commit_sha
73+
74+
# Generate S3 file path using the resolved commit
75+
s3_file_path = generate_s3_file_path(
76+
source=query.url,
77+
user_name=cast("str", query.user_name),
78+
repo_name=cast("str", query.repo_name),
79+
commit=commit_sha,
80+
include_patterns=query.include_patterns,
81+
ignore_patterns=query.ignore_patterns,
82+
)
83+
84+
# Check if file exists on S3
85+
if check_s3_object_exists(s3_file_path):
86+
# File exists on S3, serve it directly without cloning
87+
s3_url = _build_s3_url(s3_file_path)
88+
query.s3_url = s3_url
89+
90+
short_repo_url = f"{query.user_name}/{query.repo_name}"
91+
92+
# Try to get cached metadata
93+
metadata = get_metadata_from_s3(s3_file_path)
94+
95+
if metadata:
96+
# Use cached metadata if available
97+
summary = metadata.get(
98+
"summary",
99+
"Digest served from cache (S3). Download the full digest to see content details.",
100+
)
101+
tree = metadata.get("tree", "Digest served from cache. Download the full digest to see the file tree.")
102+
content = metadata.get(
103+
"content",
104+
"Digest served from cache. Download the full digest to see the content.",
105+
)
106+
else:
107+
# Fallback to placeholder messages if metadata not available
108+
summary = "Digest served from cache (S3). Download the full digest to see content details."
109+
tree = "Digest served from cache. Download the full digest to see the file tree."
110+
content = "Digest served from cache. Download the full digest to see the content."
111+
112+
return IngestSuccessResponse(
113+
repo_url=input_text,
114+
short_repo_url=short_repo_url,
115+
summary=summary,
116+
digest_url=s3_url,
117+
tree=tree,
118+
content=content,
119+
default_max_file_size=slider_position,
120+
pattern_type=pattern_type,
121+
pattern=pattern,
122+
)
123+
except Exception as exc:
124+
# Log the exception but don't fail the entire request
125+
logger.warning("S3 cache check failed, falling back to normal cloning: %s", exc)
126+
127+
return None
128+
129+
130+
def _store_digest_content(
131+
query: IngestionQuery,
132+
clone_config: CloneConfig,
133+
digest_content: str,
134+
summary: str,
135+
tree: str,
136+
content: str,
137+
) -> None:
138+
"""Store digest content either to S3 or locally based on configuration.
139+
140+
Parameters
141+
----------
142+
query : IngestionQuery
143+
The query object containing repository information.
144+
clone_config : CloneConfig
145+
The clone configuration object.
146+
digest_content : str
147+
The complete digest content to store.
148+
summary : str
149+
The summary content for metadata.
150+
tree : str
151+
The tree content for metadata.
152+
content : str
153+
The file content for metadata.
154+
155+
"""
156+
if is_s3_enabled():
157+
# Upload to S3 instead of storing locally
158+
s3_file_path = generate_s3_file_path(
159+
source=query.url,
160+
user_name=cast("str", query.user_name),
161+
repo_name=cast("str", query.repo_name),
162+
commit=query.commit,
163+
include_patterns=query.include_patterns,
164+
ignore_patterns=query.ignore_patterns,
165+
)
166+
s3_url = upload_to_s3(content=digest_content, s3_file_path=s3_file_path, ingest_id=query.id)
167+
168+
# Also upload metadata JSON for caching
169+
metadata = {
170+
"summary": summary,
171+
"tree": tree,
172+
"content": content,
173+
}
174+
try:
175+
upload_metadata_to_s3(metadata=metadata, s3_file_path=s3_file_path, ingest_id=query.id)
176+
logger.debug("Successfully uploaded metadata to S3")
177+
except Exception as metadata_exc:
178+
# Log the error but don't fail the entire request
179+
logger.warning("Failed to upload metadata to S3: %s", metadata_exc)
180+
181+
# Store S3 URL in query for later use
182+
query.s3_url = s3_url
183+
else:
184+
# Store locally
185+
local_txt_file = Path(clone_config.local_path).with_suffix(".txt")
186+
with local_txt_file.open("w", encoding="utf-8") as f:
187+
f.write(digest_content)
188+
189+
190+
def _generate_digest_url(query: IngestionQuery) -> str:
191+
"""Generate the digest URL based on S3 configuration.
192+
193+
Parameters
194+
----------
195+
query : IngestionQuery
196+
The query object containing repository information.
197+
198+
Returns
199+
-------
200+
str
201+
The digest URL.
202+
203+
Raises
204+
------
205+
RuntimeError
206+
If S3 is enabled but no S3 URL was generated.
207+
208+
"""
209+
if is_s3_enabled():
210+
digest_url = getattr(query, "s3_url", None)
211+
if not digest_url:
212+
# This should not happen if S3 upload was successful
213+
msg = "S3 is enabled but no S3 URL was generated"
214+
raise RuntimeError(msg)
215+
return digest_url
216+
return f"/api/download/file/{query.id}"
217+
18218

19219
async def process_query(
20220
input_text: str,
@@ -71,10 +271,22 @@ async def process_query(
71271
include_patterns=pattern if pattern_type == PatternType.INCLUDE else None,
72272
)
73273

274+
# Check if digest already exists on S3 before cloning
275+
s3_response = await _check_s3_cache(
276+
query=query,
277+
input_text=input_text,
278+
slider_position=slider_position,
279+
pattern_type=pattern_type.value,
280+
pattern=pattern,
281+
token=token,
282+
)
283+
if s3_response:
284+
return s3_response
285+
74286
clone_config = query.extract_clone_config()
75287
await clone_repo(clone_config, token=token)
76288

77-
short_repo_url = f"{query.user_name}/{query.repo_name}" # Sets the "<user>/<repo>" for the page title
289+
short_repo_url = f"{query.user_name}/{query.repo_name}"
78290

79291
# The commit hash should always be available at this point
80292
if not query.commit:
@@ -83,30 +295,8 @@ async def process_query(
83295

84296
try:
85297
summary, tree, content = ingest_query(query)
86-
87-
# Prepare the digest content (tree + content)
88298
digest_content = tree + "\n" + content
89-
90-
# Store digest based on S3 configuration
91-
if is_s3_enabled():
92-
# Upload to S3 instead of storing locally
93-
s3_file_path = generate_s3_file_path(
94-
source=query.url,
95-
user_name=cast("str", query.user_name),
96-
repo_name=cast("str", query.repo_name),
97-
commit=query.commit,
98-
include_patterns=query.include_patterns,
99-
ignore_patterns=query.ignore_patterns,
100-
)
101-
s3_url = upload_to_s3(content=digest_content, s3_file_path=s3_file_path, ingest_id=query.id)
102-
# Store S3 URL in query for later use
103-
query.s3_url = s3_url
104-
else:
105-
# Store locally
106-
local_txt_file = Path(clone_config.local_path).with_suffix(".txt")
107-
with local_txt_file.open("w", encoding="utf-8") as f:
108-
f.write(digest_content)
109-
299+
_store_digest_content(query, clone_config, digest_content, summary, tree, content)
110300
except Exception as exc:
111301
_print_error(query.url, exc, max_file_size, pattern_type, pattern)
112302
return IngestErrorResponse(error=str(exc))
@@ -125,15 +315,7 @@ async def process_query(
125315
summary=summary,
126316
)
127317

128-
# Generate digest_url based on S3 configuration
129-
if is_s3_enabled():
130-
digest_url = getattr(query, "s3_url", None)
131-
if not digest_url:
132-
# This should not happen if S3 upload was successful
133-
msg = "S3 is enabled but no S3 URL was generated"
134-
raise RuntimeError(msg)
135-
else:
136-
digest_url = f"/api/download/file/{query.id}"
318+
digest_url = _generate_digest_url(query)
137319

138320
return IngestSuccessResponse(
139321
repo_url=input_text,

0 commit comments

Comments
 (0)