22
33from __future__ import annotations
44
5+ import logging
56from pathlib import Path
6- from typing import cast
7+ from typing import TYPE_CHECKING , cast
78
89from gitingest .clone import clone_repo
910from gitingest .ingestion import ingest_query
1011from gitingest .query_parser import parse_remote_repo
11- from gitingest .utils .git_utils import validate_github_token
12+ from gitingest .utils .git_utils import resolve_commit , validate_github_token
1213from gitingest .utils .pattern_utils import process_patterns
1314from server .models import IngestErrorResponse , IngestResponse , IngestSuccessResponse , PatternType
14- from server .s3_utils import generate_s3_file_path , is_s3_enabled , upload_to_s3
15+ from server .s3_utils import (
16+ _build_s3_url ,
17+ check_s3_object_exists ,
18+ generate_s3_file_path ,
19+ get_metadata_from_s3 ,
20+ is_s3_enabled ,
21+ upload_metadata_to_s3 ,
22+ upload_to_s3 ,
23+ )
1524from server .server_config import MAX_DISPLAY_SIZE
1625from server .server_utils import Colors , log_slider_to_size
1726
27+ if TYPE_CHECKING :
28+ from gitingest .schemas .cloning import CloneConfig
29+ from gitingest .schemas .ingestion import IngestionQuery
30+
31+ logger = logging .getLogger (__name__ )
32+
33+
34+ async def _check_s3_cache (
35+ query : IngestionQuery ,
36+ input_text : str ,
37+ slider_position : int ,
38+ pattern_type : str ,
39+ pattern : str ,
40+ token : str | None ,
41+ ) -> IngestSuccessResponse | None :
42+ """Check if digest already exists on S3 and return response if found.
43+
44+ Parameters
45+ ----------
46+ query : IngestionQuery
47+ The parsed query object.
48+ input_text : str
49+ Original input text.
50+ slider_position : int
51+ Slider position for file size.
52+ pattern_type : str
53+ Pattern type (include/exclude).
54+ pattern : str
55+ Pattern string.
56+ token : str | None
57+ GitHub token.
58+
59+ Returns
60+ -------
61+ IngestSuccessResponse | None
62+ Response if file exists on S3, None otherwise.
63+
64+ """
65+ if not is_s3_enabled ():
66+ return None
67+
68+ try :
69+ # Use git ls-remote to get commit SHA without cloning
70+ clone_config = query .extract_clone_config ()
71+ commit_sha = await resolve_commit (clone_config , token = token )
72+ query .commit = commit_sha
73+
74+ # Generate S3 file path using the resolved commit
75+ s3_file_path = generate_s3_file_path (
76+ source = query .url ,
77+ user_name = cast ("str" , query .user_name ),
78+ repo_name = cast ("str" , query .repo_name ),
79+ commit = commit_sha ,
80+ include_patterns = query .include_patterns ,
81+ ignore_patterns = query .ignore_patterns ,
82+ )
83+
84+ # Check if file exists on S3
85+ if check_s3_object_exists (s3_file_path ):
86+ # File exists on S3, serve it directly without cloning
87+ s3_url = _build_s3_url (s3_file_path )
88+ query .s3_url = s3_url
89+
90+ short_repo_url = f"{ query .user_name } /{ query .repo_name } "
91+
92+ # Try to get cached metadata
93+ metadata = get_metadata_from_s3 (s3_file_path )
94+
95+ if metadata :
96+ # Use cached metadata if available
97+ summary = metadata .get (
98+ "summary" ,
99+ "Digest served from cache (S3). Download the full digest to see content details." ,
100+ )
101+ tree = metadata .get ("tree" , "Digest served from cache. Download the full digest to see the file tree." )
102+ content = metadata .get (
103+ "content" ,
104+ "Digest served from cache. Download the full digest to see the content." ,
105+ )
106+ else :
107+ # Fallback to placeholder messages if metadata not available
108+ summary = "Digest served from cache (S3). Download the full digest to see content details."
109+ tree = "Digest served from cache. Download the full digest to see the file tree."
110+ content = "Digest served from cache. Download the full digest to see the content."
111+
112+ return IngestSuccessResponse (
113+ repo_url = input_text ,
114+ short_repo_url = short_repo_url ,
115+ summary = summary ,
116+ digest_url = s3_url ,
117+ tree = tree ,
118+ content = content ,
119+ default_max_file_size = slider_position ,
120+ pattern_type = pattern_type ,
121+ pattern = pattern ,
122+ )
123+ except Exception as exc :
124+ # Log the exception but don't fail the entire request
125+ logger .warning ("S3 cache check failed, falling back to normal cloning: %s" , exc )
126+
127+ return None
128+
129+
130+ def _store_digest_content (
131+ query : IngestionQuery ,
132+ clone_config : CloneConfig ,
133+ digest_content : str ,
134+ summary : str ,
135+ tree : str ,
136+ content : str ,
137+ ) -> None :
138+ """Store digest content either to S3 or locally based on configuration.
139+
140+ Parameters
141+ ----------
142+ query : IngestionQuery
143+ The query object containing repository information.
144+ clone_config : CloneConfig
145+ The clone configuration object.
146+ digest_content : str
147+ The complete digest content to store.
148+ summary : str
149+ The summary content for metadata.
150+ tree : str
151+ The tree content for metadata.
152+ content : str
153+ The file content for metadata.
154+
155+ """
156+ if is_s3_enabled ():
157+ # Upload to S3 instead of storing locally
158+ s3_file_path = generate_s3_file_path (
159+ source = query .url ,
160+ user_name = cast ("str" , query .user_name ),
161+ repo_name = cast ("str" , query .repo_name ),
162+ commit = query .commit ,
163+ include_patterns = query .include_patterns ,
164+ ignore_patterns = query .ignore_patterns ,
165+ )
166+ s3_url = upload_to_s3 (content = digest_content , s3_file_path = s3_file_path , ingest_id = query .id )
167+
168+ # Also upload metadata JSON for caching
169+ metadata = {
170+ "summary" : summary ,
171+ "tree" : tree ,
172+ "content" : content ,
173+ }
174+ try :
175+ upload_metadata_to_s3 (metadata = metadata , s3_file_path = s3_file_path , ingest_id = query .id )
176+ logger .debug ("Successfully uploaded metadata to S3" )
177+ except Exception as metadata_exc :
178+ # Log the error but don't fail the entire request
179+ logger .warning ("Failed to upload metadata to S3: %s" , metadata_exc )
180+
181+ # Store S3 URL in query for later use
182+ query .s3_url = s3_url
183+ else :
184+ # Store locally
185+ local_txt_file = Path (clone_config .local_path ).with_suffix (".txt" )
186+ with local_txt_file .open ("w" , encoding = "utf-8" ) as f :
187+ f .write (digest_content )
188+
189+
190+ def _generate_digest_url (query : IngestionQuery ) -> str :
191+ """Generate the digest URL based on S3 configuration.
192+
193+ Parameters
194+ ----------
195+ query : IngestionQuery
196+ The query object containing repository information.
197+
198+ Returns
199+ -------
200+ str
201+ The digest URL.
202+
203+ Raises
204+ ------
205+ RuntimeError
206+ If S3 is enabled but no S3 URL was generated.
207+
208+ """
209+ if is_s3_enabled ():
210+ digest_url = getattr (query , "s3_url" , None )
211+ if not digest_url :
212+ # This should not happen if S3 upload was successful
213+ msg = "S3 is enabled but no S3 URL was generated"
214+ raise RuntimeError (msg )
215+ return digest_url
216+ return f"/api/download/file/{ query .id } "
217+
18218
19219async def process_query (
20220 input_text : str ,
@@ -71,10 +271,22 @@ async def process_query(
71271 include_patterns = pattern if pattern_type == PatternType .INCLUDE else None ,
72272 )
73273
274+ # Check if digest already exists on S3 before cloning
275+ s3_response = await _check_s3_cache (
276+ query = query ,
277+ input_text = input_text ,
278+ slider_position = slider_position ,
279+ pattern_type = pattern_type .value ,
280+ pattern = pattern ,
281+ token = token ,
282+ )
283+ if s3_response :
284+ return s3_response
285+
74286 clone_config = query .extract_clone_config ()
75287 await clone_repo (clone_config , token = token )
76288
77- short_repo_url = f"{ query .user_name } /{ query .repo_name } " # Sets the "<user>/<repo>" for the page title
289+ short_repo_url = f"{ query .user_name } /{ query .repo_name } "
78290
79291 # The commit hash should always be available at this point
80292 if not query .commit :
@@ -83,30 +295,8 @@ async def process_query(
83295
84296 try :
85297 summary , tree , content = ingest_query (query )
86-
87- # Prepare the digest content (tree + content)
88298 digest_content = tree + "\n " + content
89-
90- # Store digest based on S3 configuration
91- if is_s3_enabled ():
92- # Upload to S3 instead of storing locally
93- s3_file_path = generate_s3_file_path (
94- source = query .url ,
95- user_name = cast ("str" , query .user_name ),
96- repo_name = cast ("str" , query .repo_name ),
97- commit = query .commit ,
98- include_patterns = query .include_patterns ,
99- ignore_patterns = query .ignore_patterns ,
100- )
101- s3_url = upload_to_s3 (content = digest_content , s3_file_path = s3_file_path , ingest_id = query .id )
102- # Store S3 URL in query for later use
103- query .s3_url = s3_url
104- else :
105- # Store locally
106- local_txt_file = Path (clone_config .local_path ).with_suffix (".txt" )
107- with local_txt_file .open ("w" , encoding = "utf-8" ) as f :
108- f .write (digest_content )
109-
299+ _store_digest_content (query , clone_config , digest_content , summary , tree , content )
110300 except Exception as exc :
111301 _print_error (query .url , exc , max_file_size , pattern_type , pattern )
112302 return IngestErrorResponse (error = str (exc ))
@@ -125,15 +315,7 @@ async def process_query(
125315 summary = summary ,
126316 )
127317
128- # Generate digest_url based on S3 configuration
129- if is_s3_enabled ():
130- digest_url = getattr (query , "s3_url" , None )
131- if not digest_url :
132- # This should not happen if S3 upload was successful
133- msg = "S3 is enabled but no S3 URL was generated"
134- raise RuntimeError (msg )
135- else :
136- digest_url = f"/api/download/file/{ query .id } "
318+ digest_url = _generate_digest_url (query )
137319
138320 return IngestSuccessResponse (
139321 repo_url = input_text ,
0 commit comments