22
33from __future__ import annotations
44
5+ import logging
56from pathlib import Path
6- from typing import cast
7+ from typing import TYPE_CHECKING , cast
78
89from gitingest .clone import clone_repo
910from gitingest .ingestion import ingest_query
1011from gitingest .query_parser import parse_remote_repo
11- from gitingest .utils .git_utils import validate_github_token
12+ from gitingest .utils .git_utils import resolve_commit , validate_github_token
1213from gitingest .utils .pattern_utils import process_patterns
1314from server .models import IngestErrorResponse , IngestResponse , IngestSuccessResponse , PatternType
14- from server .s3_utils import generate_s3_file_path , is_s3_enabled , upload_to_s3
15+ from server .s3_utils import (
16+ _build_s3_url ,
17+ check_s3_object_exists ,
18+ generate_s3_file_path ,
19+ get_metadata_from_s3 ,
20+ is_s3_enabled ,
21+ upload_metadata_to_s3 ,
22+ upload_to_s3 ,
23+ )
1524from server .server_config import MAX_DISPLAY_SIZE
1625from server .server_utils import Colors
1726
27+ if TYPE_CHECKING :
28+ from gitingest .schemas .cloning import CloneConfig
29+ from gitingest .schemas .ingestion import IngestionQuery
30+
31+ logger = logging .getLogger (__name__ )
32+
33+
34+ async def _check_s3_cache (
35+ query : IngestionQuery ,
36+ input_text : str ,
37+ max_file_size : int ,
38+ pattern_type : str ,
39+ pattern : str ,
40+ token : str | None ,
41+ ) -> IngestSuccessResponse | None :
42+ """Check if digest already exists on S3 and return response if found.
43+
44+ Parameters
45+ ----------
46+ query : IngestionQuery
47+ The parsed query object.
48+ input_text : str
49+ Original input text.
50+ max_file_size : int
51+ Maximum file size in KB.
52+ pattern_type : str
53+ Pattern type (include/exclude).
54+ pattern : str
55+ Pattern string.
56+ token : str | None
57+ GitHub token.
58+
59+ Returns
60+ -------
61+ IngestSuccessResponse | None
62+ Response if file exists on S3, None otherwise.
63+
64+ """
65+ if not is_s3_enabled ():
66+ return None
67+
68+ try :
69+ # Use git ls-remote to get commit SHA without cloning
70+ clone_config = query .extract_clone_config ()
71+ commit_sha = await resolve_commit (clone_config , token = token )
72+ query .commit = commit_sha
73+
74+ # Generate S3 file path using the resolved commit
75+ s3_file_path = generate_s3_file_path (
76+ source = query .url ,
77+ user_name = cast ("str" , query .user_name ),
78+ repo_name = cast ("str" , query .repo_name ),
79+ commit = commit_sha ,
80+ include_patterns = query .include_patterns ,
81+ ignore_patterns = query .ignore_patterns ,
82+ )
83+
84+ # Check if file exists on S3
85+ if check_s3_object_exists (s3_file_path ):
86+ # File exists on S3, serve it directly without cloning
87+ s3_url = _build_s3_url (s3_file_path )
88+ query .s3_url = s3_url
89+
90+ short_repo_url = f"{ query .user_name } /{ query .repo_name } "
91+
92+ # Try to get cached metadata
93+ metadata = get_metadata_from_s3 (s3_file_path )
94+
95+ if metadata :
96+ # Use cached metadata if available
97+ summary = metadata .get (
98+ "summary" ,
99+ "Digest served from cache (S3). Download the full digest to see content details." ,
100+ )
101+ tree = metadata .get ("tree" , "Digest served from cache. Download the full digest to see the file tree." )
102+ content = metadata .get (
103+ "content" ,
104+ "Digest served from cache. Download the full digest to see the content." ,
105+ )
106+ else :
107+ # Fallback to placeholder messages if metadata not available
108+ summary = "Digest served from cache (S3). Download the full digest to see content details."
109+ tree = "Digest served from cache. Download the full digest to see the file tree."
110+ content = "Digest served from cache. Download the full digest to see the content."
111+
112+ return IngestSuccessResponse (
113+ repo_url = input_text ,
114+ short_repo_url = short_repo_url ,
115+ summary = summary ,
116+ digest_url = s3_url ,
117+ tree = tree ,
118+ content = content ,
119+ default_max_file_size = max_file_size ,
120+ pattern_type = pattern_type ,
121+ pattern = pattern ,
122+ )
123+ except Exception as exc :
124+ # Log the exception but don't fail the entire request
125+ logger .warning ("S3 cache check failed, falling back to normal cloning: %s" , exc )
126+
127+ return None
128+
129+
130+ def _store_digest_content (
131+ query : IngestionQuery ,
132+ clone_config : CloneConfig ,
133+ digest_content : str ,
134+ summary : str ,
135+ tree : str ,
136+ content : str ,
137+ ) -> None :
138+ """Store digest content either to S3 or locally based on configuration.
139+
140+ Parameters
141+ ----------
142+ query : IngestionQuery
143+ The query object containing repository information.
144+ clone_config : CloneConfig
145+ The clone configuration object.
146+ digest_content : str
147+ The complete digest content to store.
148+ summary : str
149+ The summary content for metadata.
150+ tree : str
151+ The tree content for metadata.
152+ content : str
153+ The file content for metadata.
154+
155+ """
156+ if is_s3_enabled ():
157+ # Upload to S3 instead of storing locally
158+ s3_file_path = generate_s3_file_path (
159+ source = query .url ,
160+ user_name = cast ("str" , query .user_name ),
161+ repo_name = cast ("str" , query .repo_name ),
162+ commit = query .commit ,
163+ include_patterns = query .include_patterns ,
164+ ignore_patterns = query .ignore_patterns ,
165+ )
166+ s3_url = upload_to_s3 (content = digest_content , s3_file_path = s3_file_path , ingest_id = query .id )
167+
168+ # Also upload metadata JSON for caching
169+ metadata = {
170+ "summary" : summary ,
171+ "tree" : tree ,
172+ "content" : content ,
173+ }
174+ try :
175+ upload_metadata_to_s3 (metadata = metadata , s3_file_path = s3_file_path , ingest_id = query .id )
176+ logger .debug ("Successfully uploaded metadata to S3" )
177+ except Exception as metadata_exc :
178+ # Log the error but don't fail the entire request
179+ logger .warning ("Failed to upload metadata to S3: %s" , metadata_exc )
180+
181+ # Store S3 URL in query for later use
182+ query .s3_url = s3_url
183+ else :
184+ # Store locally
185+ local_txt_file = Path (clone_config .local_path ).with_suffix (".txt" )
186+ with local_txt_file .open ("w" , encoding = "utf-8" ) as f :
187+ f .write (digest_content )
188+
189+
190+ def _generate_digest_url (query : IngestionQuery ) -> str :
191+ """Generate the digest URL based on S3 configuration.
192+
193+ Parameters
194+ ----------
195+ query : IngestionQuery
196+ The query object containing repository information.
197+
198+ Returns
199+ -------
200+ str
201+ The digest URL.
202+
203+ Raises
204+ ------
205+ RuntimeError
206+ If S3 is enabled but no S3 URL was generated.
207+
208+ """
209+ if is_s3_enabled ():
210+ digest_url = getattr (query , "s3_url" , None )
211+ if not digest_url :
212+ # This should not happen if S3 upload was successful
213+ msg = "S3 is enabled but no S3 URL was generated"
214+ raise RuntimeError (msg )
215+ return digest_url
216+ return f"/api/download/file/{ query .id } "
217+
18218
19219async def process_query (
20220 input_text : str ,
@@ -69,10 +269,22 @@ async def process_query(
69269 include_patterns = pattern if pattern_type == PatternType .INCLUDE else None ,
70270 )
71271
272+ # Check if digest already exists on S3 before cloning
273+ s3_response = await _check_s3_cache (
274+ query = query ,
275+ input_text = input_text ,
276+ max_file_size = max_file_size ,
277+ pattern_type = pattern_type .value ,
278+ pattern = pattern ,
279+ token = token ,
280+ )
281+ if s3_response :
282+ return s3_response
283+
72284 clone_config = query .extract_clone_config ()
73285 await clone_repo (clone_config , token = token )
74286
75- short_repo_url = f"{ query .user_name } /{ query .repo_name } " # Sets the "<user>/<repo>" for the page title
287+ short_repo_url = f"{ query .user_name } /{ query .repo_name } "
76288
77289 # The commit hash should always be available at this point
78290 if not query .commit :
@@ -81,30 +293,8 @@ async def process_query(
81293
82294 try :
83295 summary , tree , content = ingest_query (query )
84-
85- # Prepare the digest content (tree + content)
86296 digest_content = tree + "\n " + content
87-
88- # Store digest based on S3 configuration
89- if is_s3_enabled ():
90- # Upload to S3 instead of storing locally
91- s3_file_path = generate_s3_file_path (
92- source = query .url ,
93- user_name = cast ("str" , query .user_name ),
94- repo_name = cast ("str" , query .repo_name ),
95- commit = query .commit ,
96- include_patterns = query .include_patterns ,
97- ignore_patterns = query .ignore_patterns ,
98- )
99- s3_url = upload_to_s3 (content = digest_content , s3_file_path = s3_file_path , ingest_id = query .id )
100- # Store S3 URL in query for later use
101- query .s3_url = s3_url
102- else :
103- # Store locally
104- local_txt_file = Path (clone_config .local_path ).with_suffix (".txt" )
105- with local_txt_file .open ("w" , encoding = "utf-8" ) as f :
106- f .write (digest_content )
107-
297+ _store_digest_content (query , clone_config , digest_content , summary , tree , content )
108298 except Exception as exc :
109299 _print_error (query .url , exc , max_file_size , pattern_type , pattern )
110300 return IngestErrorResponse (error = str (exc ))
@@ -123,15 +313,7 @@ async def process_query(
123313 summary = summary ,
124314 )
125315
126- # Generate digest_url based on S3 configuration
127- if is_s3_enabled ():
128- digest_url = getattr (query , "s3_url" , None )
129- if not digest_url :
130- # This should not happen if S3 upload was successful
131- msg = "S3 is enabled but no S3 URL was generated"
132- raise RuntimeError (msg )
133- else :
134- digest_url = f"/api/download/file/{ query .id } "
316+ digest_url = _generate_digest_url (query )
135317
136318 return IngestSuccessResponse (
137319 repo_url = input_text ,
0 commit comments