diff --git a/crates/forge_api/src/api.rs b/crates/forge_api/src/api.rs index c254e6c5ef..2a58651881 100644 --- a/crates/forge_api/src/api.rs +++ b/crates/forge_api/src/api.rs @@ -207,7 +207,6 @@ pub trait API: Sync + Send { async fn sync_workspace( &self, path: PathBuf, - batch_size: usize, ) -> Result>>; /// Query the indexed workspace diff --git a/crates/forge_api/src/forge_api.rs b/crates/forge_api/src/forge_api.rs index 1aa57c769a..037ae8565f 100644 --- a/crates/forge_api/src/forge_api.rs +++ b/crates/forge_api/src/forge_api.rs @@ -360,9 +360,8 @@ impl< async fn sync_workspace( &self, path: PathBuf, - batch_size: usize, ) -> Result>> { - self.services.sync_workspace(path, batch_size).await + self.services.sync_workspace(path).await } async fn query_workspace( diff --git a/crates/forge_app/src/services.rs b/crates/forge_app/src/services.rs index 49ab6e61c8..0e1525af5f 100644 --- a/crates/forge_app/src/services.rs +++ b/crates/forge_app/src/services.rs @@ -305,7 +305,6 @@ pub trait WorkspaceService: Send + Sync { async fn sync_workspace( &self, path: PathBuf, - batch_size: usize, ) -> anyhow::Result>>; /// Query the indexed workspace with semantic search @@ -1110,11 +1109,8 @@ impl WorkspaceService for I { async fn sync_workspace( &self, path: PathBuf, - batch_size: usize, ) -> anyhow::Result>> { - self.workspace_service() - .sync_workspace(path, batch_size) - .await + self.workspace_service().sync_workspace(path).await } async fn query_workspace( diff --git a/crates/forge_main/src/built_in_commands.json b/crates/forge_main/src/built_in_commands.json index 459c7028c6..f5714f07ed 100644 --- a/crates/forge_main/src/built_in_commands.json +++ b/crates/forge_main/src/built_in_commands.json @@ -103,6 +103,10 @@ "command": "sync-info", "description": "Show workspace information with sync details" }, + { + "command": "sync-init", + "description": "Initialize a new workspace without syncing files" + }, { "command": "clone", "description": "Clone and manage conversation context" diff --git a/crates/forge_main/src/cli.rs b/crates/forge_main/src/cli.rs index a4a914c1c4..252a81c04b 100644 --- a/crates/forge_main/src/cli.rs +++ b/crates/forge_main/src/cli.rs @@ -242,9 +242,10 @@ pub enum WorkspaceCommand { #[arg(default_value = ".")] path: PathBuf, - /// Number of files to process concurrently - #[arg(long, default_value = "100")] - batch_size: usize, + /// Automatically initialize the workspace before syncing if it has not + /// been initialized yet. + #[arg(long)] + init: bool, }, /// List all workspaces. List { diff --git a/crates/forge_main/src/ui.rs b/crates/forge_main/src/ui.rs index 6dac57b508..ad3231cdf1 100644 --- a/crates/forge_main/src/ui.rs +++ b/crates/forge_main/src/ui.rs @@ -600,8 +600,8 @@ impl A + Send + Sync> UI { } TopLevelCommand::Workspace(index_group) => { match index_group.command { - crate::cli::WorkspaceCommand::Sync { path, batch_size } => { - self.on_index(path, batch_size).await?; + crate::cli::WorkspaceCommand::Sync { path, init } => { + self.on_index(path, init).await?; } crate::cli::WorkspaceCommand::List { porcelain } => { self.on_list_workspaces(porcelain).await?; @@ -1980,8 +1980,7 @@ impl A + Send + Sync> UI { } SlashCommand::Index => { let working_dir = self.state.cwd.clone(); - // Use default batch size of 100 for slash command - self.on_index(working_dir, 100).await?; + self.on_index(working_dir, false).await?; } SlashCommand::AgentSwitch(agent_id) => { // Validate that the agent exists by checking against loaded agents @@ -3634,11 +3633,7 @@ impl A + Send + Sync> UI { Ok(()) } - async fn on_index( - &mut self, - path: std::path::PathBuf, - batch_size: usize, - ) -> anyhow::Result<()> { + async fn on_index(&mut self, path: std::path::PathBuf, init: bool) -> anyhow::Result<()> { use forge_domain::SyncProgress; use forge_spinner::ProgressBarManager; @@ -3647,7 +3642,17 @@ impl A + Send + Sync> UI { self.init_forge_services().await?; } - let mut stream = self.api.sync_workspace(path.clone(), batch_size).await?; + // When init is set, check if the workspace is already initialized + // via get_workspace_info before calling init, so we only initialize + // when a workspace does not yet exist for the given path. + if init { + let workspace_info = self.api.get_workspace_info(path.clone()).await?; + if workspace_info.is_none() { + self.on_workspace_init(path.clone()).await?; + } + } + + let mut stream = self.api.sync_workspace(path.clone()).await?; let mut progress_bar = ProgressBarManager::default(); while let Some(event) = stream.next().await { @@ -3998,19 +4003,20 @@ impl A + Send + Sync> UI { /// Initialize workspace for a directory without syncing files async fn on_workspace_init(&mut self, path: std::path::PathBuf) -> anyhow::Result<()> { + // Check if auth already exists and create if needed + if !self.api.is_authenticated().await? { + self.init_forge_services().await?; + } + self.spinner.start(Some("Initializing workspace"))?; let workspace_id = self.api.init_workspace(path.clone()).await?; self.spinner.stop(None)?; - // Resolve and display the path - let canonical_path = path.canonicalize().unwrap_or_else(|_| path.clone()); - self.writeln_title( TitleFormat::info("Workspace initialized successfully") - .sub_title(format!("Path: {}", canonical_path.display())) - .sub_title(format!("Workspace ID: {}", workspace_id)), + .sub_title(format!("{}", workspace_id)), )?; Ok(()) diff --git a/crates/forge_services/src/context_engine.rs b/crates/forge_services/src/context_engine.rs index 51299ca6bf..a194350754 100644 --- a/crates/forge_services/src/context_engine.rs +++ b/crates/forge_services/src/context_engine.rs @@ -1,12 +1,12 @@ -use std::collections::{HashMap, HashSet}; +use std::collections::HashMap; use std::path::{Path, PathBuf}; -use std::sync::{Arc, LazyLock}; +use std::sync::Arc; use anyhow::{Context, Result}; use async_trait::async_trait; use forge_app::{ - CommandInfra, EnvironmentInfra, FileReaderInfra, SyncProgressCounter, WalkedFile, Walker, - WalkerInfra, WorkspaceService, WorkspaceStatus, compute_hash, + CommandInfra, EnvironmentInfra, FileReaderInfra, SyncProgressCounter, WalkerInfra, + WorkspaceService, WorkspaceStatus, compute_hash, }; use forge_domain::{ AuthCredential, AuthDetails, FileHash, FileNode, ProviderId, ProviderRepository, SyncProgress, @@ -17,7 +17,7 @@ use futures::future::join_all; use futures::stream::{Stream, StreamExt}; use tracing::{info, warn}; -use crate::error::Error as ServiceError; +use crate::fd::{FileDiscovery, discover_sync_file_paths}; /// Error type for a single file that could not be read during workspace /// operations, carrying the file path for downstream reporting. @@ -29,58 +29,74 @@ struct FileReadError { source: anyhow::Error, } -static ALLOWED_EXTENSIONS: LazyLock> = LazyLock::new(|| { - let extensions_str = include_str!("allowed_extensions.txt"); - extensions_str - .lines() - .map(|line| line.trim().to_lowercase()) - .filter(|line| !line.is_empty()) - .collect() -}); - -/// Loads allowed file extensions from allowed_extensions.txt into a HashSet -fn allowed_extensions() -> &'static HashSet { - &ALLOWED_EXTENSIONS +/// Canonicalizes `path`, attaching a context message that includes the original +/// path on failure. +fn canonicalize_path(path: PathBuf) -> Result { + path.canonicalize() + .with_context(|| format!("Failed to resolve path: {}", path.display())) } -/// Checks if a file has an allowed extension for workspace syncing (O(1) -/// lookup) -fn has_allowed_extension(path: &Path) -> bool { - if let Some(extension) = path.extension() { - let ext = extension.to_string_lossy().to_lowercase(); - allowed_extensions().contains(&ext) - } else { - false - } +/// Extracts [`forge_domain::FileStatus`] entries with +/// [`forge_domain::SyncStatus::Failed`] from a slice of file-read results by +/// downcasting errors to [`FileReadError`]. +fn extract_failed_statuses(results: &[Result]) -> Vec { + results + .iter() + .filter_map(|r| r.as_ref().err()) + .filter_map(|e| e.downcast_ref::()) + .map(|e| { + forge_domain::FileStatus::new( + e.path.to_string_lossy().into_owned(), + forge_domain::SyncStatus::Failed, + ) + }) + .collect() } -/// Service for indexing workspaces and performing semantic search -pub struct ForgeWorkspaceService { +/// Service for indexing workspaces and performing semantic search. +/// +/// `F` provides infrastructure capabilities (file I/O, environment, etc.) and +/// `D` is the file-discovery strategy used to enumerate workspace files. +pub struct ForgeWorkspaceService { infra: Arc, + discovery: Arc, } -impl Clone for ForgeWorkspaceService { +impl Clone for ForgeWorkspaceService { fn clone(&self) -> Self { - Self { infra: Arc::clone(&self.infra) } + Self { + infra: Arc::clone(&self.infra), + discovery: Arc::clone(&self.discovery), + } } } -impl ForgeWorkspaceService { - /// Creates a new indexing service with the provided infrastructure. - pub fn new(infra: Arc) -> Self { - Self { infra } +impl ForgeWorkspaceService { + /// Creates a new workspace service with the provided infrastructure and + /// file-discovery strategy. + pub fn new(infra: Arc, discovery: Arc) -> Self { + Self { infra, discovery } } +} +impl< + F: 'static + + ProviderRepository + + WorkspaceIndexRepository + + FileReaderInfra + + EnvironmentInfra + + CommandInfra + + WalkerInfra, + D: FileDiscovery + 'static, +> ForgeWorkspaceService +{ /// Fetches remote file hashes from the server. async fn fetch_remote_hashes( &self, user_id: &UserId, workspace_id: &WorkspaceId, auth_token: &forge_domain::ApiKey, - ) -> anyhow::Result> - where - F: WorkspaceIndexRepository, - { + ) -> anyhow::Result> { info!(workspace_id = %workspace_id, "Fetching existing file hashes from server to detect changes..."); let workspace_files = forge_domain::CodeBase::new(user_id.clone(), workspace_id.clone(), ()); @@ -90,25 +106,6 @@ impl ForgeWorkspaceS .await } - /// Deletes a batch of files from the server. - async fn delete( - &self, - user_id: &UserId, - workspace_id: &WorkspaceId, - token: &forge_domain::ApiKey, - paths: Vec, - ) -> Result<()> - where - F: WorkspaceIndexRepository, - { - let deletion = forge_domain::CodeBase::new(user_id.clone(), workspace_id.clone(), paths); - - self.infra - .delete_files(&deletion, token) - .await - .context("Failed to delete files") - } - /// Deletes files from the workspace and updates the progress counter. /// /// Returns the number of files that were successfully deleted. @@ -118,16 +115,20 @@ impl ForgeWorkspaceS workspace_id: &WorkspaceId, token: &forge_domain::ApiKey, files_to_delete: Vec, - ) -> Result - where - F: WorkspaceIndexRepository, - { + ) -> Result { if files_to_delete.is_empty() { return Ok(0); } - self.delete(user_id, workspace_id, token, files_to_delete.clone()) - .await?; + let deletion = forge_domain::CodeBase::new( + user_id.clone(), + workspace_id.clone(), + files_to_delete.clone(), + ); + self.infra + .delete_files(&deletion, token) + .await + .context("Failed to delete files")?; for path in &files_to_delete { info!(workspace_id = %workspace_id, path = %path, "File deleted successfully"); @@ -136,26 +137,6 @@ impl ForgeWorkspaceS Ok(files_to_delete.len()) } - /// Uploads a batch of files to the server. - async fn upload( - &self, - user_id: &UserId, - workspace_id: &WorkspaceId, - token: &forge_domain::ApiKey, - files: Vec, - ) -> Result<()> - where - F: WorkspaceIndexRepository, - { - let upload = forge_domain::CodeBase::new(user_id.clone(), workspace_id.clone(), files); - - self.infra - .upload_files(&upload, token) - .await - .context("Failed to upload files")?; - Ok(()) - } - /// Uploads files in parallel, returning a stream of results. /// /// The caller is responsible for processing the stream and tracking @@ -167,10 +148,7 @@ impl ForgeWorkspaceS token: &forge_domain::ApiKey, files: Vec, batch_size: usize, - ) -> impl Stream> + Send - where - F: WorkspaceIndexRepository, - { + ) -> impl Stream> + Send { let user_id = user_id.clone(); let workspace_id = workspace_id.clone(); let token = token.clone(); @@ -188,8 +166,15 @@ impl ForgeWorkspaceS let file_path = file.path.clone(); async move { info!(workspace_id = %workspace_id, path = %file_path, "File sync started"); - self.upload(&user_id, &workspace_id, &token, vec![file]) - .await?; + let upload = forge_domain::CodeBase::new( + user_id.clone(), + workspace_id.clone(), + vec![file], + ); + self.infra + .upload_files(&upload, &token) + .await + .context("Failed to upload files")?; info!(workspace_id = %workspace_id, path = %file_path, "File sync completed"); Ok::<_, anyhow::Error>(1) } @@ -198,19 +183,8 @@ impl ForgeWorkspaceS } /// Internal sync implementation that emits progress events. - async fn sync_codebase_internal( - &self, - path: PathBuf, - batch_size: usize, - emit: E, - ) -> Result<()> + async fn sync_codebase_internal(&self, path: PathBuf, emit: E) -> Result<()> where - F: ProviderRepository - + WorkspaceIndexRepository - + FileReaderInfra - + EnvironmentInfra - + CommandInfra - + WalkerInfra, E: Fn(SyncProgress) -> Fut + Send + Sync, Fut: std::future::Future + Send, { @@ -219,46 +193,39 @@ impl ForgeWorkspaceS emit(SyncProgress::Starting).await; let (token, user_id) = self.get_workspace_credentials().await?; - let path = path - .canonicalize() - .with_context(|| format!("Failed to resolve path: {}", path.display()))?; + let batch_size = self.infra.get_environment().max_file_read_batch_size; + let path = canonicalize_path(path)?; + + // Find existing workspace - do NOT auto-create + let workspace = self.get_workspace_by_path(path, &token).await?; + + let workspace_id = workspace.workspace_id.clone(); - // Initialize workspace (finds existing or creates new) - let (is_new_workspace, workspace_id) = self._init_workspace(path.clone()).await?; + // Use the canonical root stored in the workspace record so that file + // discovery and remote-hash comparison are always relative to the same + // base, even when `path` is a subdirectory of an ancestor workspace. + let workspace_root = PathBuf::from(&workspace.working_dir); // Read all files and compute hashes from the workspace root path emit(SyncProgress::DiscoveringFiles { - path: path.clone(), + path: workspace_root.clone(), workspace_id: workspace_id.clone(), }) .await; let results: Vec> = self - .read_files(batch_size, &path, &workspace_id) + .read_files(batch_size, &workspace_root, &workspace_id) .collect() .await; - let failed_statuses: Vec = results - .iter() - .filter_map(|r| r.as_ref().err()) - .filter_map(|e| e.downcast_ref::()) - .map(|e| { - forge_domain::FileStatus::new( - e.path.to_string_lossy().into_owned(), - forge_domain::SyncStatus::Failed, - ) - }) - .collect(); + let failed_statuses = extract_failed_statuses(&results); let local_files: Vec = results.into_iter().flatten().collect(); let total_file_count = local_files.len() + failed_statuses.len(); emit(SyncProgress::FilesDiscovered { count: total_file_count }).await; - let remote_files = if is_new_workspace { - Vec::new() - } else { - self.fetch_remote_hashes(&user_id, &workspace_id, &token) - .await? - }; + let remote_files = self + .fetch_remote_hashes(&user_id, &workspace_id, &token) + .await?; emit(SyncProgress::ComparingFiles { remote_files: remote_files.len(), @@ -266,7 +233,7 @@ impl ForgeWorkspaceS }) .await; - let plan = WorkspaceStatus::new(path.clone(), remote_files); + let plan = WorkspaceStatus::new(workspace_root.clone(), remote_files); let local_file_hashes: Vec = local_files.iter().cloned().map(Into::into).collect(); let mut statuses = plan.file_statuses(local_file_hashes); @@ -366,10 +333,7 @@ impl ForgeWorkspaceS /// # Errors /// Returns an error if the credential is not found, if there's a database /// error, or if the credential format is invalid - async fn get_workspace_credentials(&self) -> Result<(forge_domain::ApiKey, UserId)> - where - F: ProviderRepository, - { + async fn get_workspace_credentials(&self) -> Result<(forge_domain::ApiKey, UserId)> { let credential = self .infra .get_credential(&ProviderId::FORGE_SERVICES) @@ -408,13 +372,8 @@ impl ForgeWorkspaceS &self, path: PathBuf, token: &forge_domain::ApiKey, - ) -> Result> - where - F: WorkspaceIndexRepository, - { - let canonical_path = path - .canonicalize() - .with_context(|| format!("Failed to resolve path: {}", path.display()))?; + ) -> Result> { + let canonical_path = canonicalize_path(path)?; // Get all workspaces from remote server let workspaces = self.infra.list_workspaces(token).await?; @@ -442,133 +401,23 @@ impl ForgeWorkspaceS Ok(best_match.map(|(w, _)| w.clone())) } - /// Runs `git ls-files` in `dir_path` and returns the tracked files as - /// `WalkedFile` entries. + + /// Looks up the workspace for `path` and returns it, or an error if no + /// workspace has been indexed for that path. /// /// # Errors /// - /// Returns an error when the command fails to execute or exits with a - /// non-zero status code (e.g. when the directory is not a git repository). - async fn git_ls_files(&self, dir_path: &Path) -> anyhow::Result> - where - F: CommandInfra, - { - let output = self - .infra - .execute_command( - "git ls-files".to_string(), - dir_path.to_path_buf(), - true, - None, - ) - .await?; - - if output.exit_code != Some(0) { - let err = anyhow::anyhow!(output.stderr); - return Err(match output.exit_code { - Some(code) => err.context(format!("'git ls-files' exited with code {}", code)), - None => err, - }); - } - - let files = output - .stdout - .lines() - .filter(|line| !line.is_empty()) - .map(|line| { - let path = line.trim().to_string(); - let file_name = std::path::Path::new(&path) - .file_name() - .map(|n| n.to_string_lossy().to_string()); - WalkedFile { path, file_name, size: 0 } - }) - .collect(); - - Ok(files) - } - - /// Walks a directory using the file-system walker and returns all - /// non-directory entries. - async fn walk_directory( + /// Returns an error when the underlying repository lookup fails, or when no + /// matching workspace is found (i.e. the workspace has not been indexed + /// yet). + async fn get_workspace_by_path( &self, - dir_path: &Path, - workspace_id: &WorkspaceId, - ) -> anyhow::Result> - where - F: WalkerInfra, - { - let walker_config = Walker::unlimited() - .cwd(dir_path.to_path_buf()) - .skip_binary(true); - match self - .infra - .walk(walker_config) - .await - .context("Failed to walk directory") - { - Ok(files) => { - let files: Vec<_> = files.into_iter().filter(|f| !f.is_dir()).collect(); - info!(workspace_id = %workspace_id, file_count = files.len(), "Discovered files via walker fallback"); - Ok(files) - } - Err(err) => { - warn!(workspace_id = %workspace_id, error = ?err, "Failed to get files via walker fallback"); - Err(err) - } - } - } - - /// Discovers workspace files and filters them by allowed source extensions. - async fn discover_sync_file_paths( - &self, - dir_path: &Path, - workspace_id: &WorkspaceId, - ) -> anyhow::Result> - where - F: CommandInfra + WalkerInfra, - { - info!(workspace_id = %workspace_id, "Discovering files for sync via git ls-files"); - // `git ls-files` can succeed yet return an empty list (e.g. a freshly - // initialized repo with no commits, or a directory outside the working - // tree). Treat that the same as a failure and fall back to the walker so - // we still discover files on disk. - let walked_files: Vec = match self.git_ls_files(dir_path).await { - Ok(walked_files) if !walked_files.is_empty() => { - info!(workspace_id = %workspace_id, file_count = walked_files.len(), "Discovered files via git ls-files"); - walked_files - } - Ok(_) => { - warn!(workspace_id = %workspace_id, "git ls-files returned no files, falling back to walker"); - self.walk_directory(dir_path, workspace_id).await? - } - Err(err) => { - warn!(workspace_id = %workspace_id, error = ?err, "Failed to get files via git ls-files, falling back to walker"); - self.walk_directory(dir_path, workspace_id).await? - } - }; - - let filtered_files: Vec<_> = walked_files - .into_iter() - .filter(|walked| { - let file_path = dir_path.join(&walked.path); - has_allowed_extension(&file_path) - }) - .collect(); - - info!( - workspace_id = %workspace_id, - filtered_count = filtered_files.len(), - "Files after extension filtering" - ); - - if filtered_files.is_empty() { - return Err(ServiceError::NoSourceFilesFound.into()); - } - - Ok(filtered_files - .iter() - .map(|walked| dir_path.join(&walked.path)) - .collect()) + path: PathBuf, + token: &forge_domain::ApiKey, + ) -> Result { + self.find_workspace_by_path(path, token) + .await? + .context("Workspace not indexed. Please run `forge workspace init` first.") } /// Only includes files with allowed extensions. @@ -577,17 +426,18 @@ impl ForgeWorkspaceS batch_size: usize, dir_path: &Path, workspace_id: &WorkspaceId, - ) -> impl Stream> + Send - where - F: FileReaderInfra + EnvironmentInfra + CommandInfra + WalkerInfra, - { + ) -> impl Stream> + Send { let dir_path = dir_path.to_path_buf(); let infra = self.infra.clone(); - let service = self.clone(); + let discovery = self.discovery.clone(); let workspace_id = workspace_id.clone(); async_stream::stream! { - let file_paths: Vec = match service.discover_sync_file_paths(&dir_path, &workspace_id).await { + let file_paths: Vec = match discover_sync_file_paths( + discovery.as_ref(), + &dir_path, + &workspace_id, + ).await { Ok(file_paths) => file_paths, Err(err) => { yield Err(err); @@ -595,8 +445,8 @@ impl ForgeWorkspaceS } }; - // Use read_batch_utf8 with streaming for better memory efficiency with large - // file sets + // Use read_batch_utf8 with streaming for better memory efficiency + // with large file sets let stream = infra.read_batch_utf8(batch_size, file_paths); futures::pin_mut!(stream); @@ -618,9 +468,7 @@ impl ForgeWorkspaceS async fn _init_workspace(&self, path: PathBuf) -> Result<(bool, WorkspaceId)> { let (token, _user_id) = self.get_workspace_credentials().await?; - let path = path - .canonicalize() - .with_context(|| format!("Failed to resolve path: {}", path.display()))?; + let path = canonicalize_path(path)?; // Find workspace by exact match or ancestor from remote server let workspace = self.find_workspace_by_path(path.clone(), &token).await?; @@ -659,13 +507,10 @@ impl< + CommandInfra + WalkerInfra + 'static, -> WorkspaceService for ForgeWorkspaceService + D: FileDiscovery + 'static, +> WorkspaceService for ForgeWorkspaceService { - async fn sync_workspace( - &self, - path: PathBuf, - batch_size: usize, - ) -> Result>> { + async fn sync_workspace(&self, path: PathBuf) -> Result>> { let service = Clone::clone(self); let stream = MpscStream::spawn(move |tx| async move { @@ -678,7 +523,7 @@ impl< }; // Run the sync and emit progress events - let result = service.sync_codebase_internal(path, batch_size, emit).await; + let result = service.sync_codebase_internal(path, emit).await; // If there was an error, send it through the channel if let Err(e) = result { @@ -726,10 +571,10 @@ impl< } /// Retrieves workspace information for a specific path. - async fn get_workspace_info(&self, path: PathBuf) -> Result> - where - F: WorkspaceIndexRepository + ProviderRepository, - { + async fn get_workspace_info( + &self, + path: PathBuf, + ) -> Result> { let (token, _user_id) = self.get_workspace_credentials().await?; let workspace = self.find_workspace_by_path(path, &token).await?; @@ -792,10 +637,7 @@ impl< async fn get_workspace_status(&self, path: PathBuf) -> Result> { let (token, user_id) = self.get_workspace_credentials().await?; - let workspace = self - .find_workspace_by_path(path, &token) - .await? - .context("Workspace not indexed. Please run `workspace sync` first.")?; + let workspace = self.get_workspace_by_path(path, &token).await?; // Reuse the canonical path already stored in the workspace (resolved during // sync), avoiding a redundant canonicalize() IO call. @@ -807,18 +649,7 @@ impl< .collect() .await; - let mut failed_statuses: Vec = results - .iter() - .filter_map(|r| r.as_ref().err()) - .filter_map(|e| e.downcast_ref::()) - .map(|e| { - forge_domain::FileStatus::new( - e.path.to_string_lossy().into_owned(), - forge_domain::SyncStatus::Failed, - ) - }) - .collect(); - + let mut failed_statuses = extract_failed_statuses(&results); let local_files: Vec = results.into_iter().flatten().collect(); let remote_files = self diff --git a/crates/forge_services/src/fd.rs b/crates/forge_services/src/fd.rs new file mode 100644 index 0000000000..ff9bb80550 --- /dev/null +++ b/crates/forge_services/src/fd.rs @@ -0,0 +1,118 @@ +use std::collections::HashSet; +use std::path::{Path, PathBuf}; +use std::sync::{Arc, LazyLock}; + +use async_trait::async_trait; +use forge_app::{CommandInfra, WalkerInfra}; +use forge_domain::WorkspaceId; +use tracing::{info, warn}; + +use crate::error::Error as ServiceError; +use crate::fd_git::FsGit; +use crate::fd_walker::FdWalker; + +pub(crate) static ALLOWED_EXTENSIONS: LazyLock> = LazyLock::new(|| { + let extensions_str = include_str!("allowed_extensions.txt"); + extensions_str + .lines() + .map(|line| line.trim().to_lowercase()) + .filter(|line| !line.is_empty()) + .collect() +}); + +/// Returns `true` if `path` carries an extension present in the allowed +/// extensions list. +pub(crate) fn has_allowed_extension(path: &Path) -> bool { + if let Some(ext) = path.extension() { + ALLOWED_EXTENSIONS.contains(&ext.to_string_lossy().to_lowercase() as &str) + } else { + false + } +} + +/// Filters relative path strings down to those with an allowed extension, +/// resolves each against `dir_path`, and returns them as absolute `PathBuf`s. +/// +/// Returns an error when the filtered list is empty, indicating no indexable +/// source files exist in the workspace. +pub(crate) fn filter_and_resolve( + dir_path: &Path, + paths: impl IntoIterator, +) -> anyhow::Result> { + let filtered: Vec = paths + .into_iter() + .map(|p| dir_path.join(&p)) + .filter(|p| has_allowed_extension(p)) + .collect(); + + if filtered.is_empty() { + return Err(ServiceError::NoSourceFilesFound.into()); + } + + Ok(filtered) +} + +/// Trait for discovering the list of files in a workspace directory that +/// should be considered for synchronisation. +/// +/// Implementations may use different strategies (e.g. `git ls-files` or a +/// plain filesystem walk) to enumerate files. The returned paths are absolute. +#[async_trait] +pub trait FileDiscovery: Send + Sync { + /// Returns the absolute paths of all files to be indexed under `dir_path`. + /// + /// # Errors + /// + /// Returns an error if the discovery strategy fails and no files can be + /// enumerated. + async fn discover(&self, dir_path: &Path) -> anyhow::Result>; +} + +/// Discovers workspace files using a `FileDiscovery` implementation and logs +/// progress associated with `workspace_id`. +pub async fn discover_sync_file_paths( + discovery: &impl FileDiscovery, + dir_path: &Path, + workspace_id: &WorkspaceId, +) -> anyhow::Result> { + info!(workspace_id = %workspace_id, "Discovering files for sync"); + let files = discovery.discover(dir_path).await?; + info!( + workspace_id = %workspace_id, + count = files.len(), + "Files discovered and filtered for sync" + ); + Ok(files) +} + +/// A `FileDiscovery` implementation that routes between `GitFileDiscovery` and +/// `WalkerFileDiscovery`. +/// +/// It first attempts git-based discovery. If git is unavailable, returns no +/// files, or fails for any reason it transparently falls back to the filesystem +/// walker so that workspaces without git history are still indexed correctly. +pub struct FdDefault { + git: FsGit, + walker: FdWalker, +} + +impl FdDefault { + /// Creates a new `RoutingFileDiscovery` using the provided infrastructure + /// for both the git and walker strategies. + pub fn new(infra: Arc) -> Self { + Self { git: FsGit::new(infra.clone()), walker: FdWalker::new(infra) } + } +} + +#[async_trait] +impl FileDiscovery for FdDefault { + async fn discover(&self, dir_path: &Path) -> anyhow::Result> { + match self.git.discover(dir_path).await { + Ok(files) => Ok(files), + Err(err) => { + warn!(error = ?err, "git-based file discovery failed, falling back to walker"); + self.walker.discover(dir_path).await + } + } + } +} diff --git a/crates/forge_services/src/fd_git.rs b/crates/forge_services/src/fd_git.rs new file mode 100644 index 0000000000..30178f0bac --- /dev/null +++ b/crates/forge_services/src/fd_git.rs @@ -0,0 +1,76 @@ +use std::path::{Path, PathBuf}; +use std::sync::Arc; + +use async_trait::async_trait; +use forge_app::CommandInfra; +use tracing::info; + +use crate::fd::{FileDiscovery, filter_and_resolve}; + +/// File discovery implementation backed by `git ls-files`. +/// +/// Returns the files tracked by git in the repository rooted at `dir_path`. +/// This is the preferred strategy when the workspace is a git repository with +/// at least one commit. +pub struct FsGit { + infra: Arc, +} + +impl FsGit { + /// Creates a new `GitFileDiscovery` using the provided infrastructure. + pub fn new(infra: Arc) -> Self { + Self { infra } + } +} + +impl FsGit { + /// Runs `git ls-files` in `dir_path` and returns the resulting file paths. + /// + /// # Errors + /// + /// Returns an error when the command cannot be executed or exits with a + /// non-zero status (e.g. the directory is not a git repository). + async fn git_ls_files(&self, dir_path: &Path) -> anyhow::Result> { + let output = self + .infra + .execute_command( + "git ls-files".to_string(), + dir_path.to_path_buf(), + true, + None, + ) + .await?; + + if output.exit_code != Some(0) { + let err = anyhow::anyhow!(output.stderr); + return Err(match output.exit_code { + Some(code) => err.context(format!("'git ls-files' exited with code {}", code)), + None => err, + }); + } + + let paths = output + .stdout + .lines() + .filter(|line| !line.is_empty()) + .map(|line| line.trim().to_string()) + .collect(); + + Ok(paths) + } +} + +#[async_trait] +impl FileDiscovery for FsGit { + async fn discover(&self, dir_path: &Path) -> anyhow::Result> { + let paths = self.git_ls_files(dir_path).await?; + if paths.is_empty() { + return Err(anyhow::anyhow!("git ls-files returned no files")); + } + info!( + file_count = paths.len(), + "Discovered files via git ls-files" + ); + filter_and_resolve(dir_path, paths) + } +} diff --git a/crates/forge_services/src/fd_walker.rs b/crates/forge_services/src/fd_walker.rs new file mode 100644 index 0000000000..792d2edc05 --- /dev/null +++ b/crates/forge_services/src/fd_walker.rs @@ -0,0 +1,49 @@ +use std::path::{Path, PathBuf}; +use std::sync::Arc; + +use anyhow::Context; +use async_trait::async_trait; +use forge_app::{Walker, WalkerInfra}; +use tracing::info; + +use crate::fd::{FileDiscovery, filter_and_resolve}; + +/// File discovery implementation backed by the filesystem walker. +/// +/// Walks the directory tree under `dir_path` using the configured `WalkerInfra` +/// implementation. This is used as a fallback when git-based discovery is +/// unavailable or returns no files. +pub struct FdWalker { + infra: Arc, +} + +impl FdWalker { + /// Creates a new `WalkerFileDiscovery` using the provided infrastructure. + pub fn new(infra: Arc) -> Self { + Self { infra } + } +} + +#[async_trait] +impl FileDiscovery for FdWalker { + async fn discover(&self, dir_path: &Path) -> anyhow::Result> { + let walker_config = Walker::unlimited() + .cwd(dir_path.to_path_buf()) + .skip_binary(true); + + let files = self + .infra + .walk(walker_config) + .await + .context("Failed to walk directory")?; + + let paths: Vec = files + .into_iter() + .filter(|f| !f.is_dir()) + .map(|f| f.path) + .collect(); + + info!(file_count = paths.len(), "Discovered files via walker"); + filter_and_resolve(dir_path, paths) + } +} diff --git a/crates/forge_services/src/forge_services.rs b/crates/forge_services/src/forge_services.rs index c6d94adf42..7a7a4c9865 100644 --- a/crates/forge_services/src/forge_services.rs +++ b/crates/forge_services/src/forge_services.rs @@ -20,6 +20,7 @@ use crate::command::CommandLoaderService as ForgeCommandLoaderService; use crate::conversation::ForgeConversationService; use crate::discovery::ForgeDiscoveryService; use crate::env::ForgeEnvironmentService; +use crate::fd::FdDefault; use crate::instructions::ForgeCustomInstructionsService; use crate::mcp::{ForgeMcpManager, ForgeMcpService}; use crate::policy::ForgePolicyService; @@ -85,7 +86,7 @@ pub struct ForgeServices< command_loader_service: Arc>, policy_service: ForgePolicyService, provider_auth_service: ForgeProviderAuthService, - workspace_service: Arc>, + workspace_service: Arc>>, skill_service: Arc>, } @@ -141,8 +142,10 @@ impl< let command_loader_service = Arc::new(ForgeCommandLoaderService::new(infra.clone())); let policy_service = ForgePolicyService::new(infra.clone()); let provider_auth_service = ForgeProviderAuthService::new(infra.clone()); + let discovery = Arc::new(FdDefault::new(infra.clone())); let workspace_service = Arc::new(crate::context_engine::ForgeWorkspaceService::new( infra.clone(), + discovery, )); let skill_service = Arc::new(ForgeSkillFetch::new(infra.clone())); @@ -241,7 +244,7 @@ impl< type CommandLoaderService = ForgeCommandLoaderService; type PolicyService = ForgePolicyService; type ProviderService = ForgeProviderService; - type WorkspaceService = crate::context_engine::ForgeWorkspaceService; + type WorkspaceService = crate::context_engine::ForgeWorkspaceService>; type SkillFetchService = ForgeSkillFetch; fn config_service(&self) -> &Self::AppConfigService { diff --git a/crates/forge_services/src/lib.rs b/crates/forge_services/src/lib.rs index 67d2b55844..925b68e19b 100644 --- a/crates/forge_services/src/lib.rs +++ b/crates/forge_services/src/lib.rs @@ -9,6 +9,9 @@ mod conversation; mod discovery; mod env; mod error; +mod fd; +mod fd_git; +mod fd_walker; mod forge_services; mod instructions; mod mcp; diff --git a/shell-plugin/lib/actions/config.zsh b/shell-plugin/lib/actions/config.zsh index 59fa4e17aa..324cac1b24 100644 --- a/shell-plugin/lib/actions/config.zsh +++ b/shell-plugin/lib/actions/config.zsh @@ -245,7 +245,14 @@ function _forge_action_sync() { echo # Execute sync with stdin redirected to prevent hanging # Sync doesn't need interactive input, so close stdin immediately - _forge_exec workspace sync