cache refactor

nicucalcea · nicucalcea · commit 2cc7ae711d57 · 2025-05-11T01:11:14.000+01:00
diff --git a/augmenta/augmenta.py b/augmenta/augmenta.py
@@ -10,7 +10,7 @@
 from augmenta.utils.prompt_formatter import format_examples, format_prompt
 from augmenta.agent import AugmentaAgent
 from augmenta.cache import CacheManager
-from augmenta.cache.process import handle_process_resumption, setup_caching, apply_cached_results
+from augmenta.cache.process import setup_cache_handling, apply_cached_results
 from augmenta.config.read_config import load_config, get_config_values
 import logfire
 
@@ -73,52 +73,6 @@ def load_input_data(config_data: Dict[str, Any]) -> pd.DataFrame:
         raise ValueError(f"Failed to read input CSV file '{input_csv}': {str(e)}")
 
 
-def setup_cache_handling(
-    config_data: Dict[str, Any],
-    config_path: Path,
-    cache_enabled: bool,
-    process_id: Optional[str],
-    auto_resume: bool,
-    df: pd.DataFrame
-) -> Tuple[Optional[str], Optional[CacheManager], Dict[int, Any]]:
-    """Set up caching configuration.
-    
-    Args:
-        config_data: Configuration dictionary
-        config_path: Path to configuration file
-        cache_enabled: Whether caching is enabled
-        process_id: Optional process ID for resuming
-        auto_resume: Whether to auto-resume previous processes
-        df: Loaded DataFrame
-        
-    Returns:
-        Tuple of (process ID, cache manager, cached results)
-    """
-    if not cache_enabled:
-        return None, None, {}
-        
-    # Handle process resumption
-    process_id = handle_process_resumption(
-        config_data=config_data,
-        config_path=config_path,
-        csv_path=config_data["input_csv"],
-        no_cache=not cache_enabled,
-        resume=process_id,
-        no_auto_resume=not auto_resume
-    )
-
-    # Set up caching
-    cache_manager, process_id, cached_results = setup_caching(
-        config_data=config_data,
-        csv_path=config_data["input_csv"],
-        cache_enabled=cache_enabled,
-        df_length=len(df),
-        process_id=process_id
-    )
-    
-    return process_id, cache_manager, cached_results
-
-
 async def process_augmenta(
     config_path: Union[str, Path],
     cache_enabled: bool = True,
@@ -152,8 +106,7 @@ async def process_augmenta(
     
     # Set up agent
     agent = setup_agent(config_data)
-    
-    # Load input data
+      # Load input data
     df = load_input_data(config_data)
 
     # Handle caching setup
diff --git a/augmenta/cache/__init__.py b/augmenta/cache/__init__.py
@@ -6,21 +6,20 @@
 from .exceptions import CacheError, DatabaseError, ValidationError
 from .models import ProcessStatus
 from .process import (
-    handle_process_resumption,
+    get_cache_manager,
     handle_cache_cleanup,
-    setup_caching,
+    setup_cache_handling,
     apply_cached_results
 )
 from .manager import CacheManager
 
 __all__ = [
     # Core cache management
     'CacheManager',
-    
-    # Process handling
-    'handle_process_resumption',
+    'get_cache_manager',
+      # Process handling
     'handle_cache_cleanup',
-    'setup_caching',
+    'setup_cache_handling',
     'apply_cached_results',
     
     # Models and exceptions
diff --git a/augmenta/cache/manager.py b/augmenta/cache/manager.py
@@ -33,20 +33,22 @@ def __new__(cls, *args, **kwargs) -> 'CacheManager':
                     cls._instance = super().__new__(cls)
         return cls._instance
     
-    def __init__(self, cache_dir: Optional[Path] = None) -> None:
+    def __init__(self, cache_dir: Optional[Path] = None, auto_cleanup_days: int = 30) -> None:
         with self._lock:
             if hasattr(self, 'initialized'):
                 return
                 
             self.cache_dir = cache_dir or Path(os.getcwd()) / '.augmenta' / 'cache'
             self.cache_dir.mkdir(parents=True, exist_ok=True)
             self.db_path = self.cache_dir / 'cache.db'
+            self.auto_cleanup_days = auto_cleanup_days
             
             self.write_queue = Queue()
             self.is_running = True
             
             self.db = DatabaseConnection(self.db_path)
             self._start_writer_thread()
+            self._cleanup_old_processes()  # Auto-cleanup on startup
             atexit.register(self.cleanup)
             self.initialized = True
     
@@ -181,6 +183,17 @@ def mark_process_completed(self, process_id: str) -> None:
             (datetime.now(), process_id)
         ))
     
+    def _cleanup_old_processes(self) -> None:
+        """Clean up processes older than the specified days."""
+        try:
+            cutoff = datetime.now() - timedelta(days=self.auto_cleanup_days)
+            with self.db.get_connection() as conn:
+                result = conn.execute("DELETE FROM processes WHERE last_updated < ?", (cutoff,))
+                if result.rowcount > 0:
+                    logger.info(f"Cleaned up {result.rowcount} old processes from cache")
+        except Exception as e:
+            logger.error(f"Error during automatic cache cleanup: {e}")
+    
     def cleanup_old_processes(self, days: int = 30) -> None:
         """Clean up processes older than specified days."""
         validate_int(days, "Days")
diff --git a/augmenta/cache/process.py b/augmenta/cache/process.py
@@ -9,73 +9,79 @@
 from augmenta.utils.get_hash import get_hash
 from .manager import CacheManager
 
-def handle_process_resumption(
-    config_data: Dict[str, Any],
-    config_path: Path,
-    csv_path: Path,
-    no_cache: bool = False,
-    resume: Optional[str] = None,
-    no_auto_resume: bool = False,
-    cache_manager: Optional[Any] = None
-) -> Optional[str]:
-    """Handle process resumption logic."""
-    if resume or no_cache or no_auto_resume:
-        return resume
-        
-    if cache_manager is None:
-        cache_manager = CacheManager()
-        
-    config_hash = get_hash(config_data)
-    csv_hash = get_hash(csv_path)
-    combined_hash = get_hash({'config': config_hash, 'csv': csv_hash})
-    
-    if unfinished_process := cache_manager.find_unfinished_process(combined_hash):
-        summary = cache_manager.get_process_summary(unfinished_process)
-        click.echo(summary)
-        if click.confirm("Would you like to resume this process?"):
-            return unfinished_process.process_id
-            
-    return None
+def get_cache_manager() -> CacheManager:
+    """Get the singleton cache manager instance."""
+    return CacheManager()
 
-def setup_caching(
+def setup_cache_handling(
     config_data: Dict[str, Any],
-    csv_path: Path,
+    config_path: Path,
     cache_enabled: bool,
-    df_length: int,
-    process_id: Optional[str] = None,
-    cache_manager: Optional[Any] = None
-) -> Tuple[Optional[Any], Optional[str], Dict]:
-    """Set up caching for a process."""
+    process_id: Optional[str],
+    auto_resume: bool,
+    df: pd.DataFrame
+) -> Tuple[Optional[str], Optional[CacheManager], Dict[int, Any]]:
+    """Set up caching configuration.
+    
+    Args:
+        config_data: Configuration dictionary
+        config_path: Path to configuration file
+        cache_enabled: Whether caching is enabled
+        process_id: Optional process ID for resuming
+        auto_resume: Whether to auto-resume previous processes
+        df: Loaded DataFrame
+        
+    Returns:
+        Tuple of (process ID, cache manager, cached results)
+    """
     if not cache_enabled:
         return None, None, {}
         
-    if cache_manager is None:
-        cache_manager = CacheManager()
+    # Initialize cache manager once
+    cache_manager = get_cache_manager()
+    
+    # Skip resumption if explicitly provided or disabled
+    if not process_id and auto_resume:
+        # Generate hash for config and input data
+        config_hash = get_hash(config_data)
+        csv_hash = get_hash(config_data["input_csv"])
+        combined_hash = get_hash({'config': config_hash, 'csv': csv_hash})
         
-    config_hash = get_hash(config_data)
-    csv_hash = get_hash(csv_path)
-    combined_hash = get_hash({'config': config_hash, 'csv': csv_hash})
+        # Check for unfinished process
+        if unfinished_process := cache_manager.find_unfinished_process(combined_hash):
+            summary = cache_manager.get_process_summary(unfinished_process)
+            click.echo(summary)
+            if click.confirm("Would you like to resume this process?"):
+                process_id = unfinished_process.process_id
     
+    # Set up or resume process
     if not process_id:
-        process_id = cache_manager.start_process(combined_hash, df_length)
+        # Start new process
+        config_hash = get_hash(config_data)
+        csv_hash = get_hash(config_data["input_csv"])
+        combined_hash = get_hash({'config': config_hash, 'csv': csv_hash})
+        process_id = cache_manager.start_process(combined_hash, len(df))
     else:
+        # Update existing process
         with cache_manager.db.get_connection() as conn:
             conn.execute(
                 "UPDATE processes SET status = 'running', last_updated = ? WHERE process_id = ?",
                 (datetime.now(), process_id)
             )
-            
+    
+    # Get cached results
     cached_results = cache_manager.get_cached_results(process_id)
-    return cache_manager, process_id, cached_results
+    
+    return process_id, cache_manager, cached_results
 
 def apply_cached_results(
     df: pd.DataFrame,
     process_id: str,
-    cache_manager: Optional[Any] = None
+    cache_manager: Optional[CacheManager] = None
 ) -> pd.DataFrame:
     """Apply cached results to a DataFrame."""
     if cache_manager is None:
-        cache_manager = CacheManager()
+        cache_manager = get_cache_manager()
         
     cached_results = cache_manager.get_cached_results(process_id)
     for row_index, result in cached_results.items():
@@ -86,7 +92,7 @@ def apply_cached_results(
 def handle_cache_cleanup(cache_manager: Optional[Any] = None) -> None:
     """Clean up cache by removing the cache database file."""
     if cache_manager is None:
-        cache_manager = CacheManager()
+        cache_manager = get_cache_manager()
     
     try:
         # Ensure all pending writes are processed