MODSetter
diff --git a/‎README.md‎
Lines changed: 0 additions & 8 deletions b/‎README.md‎
Lines changed: 0 additions & 8 deletions
diff --git a/‎docker-compose.yml‎
Lines changed: 19 additions & 0 deletions b/‎docker-compose.yml‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎surfsense_backend/.env.example‎
Lines changed: 17 additions & 0 deletions b/‎surfsense_backend/.env.example‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎surfsense_backend/.gitignore‎
Lines changed: 5 additions & 1 deletion b/‎surfsense_backend/.gitignore‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎surfsense_backend/alembic/versions/25_migrate_llm_configs_to_search_spaces.py‎
Lines changed: 33 additions & 18 deletions b/‎surfsense_backend/alembic/versions/25_migrate_llm_configs_to_search_spaces.py‎
Lines changed: 33 additions & 18 deletions
diff --git a/‎surfsense_backend/alembic/versions/32_add_periodic_indexing_fields.py‎
Lines changed: 94 additions & 0 deletions b/‎surfsense_backend/alembic/versions/32_add_periodic_indexing_fields.py‎
Lines changed: 94 additions & 0 deletions
diff --git a/‎surfsense_backend/app/celery_app.py‎
Lines changed: 55 additions & 6 deletions b/‎surfsense_backend/app/celery_app.py‎
Lines changed: 55 additions & 6 deletions
diff --git a/‎surfsense_backend/app/db.py‎
Lines changed: 5 additions & 0 deletions b/‎surfsense_backend/app/db.py‎
Lines changed: 5 additions & 0 deletions
@@ -136,14 +136,6 @@ Check out our public roadmap and contribute your ideas or feedback:
 
 **View the Roadmap:** [SurfSense Roadmap on GitHub Projects](https://github.com/users/MODSetter/projects/2)
 
-## ⚠️ Important Announcement
-
-**AWS and Vercel are currently experiencing outages.** We deployed a major update to SurfSense last night and have updated our documentation accordingly with important setup and configuration changes. Unfortunately, these documentation updates cannot be deployed to our main site (surfsense.com) due to the ongoing outages.
-
-**Please view our documentation directly on GitHub:**  
-📚 [SurfSense Documentation](https://github.com/MODSetter/SurfSense/tree/main/surfsense_web/content/docs)
-
-We apologize for any inconvenience and appreciate your patience!
 
 ## How to get started?
 
 
@@ -74,6 +74,25 @@ services:
       - redis
       - backend
 
+  celery_beat:
+    build: ./surfsense_backend
+    # image: ghcr.io/modsetter/surfsense_backend:latest
+    command: celery -A app.celery_app beat --loglevel=info
+    volumes:
+      - ./surfsense_backend:/app
+      - shared_temp:/tmp
+    env_file:
+      - ./surfsense_backend/.env
+    environment:
+      - DATABASE_URL=postgresql+asyncpg://${POSTGRES_USER:-postgres}:${POSTGRES_PASSWORD:-postgres}@db:5432/${POSTGRES_DB:-surfsense}
+      - CELERY_BROKER_URL=redis://redis:${REDIS_PORT:-6379}/0
+      - CELERY_RESULT_BACKEND=redis://redis:${REDIS_PORT:-6379}/0
+      - PYTHONPATH=/app
+    depends_on:
+      - db
+      - redis
+      - celery_worker
+
   # flower:
   #   build: ./surfsense_backend
   #   # image: ghcr.io/modsetter/surfsense_backend:latest
 
@@ -3,6 +3,23 @@ DATABASE_URL=postgresql+asyncpg://postgres:postgres@localhost:5432/surfsense
 #Celery Config
 CELERY_BROKER_URL=redis://localhost:6379/0
 CELERY_RESULT_BACKEND=redis://localhost:6379/0
+# Periodic task interval
+# # Run every minute (default)
+# SCHEDULE_CHECKER_INTERVAL=1m
+
+# # Run every 5 minutes
+# SCHEDULE_CHECKER_INTERVAL=5m
+
+# # Run every 10 minutes
+# SCHEDULE_CHECKER_INTERVAL=10m
+
+# # Run every hour
+# SCHEDULE_CHECKER_INTERVAL=1h
+
+# # Run every 2 hours
+# SCHEDULE_CHECKER_INTERVAL=2h
+
+SCHEDULE_CHECKER_INTERVAL=5m
 
 SECRET_KEY=SECRET
 NEXT_FRONTEND_URL=http://localhost:3000
 
@@ -6,4 +6,8 @@ __pycache__/
 .flashrank_cache
 surf_new_backend.egg-info/
 podcasts/
-temp_audio/
+temp_audio/
+celerybeat-schedule*
+celerybeat-schedule.*
+celerybeat-schedule.dir
+celerybeat-schedule.bak
@@ -55,30 +55,45 @@ def upgrade() -> None:
 
     # ===== STEP 2: Populate search_space_id with user's first search space =====
     # This ensures existing LLM configs are assigned to a valid search space
-    op.execute(
-        """
-        UPDATE llm_configs lc
-        SET search_space_id = (
-            SELECT id 
-            FROM searchspaces ss 
-            WHERE ss.user_id = lc.user_id 
-            ORDER BY ss.created_at ASC 
-            LIMIT 1
+    # Only run this if user_id column exists on llm_configs
+    if "user_id" in llm_config_columns:
+        op.execute(
+            """
+            UPDATE llm_configs lc
+            SET search_space_id = (
+                SELECT id 
+                FROM searchspaces ss 
+                WHERE ss.user_id = lc.user_id 
+                ORDER BY ss.created_at ASC 
+                LIMIT 1
+            )
+            WHERE search_space_id IS NULL AND user_id IS NOT NULL
+            """
         )
-        WHERE search_space_id IS NULL AND user_id IS NOT NULL
-        """
-    )
 
     # ===== STEP 3: Make search_space_id NOT NULL and add FK constraint =====
-    op.alter_column(
-        "llm_configs",
-        "search_space_id",
-        nullable=False,
+    # Check if there are any rows with NULL search_space_id
+    # If llm_configs table is empty or all rows have search_space_id, we can proceed
+    result = conn.execute(
+        sa.text("SELECT COUNT(*) FROM llm_configs WHERE search_space_id IS NULL")
     )
+    null_count = result.scalar()
+
+    if null_count == 0 or "user_id" in llm_config_columns:
+        # Safe to make NOT NULL
+        op.alter_column(
+            "llm_configs",
+            "search_space_id",
+            nullable=False,
+        )
+    else:
+        # If there are NULL values and no user_id to migrate from, skip making it NOT NULL
+        # This would happen if llm_configs already exists without user_id
+        pass
 
-    # Add foreign key constraint
+    # Add foreign key constraint only if search_space_id is NOT NULL
     foreign_keys = [fk["name"] for fk in inspector.get_foreign_keys("llm_configs")]
-    if "fk_llm_configs_search_space_id" not in foreign_keys:
+    if "fk_llm_configs_search_space_id" not in foreign_keys and null_count == 0:
         op.create_foreign_key(
             "fk_llm_configs_search_space_id",
             "llm_configs",
 
@@ -0,0 +1,94 @@
+"""Add periodic indexing fields to search_source_connectors
+
+Revision ID: 32
+Revises: 31
+
+Changes:
+1. Add periodic_indexing_enabled column (Boolean, default False)
+2. Add indexing_frequency_minutes column (Integer, nullable)
+3. Add next_scheduled_at column (TIMESTAMP with timezone, nullable)
+"""
+
+from collections.abc import Sequence
+
+import sqlalchemy as sa
+
+from alembic import op
+
+# revision identifiers, used by Alembic.
+revision: str = "32"
+down_revision: str | None = "31"
+branch_labels: str | Sequence[str] | None = None
+depends_on: str | Sequence[str] | None = None
+
+
+def upgrade() -> None:
+    """Add periodic indexing fields to search_source_connectors table."""
+
+    from sqlalchemy import inspect
+
+    conn = op.get_bind()
+    inspector = inspect(conn)
+
+    # Get existing columns
+    connector_columns = [
+        col["name"] for col in inspector.get_columns("search_source_connectors")
+    ]
+
+    # Add periodic_indexing_enabled column if it doesn't exist
+    if "periodic_indexing_enabled" not in connector_columns:
+        op.add_column(
+            "search_source_connectors",
+            sa.Column(
+                "periodic_indexing_enabled",
+                sa.Boolean(),
+                nullable=False,
+                server_default="false",
+            ),
+        )
+
+    # Add indexing_frequency_minutes column if it doesn't exist
+    if "indexing_frequency_minutes" not in connector_columns:
+        op.add_column(
+            "search_source_connectors",
+            sa.Column(
+                "indexing_frequency_minutes",
+                sa.Integer(),
+                nullable=True,
+            ),
+        )
+
+    # Add next_scheduled_at column if it doesn't exist
+    if "next_scheduled_at" not in connector_columns:
+        op.add_column(
+            "search_source_connectors",
+            sa.Column(
+                "next_scheduled_at",
+                sa.TIMESTAMP(timezone=True),
+                nullable=True,
+            ),
+        )
+
+
+def downgrade() -> None:
+    """Remove periodic indexing fields from search_source_connectors table."""
+
+    from sqlalchemy import inspect
+
+    conn = op.get_bind()
+    inspector = inspect(conn)
+
+    # Get existing columns
+    connector_columns = [
+        col["name"] for col in inspector.get_columns("search_source_connectors")
+    ]
+
+    # Drop columns if they exist
+    if "next_scheduled_at" in connector_columns:
+        op.drop_column("search_source_connectors", "next_scheduled_at")
+
+    if "indexing_frequency_minutes" in connector_columns:
+        op.drop_column("search_source_connectors", "indexing_frequency_minutes")
+
+    if "periodic_indexing_enabled" in connector_columns:
+        op.drop_column("search_source_connectors", "periodic_indexing_enabled")
@@ -3,6 +3,7 @@
 import os
 
 from celery import Celery
+from celery.schedules import crontab
 from dotenv import load_dotenv
 
 # Load environment variables
@@ -12,6 +13,46 @@
 CELERY_BROKER_URL = os.getenv("CELERY_BROKER_URL", "redis://localhost:6379/0")
 CELERY_RESULT_BACKEND = os.getenv("CELERY_RESULT_BACKEND", "redis://localhost:6379/0")
 
+# Get schedule checker interval from environment
+# Format: "<number><unit>" where unit is 'm' (minutes) or 'h' (hours)
+# Examples: "1m" (every minute), "5m" (every 5 minutes), "1h" (every hour)
+SCHEDULE_CHECKER_INTERVAL = os.getenv("SCHEDULE_CHECKER_INTERVAL", "2m")
+
+
+def parse_schedule_interval(interval: str) -> dict:
+    """Parse interval string into crontab parameters.
+
+    Args:
+        interval: String like "1m", "5m", "1h", etc.
+
+    Returns:
+        Dict with crontab parameters (minute, hour)
+    """
+    interval = interval.strip().lower()
+
+    # Extract number and unit
+    if interval.endswith("m") or interval.endswith("min"):
+        # Minutes
+        num = int(interval.rstrip("min"))
+        if num == 1:
+            return {"minute": "*", "hour": "*"}
+        else:
+            return {"minute": f"*/{num}", "hour": "*"}
+    elif interval.endswith("h") or interval.endswith("hour"):
+        # Hours
+        num = int(interval.rstrip("hour"))
+        if num == 1:
+            return {"minute": "0", "hour": "*"}
+        else:
+            return {"minute": "0", "hour": f"*/{num}"}
+    else:
+        # Default to every minute if parsing fails
+        return {"minute": "*", "hour": "*"}
+
+
+# Parse the schedule interval
+schedule_params = parse_schedule_interval(SCHEDULE_CHECKER_INTERVAL)
+
 # Create Celery app
 celery_app = Celery(
     "surfsense",
@@ -21,6 +62,7 @@
         "app.tasks.celery_tasks.document_tasks",
         "app.tasks.celery_tasks.podcast_tasks",
         "app.tasks.celery_tasks.connector_tasks",
+        "app.tasks.celery_tasks.schedule_checker_task",
     ],
 )
 
@@ -47,13 +89,20 @@
     task_reject_on_worker_lost=True,
     # Broker settings
     broker_connection_retry_on_startup=True,
+    # Beat scheduler settings
+    beat_max_loop_interval=60,  # Check every minute
 )
 
-# Optional: Configure Celery Beat for periodic tasks
+# Configure Celery Beat schedule
+# This uses a meta-scheduler pattern: instead of creating individual Beat schedules
+# for each connector, we have ONE schedule that checks the database at the configured interval
+# for connectors that need indexing. This provides dynamic scheduling without restarts.
 celery_app.conf.beat_schedule = {
-    # Example: Add periodic tasks here if needed
-    # "periodic-task-name": {
-    #     "task": "app.tasks.celery_tasks.some_task",
-    #     "schedule": crontab(minute=0, hour=0),  # Run daily at midnight
-    # },
+    "check-periodic-connector-schedules": {
+        "task": "check_periodic_schedules",
+        "schedule": crontab(**schedule_params),
+        "options": {
+            "expires": 30,  # Task expires after 30 seconds if not picked up
+        },
+    },
 }
@@ -285,6 +285,11 @@ class SearchSourceConnector(BaseModel, TimestampMixin):
     last_indexed_at = Column(TIMESTAMP(timezone=True), nullable=True)
     config = Column(JSON, nullable=False)
 
+    # Periodic indexing fields
+    periodic_indexing_enabled = Column(Boolean, nullable=False, default=False)
+    indexing_frequency_minutes = Column(Integer, nullable=True)
+    next_scheduled_at = Column(TIMESTAMP(timezone=True), nullable=True)
+
     search_space_id = Column(
         Integer, ForeignKey("searchspaces.id", ondelete="CASCADE"), nullable=False
     )