Skip to content

Commit a44deb7

Browse files
authored
Merge pull request #428 from MODSetter/dev
feat: added periodic indexing for indexable search source connectors
2 parents 70808eb + aed8163 commit a44deb7

File tree

17 files changed

+998
-51
lines changed

17 files changed

+998
-51
lines changed

README.md

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -136,14 +136,6 @@ Check out our public roadmap and contribute your ideas or feedback:
136136

137137
**View the Roadmap:** [SurfSense Roadmap on GitHub Projects](https://github.com/users/MODSetter/projects/2)
138138

139-
## ⚠️ Important Announcement
140-
141-
**AWS and Vercel are currently experiencing outages.** We deployed a major update to SurfSense last night and have updated our documentation accordingly with important setup and configuration changes. Unfortunately, these documentation updates cannot be deployed to our main site (surfsense.com) due to the ongoing outages.
142-
143-
**Please view our documentation directly on GitHub:**
144-
📚 [SurfSense Documentation](https://github.com/MODSetter/SurfSense/tree/main/surfsense_web/content/docs)
145-
146-
We apologize for any inconvenience and appreciate your patience!
147139

148140
## How to get started?
149141

docker-compose.yml

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,25 @@ services:
7474
- redis
7575
- backend
7676

77+
celery_beat:
78+
build: ./surfsense_backend
79+
# image: ghcr.io/modsetter/surfsense_backend:latest
80+
command: celery -A app.celery_app beat --loglevel=info
81+
volumes:
82+
- ./surfsense_backend:/app
83+
- shared_temp:/tmp
84+
env_file:
85+
- ./surfsense_backend/.env
86+
environment:
87+
- DATABASE_URL=postgresql+asyncpg://${POSTGRES_USER:-postgres}:${POSTGRES_PASSWORD:-postgres}@db:5432/${POSTGRES_DB:-surfsense}
88+
- CELERY_BROKER_URL=redis://redis:${REDIS_PORT:-6379}/0
89+
- CELERY_RESULT_BACKEND=redis://redis:${REDIS_PORT:-6379}/0
90+
- PYTHONPATH=/app
91+
depends_on:
92+
- db
93+
- redis
94+
- celery_worker
95+
7796
# flower:
7897
# build: ./surfsense_backend
7998
# # image: ghcr.io/modsetter/surfsense_backend:latest

surfsense_backend/.env.example

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,23 @@ DATABASE_URL=postgresql+asyncpg://postgres:postgres@localhost:5432/surfsense
33
#Celery Config
44
CELERY_BROKER_URL=redis://localhost:6379/0
55
CELERY_RESULT_BACKEND=redis://localhost:6379/0
6+
# Periodic task interval
7+
# # Run every minute (default)
8+
# SCHEDULE_CHECKER_INTERVAL=1m
9+
10+
# # Run every 5 minutes
11+
# SCHEDULE_CHECKER_INTERVAL=5m
12+
13+
# # Run every 10 minutes
14+
# SCHEDULE_CHECKER_INTERVAL=10m
15+
16+
# # Run every hour
17+
# SCHEDULE_CHECKER_INTERVAL=1h
18+
19+
# # Run every 2 hours
20+
# SCHEDULE_CHECKER_INTERVAL=2h
21+
22+
SCHEDULE_CHECKER_INTERVAL=5m
623

724
SECRET_KEY=SECRET
825
NEXT_FRONTEND_URL=http://localhost:3000

surfsense_backend/.gitignore

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,4 +6,8 @@ __pycache__/
66
.flashrank_cache
77
surf_new_backend.egg-info/
88
podcasts/
9-
temp_audio/
9+
temp_audio/
10+
celerybeat-schedule*
11+
celerybeat-schedule.*
12+
celerybeat-schedule.dir
13+
celerybeat-schedule.bak

surfsense_backend/alembic/versions/25_migrate_llm_configs_to_search_spaces.py

Lines changed: 33 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -55,30 +55,45 @@ def upgrade() -> None:
5555

5656
# ===== STEP 2: Populate search_space_id with user's first search space =====
5757
# This ensures existing LLM configs are assigned to a valid search space
58-
op.execute(
59-
"""
60-
UPDATE llm_configs lc
61-
SET search_space_id = (
62-
SELECT id
63-
FROM searchspaces ss
64-
WHERE ss.user_id = lc.user_id
65-
ORDER BY ss.created_at ASC
66-
LIMIT 1
58+
# Only run this if user_id column exists on llm_configs
59+
if "user_id" in llm_config_columns:
60+
op.execute(
61+
"""
62+
UPDATE llm_configs lc
63+
SET search_space_id = (
64+
SELECT id
65+
FROM searchspaces ss
66+
WHERE ss.user_id = lc.user_id
67+
ORDER BY ss.created_at ASC
68+
LIMIT 1
69+
)
70+
WHERE search_space_id IS NULL AND user_id IS NOT NULL
71+
"""
6772
)
68-
WHERE search_space_id IS NULL AND user_id IS NOT NULL
69-
"""
70-
)
7173

7274
# ===== STEP 3: Make search_space_id NOT NULL and add FK constraint =====
73-
op.alter_column(
74-
"llm_configs",
75-
"search_space_id",
76-
nullable=False,
75+
# Check if there are any rows with NULL search_space_id
76+
# If llm_configs table is empty or all rows have search_space_id, we can proceed
77+
result = conn.execute(
78+
sa.text("SELECT COUNT(*) FROM llm_configs WHERE search_space_id IS NULL")
7779
)
80+
null_count = result.scalar()
81+
82+
if null_count == 0 or "user_id" in llm_config_columns:
83+
# Safe to make NOT NULL
84+
op.alter_column(
85+
"llm_configs",
86+
"search_space_id",
87+
nullable=False,
88+
)
89+
else:
90+
# If there are NULL values and no user_id to migrate from, skip making it NOT NULL
91+
# This would happen if llm_configs already exists without user_id
92+
pass
7893

79-
# Add foreign key constraint
94+
# Add foreign key constraint only if search_space_id is NOT NULL
8095
foreign_keys = [fk["name"] for fk in inspector.get_foreign_keys("llm_configs")]
81-
if "fk_llm_configs_search_space_id" not in foreign_keys:
96+
if "fk_llm_configs_search_space_id" not in foreign_keys and null_count == 0:
8297
op.create_foreign_key(
8398
"fk_llm_configs_search_space_id",
8499
"llm_configs",
Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
"""Add periodic indexing fields to search_source_connectors
2+
3+
Revision ID: 32
4+
Revises: 31
5+
6+
Changes:
7+
1. Add periodic_indexing_enabled column (Boolean, default False)
8+
2. Add indexing_frequency_minutes column (Integer, nullable)
9+
3. Add next_scheduled_at column (TIMESTAMP with timezone, nullable)
10+
"""
11+
12+
from collections.abc import Sequence
13+
14+
import sqlalchemy as sa
15+
16+
from alembic import op
17+
18+
# revision identifiers, used by Alembic.
19+
revision: str = "32"
20+
down_revision: str | None = "31"
21+
branch_labels: str | Sequence[str] | None = None
22+
depends_on: str | Sequence[str] | None = None
23+
24+
25+
def upgrade() -> None:
26+
"""Add periodic indexing fields to search_source_connectors table."""
27+
28+
from sqlalchemy import inspect
29+
30+
conn = op.get_bind()
31+
inspector = inspect(conn)
32+
33+
# Get existing columns
34+
connector_columns = [
35+
col["name"] for col in inspector.get_columns("search_source_connectors")
36+
]
37+
38+
# Add periodic_indexing_enabled column if it doesn't exist
39+
if "periodic_indexing_enabled" not in connector_columns:
40+
op.add_column(
41+
"search_source_connectors",
42+
sa.Column(
43+
"periodic_indexing_enabled",
44+
sa.Boolean(),
45+
nullable=False,
46+
server_default="false",
47+
),
48+
)
49+
50+
# Add indexing_frequency_minutes column if it doesn't exist
51+
if "indexing_frequency_minutes" not in connector_columns:
52+
op.add_column(
53+
"search_source_connectors",
54+
sa.Column(
55+
"indexing_frequency_minutes",
56+
sa.Integer(),
57+
nullable=True,
58+
),
59+
)
60+
61+
# Add next_scheduled_at column if it doesn't exist
62+
if "next_scheduled_at" not in connector_columns:
63+
op.add_column(
64+
"search_source_connectors",
65+
sa.Column(
66+
"next_scheduled_at",
67+
sa.TIMESTAMP(timezone=True),
68+
nullable=True,
69+
),
70+
)
71+
72+
73+
def downgrade() -> None:
74+
"""Remove periodic indexing fields from search_source_connectors table."""
75+
76+
from sqlalchemy import inspect
77+
78+
conn = op.get_bind()
79+
inspector = inspect(conn)
80+
81+
# Get existing columns
82+
connector_columns = [
83+
col["name"] for col in inspector.get_columns("search_source_connectors")
84+
]
85+
86+
# Drop columns if they exist
87+
if "next_scheduled_at" in connector_columns:
88+
op.drop_column("search_source_connectors", "next_scheduled_at")
89+
90+
if "indexing_frequency_minutes" in connector_columns:
91+
op.drop_column("search_source_connectors", "indexing_frequency_minutes")
92+
93+
if "periodic_indexing_enabled" in connector_columns:
94+
op.drop_column("search_source_connectors", "periodic_indexing_enabled")

surfsense_backend/app/celery_app.py

Lines changed: 55 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import os
44

55
from celery import Celery
6+
from celery.schedules import crontab
67
from dotenv import load_dotenv
78

89
# Load environment variables
@@ -12,6 +13,46 @@
1213
CELERY_BROKER_URL = os.getenv("CELERY_BROKER_URL", "redis://localhost:6379/0")
1314
CELERY_RESULT_BACKEND = os.getenv("CELERY_RESULT_BACKEND", "redis://localhost:6379/0")
1415

16+
# Get schedule checker interval from environment
17+
# Format: "<number><unit>" where unit is 'm' (minutes) or 'h' (hours)
18+
# Examples: "1m" (every minute), "5m" (every 5 minutes), "1h" (every hour)
19+
SCHEDULE_CHECKER_INTERVAL = os.getenv("SCHEDULE_CHECKER_INTERVAL", "2m")
20+
21+
22+
def parse_schedule_interval(interval: str) -> dict:
23+
"""Parse interval string into crontab parameters.
24+
25+
Args:
26+
interval: String like "1m", "5m", "1h", etc.
27+
28+
Returns:
29+
Dict with crontab parameters (minute, hour)
30+
"""
31+
interval = interval.strip().lower()
32+
33+
# Extract number and unit
34+
if interval.endswith("m") or interval.endswith("min"):
35+
# Minutes
36+
num = int(interval.rstrip("min"))
37+
if num == 1:
38+
return {"minute": "*", "hour": "*"}
39+
else:
40+
return {"minute": f"*/{num}", "hour": "*"}
41+
elif interval.endswith("h") or interval.endswith("hour"):
42+
# Hours
43+
num = int(interval.rstrip("hour"))
44+
if num == 1:
45+
return {"minute": "0", "hour": "*"}
46+
else:
47+
return {"minute": "0", "hour": f"*/{num}"}
48+
else:
49+
# Default to every minute if parsing fails
50+
return {"minute": "*", "hour": "*"}
51+
52+
53+
# Parse the schedule interval
54+
schedule_params = parse_schedule_interval(SCHEDULE_CHECKER_INTERVAL)
55+
1556
# Create Celery app
1657
celery_app = Celery(
1758
"surfsense",
@@ -21,6 +62,7 @@
2162
"app.tasks.celery_tasks.document_tasks",
2263
"app.tasks.celery_tasks.podcast_tasks",
2364
"app.tasks.celery_tasks.connector_tasks",
65+
"app.tasks.celery_tasks.schedule_checker_task",
2466
],
2567
)
2668

@@ -47,13 +89,20 @@
4789
task_reject_on_worker_lost=True,
4890
# Broker settings
4991
broker_connection_retry_on_startup=True,
92+
# Beat scheduler settings
93+
beat_max_loop_interval=60, # Check every minute
5094
)
5195

52-
# Optional: Configure Celery Beat for periodic tasks
96+
# Configure Celery Beat schedule
97+
# This uses a meta-scheduler pattern: instead of creating individual Beat schedules
98+
# for each connector, we have ONE schedule that checks the database at the configured interval
99+
# for connectors that need indexing. This provides dynamic scheduling without restarts.
53100
celery_app.conf.beat_schedule = {
54-
# Example: Add periodic tasks here if needed
55-
# "periodic-task-name": {
56-
# "task": "app.tasks.celery_tasks.some_task",
57-
# "schedule": crontab(minute=0, hour=0), # Run daily at midnight
58-
# },
101+
"check-periodic-connector-schedules": {
102+
"task": "check_periodic_schedules",
103+
"schedule": crontab(**schedule_params),
104+
"options": {
105+
"expires": 30, # Task expires after 30 seconds if not picked up
106+
},
107+
},
59108
}

surfsense_backend/app/db.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -285,6 +285,11 @@ class SearchSourceConnector(BaseModel, TimestampMixin):
285285
last_indexed_at = Column(TIMESTAMP(timezone=True), nullable=True)
286286
config = Column(JSON, nullable=False)
287287

288+
# Periodic indexing fields
289+
periodic_indexing_enabled = Column(Boolean, nullable=False, default=False)
290+
indexing_frequency_minutes = Column(Integer, nullable=True)
291+
next_scheduled_at = Column(TIMESTAMP(timezone=True), nullable=True)
292+
288293
search_space_id = Column(
289294
Integer, ForeignKey("searchspaces.id", ondelete="CASCADE"), nullable=False
290295
)

0 commit comments

Comments
 (0)