Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
fdddd11
Pause crawls instead of stopping when quotas are reached
tw4l Nov 5, 2025
584cabe
Update nightly tests
tw4l Nov 6, 2025
c771d9c
Update frontend for new paused states
tw4l Nov 11, 2025
9970f26
Fix comments
tw4l Nov 11, 2025
5c8df2d
Fix status.stopReason handling for paused states
tw4l Nov 11, 2025
2061ae2
Fix datetime deprecation in nightly test fixture
tw4l Nov 11, 2025
be4c622
WIP: Mark current issues with some TODOs
tw4l Nov 11, 2025
61c931c
WIP: Add debug logging to beginning of sync_crawls
tw4l Nov 12, 2025
11dfbbc
Modify execution time test to account for pausing
tw4l Nov 12, 2025
076827c
WIP: Add email notification
tw4l Nov 12, 2025
120ce92
Inc org bytes stored when crawl files are added, not at end of crawl
tw4l Nov 12, 2025
77d5a6b
More incremental storage work
tw4l Nov 12, 2025
e5b89b8
One more TODO
tw4l Nov 12, 2025
054cb55
Move paused with no stop reason condition below quota checks
tw4l Nov 13, 2025
4f3dc0b
WIP: Don't double-count already-uploaded WACZs in check
tw4l Nov 13, 2025
c6fe30a
Decrement org in delete_failed_crawl_files
tw4l Nov 18, 2025
262b135
Shorten docstring
tw4l Nov 18, 2025
c6a4765
Fix email sending (but still not yet idempotent)
tw4l Nov 18, 2025
1472129
Only send auto-paused emails once
tw4l Nov 18, 2025
c15c1ec
Add TODO to address already-existing bug that now matters more
tw4l Nov 18, 2025
7a0a515
TEMP: Add print logging to help figure out bug
tw4l Nov 24, 2025
6432f42
Semi-solution with comments describing why it's not perfect
tw4l Nov 24, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion backend/btrixcloud/basecrawls.py
Original file line number Diff line number Diff line change
Expand Up @@ -430,7 +430,7 @@ async def delete_failed_crawl_files(self, crawl_id: str, oid: UUID):
"""Delete crawl files for failed crawl"""
crawl = await self.get_base_crawl(crawl_id)
org = await self.orgs.get_org_by_id(oid)
await self._delete_crawl_files(crawl, org)
deleted_file_size = await self._delete_crawl_files(crawl, org)
await self.crawls.find_one_and_update(
{"_id": crawl_id, "oid": oid},
{
Expand All @@ -441,6 +441,7 @@ async def delete_failed_crawl_files(self, crawl_id: str, oid: UUID):
}
},
)
await self.orgs.inc_org_bytes_stored(oid, -deleted_file_size, "crawl")

async def delete_all_crawl_qa_files(self, crawl_id: str, org: Organization):
"""Delete files for all qa runs in a crawl"""
Expand Down
64 changes: 58 additions & 6 deletions backend/btrixcloud/crawls.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import urllib.parse
from datetime import datetime
from uuid import UUID
import asyncio

from typing import (
Annotated,
Expand Down Expand Up @@ -79,6 +80,8 @@
MatchCrawlQueueResponse,
CrawlLogLine,
TagsResponse,
TYPE_AUTO_PAUSED_STATES,
UserRole,
)


Expand All @@ -93,7 +96,12 @@ class CrawlOps(BaseCrawlOps):

crawl_manager: CrawlManager

def __init__(self, crawl_manager: CrawlManager, log_ops: CrawlLogOps, *args):
def __init__(
self,
crawl_manager: CrawlManager,
log_ops: CrawlLogOps,
*args,
):
super().__init__(*args)
self.crawl_manager = crawl_manager
self.log_ops = log_ops
Expand Down Expand Up @@ -371,6 +379,20 @@ async def get_active_crawls_size(self, oid: UUID) -> int:

return results[0].get("totalSum") or 0

async def get_active_crawls_uploaded_wacz_size(self, oid: UUID) -> int:
"""get size of all waczs already uploaded for running/paused crawls"""
cursor = self.crawls.aggregate(
[
{"$match": {"state": {"$in": RUNNING_AND_WAITING_STATES}, "oid": oid}},
{"$group": {"_id": None, "totalSum": {"$sum": "$fileSize"}}},
]
)
results = await cursor.to_list(length=1)
if not results:
return 0

return results[0].get("totalSum") or 0

async def delete_crawls(
self,
org: Organization,
Expand Down Expand Up @@ -812,7 +834,11 @@ async def get_crawl_stats(
return crawls_data

async def pause_crawl(
self, crawl_id: str, org: Organization, pause: bool
self,
crawl_id: str,
org: Organization,
pause: bool,
paused_at: Optional[datetime] = None,
) -> Dict[str, bool]:
"""pause or resume a crawl temporarily"""
crawl = await self.get_base_crawl(crawl_id, org)
Expand All @@ -821,10 +847,8 @@ async def pause_crawl(

result = None

if pause:
if pause and not paused_at:
paused_at = dt_now()
else:
paused_at = None

try:
result = await self.crawl_manager.pause_resume_crawl(
Expand Down Expand Up @@ -1195,6 +1219,30 @@ async def get_crawl_logs(
qa_run_id=qa_run_id,
)

async def notify_org_admins_of_auto_paused_crawl(
self,
paused_reason: TYPE_AUTO_PAUSED_STATES,
cid: UUID,
org: Organization,
):
"""Send email to all org admins about automatically paused crawl"""
users = await self.orgs.get_users_for_org(org, UserRole.OWNER)
workflow = await self.crawl_configs.get_crawl_config_out(cid, org)

await asyncio.gather(
*[
self.user_manager.email.send_crawl_auto_paused(
user.name,
user.email,
paused_reason,
workflow.lastCrawlPausedExpiry,
cid,
org,
)
for user in users
]
)


# ============================================================================
async def recompute_crawl_file_count_and_size(crawls, crawl_id: str):
Expand All @@ -1217,7 +1265,11 @@ async def recompute_crawl_file_count_and_size(crawls, crawl_id: str):
# ============================================================================
# pylint: disable=too-many-arguments, too-many-locals, too-many-statements
def init_crawls_api(
crawl_manager: CrawlManager, crawl_log_ops: CrawlLogOps, app, user_dep, *args
crawl_manager: CrawlManager,
crawl_log_ops: CrawlLogOps,
app,
user_dep,
*args,
):
"""API for crawl management, including crawl done callback"""
# pylint: disable=invalid-name, duplicate-code
Expand Down
29 changes: 29 additions & 0 deletions backend/btrixcloud/emailsender.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
Organization,
InvitePending,
Subscription,
TYPE_AUTO_PAUSED_STATES,
)
from .utils import is_bool, get_origin

Expand Down Expand Up @@ -250,3 +251,31 @@ async def send_subscription_trial_ending_soon(
behavior_on_trial_end=behavior_on_trial_end,
support_email=self.support_email,
)

async def send_crawl_auto_paused(
self,
user_name: str,
receiver_email: str,
paused_reason: TYPE_AUTO_PAUSED_STATES,
paused_expiry: datetime,
cid: UUID,
org: Organization,
headers=None,
):
"""Send email indicating crawl was paused due to quota or disabled crawling"""

origin = get_origin(headers)
org_url = f"{origin}/orgs/{org.slug}"
workflow_url = f"{org_url}/workflows/{cid}/latest"

await self._send_encrypted(
receiver_email,
"crawlAutoPaused",
org_name=org.name,
user_name=user_name,
paused_reason=paused_reason,
paused_expiry=paused_expiry.isoformat(),
org_url=org_url,
workflow_url=workflow_url,
support_email=self.support_email,
)
28 changes: 24 additions & 4 deletions backend/btrixcloud/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,10 +237,30 @@ class UserOrgInfoOut(BaseModel):
]
RUNNING_STATES = get_args(TYPE_RUNNING_STATES)

TYPE_WAITING_STATES = Literal[
"starting", "waiting_capacity", "waiting_org_limit", "paused"
TYPE_MANUALLY_PAUSED_STATES = Literal["paused"]

TYPE_AUTO_PAUSED_STATES = Literal[
"paused_storage_quota_reached",
"paused_time_quota_reached",
"paused_org_readonly",
]
AUTO_PAUSED_STATES = get_args(TYPE_AUTO_PAUSED_STATES)

TYPE_PAUSED_STATES = Literal[
TYPE_MANUALLY_PAUSED_STATES,
TYPE_AUTO_PAUSED_STATES,
]
WAITING_STATES = get_args(TYPE_WAITING_STATES)
PAUSED_STATES = get_args(TYPE_PAUSED_STATES)

TYPE_WAITING_NOT_PAUSED_STATES = Literal[
"starting",
"waiting_capacity",
"waiting_org_limit",
]
WAITING_NOT_PAUSED_STATES = get_args(TYPE_WAITING_NOT_PAUSED_STATES)

TYPE_WAITING_STATES = Literal[TYPE_PAUSED_STATES, TYPE_WAITING_NOT_PAUSED_STATES]
WAITING_STATES = [*PAUSED_STATES, *WAITING_NOT_PAUSED_STATES]

TYPE_FAILED_STATES = Literal[
"canceled",
Expand All @@ -260,7 +280,7 @@ class UserOrgInfoOut(BaseModel):
"stopped_org_readonly",
]
SUCCESSFUL_STATES = get_args(TYPE_SUCCESSFUL_STATES)
SUCCESSFUL_AND_PAUSED_STATES = ["paused", *SUCCESSFUL_STATES]
SUCCESSFUL_AND_PAUSED_STATES = [*PAUSED_STATES, *SUCCESSFUL_STATES]

TYPE_RUNNING_AND_WAITING_STATES = Literal[TYPE_WAITING_STATES, TYPE_RUNNING_STATES]
RUNNING_AND_WAITING_STATES = [*WAITING_STATES, *RUNNING_STATES]
Expand Down
Loading
Loading