Skip to content

Commit 885870b

Browse files
authored
Merge pull request #405 from unitagain/feature/baidu-search-integration
feat: add Baidu AI Search integration
2 parents fa39176 + a479fea commit 885870b

File tree

12 files changed

+669
-1
lines changed

12 files changed

+669
-1
lines changed
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
"""Add BAIDU_SEARCH_API to searchsourceconnectortype enum
2+
3+
Revision ID: 30
4+
Revises: 29
5+
6+
Changes:
7+
1. Add BAIDU_SEARCH_API value to searchsourceconnectortype enum
8+
2. Add BAIDU_SEARCH_API value to documenttype enum for consistency
9+
"""
10+
11+
from collections.abc import Sequence
12+
13+
from alembic import op
14+
15+
# revision identifiers, used by Alembic.
16+
revision: str = "30"
17+
down_revision: str | None = "29"
18+
branch_labels: str | Sequence[str] | None = None
19+
depends_on: str | Sequence[str] | None = None
20+
21+
22+
def upgrade() -> None:
23+
"""Add BAIDU_SEARCH_API to searchsourceconnectortype and documenttype enums."""
24+
25+
# Add BAIDU_SEARCH_API to searchsourceconnectortype enum
26+
op.execute(
27+
"""
28+
DO $$
29+
BEGIN
30+
IF NOT EXISTS (
31+
SELECT 1 FROM pg_type t
32+
JOIN pg_enum e ON t.oid = e.enumtypid
33+
WHERE t.typname = 'searchsourceconnectortype' AND e.enumlabel = 'BAIDU_SEARCH_API'
34+
) THEN
35+
ALTER TYPE searchsourceconnectortype ADD VALUE 'BAIDU_SEARCH_API';
36+
END IF;
37+
END
38+
$$;
39+
"""
40+
)
41+
42+
# Add BAIDU_SEARCH_API to documenttype enum for consistency
43+
op.execute(
44+
"""
45+
DO $$
46+
BEGIN
47+
IF NOT EXISTS (
48+
SELECT 1 FROM pg_type t
49+
JOIN pg_enum e ON t.oid = e.enumtypid
50+
WHERE t.typname = 'documenttype' AND e.enumlabel = 'BAIDU_SEARCH_API'
51+
) THEN
52+
ALTER TYPE documenttype ADD VALUE 'BAIDU_SEARCH_API';
53+
END IF;
54+
END
55+
$$;
56+
"""
57+
)
58+
59+
60+
def downgrade() -> None:
61+
"""
62+
Downgrade is not supported for enum values in PostgreSQL.
63+
64+
Removing enum values can break existing data and is generally not safe.
65+
To remove these values, you would need to:
66+
1. Remove all references to BAIDU_SEARCH_API in the database
67+
2. Recreate the enum type without BAIDU_SEARCH_API
68+
3. Reapply all other enum values
69+
70+
This is intentionally left as a no-op for safety.
71+
"""
72+
pass
73+

surfsense_backend/app/agents/researcher/nodes.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1037,6 +1037,32 @@ async def fetch_relevant_documents(
10371037
}
10381038
)
10391039

1040+
elif connector == "BAIDU_SEARCH_API":
1041+
(
1042+
source_object,
1043+
baidu_chunks,
1044+
) = await connector_service.search_baidu(
1045+
user_query=reformulated_query,
1046+
user_id=user_id,
1047+
search_space_id=search_space_id,
1048+
top_k=top_k,
1049+
)
1050+
1051+
# Add to sources and raw documents
1052+
if source_object:
1053+
all_sources.append(source_object)
1054+
all_raw_documents.extend(baidu_chunks)
1055+
1056+
# Stream found document count
1057+
if streaming_service and writer:
1058+
writer(
1059+
{
1060+
"yield_value": streaming_service.format_terminal_info_delta(
1061+
f"🇨🇳 Found {len(baidu_chunks)} Baidu Search results related to your query"
1062+
)
1063+
}
1064+
)
1065+
10401066
elif connector == "DISCORD_CONNECTOR":
10411067
(
10421068
source_object,

surfsense_backend/app/agents/researcher/utils.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ def get_connector_emoji(connector_name: str) -> str:
4848
"DISCORD_CONNECTOR": "🗨️",
4949
"TAVILY_API": "🔍",
5050
"LINKUP_API": "🔗",
51+
"BAIDU_SEARCH_API": "🇨🇳",
5152
"GOOGLE_CALENDAR_CONNECTOR": "📅",
5253
"AIRTABLE_CONNECTOR": "🗃️",
5354
"LUMA_CONNECTOR": "✨",
@@ -72,6 +73,7 @@ def get_connector_friendly_name(connector_name: str) -> str:
7273
"DISCORD_CONNECTOR": "Discord",
7374
"TAVILY_API": "Tavily Search",
7475
"LINKUP_API": "Linkup Search",
76+
"BAIDU_SEARCH_API": "Baidu Search",
7577
"AIRTABLE_CONNECTOR": "Airtable",
7678
"LUMA_CONNECTOR": "Luma",
7779
}

surfsense_backend/app/db.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ class SearchSourceConnectorType(str, Enum):
5757
TAVILY_API = "TAVILY_API"
5858
SEARXNG_API = "SEARXNG_API"
5959
LINKUP_API = "LINKUP_API"
60+
BAIDU_SEARCH_API = "BAIDU_SEARCH_API" # Baidu AI Search API for Chinese web search
6061
SLACK_CONNECTOR = "SLACK_CONNECTOR"
6162
NOTION_CONNECTOR = "NOTION_CONNECTOR"
6263
GITHUB_CONNECTOR = "GITHUB_CONNECTOR"

surfsense_backend/app/services/connector_service.py

Lines changed: 220 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -560,6 +560,226 @@ def _format_list(value: Any) -> str | None:
560560

561561
return result_object, documents
562562

563+
async def search_baidu(
564+
self,
565+
user_query: str,
566+
user_id: str,
567+
search_space_id: int,
568+
top_k: int = 20,
569+
) -> tuple:
570+
"""
571+
Search using Baidu AI Search API and return both sources and documents.
572+
573+
Baidu AI Search provides intelligent search with automatic summarization.
574+
We extract the raw search results (references) from the API response.
575+
576+
Args:
577+
user_query: User's search query
578+
user_id: User ID
579+
search_space_id: Search space ID
580+
top_k: Maximum number of results to return
581+
582+
Returns:
583+
tuple: (sources_info_dict, documents_list)
584+
"""
585+
# Get Baidu connector configuration
586+
baidu_connector = await self.get_connector_by_type(
587+
user_id, SearchSourceConnectorType.BAIDU_SEARCH_API, search_space_id
588+
)
589+
590+
if not baidu_connector:
591+
return {
592+
"id": 12,
593+
"name": "Baidu Search",
594+
"type": "BAIDU_SEARCH_API",
595+
"sources": [],
596+
}, []
597+
598+
config = baidu_connector.config or {}
599+
api_key = config.get("BAIDU_API_KEY")
600+
601+
if not api_key:
602+
print("ERROR: Baidu connector is missing BAIDU_API_KEY configuration")
603+
print(f"Connector config: {config}")
604+
return {
605+
"id": 12,
606+
"name": "Baidu Search",
607+
"type": "BAIDU_SEARCH_API",
608+
"sources": [],
609+
}, []
610+
611+
# Optional configuration parameters
612+
model = config.get("BAIDU_MODEL", "ernie-3.5-8k")
613+
search_source = config.get("BAIDU_SEARCH_SOURCE", "baidu_search_v2")
614+
enable_deep_search = config.get("BAIDU_ENABLE_DEEP_SEARCH", False)
615+
616+
# Baidu AI Search API endpoint
617+
baidu_endpoint = "https://qianfan.baidubce.com/v2/ai_search/chat/completions"
618+
619+
# Prepare request headers
620+
# Note: Baidu uses X-Appbuilder-Authorization instead of standard Authorization header
621+
headers = {
622+
"X-Appbuilder-Authorization": f"Bearer {api_key}",
623+
"Content-Type": "application/json",
624+
}
625+
626+
# Prepare request payload
627+
# Calculate resource_type_filter top_k values
628+
# Baidu v2 supports max 20 per type
629+
max_per_type = min(top_k, 20)
630+
631+
payload = {
632+
"messages": [{"role": "user", "content": user_query}],
633+
"model": model,
634+
"search_source": search_source,
635+
"resource_type_filter": [
636+
{"type": "web", "top_k": max_per_type},
637+
{"type": "video", "top_k": max(1, max_per_type // 4)}, # Fewer videos
638+
],
639+
"stream": False, # Non-streaming for simpler processing
640+
"enable_deep_search": enable_deep_search,
641+
"enable_corner_markers": True, # Enable reference markers
642+
}
643+
644+
try:
645+
# Baidu AI Search may take longer as it performs search + summarization
646+
# Increase timeout to 90 seconds
647+
async with httpx.AsyncClient(timeout=90.0) as client:
648+
response = await client.post(
649+
baidu_endpoint,
650+
headers=headers,
651+
json=payload,
652+
)
653+
response.raise_for_status()
654+
except httpx.TimeoutException as exc:
655+
print(f"ERROR: Baidu API request timeout after 90s: {exc!r}")
656+
print(f"Endpoint: {baidu_endpoint}")
657+
return {
658+
"id": 12,
659+
"name": "Baidu Search",
660+
"type": "BAIDU_SEARCH_API",
661+
"sources": [],
662+
}, []
663+
except httpx.HTTPStatusError as exc:
664+
print(f"ERROR: Baidu API HTTP Status Error: {exc.response.status_code}")
665+
print(f"Response text: {exc.response.text[:500]}")
666+
print(f"Request URL: {exc.request.url}")
667+
return {
668+
"id": 12,
669+
"name": "Baidu Search",
670+
"type": "BAIDU_SEARCH_API",
671+
"sources": [],
672+
}, []
673+
except httpx.RequestError as exc:
674+
print(f"ERROR: Baidu API Request Error: {type(exc).__name__}: {exc!r}")
675+
print(f"Endpoint: {baidu_endpoint}")
676+
return {
677+
"id": 12,
678+
"name": "Baidu Search",
679+
"type": "BAIDU_SEARCH_API",
680+
"sources": [],
681+
}, []
682+
except Exception as exc:
683+
print(f"ERROR: Unexpected error calling Baidu API: {type(exc).__name__}: {exc!r}")
684+
print(f"Endpoint: {baidu_endpoint}")
685+
print(f"Payload: {payload}")
686+
return {
687+
"id": 12,
688+
"name": "Baidu Search",
689+
"type": "BAIDU_SEARCH_API",
690+
"sources": [],
691+
}, []
692+
693+
try:
694+
data = response.json()
695+
except ValueError as e:
696+
print(f"ERROR: Failed to decode JSON response from Baidu AI Search: {e}")
697+
print(f"Response status: {response.status_code}")
698+
print(f"Response text: {response.text[:500]}") # First 500 chars
699+
return {
700+
"id": 12,
701+
"name": "Baidu Search",
702+
"type": "BAIDU_SEARCH_API",
703+
"sources": [],
704+
}, []
705+
706+
# Extract references (search results) from the response
707+
baidu_references = data.get("references", [])
708+
709+
if "code" in data or "message" in data:
710+
print(f"WARNING: Baidu API returned error - Code: {data.get('code')}, Message: {data.get('message')}")
711+
712+
if not baidu_references:
713+
print("WARNING: No references found in Baidu API response")
714+
print(f"Response keys: {list(data.keys())}")
715+
return {
716+
"id": 12,
717+
"name": "Baidu Search",
718+
"type": "BAIDU_SEARCH_API",
719+
"sources": [],
720+
}, []
721+
722+
sources_list: list[dict[str, Any]] = []
723+
documents: list[dict[str, Any]] = []
724+
725+
async with self.counter_lock:
726+
for reference in baidu_references:
727+
# Extract basic fields
728+
title = reference.get("title", "Baidu Search Result")
729+
url = reference.get("url", "")
730+
content = reference.get("content", "")
731+
date = reference.get("date", "")
732+
ref_type = reference.get("type", "web") # web, image, video
733+
734+
# Create a source entry
735+
source = {
736+
"id": self.source_id_counter,
737+
"title": title,
738+
"description": content[:300] if content else "", # Limit description length
739+
"url": url,
740+
}
741+
sources_list.append(source)
742+
743+
# Prepare metadata
744+
metadata = {
745+
"url": url,
746+
"date": date,
747+
"type": ref_type,
748+
"source": "BAIDU_SEARCH_API",
749+
"web_anchor": reference.get("web_anchor", ""),
750+
"website": reference.get("website", ""),
751+
}
752+
753+
# Add type-specific metadata
754+
if ref_type == "image" and reference.get("image"):
755+
metadata["image"] = reference["image"]
756+
elif ref_type == "video" and reference.get("video"):
757+
metadata["video"] = reference["video"]
758+
759+
# Create a document entry
760+
document = {
761+
"chunk_id": self.source_id_counter,
762+
"content": content,
763+
"score": 1.0, # Baidu doesn't provide relevance scores
764+
"document": {
765+
"id": self.source_id_counter,
766+
"title": title,
767+
"document_type": "BAIDU_SEARCH_API",
768+
"metadata": metadata,
769+
},
770+
}
771+
documents.append(document)
772+
self.source_id_counter += 1
773+
774+
result_object = {
775+
"id": 12,
776+
"name": "Baidu Search",
777+
"type": "BAIDU_SEARCH_API",
778+
"sources": sources_list,
779+
}
780+
781+
return result_object, documents
782+
563783
async def search_slack(
564784
self,
565785
user_query: str,

surfsense_backend/app/utils/validators.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -434,6 +434,15 @@ def validate_list_field(key: str, field_name: str) -> None:
434434
},
435435
},
436436
"LINKUP_API": {"required": ["LINKUP_API_KEY"], "validators": {}},
437+
"BAIDU_SEARCH_API": {
438+
"required": ["BAIDU_API_KEY"],
439+
"optional": [
440+
"BAIDU_MODEL",
441+
"BAIDU_SEARCH_SOURCE",
442+
"BAIDU_ENABLE_DEEP_SEARCH",
443+
],
444+
"validators": {},
445+
},
437446
"SLACK_CONNECTOR": {"required": ["SLACK_BOT_TOKEN"], "validators": {}},
438447
"NOTION_CONNECTOR": {
439448
"required": ["NOTION_INTEGRATION_TOKEN"],

surfsense_web/.gitignore

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,11 @@ yarn-error.log*
3232

3333
# env files (can opt-in for committing if needed)
3434
.env
35+
.env.local
36+
.env*.local
37+
.env.development.local
38+
.env.test.local
39+
.env.production.local
3540

3641
# vercel
3742
.vercel

0 commit comments

Comments
 (0)