Skip to content

Commit 49695ee

Browse files
committed
feat(loader): Modify the dump file format to support metadata
This will be useful to implement the incremental loading, so we know which was the previous one and can search for changes since then. Also other meta information about the loaded information can be added later in the new "meta" property. Also, no matter it's not implemented yet and looking forward to add support for multiple mediawiki sites, we are adding all the site-specific information within a new "sites" list. Obviously, for now, it will contain only 1 element (1 site). All the previous pages information now go to a "sites[]/pages" property, how not. Plug the mediawiki url and the number of pages. Finally, note that the indexer, when loading the dump files, will be able to manage both the old and the new formats in a transparent way, at least for some good time, allowing all the cases to be updated without trouble.
1 parent 811cd88 commit 49695ee

File tree

5 files changed

+70
-12
lines changed

5 files changed

+70
-12
lines changed

CHANGELOG.md

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,14 @@ and commits should be formatted using [Conventional Commits](https://www.convent
88

99
## [Unreleased]
1010

11+
### Added
12+
13+
- Loader: Modify the dump file format to accommodate metadata.
14+
1115
### Changed
1216

13-
- Dependencies: Bump dependencies, most noticeably FastMCP.
17+
- Dependencies: Bump dependencies, most noticeably FastMCP by @stronk7 ([38bdbe6](https://github.com/moodlehq/wiki-rag/commit/38bdbe6709365408e1bfb78f0d17d147a164aafc))
18+
- Dependencies: Bump langchain and langgraph libs to 1.0.x
1419

1520
### Fixed
1621

wiki_rag/index/main.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,9 @@ def main():
108108
input_file = loader_dump_path / input_candidate
109109

110110
logger.info(f"Loading parsed pages from json: {input_file}, namespaces: {mediawiki_namespaces}")
111-
pages = load_parsed_information(input_file)
111+
information = load_parsed_information(input_file)
112+
# TODO: Multiple site information handling should be implemented here.
113+
pages = information["sites"][0]["pages"]
112114
logger.info(f"Loaded {len(pages)} pages from json file")
113115

114116
temp_collection_name = f"{collection_name}_temp"

wiki_rag/index/util.py

Lines changed: 31 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import json
77
import logging
88

9+
from datetime import UTC, datetime, timedelta
910
from pathlib import Path
1011

1112
from langchain_openai import OpenAIEmbeddings
@@ -24,16 +25,42 @@
2425
logger = logging.getLogger(__name__)
2526

2627

27-
def load_parsed_information(input_file: Path) -> list[dict]:
28+
def load_parsed_information(input_file: Path) -> dict:
2829
"""Load the parsed information from the file."""
29-
pages = []
30+
information = []
3031
try:
3132
with open(input_file) as f:
32-
pages = json.load(f)
33+
information = json.load(f)
3334
except Exception as e:
3435
logger.error(f"Error loading the parsed information from {input_file}: {e}")
3536

36-
return pages
37+
# If the old format (array of pages) is detected, let's convert it to the new format,
38+
# (basic information in "meta" and pages in "sites").
39+
if isinstance(information, list):
40+
logger.warning(f"Old format detected in {input_file}, converting to new format.")
41+
file_mod_time = datetime.fromtimestamp(input_file.stat().st_mtime, UTC)
42+
two_days_ago = file_mod_time - timedelta(days=2) # ftime -48h so we don't miss anything on incremental index.
43+
information = {
44+
"meta": {
45+
"timestamp": two_days_ago.isoformat(),
46+
"num_sites": 1,
47+
},
48+
"sites": [
49+
{
50+
"site_url": "unknown",
51+
"num_pages": len(information),
52+
"pages": information,
53+
}
54+
]
55+
}
56+
57+
# If the loaded information is not a dictionary or is missing "meta" and "sites" properties,
58+
# this is wrong and we should error. Raise an exception.
59+
if not isinstance(information, dict) or "meta" not in information or "sites" not in information:
60+
msg = f"Error with the format from {input_file}: missing 'meta' or 'sites' properties."
61+
raise ValueError(msg)
62+
63+
return information
3764

3865

3966
def create_temp_collection_schema(collection_name: str, embedding_dimension: int) -> None:

wiki_rag/load/main.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -89,8 +89,9 @@ def main():
8989
if not collection_name:
9090
logger.error("Collection name not found in environment. Exiting.")
9191
sys.exit(1)
92-
# File name is the collection name + toady's date and time (hours and minutes) + .json
93-
dump_filename = loader_dump_path / f"{collection_name}-{datetime.now(UTC).strftime('%Y-%m-%d-%H-%M')}.json"
92+
# The dump datetime is now, before starting the loading. We use also for the filename.
93+
dump_datetime = datetime.now(UTC).replace(microsecond=0)
94+
dump_filename = loader_dump_path / f"{collection_name}-{dump_datetime.strftime('%Y-%m-%d-%H-%M')}.json"
9495

9596
user_agent = os.getenv("USER_AGENT")
9697
if not user_agent:
@@ -128,7 +129,7 @@ def main():
128129
logger.info(f"Parsed {len(parsed_pages)} pages.")
129130

130131
logger.info(f"Saving parsed pages to {dump_filename}")
131-
save_parsed_pages(parsed_pages, dump_filename)
132+
save_parsed_pages(parsed_pages, dump_filename, dump_datetime, mediawiki_url)
132133

133134
logger.info("wiki_rag-load finished.")
134135

wiki_rag/load/util.py

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import time
1111
import uuid
1212

13+
from datetime import datetime
1314
from pathlib import Path
1415

1516
import requests
@@ -50,6 +51,7 @@ def get_mediawiki_pages_list(
5051

5152
session = requests.Session()
5253

54+
# TODO: Check response code (200) and handle errors.
5355
result = session.get(url=api_url, params=params, headers=headers)
5456
articles = result.json()["query"]["statistics"]["articles"]
5557
max_chunks = (articles * len(namespaces) // chunk) + 1
@@ -72,6 +74,7 @@ def get_mediawiki_pages_list(
7274
if next_page:
7375
params["apcontinue"] = next_page
7476

77+
# TODO: Check response code (200) and handle errors.
7578
result = session.get(url=api_url, params=params, headers=headers)
7679
data = result.json()
7780
pages.extend(data["query"]["allpages"])
@@ -158,6 +161,7 @@ def fetch_and_parse_page(mediawiki_url: str, page_id: int, user_agent: str, excl
158161
}
159162

160163
session = requests.Session()
164+
# TODO: Check response code (200) and handle errors.
161165
result = session.get(url=api_url, params=params, headers=headers)
162166

163167
id = result.json()["parse"]["pageid"]
@@ -363,8 +367,14 @@ def convert_internal_links(pages: list[dict]):
363367
section["relations"].append(target[0]["id"])
364368

365369

366-
def save_parsed_pages(parsed_pages: list[dict], output_file: Path) -> None:
367-
"""Save the whole parsed information to a JSON file for later processing."""
370+
def save_parsed_pages(parsed_pages: list[dict], output_file: Path, timestamp: datetime, url: str) -> None:
371+
"""Save the whole parsed information to a JSON file for later processing.
372+
373+
We also add some metadata, apart from the pages that can be useful to check dates and
374+
modifications. It will be a dictionary with at least these keys:
375+
- meta: A dictionary with metadata about the dump.
376+
- sites: The list of mediawiki sites, each being a dict with url, num_pages and pages info list.
377+
"""
368378
class CustomEncoder(json.JSONEncoder):
369379
def default(self, o):
370380
if isinstance(o, uuid.UUID):
@@ -373,4 +383,17 @@ def default(self, o):
373383
return json.JSONEncoder.default(self, o)
374384

375385
with open(output_file, "w") as f:
376-
json.dump(parsed_pages, f, cls=CustomEncoder) # Ignore, PyCharm bug, PY-73050. Works ok.
386+
info = {
387+
"meta": {
388+
"timestamp": timestamp.isoformat(),
389+
"num_sites": 1, # TODO: Change when multiple sites are supported.
390+
},
391+
"sites": [
392+
{
393+
"site_url": url,
394+
"num_pages": len(parsed_pages),
395+
"pages": parsed_pages,
396+
}
397+
]
398+
}
399+
json.dump(info, f, cls=CustomEncoder)

0 commit comments

Comments
 (0)