feat(loader): Modify the dump file format to support metadata

stronk7 · stronk7 · commit 49695ee094f1 · 2025-10-22T09:04:32.000+02:00
This will be useful to implement the incremental loading, so we
know which was the previous one and can search for changes since
then. Also other meta information about the loaded information can
be added later in the new "meta" property.

Also, no matter it's not implemented yet and looking forward to
add support for multiple mediawiki sites, we are adding all the
site-specific information within a new "sites" list. Obviously, for
now, it will contain only 1 element (1 site).

All the previous pages information now go to a "sites[]/pages" property,
how not. Plug the mediawiki url and the number of pages.

Finally, note that the indexer, when loading the dump files, will
be able to manage both the old and the new formats in a transparent
way, at least for some good time, allowing all the cases to be
updated without trouble.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,9 +8,14 @@ and commits should be formatted using [Conventional Commits](https://www.convent
 
 ## [Unreleased]
 
+### Added
+
+- Loader: Modify the dump file format to accommodate metadata.
+
 ### Changed
 
-- Dependencies: Bump dependencies, most noticeably FastMCP.
+- Dependencies: Bump dependencies, most noticeably FastMCP by @stronk7 ([38bdbe6](https://github.com/moodlehq/wiki-rag/commit/38bdbe6709365408e1bfb78f0d17d147a164aafc))
+- Dependencies: Bump langchain and langgraph libs to 1.0.x
 
 ### Fixed
 
diff --git a/wiki_rag/index/main.py b/wiki_rag/index/main.py
@@ -108,7 +108,9 @@ def main():
     input_file = loader_dump_path / input_candidate
 
     logger.info(f"Loading parsed pages from json: {input_file}, namespaces: {mediawiki_namespaces}")
-    pages = load_parsed_information(input_file)
+    information = load_parsed_information(input_file)
+    # TODO: Multiple site information handling should be implemented here.
+    pages = information["sites"][0]["pages"]
     logger.info(f"Loaded {len(pages)} pages from json file")
 
     temp_collection_name = f"{collection_name}_temp"
diff --git a/wiki_rag/index/util.py b/wiki_rag/index/util.py
@@ -6,6 +6,7 @@
 import json
 import logging
 
+from datetime import UTC, datetime, timedelta
 from pathlib import Path
 
 from langchain_openai import OpenAIEmbeddings
@@ -24,16 +25,42 @@
 logger = logging.getLogger(__name__)
 
 
-def load_parsed_information(input_file: Path) -> list[dict]:
+def load_parsed_information(input_file: Path) -> dict:
     """Load the parsed information from the file."""
-    pages = []
+    information = []
     try:
         with open(input_file) as f:
-            pages = json.load(f)
+            information = json.load(f)
     except Exception as e:
         logger.error(f"Error loading the parsed information from {input_file}: {e}")
 
-    return pages
+    # If the old format (array of pages) is detected, let's convert it to the new format,
+    # (basic information in "meta" and pages in "sites").
+    if isinstance(information, list):
+        logger.warning(f"Old format detected in {input_file}, converting to new format.")
+        file_mod_time = datetime.fromtimestamp(input_file.stat().st_mtime, UTC)
+        two_days_ago = file_mod_time - timedelta(days=2)  # ftime -48h so we don't miss anything on incremental index.
+        information = {
+            "meta": {
+                "timestamp": two_days_ago.isoformat(),
+                "num_sites": 1,
+            },
+            "sites": [
+                {
+                    "site_url": "unknown",
+                    "num_pages": len(information),
+                    "pages": information,
+                }
+            ]
+        }
+
+    # If the loaded information is not a dictionary or is missing "meta" and "sites" properties,
+    # this is wrong and we should error. Raise an exception.
+    if not isinstance(information, dict) or "meta" not in information or "sites" not in information:
+        msg = f"Error with the format from {input_file}: missing 'meta' or 'sites' properties."
+        raise ValueError(msg)
+
+    return information
 
 
 def create_temp_collection_schema(collection_name: str, embedding_dimension: int) -> None:
diff --git a/wiki_rag/load/main.py b/wiki_rag/load/main.py
@@ -89,8 +89,9 @@ def main():
     if not collection_name:
         logger.error("Collection name not found in environment. Exiting.")
         sys.exit(1)
-    # File name is the collection name + toady's date and time (hours and minutes) + .json
-    dump_filename = loader_dump_path / f"{collection_name}-{datetime.now(UTC).strftime('%Y-%m-%d-%H-%M')}.json"
+    # The dump datetime is now, before starting the loading. We use also for the filename.
+    dump_datetime = datetime.now(UTC).replace(microsecond=0)
+    dump_filename = loader_dump_path / f"{collection_name}-{dump_datetime.strftime('%Y-%m-%d-%H-%M')}.json"
 
     user_agent = os.getenv("USER_AGENT")
     if not user_agent:
@@ -128,7 +129,7 @@ def main():
     logger.info(f"Parsed {len(parsed_pages)} pages.")
 
     logger.info(f"Saving parsed pages to {dump_filename}")
-    save_parsed_pages(parsed_pages, dump_filename)
+    save_parsed_pages(parsed_pages, dump_filename, dump_datetime, mediawiki_url)
 
     logger.info("wiki_rag-load finished.")
 
diff --git a/wiki_rag/load/util.py b/wiki_rag/load/util.py
@@ -10,6 +10,7 @@
 import time
 import uuid
 
+from datetime import datetime
 from pathlib import Path
 
 import requests
@@ -50,6 +51,7 @@ def get_mediawiki_pages_list(
 
     session = requests.Session()
 
+    # TODO: Check response code (200) and handle errors.
     result = session.get(url=api_url, params=params, headers=headers)
     articles = result.json()["query"]["statistics"]["articles"]
     max_chunks = (articles * len(namespaces) // chunk) + 1
@@ -72,6 +74,7 @@ def get_mediawiki_pages_list(
                 if next_page:
                     params["apcontinue"] = next_page
 
+                # TODO: Check response code (200) and handle errors.
                 result = session.get(url=api_url, params=params, headers=headers)
                 data = result.json()
                 pages.extend(data["query"]["allpages"])
@@ -158,6 +161,7 @@ def fetch_and_parse_page(mediawiki_url: str, page_id: int, user_agent: str, excl
     }
 
     session = requests.Session()
+    # TODO: Check response code (200) and handle errors.
     result = session.get(url=api_url, params=params, headers=headers)
 
     id = result.json()["parse"]["pageid"]
@@ -363,8 +367,14 @@ def convert_internal_links(pages: list[dict]):
                     section["relations"].append(target[0]["id"])
 
 
-def save_parsed_pages(parsed_pages: list[dict], output_file: Path) -> None:
-    """Save the whole parsed information to a JSON file for later processing."""
+def save_parsed_pages(parsed_pages: list[dict], output_file: Path, timestamp: datetime, url: str) -> None:
+    """Save the whole parsed information to a JSON file for later processing.
+
+    We also add some metadata, apart from the pages that can be useful to check dates and
+    modifications. It will be a dictionary with at least these keys:
+    - meta: A dictionary with metadata about the dump.
+    - sites: The list of mediawiki sites, each being a dict with url, num_pages and pages info list.
+    """
     class CustomEncoder(json.JSONEncoder):
         def default(self, o):
             if isinstance(o, uuid.UUID):
@@ -373,4 +383,17 @@ def default(self, o):
             return json.JSONEncoder.default(self, o)
 
     with open(output_file, "w") as f:
-        json.dump(parsed_pages, f, cls=CustomEncoder)  # Ignore, PyCharm bug, PY-73050. Works ok.
+        info = {
+            "meta": {
+                "timestamp": timestamp.isoformat(),
+                "num_sites": 1,  # TODO: Change when multiple sites are supported.
+            },
+            "sites": [
+                {
+                    "site_url": url,
+                    "num_pages": len(parsed_pages),
+                    "pages": parsed_pages,
+                }
+            ]
+        }
+        json.dump(info, f, cls=CustomEncoder)