Skip to content

Commit 81b49e3

Browse files
committed
feat(indexer): Add JSON schema support and apply for it before indexing
1 parent 5e84ba5 commit 81b49e3

File tree

4 files changed

+112
-10
lines changed

4 files changed

+112
-10
lines changed

CHANGELOG.md

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,16 @@ and commits should be formatted using [Conventional Commits](https://www.convent
88

99
## [Unreleased]
1010

11+
### Added
12+
13+
- Indexer: Add JSON schema support and apply for it before indexing
14+
1115
### Changed
1216

1317
- GitHub: Run workflows also with Python 3.14 (aka, πthon) by @stronk7 ([ffc9b17](https://github.com/moodlehq/wiki-rag/commit/ffc9b17312ee7e190050585d674d3f11510fa8f2))
14-
- Loader: Improve the generation of page head sections
15-
- Indexer: Better handling of preamble and contents on indexing
16-
- Searcher: Improve the "popularity" optimisation
18+
- Loader: Improve the generation of page head sections by @stronk7 ([a228019](https://github.com/moodlehq/wiki-rag/commit/a228019a03284aa21da8661adfd2c23dac9eee8d))
19+
- Indexer: Better handling of preamble and contents on indexing by @stronk7 ([996eeec](https://github.com/moodlehq/wiki-rag/commit/996eeecfc9d3f49ee6b9264ae77cea9fa5bb0f34))
20+
- Searcher: Improve the "popularity" optimisation by @stronk7 ([5e84ba5](https://github.com/moodlehq/wiki-rag/commit/5e84ba5039fdb02a8b5549297036cbe7c3931d03))
1721
## [0.11.2] - 2025-10-22
1822

1923
### Added

wiki_rag/index/main.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -107,11 +107,11 @@ def main():
107107
# TODO: Make this to accept CLI argument or, by default, use the last file in the directory.
108108
input_file = loader_dump_path / input_candidate
109109

110-
logger.info(f"Loading parsed pages from json: {input_file}, namespaces: {mediawiki_namespaces}")
110+
logger.info(f"Loading parsed pages from JSON: {input_file}, namespaces: {mediawiki_namespaces}")
111111
information = load_parsed_information(input_file)
112112
# TODO: Multiple site information handling should be implemented here.
113113
pages = information["sites"][0]["pages"]
114-
logger.info(f"Loaded {len(pages)} pages from json file")
114+
logger.info(f"Loaded {len(pages)} pages from JSON file")
115115

116116
temp_collection_name = f"{collection_name}_temp"
117117
logger.info(f'Preparing new temp collection "{temp_collection_name}" schema')

wiki_rag/index/util.py

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from datetime import UTC, datetime, timedelta
1010
from pathlib import Path
1111

12+
from jsonschema import ValidationError, validate
1213
from langchain_openai import OpenAIEmbeddings
1314
from pymilvus import (
1415
CollectionSchema,
@@ -22,6 +23,8 @@
2223

2324
import wiki_rag.index as index
2425

26+
from wiki_rag import ROOT_DIR
27+
2528
logger = logging.getLogger(__name__)
2629

2730

@@ -54,11 +57,15 @@ def load_parsed_information(input_file: Path) -> dict:
5457
]
5558
}
5659

57-
# If the loaded information is not a dictionary or is missing "meta" and "sites" properties,
58-
# this is wrong and we should error. Raise an exception.
59-
if not isinstance(information, dict) or "meta" not in information or "sites" not in information:
60-
msg = f"Error with the format from {input_file}: missing 'meta' or 'sites' properties."
61-
raise ValueError(msg)
60+
# Let's validate the schema as much as we can.
61+
schema = json.load(open(ROOT_DIR / "wiki_rag/schema.json"))
62+
try:
63+
validate(information, schema)
64+
logger.debug("Successfully parsed the JSON information")
65+
except ValidationError as e:
66+
msg = f"Error validating the JSON information from {input_file}: {e}"
67+
logger.error(msg)
68+
exit(1)
6269

6370
return information
6471

wiki_rag/schema.json

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
{
2+
"$schema": "https://json-schema.org/draft/2020-12/schema",
3+
"$id": "https://github.com/moodlehq/wiki-rag/raw/refs/heads/main/wiki_rag/schema.json",
4+
"title": "Wiki-RAG data files schema",
5+
"type": "object",
6+
"required": ["meta", "sites"],
7+
"unevaluatedProperties": false,
8+
"properties": {
9+
"meta": {
10+
"type": "object",
11+
"required": ["timestamp", "num_sites"],
12+
"unevaluatedProperties": false,
13+
"properties": {
14+
"timestamp": { "type": "string", "format": "date-time" },
15+
"num_sites": { "type": "integer", "minimum": 0 }
16+
}
17+
},
18+
"sites": {
19+
"type": "array",
20+
"items": { "$ref": "#/$defs/site" }
21+
}
22+
},
23+
24+
"$defs": {
25+
"uuid": {
26+
"type": "string",
27+
"pattern": "^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$"
28+
},
29+
"uuidOrNull": {
30+
"type": ["string", "null"],
31+
"pattern": "^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$"
32+
},
33+
34+
"site": {
35+
"type": "object",
36+
"required": ["site_url", "num_pages", "pages"],
37+
"unevaluatedProperties": false,
38+
"properties": {
39+
"site_url": { "type": "string", "format": "uri" },
40+
"num_pages": { "type": "integer", "minimum": 0 },
41+
"pages": { "type": "array", "items": { "$ref": "#/$defs/page" } }
42+
}
43+
},
44+
45+
"page": {
46+
"type": "object",
47+
"required": ["id", "title", "sections"],
48+
"unevaluatedProperties": false,
49+
"properties": {
50+
"id": { "type": "integer" },
51+
"title": { "type": "string" },
52+
"sections": { "type": "array", "items": { "$ref": "#/$defs/section" } },
53+
"categories": { "type": "array", "items": { "type": "string" } },
54+
"templates": { "type": "array", "items": { "type": "string" } },
55+
"internal_links": { "type": "array", "items": { "type": "string" } },
56+
"external_links": { "type": "array", "items": { "type": "string" } },
57+
"language_links": { "type": "array", "items": { "type": "string" } }
58+
}
59+
},
60+
61+
"section": {
62+
"type": "object",
63+
"required": [
64+
"id", "anchor", "title", "source", "text", "all_links", "wiki_links",
65+
"index", "level", "page_id", "doc_id", "doc_title", "doc_hash",
66+
"parent", "children", "previous", "next", "relations"
67+
],
68+
"unevaluatedProperties": false,
69+
"properties": {
70+
"id": { "$ref": "#/$defs/uuid" },
71+
"anchor": { "type": "string" },
72+
"title": { "type": "string" },
73+
"source": { "type": "string", "format": "uri" },
74+
"text": { "type": "string" },
75+
"all_links": { "type": "array", "items": { "type": "string" } },
76+
"wiki_links": { "type": "array", "items": { "type": "string" } },
77+
"index": { "type": "integer", "minimum": 0 },
78+
"level": { "type": "integer", "minimum": 0 },
79+
"page_id": { "type": "integer" },
80+
"doc_id": { "$ref": "#/$defs/uuid" },
81+
"doc_title": { "type": "string" },
82+
"doc_hash": { "$ref": "#/$defs/uuid" },
83+
"parent": { "$ref": "#/$defs/uuidOrNull" },
84+
"children": { "type": "array", "items": { "$ref": "#/$defs/uuidOrNull" } },
85+
"previous": { "type": "array", "items": { "$ref": "#/$defs/uuidOrNull" } },
86+
"next": { "type": "array", "items": { "$ref": "#/$defs/uuidOrNull" } },
87+
"relations": { "type": "array", "items": { "$ref": "#/$defs/uuidOrNull" } }
88+
}
89+
}
90+
}
91+
}

0 commit comments

Comments
 (0)