diff --git a/hugegraph-llm/src/hugegraph_llm/config/prompt_config.py b/hugegraph-llm/src/hugegraph_llm/config/prompt_config.py index d56c830e8..bf42d3865 100644 --- a/hugegraph-llm/src/hugegraph_llm/config/prompt_config.py +++ b/hugegraph-llm/src/hugegraph_llm/config/prompt_config.py @@ -47,62 +47,52 @@ def __init__(self, llm_config_object): # Note: Users should modify the prompt(examples) according to the real schema and text (property_graph_extract.py) extract_graph_prompt_EN: str = """## Main Task -Given the following graph schema and a piece of text, your task is to analyze the text and extract information that fits into the schema's structure, formatting the information into vertices and edges as specified. +Extract only the vertices and edges that are supported by the given graph schema and input text. Return valid JSON only. -## Basic Rules: -### Schema Format: -Graph Schema: -- "vertices": [List of vertex labels and their properties] -- "edges": [List of edge labels, their source and target vertex labels, and properties] - -### Content Rule: -Please read the provided text carefully and identify any information that corresponds to the vertices and edges defined in the schema. -You are not allowed to modify the schema contraints. Your task is to format the provided information into the required schema, without missing any keyword. -For each piece of information that matches a vertex or edge, format it strictly according to the following JSON structures: - -#### Vertex Format: -{"id":"vertexLabelID:entityName","label":"vertexLabel","type":"vertex","properties":{"propertyName":"propertyValue", ...}} - -where: - - "vertexLabelID": int - - "vertexLabel": str - - "entityName": str - - "type": "vertex" - - "properties": dict - -#### Edge Format: -{"id":"vertexlabelID:pk1!pk2!pk3", label":"edgeLabel","type":"edge","outV":"sourceVertexId","outVLabel":"sourceVertexLabel","inV":"targetVertexId","inVLabel":"targetVertexLabel","properties":{"propertyName":"propertyValue",...}} - -where: - - "id": int or str (conditional) (optional) - - "edgeLabel": str - - "type": "edge" - - "outV": str - - "outVLabel": str - - "inV": str - - "inVLabel": str - - "properties": dict - - "sourceVertexId": "vertexLabelID:entityName" - - "targetVertexId": "vertexLabelID:entityName" - -Strictly follow these rules: -1. Don't extract property fields or labels that doesn't exist in the given schema. Do not generate new information. -2. Ensure the extracted property set in the same type as the given schema (like 'age' should be a number, 'select' should be a boolean). -3. If there are multiple primary keys, the strategy for generating VID is: vertexlabelID:pk1!pk2!pk3 (pk means primary key, and '!' is the separator). This id must be generated ONLY if there are multiple primary keys. If there is only one primary key, the strategy for generating VID is: int (sequencially increasing). -4. Output in JSON format, only include vertexes and edges & remove empty properties, extracted and formatted based on the text/rules and schema. -5. Translate the schema fields into Chinese if the given text input is Chinese (Optional) - -Refer to the following baseline example to understand the output generation requirements: -## Example: -### Input example: -#### text: -Meet Sarah, a 30-year-old attorney, and her roommate, James, whom she's shared a home with since 2010. James, in his professional life, works as a journalist. - -#### graph schema example: -{"vertices":[{"vertex_label":"person","properties":["name","age","occupation"]}], "edges":[{"edge_label":"roommate", "source_vertex_label":"person","target_vertex_label":"person","properties":["date"]]} - -### Output example: -{"vertices":[{"id":"1:Sarah","label":"person","type":"vertex","properties":{"name":"Sarah","age":30,"occupation":"attorney"}},{"id":"1:James","label":"person","type":"vertex","properties":{"name":"James","occupation":"journalist"}}], "edges":[{"id": 1, "label":"roommate","type":"edge","outV":"1:Sarah","outVLabel":"person","inV":"1:James","inVLabel":"person","properties":{"date":"2010"}}]}""" +## Schema Contract +The Graph schema uses this shape: +- vertexlabels[]: each vertex label has "id", "name", "primary_keys", "properties", and optional "nullable_keys". +- edgelabels[]: each edge label has "name", "source_label", "target_label", and "properties". +- propertykeys[]: each property key has "name", "data_type", and "cardinality". + +## Output Contract +Return exactly one JSON object: {"vertices": [...], "edges": [...]} + +Vertex object: +{"id":"vertex id","label":"vertex label","properties":{"propertyName":"propertyValue", ...}} + +Edge object: +{"label":"edge label","outV":"source vertex id","outVLabel":"source vertex label","inV":"target vertex id","inVLabel":"target vertex label","properties":{"propertyName":"propertyValue", ...}} + +## Deterministic Vertex ID Rules +For every vertex, first find the schema entry where vertexlabels[].name equals the output label. +- vertexLabelID must be taken from that schema entry's vertexlabels[].id. Never invent it from the label text. +- If primary_keys has exactly one key: id = "{vertexLabelID}:{properties.}". +- If primary_keys has multiple keys: id = "{vertexLabelID}:{properties.}!{properties.}" in the same order as schema primary_keys. +- Never use label names such as "person:Sarah" as vertex ids when schema gives a numeric vertex label id. + +## Edge Reference Rules +- outV and inV must exactly match the id of vertices in the same output. +- outVLabel/inVLabel must match the corresponding source/target vertex label. +- Only output an edge if both endpoint vertices are also present in vertices. +- Do not create an edge label that is not present in edgelabels[]. + +## Extraction Rules +1. Do not extract labels or properties that are absent from the schema. +2. Do not translate schema field names, labels, or property keys. Keep schema names exactly as provided. +3. Preserve property data types according to propertykeys[]; for example, INT stays number and BOOLEAN stays boolean. +4. Remove empty properties. Do not invent missing facts. +5. Output JSON only; no Markdown fences, prose, comments, or trailing text. + +## Example +Input text: +Meet Sarah, a 30-year-old attorney, and her roommate, James, whom she's shared a home with since 2010. James works as a journalist. + +Graph schema example: +{"vertexlabels":[{"id":1,"name":"person","primary_keys":["name"],"properties":["name","age","occupation"],"nullable_keys":["age","occupation"]}],"edgelabels":[{"name":"roommate","source_label":"person","target_label":"person","properties":["date"]}],"propertykeys":[{"name":"name","data_type":"TEXT","cardinality":"SINGLE"},{"name":"age","data_type":"INT","cardinality":"SINGLE"},{"name":"occupation","data_type":"TEXT","cardinality":"SINGLE"},{"name":"date","data_type":"TEXT","cardinality":"SINGLE"}]} + +Output: +{"vertices":[{"id":"1:Sarah","label":"person","properties":{"name":"Sarah","age":30,"occupation":"attorney"}},{"id":"1:James","label":"person","properties":{"name":"James","occupation":"journalist"}}],"edges":[{"label":"roommate","outV":"1:Sarah","outVLabel":"person","inV":"1:James","inVLabel":"person","properties":{"date":"2010"}}]}""" graph_schema: str = """{ "vertexlabels": [ @@ -275,40 +265,52 @@ def __init__(self, llm_config_object): """ extract_graph_prompt_CN: str = """## 主要任务 -根据以下图谱和一段文本,你的任务是分析文本并提取符合模式结构的信息,将信息格式化为顶点和边。 - -## 基本规则 -### 模式格式 -图谱模式: -- 顶点:[顶点标签及其属性列表] -- 边:[边标签、源顶点标签、目标顶点标签及其属性列表] - -### 内容规则 -请仔细阅读提供的文本,识别与模式中定义的顶点和边相对应的信息。对于每一条匹配顶点或边的信息,按以下 JSON 结构格式化: - -#### 顶点格式: -{"id":"顶点标签 ID:实体名称","label":"顶点标签","type":"vertex","properties":{"属性名":"属性值", ...}} - -#### 边格式: -{"label":"边标签","type":"edge","outV":"源顶点 ID","outVLabel":"源顶点标签","inV":"目标顶点 ID","inVLabel":"目标顶点标签","properties":{"属性名":"属性值",...}} - -同时遵循以下规则: -1. 不要提取给定模式中不存在的属性字段或标签 -2. 确保提取的属性集与给定模式类型一致(如'age'应为数字,'select'应为布尔值) -3. 如果有多个主键,生成 VID 的策略是:顶点标签 ID:pk1!pk2!pk3(pk 表示主键,'!'是分隔符) -4. 以 JSON 格式输出,仅包含顶点和边,移除空属性,基于文本/规则和模式提取和格式化 -5. 如果给定文本为中文但模式为英文,则将模式字段翻译成中文(可选) +只抽取输入文本和给定图谱 schema 共同支持的顶点与边。只返回合法 JSON。 + +## Schema 契约 +图谱 schema 使用以下结构: +- vertexlabels[]:每个顶点标签包含 "id"、"name"、"primary_keys"、"properties",以及可选的 "nullable_keys"。 +- edgelabels[]:每个边标签包含 "name"、"source_label"、"target_label"、"properties"。 +- propertykeys[]:每个属性包含 "name"、"data_type"、"cardinality"。 + +## 输出契约 +必须返回唯一 JSON 对象:{"vertices": [...], "edges": [...]} + +顶点对象: +{"id":"顶点 id","label":"顶点标签","properties":{"属性名":"属性值", ...}} + +边对象: +{"label":"边标签","outV":"源顶点 id","outVLabel":"源顶点标签","inV":"目标顶点 id","inVLabel":"目标顶点标签","properties":{"属性名":"属性值", ...}} + +## 确定性顶点 ID 规则 +对每个顶点,先找到 schema 中 vertexlabels[].name 等于输出 label 的条目。 +- vertexLabelID 必须取自该 schema 条目的 vertexlabels[].id,不能从标签文本猜测。 +- 如果 primary_keys 只有一个字段:id = "{vertexLabelID}:{properties.}"。 +- 如果 primary_keys 有多个字段:id = "{vertexLabelID}:{properties.}!{properties.}",顺序必须与 schema primary_keys 一致。 +- 当 schema 提供数字顶点标签 id 时,不要使用 "person:Sarah" 这样的标签名作为顶点 id。 + +## 边引用规则 +- outV 和 inV 必须严格等于本次输出 vertices 中的 id。 +- outVLabel/inVLabel 必须分别匹配对应源/目标顶点标签。 +- 只有当两个端点顶点都出现在 vertices 中时,才输出该边。 +- 不要输出 edgelabels[] 中不存在的边标签。 + +## 抽取规则 +1. 不要抽取 schema 中不存在的标签或属性。 +2. 不要翻译 schema 字段名、标签名或属性 key,必须与 schema 原文完全一致。 +3. 根据 propertykeys[] 保持属性类型,例如 INT 保持数字,BOOLEAN 保持布尔值。 +4. 移除空属性。不要编造缺失事实。 +5. 只输出 JSON;不要输出 Markdown 代码块、解释文本、注释或尾随文本。 ## 示例 -### 输入示例: -#### 文本 -认识 Sarah,一位 30 岁的律师,和她的室友 James,他们从 2010 年开始合住。James 在职业生活中是一名记者。 +输入文本: +认识 Sarah,一位 30 岁的律师,和她的室友 James,他们从 2010 年开始合住。James 是一名记者。 -#### 图谱模式 -{"vertices":[{"vertex_label":"person","properties":["name","age","occupation"]}], "edges":[{"edge_label":"roommate", "source_vertex_label":"person","target_vertex_label":"person","properties":["date"]]} +图谱 schema 示例: +{"vertexlabels":[{"id":1,"name":"person","primary_keys":["name"],"properties":["name","age","occupation"],"nullable_keys":["age","occupation"]}],"edgelabels":[{"name":"roommate","source_label":"person","target_label":"person","properties":["date"]}],"propertykeys":[{"name":"name","data_type":"TEXT","cardinality":"SINGLE"},{"name":"age","data_type":"INT","cardinality":"SINGLE"},{"name":"occupation","data_type":"TEXT","cardinality":"SINGLE"},{"name":"date","data_type":"TEXT","cardinality":"SINGLE"}]} -### 输出示例: -[{"id":"1:Sarah","label":"person","type":"vertex","properties":{"name":"Sarah","age":30,"occupation":"律师"}},{"id":"1:James","label":"person","type":"vertex","properties":{"name":"James","occupation":"记者"}},{"label":"roommate","type":"edge","outV":"1:Sarah","outVLabel":"person","inV":"1:James","inVLabel":"person","properties":{"date":"2010"}}] +输出: +{"vertices":[{"id":"1:Sarah","label":"person","properties":{"name":"Sarah","age":30,"occupation":"律师"}},{"id":"1:James","label":"person","properties":{"name":"James","occupation":"记者"}}],"edges":[{"label":"roommate","outV":"1:Sarah","outVLabel":"person","inV":"1:James","inVLabel":"person","properties":{"date":"2010"}}]} """ gremlin_generate_prompt_CN: str = """ diff --git a/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/commit_to_hugegraph.py b/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/commit_to_hugegraph.py index d464b80ef..daade304d 100644 --- a/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/commit_to_hugegraph.py +++ b/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/commit_to_hugegraph.py @@ -81,6 +81,7 @@ def load_into_graph(self, vertices, edges, schema): # pylint: disable=too-many- vertex_label_map = {v_label["name"]: v_label for v_label in schema["vertexlabels"]} edge_label_map = {e_label["name"]: e_label for e_label in schema["edgelabels"]} property_label_map = {p_label["name"]: p_label for p_label in schema["propertykeys"]} + vid_mapping = {} # mapping from LLM-generated vertex ID to actual server vertex ID for vertex in vertices: input_label = vertex["label"] @@ -146,12 +147,24 @@ def load_into_graph(self, vertices, edges, schema): # pylint: disable=too-many- continue # TODO: we could try batch add vertices first, setback to single-mode if failed - vid = self._handle_graph_creation(self.client.graph().addVertex, input_label, input_properties).id + original_id = vertex.get("id") + if vertex_label.get("id_strategy") == "CUSTOMIZE_STRING" and original_id: + result = self._handle_graph_creation( + self.client.graph().addVertex, + input_label, + input_properties, + id=original_id, + ) + else: + result = self._handle_graph_creation(self.client.graph().addVertex, input_label, input_properties) + vid = result.id vertex["id"] = vid + if original_id: + vid_mapping[original_id] = vid for edge in edges: - start = edge["outV"] - end = edge["inV"] + start = vid_mapping.get(edge.get("outV"), edge.get("outV")) + end = vid_mapping.get(edge.get("inV"), edge.get("inV")) label = edge["label"] properties = edge["properties"] diff --git a/hugegraph-llm/src/hugegraph_llm/operators/llm_op/property_graph_extract.py b/hugegraph-llm/src/hugegraph_llm/operators/llm_op/property_graph_extract.py index 31411d969..ec4f7f332 100644 --- a/hugegraph-llm/src/hugegraph_llm/operators/llm_op/property_graph_extract.py +++ b/hugegraph-llm/src/hugegraph_llm/operators/llm_op/property_graph_extract.py @@ -120,6 +120,94 @@ def extract_property_graph_by_llm(self, schema, chunk): prompt = self.example_prompt + prompt return self.llm.generate(prompt=prompt) + @staticmethod + def _primary_key_id(vertex_label, properties): + id_strategy = vertex_label.get("id_strategy") + if id_strategy and str(id_strategy).upper() != "PRIMARY_KEY": + return None + primary_keys = vertex_label.get("primary_keys", []) + if not primary_keys or "id" not in vertex_label: + return None + values = [] + for key in primary_keys: + value = properties.get(key) + if value is None or value == "": + return None + values.append(str(value)) + return f"{vertex_label['id']}:{'!'.join(values)}" + + def _normalize_vertices(self, vertices, vertex_label_map): + vertex_id_map = {} + normalized_vertices = [] + for vertex in vertices: + label = vertex["label"] + properties = vertex["properties"] + canonical_id = self._primary_key_id(vertex_label_map[label], properties) + original_id = vertex.get("id") + if canonical_id is None: + if original_id: + vertex_id_map[(label, original_id)] = original_id + normalized_vertices.append(vertex) + continue + + vertex["id"] = canonical_id + vertex_id_map[(label, canonical_id)] = canonical_id + if original_id: + vertex_id_map[(label, original_id)] = canonical_id + normalized_vertices.append(vertex) + return normalized_vertices, vertex_id_map + + def _resolve_endpoint(self, edge, endpoint_key, label_key, legacy_key, vertex_label_map, vertex_id_map): + endpoint = edge.get(endpoint_key) + label = edge.get(label_key) + if endpoint and label: + return vertex_id_map.get((label, endpoint)), label + + legacy_endpoint = edge.get(legacy_key) + if not isinstance(legacy_endpoint, dict): + return None, label + + label = legacy_endpoint.get("label") + properties = legacy_endpoint.get("properties", {}) + if label not in vertex_label_map: + return None, label + canonical_id = self._primary_key_id(vertex_label_map[label], properties) + return vertex_id_map.get((label, canonical_id)), label + + def _normalize_edges(self, edges, edge_label_map, vertex_label_map, vertex_id_map): + normalized_edges = [] + for edge in edges: + edge_label = edge_label_map[edge["label"]] + out_v, out_v_label = self._resolve_endpoint( + edge, + "outV", + "outVLabel", + "source", + vertex_label_map, + vertex_id_map, + ) + in_v, in_v_label = self._resolve_endpoint( + edge, + "inV", + "inVLabel", + "target", + vertex_label_map, + vertex_id_map, + ) + if not out_v or not in_v: + log.warning("Invalid edge endpoints '%s' have been ignored.", edge) + continue + if out_v_label != edge_label.get("source_label") or in_v_label != edge_label.get("target_label"): + log.warning("Invalid edge endpoint labels '%s' have been ignored.", edge) + continue + + edge["outV"] = out_v + edge["outVLabel"] = out_v_label + edge["inV"] = in_v + edge["inVLabel"] = in_v_label + normalized_edges.append(edge) + return normalized_edges + def _extract_and_filter_label(self, schema, text) -> List[Dict[str, Any]]: # Strip markdown code blocks (e.g. ```json ... ```) text = re.sub(r"```\w*\n?", "", text) @@ -147,19 +235,25 @@ def _extract_and_filter_label(self, schema, text) -> List[Dict[str, Any]]: return items # Create sets for valid vertex and edge labels based on the schema - vertex_label_set = {vertex["name"] for vertex in schema["vertexlabels"]} - edge_label_set = {edge["name"] for edge in schema["edgelabels"]} + vertex_label_map = {vertex["name"]: vertex for vertex in schema["vertexlabels"]} + edge_label_map = {edge["name"]: edge for edge in schema["edgelabels"]} + vertex_label_set = set(vertex_label_map) + edge_label_set = set(edge_label_map) def process_items(item_list, valid_labels, item_type): + parsed_items = [] for item in item_list: if not isinstance(item, dict): log.warning("Invalid property graph item type '%s'.", type(item)) continue + item = dict(item) + item_type_value = item.get("type", item_type) + item["type"] = item_type_value if not self.NECESSARY_ITEM_KEYS.issubset(item.keys()): log.warning("Invalid item keys '%s'.", item.keys()) continue - if item["type"] != item_type: - log.warning("Invalid %s type '%s' has been ignored.", item_type, item["type"]) + if item_type_value != item_type: + log.warning("Invalid %s type '%s' has been ignored.", item_type, item_type_value) continue if item["label"] not in valid_labels: log.warning( @@ -168,10 +262,14 @@ def process_items(item_list, valid_labels, item_type): item["label"], ) continue - items.append(item) + parsed_items.append(item) + return parsed_items - process_items(property_graph["vertices"], vertex_label_set, "vertex") - process_items(property_graph["edges"], edge_label_set, "edge") + vertex_items = process_items(property_graph["vertices"], vertex_label_set, "vertex") + vertices, vertex_id_map = self._normalize_vertices(vertex_items, vertex_label_map) + edge_items = process_items(property_graph["edges"], edge_label_set, "edge") + edges = self._normalize_edges(edge_items, edge_label_map, vertex_label_map, vertex_id_map) + items = vertices + edges except json.JSONDecodeError: log.critical("Invalid property graph JSON! Please check the extracted JSON data carefully") return items diff --git a/hugegraph-llm/src/hugegraph_llm/resources/prompt_examples/prompt_examples.json b/hugegraph-llm/src/hugegraph_llm/resources/prompt_examples/prompt_examples.json index f3bd33c37..3e7b17f44 100644 --- a/hugegraph-llm/src/hugegraph_llm/resources/prompt_examples/prompt_examples.json +++ b/hugegraph-llm/src/hugegraph_llm/resources/prompt_examples/prompt_examples.json @@ -3,24 +3,24 @@ "name": "Official Person-Relationship Extraction", "description": "A standard template for extracting Person and Webpage entities, along with their relationships (Roommate, Owns), from descriptive text.", "text": "Meet Sarah, a 30-year-old attorney, and her roommate, James, whom she's shared a home with since 2010. James, in his professional life, works as a journalist. Additionally, Sarah is the proud owner of the website www.sarahsplace.com.", - "prompt": "## Main Task\nGiven the following graph schema and a piece of text, your task is to analyze the text and extract information that fits into the schema's structure, formatting the information into vertices and edges as specified.\n\n## Basic Rules:\n### Schema Format:\nGraph Schema:\n- \"vertices\": [List of vertex labels and their properties]\n- \"edges\": [List of edge labels, their source and target vertex labels, and properties]\n\n### Content Rule:\nPlease read the provided text carefully and identify any information that corresponds to the vertices and edges defined in the schema.\nYou are not allowed to modify the schema contraints. Your task is to format the provided information into the required schema, without missing any keyword.\nFor each piece of information that matches a vertex or edge, format it strictly according to the following JSON structures:\n\n#### Vertex Format:\n{\"id\":\"vertexLabelID:entityName\",\"label\":\"vertexLabel\",\"type\":\"vertex\",\"properties\":{\"propertyName\":\"propertyValue\", ...}}\n\n#### Edge Format:\n{\"id\":\"vertexlabelID:pk1!pk2!pk3\", \"label\":\"edgeLabel\",\"type\":\"edge\",\"outV\":\"sourceVertexId\",\"outVLabel\":\"sourceVertexLabel\",\"inV\":\"targetVertexId\",\"inVLabel\":\"targetVertexLabel\",\"properties\":{\"propertyName\":\"propertyValue\",...}}\n\nStrictly follow these rules:\n1. Don't extract property fields or labels that doesn't exist in the given schema. Do not generate new information.\n2. Ensure the extracted property set in the same type as the given schema (like 'age' should be a number, 'select' should be a boolean).\n3. If there are multiple primary keys, the strategy for generating VID is: vertexlabelID:pk1!pk2!pk3 (pk means primary key, and '!' is the separator). This id must be generated ONLY if there are multiple primary keys. If there is only one primary key, the strategy for generating VID is: int (sequencially increasing).\n4. Output in JSON format, only include vertexes and edges & remove empty properties, extracted and formatted based on the text/rules and schema.\n5. Translate the schema fields into Chinese if the given text input is Chinese (Optional)\n\n## Example:\n### Input example:\n#### text:\nMeet Sarah, a 30-year-old attorney, and her roommate, James, whom she's shared a home with since 2010. James, in his professional life, works as a journalist.\n\n#### graph schema example:\n{\"vertices\":[{\"vertex_label\":\"person\",\"properties\":[\"name\",\"age\",\"occupation\"]}], \"edges\":[{\"edge_label\":\"roommate\", \"source_vertex_label\":\"person\",\"target_vertex_label\":\"person\",\"properties\":[\"date\"]}]}\n\n### Output example:\n{\"vertices\":[{\"id\":\"1:Sarah\",\"label\":\"person\",\"type\":\"vertex\",\"properties\":{\"name\":\"Sarah\",\"age\":30,\"occupation\":\"attorney\"}},{\"id\":\"1:James\",\"label\":\"person\",\"type\":\"vertex\",\"properties\":{\"name\":\"James\",\"occupation\":\"journalist\"}}], \"edges\":[{\"label\":\"roommate\",\"type\":\"edge\",\"outV\":\"1:Sarah\",\"outVLabel\":\"person\",\"inV\":\"1:James\",\"inVLabel\":\"person\",\"properties\":{\"date\":\"2010\"}}]}" + "prompt": "## Main Task\nExtract only the vertices and edges supported by the given graph schema and input text. Return valid JSON only.\n\n## Schema Contract\nThe graph schema uses vertexlabels[], edgelabels[], and propertykeys[]. Use vertexlabels[].id and primary_keys to build deterministic vertex ids.\n\n## Output Contract\nReturn exactly one JSON object: {\"vertices\": [...], \"edges\": [...]}\nVertex object: {\"id\":\"vertex id\",\"label\":\"vertex label\",\"properties\":{\"propertyName\":\"propertyValue\", ...}}\nEdge object: {\"label\":\"edge label\",\"outV\":\"source vertex id\",\"outVLabel\":\"source vertex label\",\"inV\":\"target vertex id\",\"inVLabel\":\"target vertex label\",\"properties\":{\"propertyName\":\"propertyValue\", ...}}\n\n## Vertex ID Rules\n- If primary_keys has one key: id = \"{vertexLabelID}:{properties.}\".\n- If primary_keys has multiple keys: id = \"{vertexLabelID}:{properties.}!{properties.}\" in schema primary-key order.\n- Never use label names such as \"person:Sarah\" when schema gives a numeric vertex label id.\n\n## Extraction Rules\n- Do not extract labels or properties absent from the schema.\n- Do not translate schema field names, labels, or property keys.\n- Preserve property data types according to propertykeys[].\n- Only output an edge if outV and inV reference vertices in the same output.\n- Output JSON only; no Markdown fences, prose, comments, or trailing text.\n\n## Example\nInput text:\nMeet Sarah, a 30-year-old attorney, and her roommate, James, whom she's shared a home with since 2010. James works as a journalist.\n\nGraph schema example:\n{\"vertexlabels\":[{\"id\":1,\"name\":\"person\",\"primary_keys\":[\"name\"],\"properties\":[\"name\",\"age\",\"occupation\"],\"nullable_keys\":[\"age\",\"occupation\"]}],\"edgelabels\":[{\"name\":\"roommate\",\"source_label\":\"person\",\"target_label\":\"person\",\"properties\":[\"date\"]}],\"propertykeys\":[{\"name\":\"name\",\"data_type\":\"TEXT\",\"cardinality\":\"SINGLE\"},{\"name\":\"age\",\"data_type\":\"INT\",\"cardinality\":\"SINGLE\"},{\"name\":\"occupation\",\"data_type\":\"TEXT\",\"cardinality\":\"SINGLE\"},{\"name\":\"date\",\"data_type\":\"TEXT\",\"cardinality\":\"SINGLE\"}]}\n\nOutput:\n{\"vertices\":[{\"id\":\"1:Sarah\",\"label\":\"person\",\"properties\":{\"name\":\"Sarah\",\"age\":30,\"occupation\":\"attorney\"}},{\"id\":\"1:James\",\"label\":\"person\",\"properties\":{\"name\":\"James\",\"occupation\":\"journalist\"}}],\"edges\":[{\"label\":\"roommate\",\"outV\":\"1:Sarah\",\"outVLabel\":\"person\",\"inV\":\"1:James\",\"inVLabel\":\"person\",\"properties\":{\"date\":\"2010\"}}]}" }, { "name": "Traffic Accident Element Extraction", "description": "Extracts key elements from a traffic accident report, including persons involved, vehicles, and responsibilities.", "text": "On March 15, 2024, John Smith, driving a red Porsche with license plate NY-88888, collided with a scooter ridden by Mike Lee at the intersection of People's Road and Liberation Road. The collision resulted in a fracture in Mike Lee's right leg. The traffic police determined that John Smith was fully responsible for running a red light.", - "prompt": "## Main Task\nGiven the following graph schema and a piece of text about a traffic accident, your task is to extract information that fits into the schema's structure, formatting the information into vertices and edges as specified.\n\n## Basic Rules:\n### Schema Format:\nGraph Schema:\n- \"vertices\": [List of vertex labels and their properties]\n- \"edges\": [List of edge labels, their source and target vertex labels, and properties]\n\n### Content Rule:\nPlease read the provided text carefully and identify any information that corresponds to the vertices and edges defined in the schema.\nYou are not allowed to modify the schema contraints. Your task is to format the provided information into the required schema, without missing any keyword.\nFor each piece of information that matches a vertex or edge, format it strictly according to the following JSON structures:\n\n#### Vertex Format:\n{\"id\":\"vertexLabelID:entityName\",\"label\":\"vertexLabel\",\"type\":\"vertex\",\"properties\":{\"propertyName\":\"propertyValue\", ...}}\n\n#### Edge Format:\n{\"id\":\"vertexlabelID:pk1!pk2!pk3\", \"label\":\"edgeLabel\",\"type\":\"edge\",\"outV\":\"sourceVertexId\",\"outVLabel\":\"sourceVertexLabel\",\"inV\":\"targetVertexId\",\"inVLabel\":\"targetVertexLabel\",\"properties\":{\"propertyName\":\"propertyValue\",...}}\n\nStrictly follow these rules:\n1. Don't extract property fields or labels that doesn't exist in the given schema. Do not generate new information.\n2. Ensure the extracted property set in the same type as the given schema (like 'age' should be a number, 'select' should be a boolean).\n3. Output in JSON format, only include vertexes and edges & remove empty properties, extracted and formatted based on the text/rules and schema.\n\n## Example:\n### Input example:\n#### text:\nOn March 15, 2024, John Smith, driving a red Porsche with license plate NY-88888, collided with a scooter ridden by Mike Lee at the intersection of People's Road and Liberation Road. The collision resulted in a fracture in Mike Lee's right leg. The traffic police determined that John Smith was fully responsible for running a red light.\n\n#### graph schema example:\n{\"vertices\":[{\"vertex_label\":\"Person\",\"properties\":[\"name\",\"injury_level\"]},{\"vertex_label\":\"Vehicle\",\"properties\":[\"license_plate\",\"type\",\"color\"]},{\"vertex_label\":\"Accident\",\"properties\":[\"date\",\"location\",\"responsible_party\"]}], \"edges\":[{\"edge_label\":\"drives\",\"source_label\":\"Person\",\"target_label\":\"Vehicle\"},{\"edge_label\":\"participated_in\",\"source_label\":\"Person\",\"target_label\":\"Accident\"}]}\n\n### Output example:\n{\"vertices\":[{\"id\":\"1:John Smith\",\"label\":\"Person\",\"type\":\"vertex\",\"properties\":{\"name\":\"John Smith\"}},{\"id\":\"1:Mike Lee\",\"label\":\"Person\",\"type\":\"vertex\",\"properties\":{\"name\":\"Mike Lee\",\"injury_level\":\"right leg fracture\"}},{\"id\":\"2:NY-88888\",\"label\":\"Vehicle\",\"type\":\"vertex\",\"properties\":{\"license_plate\":\"NY-88888\",\"type\":\"Porsche\",\"color\":\"red\"}}],\"edges\":[{\"label\":\"drives\",\"type\":\"edge\",\"outV\":\"1:John Smith\",\"outVLabel\":\"Person\",\"inV\":\"2:NY-88888\",\"inVLabel\":\"Vehicle\",\"properties\":{}}]}" + "prompt": "## Main Task\nExtract only the vertices and edges supported by the given graph schema and input text. Return valid JSON only.\n\n## Schema Contract\nThe graph schema uses vertexlabels[], edgelabels[], and propertykeys[]. Use vertexlabels[].id and primary_keys to build deterministic vertex ids.\n\n## Output Contract\nReturn exactly one JSON object: {\"vertices\": [...], \"edges\": [...]}\nVertex object: {\"id\":\"vertex id\",\"label\":\"vertex label\",\"properties\":{\"propertyName\":\"propertyValue\", ...}}\nEdge object: {\"label\":\"edge label\",\"outV\":\"source vertex id\",\"outVLabel\":\"source vertex label\",\"inV\":\"target vertex id\",\"inVLabel\":\"target vertex label\",\"properties\":{\"propertyName\":\"propertyValue\", ...}}\n\n## Vertex ID Rules\n- If primary_keys has one key: id = \"{vertexLabelID}:{properties.}\".\n- If primary_keys has multiple keys: id = \"{vertexLabelID}:{properties.}!{properties.}\" in schema primary-key order.\n- Never use label names such as \"person:Sarah\" when schema gives a numeric vertex label id.\n\n## Extraction Rules\n- Do not extract labels or properties absent from the schema.\n- Do not translate schema field names, labels, or property keys.\n- Preserve property data types according to propertykeys[].\n- Only output an edge if outV and inV reference vertices in the same output.\n- Output JSON only; no Markdown fences, prose, comments, or trailing text.\n\n## Example\nInput text:\nOn March 15, 2024, John Smith drove a red Porsche with license plate NY-88888 and collided with a scooter ridden by Mike Lee. Mike Lee suffered a right leg fracture.\n\nGraph schema example:\n{\"vertexlabels\":[{\"id\":1,\"name\":\"Person\",\"primary_keys\":[\"name\"],\"properties\":[\"name\",\"injury_level\"],\"nullable_keys\":[\"injury_level\"]},{\"id\":2,\"name\":\"Vehicle\",\"primary_keys\":[\"license_plate\"],\"properties\":[\"license_plate\",\"type\",\"color\"],\"nullable_keys\":[\"type\",\"color\"]}],\"edgelabels\":[{\"name\":\"drives\",\"source_label\":\"Person\",\"target_label\":\"Vehicle\",\"properties\":[]}],\"propertykeys\":[{\"name\":\"name\",\"data_type\":\"TEXT\",\"cardinality\":\"SINGLE\"},{\"name\":\"injury_level\",\"data_type\":\"TEXT\",\"cardinality\":\"SINGLE\"},{\"name\":\"license_plate\",\"data_type\":\"TEXT\",\"cardinality\":\"SINGLE\"},{\"name\":\"type\",\"data_type\":\"TEXT\",\"cardinality\":\"SINGLE\"},{\"name\":\"color\",\"data_type\":\"TEXT\",\"cardinality\":\"SINGLE\"}]}\n\nOutput:\n{\"vertices\":[{\"id\":\"1:John Smith\",\"label\":\"Person\",\"properties\":{\"name\":\"John Smith\"}},{\"id\":\"1:Mike Lee\",\"label\":\"Person\",\"properties\":{\"name\":\"Mike Lee\",\"injury_level\":\"right leg fracture\"}},{\"id\":\"2:NY-88888\",\"label\":\"Vehicle\",\"properties\":{\"license_plate\":\"NY-88888\",\"type\":\"Porsche\",\"color\":\"red\"}}],\"edges\":[{\"label\":\"drives\",\"outV\":\"1:John Smith\",\"outVLabel\":\"Person\",\"inV\":\"2:NY-88888\",\"inVLabel\":\"Vehicle\",\"properties\":{}}]}" }, { "name": "Financial Event Extraction", "description": "Extracts key financial information such as companies, acquisition events, and amounts from financial news.", "text": "Tech giant Company A announced yesterday that it will fully acquire startup Company B, which operates in the artificial intelligence sector, for a price of $2 billion. The acquisition is expected to be completed by the end of the year.", - "prompt": "## Main Task\nGiven the following graph schema and a piece of financial news, your task is to extract information about corporate mergers and acquisitions.\n\n## Basic Rules:\n### Schema Format:\nGraph Schema:\n- \"vertices\": [List of vertex labels and their properties]\n- \"edges\": [List of edge labels, their source and target vertex labels, and properties]\n\n### Content Rule:\nPlease read the provided text carefully and identify any information that corresponds to the vertices and edges defined in the schema.\nYou are not allowed to modify the schema contraints. Your task is to format the provided information into the required schema, without missing any keyword.\nFor each piece of information that matches a vertex or edge, format it strictly according to the following JSON structures:\n\n#### Vertex Format:\n{\"id\":\"vertexLabelID:entityName\",\"label\":\"vertexLabel\",\"type\":\"vertex\",\"properties\":{\"propertyName\":\"propertyValue\", ...}}\n\n#### Edge Format:\n{\"id\":\"vertexlabelID:pk1!pk2!pk3\", \"label\":\"edgeLabel\",\"type\":\"edge\",\"outV\":\"sourceVertexId\",\"outVLabel\":\"sourceVertexLabel\",\"inV\":\"targetVertexId\",\"inVLabel\":\"targetVertexLabel\",\"properties\":{\"propertyName\":\"propertyValue\",...}}\n\nStrictly follow these rules:\n1. Don't extract property fields or labels that doesn't exist in the given schema. Do not generate new information.\n2. Ensure the extracted property set in the same type as the given schema (like 'age' should be a number, 'select' should be a boolean).\n3. Output in JSON format, only include vertexes and edges & remove empty properties, extracted and formatted based on the text/rules and schema.\n\n## Example:\n### Input example:\n#### text:\nTech giant Company A announced yesterday that it will fully acquire startup Company B, which operates in the artificial intelligence sector, for a price of $2 billion. The acquisition is expected to be completed by the end of the year.\n\n#### graph schema example:\n{\"vertices\":[{\"vertex_label\":\"Company\",\"properties\":[\"name\",\"field\"]},{\"vertex_label\":\"Acquisition\",\"properties\":[\"price\",\"currency\",\"status\"]}], \"edges\":[{\"edge_label\":\"acquirer\",\"source_label\":\"Acquisition\",\"target_label\":\"Company\"},{\"edge_label\":\"acquired\",\"source_label\":\"Acquisition\",\"target_label\":\"Company\"}]}\n\n### Output example:\n{\"vertices\":[{\"id\":\"1:Company A\",\"label\":\"Company\",\"type\":\"vertex\",\"properties\":{\"name\":\"Company A\",\"field\":\"Tech\"}},{\"id\":\"1:Company B\",\"label\":\"Company\",\"type\":\"vertex\",\"properties\":{\"name\":\"Company B\",\"field\":\"artificial intelligence\"}},{\"id\":\"2:A acquires B\",\"label\":\"Acquisition\",\"type\":\"vertex\",\"properties\":{\"price\":2000000000,\"currency\":\"USD\",\"status\":\"expected to be completed by year-end\"}}],\"edges\":[{\"label\":\"acquirer\",\"type\":\"edge\",\"outV\":\"2:A acquires B\",\"outVLabel\":\"Acquisition\",\"inV\":\"1:Company A\",\"inVLabel\":\"Company\",\"properties\":{}},{\"label\":\"acquired\",\"type\":\"edge\",\"outV\":\"2:A acquires B\",\"outVLabel\":\"Acquisition\",\"inV\":\"1:Company B\",\"inVLabel\":\"Company\",\"properties\":{}}]}" + "prompt": "## Main Task\nExtract only the vertices and edges supported by the given graph schema and input text. Return valid JSON only.\n\n## Schema Contract\nThe graph schema uses vertexlabels[], edgelabels[], and propertykeys[]. Use vertexlabels[].id and primary_keys to build deterministic vertex ids.\n\n## Output Contract\nReturn exactly one JSON object: {\"vertices\": [...], \"edges\": [...]}\nVertex object: {\"id\":\"vertex id\",\"label\":\"vertex label\",\"properties\":{\"propertyName\":\"propertyValue\", ...}}\nEdge object: {\"label\":\"edge label\",\"outV\":\"source vertex id\",\"outVLabel\":\"source vertex label\",\"inV\":\"target vertex id\",\"inVLabel\":\"target vertex label\",\"properties\":{\"propertyName\":\"propertyValue\", ...}}\n\n## Vertex ID Rules\n- If primary_keys has one key: id = \"{vertexLabelID}:{properties.}\".\n- If primary_keys has multiple keys: id = \"{vertexLabelID}:{properties.}!{properties.}\" in schema primary-key order.\n- Never use label names such as \"person:Sarah\" when schema gives a numeric vertex label id.\n\n## Extraction Rules\n- Do not extract labels or properties absent from the schema.\n- Do not translate schema field names, labels, or property keys.\n- Preserve property data types according to propertykeys[].\n- Only output an edge if outV and inV reference vertices in the same output.\n- Output JSON only; no Markdown fences, prose, comments, or trailing text.\n\n## Example\nInput text:\nTech giant Company A will fully acquire startup Company B, which operates in artificial intelligence, for $2 billion.\n\nGraph schema example:\n{\"vertexlabels\":[{\"id\":1,\"name\":\"Company\",\"primary_keys\":[\"name\"],\"properties\":[\"name\",\"field\"],\"nullable_keys\":[\"field\"]},{\"id\":2,\"name\":\"Acquisition\",\"primary_keys\":[\"deal\"],\"properties\":[\"deal\",\"price\",\"currency\"],\"nullable_keys\":[\"price\",\"currency\"]}],\"edgelabels\":[{\"name\":\"acquirer\",\"source_label\":\"Acquisition\",\"target_label\":\"Company\",\"properties\":[]},{\"name\":\"acquired\",\"source_label\":\"Acquisition\",\"target_label\":\"Company\",\"properties\":[]}],\"propertykeys\":[{\"name\":\"name\",\"data_type\":\"TEXT\",\"cardinality\":\"SINGLE\"},{\"name\":\"field\",\"data_type\":\"TEXT\",\"cardinality\":\"SINGLE\"},{\"name\":\"deal\",\"data_type\":\"TEXT\",\"cardinality\":\"SINGLE\"},{\"name\":\"price\",\"data_type\":\"LONG\",\"cardinality\":\"SINGLE\"},{\"name\":\"currency\",\"data_type\":\"TEXT\",\"cardinality\":\"SINGLE\"}]}\n\nOutput:\n{\"vertices\":[{\"id\":\"1:Company A\",\"label\":\"Company\",\"properties\":{\"name\":\"Company A\",\"field\":\"technology\"}},{\"id\":\"1:Company B\",\"label\":\"Company\",\"properties\":{\"name\":\"Company B\",\"field\":\"artificial intelligence\"}},{\"id\":\"2:Company A acquires Company B\",\"label\":\"Acquisition\",\"properties\":{\"deal\":\"Company A acquires Company B\",\"price\":2000000000,\"currency\":\"USD\"}}],\"edges\":[{\"label\":\"acquirer\",\"outV\":\"2:Company A acquires Company B\",\"outVLabel\":\"Acquisition\",\"inV\":\"1:Company A\",\"inVLabel\":\"Company\",\"properties\":{}},{\"label\":\"acquired\",\"outV\":\"2:Company A acquires Company B\",\"outVLabel\":\"Acquisition\",\"inV\":\"1:Company B\",\"inVLabel\":\"Company\",\"properties\":{}}]}" }, { "name": "Medical Diagnosis Extraction", "description": "Extracts patients, symptoms, diagnosis results, and recommended drugs from medical record text.", "text": "Patient Li Hua, presents with a headache and fever for three days. After examination, the diagnosis is a viral cold. It is recommended to take the drug 'Gankang' for treatment.", - "prompt": "## Main Task\nGiven the following graph schema and a piece of medical record, your task is to extract entities and relationships related to diagnosis and treatment.\n\n## Basic Rules:\n### Schema Format:\nGraph Schema:\n- \"vertices\": [List of vertex labels and their properties]\n- \"edges\": [List of edge labels, their source and target vertex labels, and properties]\n\n### Content Rule:\nPlease read the provided text carefully and identify any information that corresponds to the vertices and edges defined in the schema.\nYou are not allowed to modify the schema contraints. Your task is to format the provided information into the required schema, without missing any keyword.\nFor each piece of information that matches a vertex or edge, format it strictly according to the following JSON structures:\n\n#### Vertex Format:\n{\"id\":\"vertexLabelID:entityName\",\"label\":\"vertexLabel\",\"type\":\"vertex\",\"properties\":{\"propertyName\":\"propertyValue\", ...}}\n\n#### Edge Format:\n{\"id\":\"vertexlabelID:pk1!pk2!pk3\", \"label\":\"edgeLabel\",\"type\":\"edge\",\"outV\":\"sourceVertexId\",\"outVLabel\":\"sourceVertexLabel\",\"inV\":\"targetVertexId\",\"inVLabel\":\"targetVertexLabel\",\"properties\":{\"propertyName\":\"propertyValue\",...}}\n\nStrictly follow these rules:\n1. Don't extract property fields or labels that doesn't exist in the given schema. Do not generate new information.\n2. Ensure the extracted property set in the same type as the given schema (like 'age' should be a number, 'select' should be a boolean).\n3. Output in JSON format, only include vertexes and edges & remove empty properties, extracted and formatted based on the text/rules and schema.\n\n## Example:\n### Input example:\n#### text:\nPatient Li Hua, presents with a headache and fever for three days. After examination, the diagnosis is a viral cold. It is recommended to take the drug 'Gankang' for treatment.\n\n#### graph schema example:\n{\"vertices\":[{\"vertex_label\":\"Patient\",\"properties\":[\"name\"]},{\"vertex_label\":\"Symptom\",\"properties\":[\"name\"]},{\"vertex_label\":\"Diagnosis\",\"properties\":[\"name\"]},{\"vertex_label\":\"Drug\",\"properties\":[\"name\"]}], \"edges\":[{\"edge_label\":\"has_symptom\",\"source_label\":\"Patient\",\"target_label\":\"Symptom\"},{\"edge_label\":\"diagnosed_with\",\"source_label\":\"Patient\",\"target_label\":\"Diagnosis\"},{\"edge_label\":\"recommends_drug\",\"source_label\":\"Diagnosis\",\"target_label\":\"Drug\"}]}\n\n### Output example:\n{\"vertices\":[{\"id\":\"1:Li Hua\",\"label\":\"Patient\",\"type\":\"vertex\",\"properties\":{\"name\":\"Li Hua\"}},{\"id\":\"2:headache\",\"label\":\"Symptom\",\"type\":\"vertex\",\"properties\":{\"name\":\"headache\"}},{\"id\":\"2:fever\",\"label\":\"Symptom\",\"type\":\"vertex\",\"properties\":{\"name\":\"fever\"}},{\"id\":\"3:viral cold\",\"label\":\"Diagnosis\",\"type\":\"vertex\",\"properties\":{\"name\":\"viral cold\"}},{\"id\":\"4:Gankang\",\"label\":\"Drug\",\"type\":\"vertex\",\"properties\":{\"name\":\"Gankang\"}}],\"edges\":[{\"label\":\"has_symptom\",\"type\":\"edge\",\"outV\":\"1:Li Hua\",\"outVLabel\":\"Patient\",\"inV\":\"2:headache\",\"inVLabel\":\"Symptom\",\"properties\":{}},{\"label\":\"diagnosed_with\",\"type\":\"edge\",\"outV\":\"1:Li Hua\",\"outVLabel\":\"Patient\",\"inV\":\"3:viral cold\",\"inVLabel\":\"Diagnosis\",\"properties\":{}},{\"label\":\"recommends_drug\",\"type\":\"edge\",\"outV\":\"3:viral cold\",\"outVLabel\":\"Diagnosis\",\"inV\":\"4:Gankang\",\"inVLabel\":\"Drug\",\"properties\":{}}]}" + "prompt": "## Main Task\nExtract only the vertices and edges supported by the given graph schema and input text. Return valid JSON only.\n\n## Schema Contract\nThe graph schema uses vertexlabels[], edgelabels[], and propertykeys[]. Use vertexlabels[].id and primary_keys to build deterministic vertex ids.\n\n## Output Contract\nReturn exactly one JSON object: {\"vertices\": [...], \"edges\": [...]}\nVertex object: {\"id\":\"vertex id\",\"label\":\"vertex label\",\"properties\":{\"propertyName\":\"propertyValue\", ...}}\nEdge object: {\"label\":\"edge label\",\"outV\":\"source vertex id\",\"outVLabel\":\"source vertex label\",\"inV\":\"target vertex id\",\"inVLabel\":\"target vertex label\",\"properties\":{\"propertyName\":\"propertyValue\", ...}}\n\n## Vertex ID Rules\n- If primary_keys has one key: id = \"{vertexLabelID}:{properties.}\".\n- If primary_keys has multiple keys: id = \"{vertexLabelID}:{properties.}!{properties.}\" in schema primary-key order.\n- Never use label names such as \"person:Sarah\" when schema gives a numeric vertex label id.\n\n## Extraction Rules\n- Do not extract labels or properties absent from the schema.\n- Do not translate schema field names, labels, or property keys.\n- Preserve property data types according to propertykeys[].\n- Only output an edge if outV and inV reference vertices in the same output.\n- Output JSON only; no Markdown fences, prose, comments, or trailing text.\n\n## Example\nInput text:\nPatient Li Hua has a headache and fever for three days. The diagnosis is viral cold, and the recommended drug is Gankang.\n\nGraph schema example:\n{\"vertexlabels\":[{\"id\":1,\"name\":\"Patient\",\"primary_keys\":[\"name\"],\"properties\":[\"name\"],\"nullable_keys\":[]},{\"id\":2,\"name\":\"Symptom\",\"primary_keys\":[\"name\"],\"properties\":[\"name\"],\"nullable_keys\":[]},{\"id\":3,\"name\":\"Diagnosis\",\"primary_keys\":[\"name\"],\"properties\":[\"name\"],\"nullable_keys\":[]},{\"id\":4,\"name\":\"Drug\",\"primary_keys\":[\"name\"],\"properties\":[\"name\"],\"nullable_keys\":[]}],\"edgelabels\":[{\"name\":\"has_symptom\",\"source_label\":\"Patient\",\"target_label\":\"Symptom\",\"properties\":[]},{\"name\":\"diagnosed_with\",\"source_label\":\"Patient\",\"target_label\":\"Diagnosis\",\"properties\":[]},{\"name\":\"recommends_drug\",\"source_label\":\"Diagnosis\",\"target_label\":\"Drug\",\"properties\":[]}],\"propertykeys\":[{\"name\":\"name\",\"data_type\":\"TEXT\",\"cardinality\":\"SINGLE\"}]}\n\nOutput:\n{\"vertices\":[{\"id\":\"1:Li Hua\",\"label\":\"Patient\",\"properties\":{\"name\":\"Li Hua\"}},{\"id\":\"2:headache\",\"label\":\"Symptom\",\"properties\":{\"name\":\"headache\"}},{\"id\":\"2:fever\",\"label\":\"Symptom\",\"properties\":{\"name\":\"fever\"}},{\"id\":\"3:viral cold\",\"label\":\"Diagnosis\",\"properties\":{\"name\":\"viral cold\"}},{\"id\":\"4:Gankang\",\"label\":\"Drug\",\"properties\":{\"name\":\"Gankang\"}}],\"edges\":[{\"label\":\"has_symptom\",\"outV\":\"1:Li Hua\",\"outVLabel\":\"Patient\",\"inV\":\"2:headache\",\"inVLabel\":\"Symptom\",\"properties\":{}},{\"label\":\"has_symptom\",\"outV\":\"1:Li Hua\",\"outVLabel\":\"Patient\",\"inV\":\"2:fever\",\"inVLabel\":\"Symptom\",\"properties\":{}},{\"label\":\"diagnosed_with\",\"outV\":\"1:Li Hua\",\"outVLabel\":\"Patient\",\"inV\":\"3:viral cold\",\"inVLabel\":\"Diagnosis\",\"properties\":{}},{\"label\":\"recommends_drug\",\"outV\":\"3:viral cold\",\"outVLabel\":\"Diagnosis\",\"inV\":\"4:Gankang\",\"inVLabel\":\"Drug\",\"properties\":{}}]}" } ] diff --git a/hugegraph-llm/src/tests/config/test_prompt_config.py b/hugegraph-llm/src/tests/config/test_prompt_config.py new file mode 100644 index 000000000..0a46b4c62 --- /dev/null +++ b/hugegraph-llm/src/tests/config/test_prompt_config.py @@ -0,0 +1,153 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import json +from pathlib import Path +from unittest.mock import MagicMock + +from hugegraph_llm.config.prompt_config import PromptConfig +from hugegraph_llm.models.llms.base import BaseLLM +from hugegraph_llm.operators.llm_op.property_graph_extract import PropertyGraphExtract + + +def _json_objects_after_marker(prompt, marker): + start = prompt.index(marker) + len(marker) + decoder = json.JSONDecoder() + objects = [] + index = start + while True: + index = prompt.find("{", index) + if index == -1: + return objects + try: + value, end = decoder.raw_decode(prompt[index:]) + except json.JSONDecodeError: + index += 1 + continue + objects.append(value) + index += end + + +def _example_schema_and_output(prompt, example_marker): + objects = _json_objects_after_marker(prompt, example_marker) + schema = next(obj for obj in objects if "vertexlabels" in obj and "edgelabels" in obj) + output = next(obj for obj in objects if "vertices" in obj and "edges" in obj) + return schema, output + + +def _assert_prompt_example_contract(prompt, example_marker): + schema, output = _example_schema_and_output(prompt, example_marker) + _assert_output_matches_schema_contract(schema, output) + + +def _assert_output_matches_schema_contract(schema, output): + assert set(output) == {"vertices", "edges"} + assert output["vertices"] + assert output["edges"] + + vertex_ids = {vertex["id"] for vertex in output["vertices"]} + vertex_labels = {vertex["label"] for vertex in output["vertices"]} + schema_vertices = {vertex["name"]: vertex for vertex in schema["vertexlabels"]} + schema_edges = {edge["name"]: edge for edge in schema["edgelabels"]} + + for vertex in output["vertices"]: + assert set(vertex) == {"id", "label", "properties"} + schema_vertex = schema_vertices[vertex["label"]] + primary_values = [str(vertex["properties"][key]) for key in schema_vertex["primary_keys"]] + expected_id = f"{schema_vertex['id']}:{'!'.join(primary_values)}" + assert vertex["id"] == expected_id + assert not vertex["id"].startswith(f"{vertex['label']}:") + assert isinstance(vertex["properties"], dict) + + for edge in output["edges"]: + assert set(edge) == {"label", "outV", "outVLabel", "inV", "inVLabel", "properties"} + assert edge["label"] in schema_edges + assert edge["outV"] in vertex_ids + assert edge["inV"] in vertex_ids + assert edge["outVLabel"] in vertex_labels + assert edge["inVLabel"] in vertex_labels + assert edge["outVLabel"] == schema_edges[edge["label"]]["source_label"] + assert edge["inVLabel"] == schema_edges[edge["label"]]["target_label"] + assert isinstance(edge["properties"], dict) + + extractor = PropertyGraphExtract(llm=MagicMock(spec=BaseLLM)) + parsed_items = extractor._extract_and_filter_label(schema, json.dumps(output)) + assert {item["type"] for item in parsed_items} == {"vertex", "edge"} + assert len(parsed_items) == len(output["vertices"]) + len(output["edges"]) + + +def test_extract_graph_prompt_en_example_matches_parser_contract(): + _assert_prompt_example_contract(PromptConfig.extract_graph_prompt_EN, "## Example") + + +def test_extract_graph_prompt_cn_example_matches_parser_contract(): + _assert_prompt_example_contract(PromptConfig.extract_graph_prompt_CN, "## 示例") + + +def test_extract_graph_prompt_example_contract_rejects_label_name_vertex_id(): + schema, output = _example_schema_and_output(PromptConfig.extract_graph_prompt_EN, "## Example") + output["vertices"][0]["id"] = "person:Sarah" + + try: + _assert_output_matches_schema_contract(schema, output) + except AssertionError: + return + + raise AssertionError("Prompt example contract accepted a label-name vertex id") + + +def test_extract_graph_prompt_example_contract_rejects_dangling_edge_reference(): + schema, output = _example_schema_and_output(PromptConfig.extract_graph_prompt_EN, "## Example") + output["edges"][0]["outV"] = "1:Missing" + + try: + _assert_output_matches_schema_contract(schema, output) + except AssertionError: + return + + raise AssertionError("Prompt example contract accepted an edge reference outside vertices") + + +def test_prompt_examples_match_extraction_contract(): + examples_path = ( + Path(__file__).parents[2] / "hugegraph_llm" / "resources" / "prompt_examples" / "prompt_examples.json" + ) + examples = json.loads(examples_path.read_text(encoding="utf-8")) + + for example in examples: + prompt = example["prompt"] + assert '"type":"vertex"' not in prompt + assert '"type":"edge"' not in prompt + _assert_prompt_example_contract(prompt, "## Example") + + +def test_prompt_examples_use_matching_domain_examples(): + examples_path = ( + Path(__file__).parents[2] / "hugegraph_llm" / "resources" / "prompt_examples" / "prompt_examples.json" + ) + examples = json.loads(examples_path.read_text(encoding="utf-8")) + domain_markers = { + "Official Person-Relationship Extraction": ["Sarah", "James"], + "Traffic Accident Element Extraction": ["John Smith", "NY-88888"], + "Financial Event Extraction": ["Company A", "$2 billion"], + "Medical Diagnosis Extraction": ["Li Hua", "Gankang"], + } + + for example in examples: + prompt = example["prompt"] + for marker in domain_markers[example["name"]]: + assert marker in prompt diff --git a/hugegraph-llm/src/tests/operators/hugegraph_op/test_commit_to_hugegraph.py b/hugegraph-llm/src/tests/operators/hugegraph_op/test_commit_to_hugegraph.py index 634fdb961..b13f90422 100644 --- a/hugegraph-llm/src/tests/operators/hugegraph_op/test_commit_to_hugegraph.py +++ b/hugegraph-llm/src/tests/operators/hugegraph_op/test_commit_to_hugegraph.py @@ -22,6 +22,7 @@ from pyhugegraph.utils.exceptions import CreateError, NotFoundError from hugegraph_llm.operators.hugegraph_op.commit_to_hugegraph import Commit2Graph +from hugegraph_llm.operators.llm_op.property_graph_extract import PropertyGraphExtract class TestCommit2Graph(unittest.TestCase): @@ -49,6 +50,7 @@ def setUp(self): ], "vertexlabels": [ { + "id": 1, "name": "person", "properties": ["name", "age"], "primary_keys": ["name"], @@ -56,6 +58,7 @@ def setUp(self): "id_strategy": "PRIMARY_KEY", }, { + "id": 2, "name": "movie", "properties": ["title", "year"], "primary_keys": ["title"], @@ -351,6 +354,173 @@ def test_load_into_graph_with_data_type_validation_success(self, mock_handle_gra # Verify that _handle_graph_creation was called for each vertex and edge self.assertEqual(mock_handle_graph_creation.call_count, 3) # 2 vertices + 1 edge + @patch("hugegraph_llm.operators.hugegraph_op.commit_to_hugegraph.Commit2Graph._handle_graph_creation") + def test_load_into_graph_maps_llm_vertex_ids_to_created_vertex_ids(self, mock_handle_graph_creation): + """Test edges use server-created vertex ids when LLM ids differ.""" + mock_handle_graph_creation.side_effect = [ + MagicMock(id="1:Tom Hanks"), + MagicMock(id="2:Forrest Gump"), + MagicMock(id="edge_id"), + ] + + vertices = [ + { + "id": "person:Tom Hanks", + "label": "person", + "properties": {"name": "Tom Hanks", "age": 67}, + }, + { + "id": "movie:Forrest Gump", + "label": "movie", + "properties": {"title": "Forrest Gump", "year": 1994}, + }, + ] + edges = [ + { + "label": "acted_in", + "properties": {"role": "Forrest Gump"}, + "outV": "person:Tom Hanks", + "inV": "movie:Forrest Gump", + } + ] + + self.commit2graph.load_into_graph(vertices, edges, self.schema) + + self.assertEqual(vertices[0]["id"], "1:Tom Hanks") + self.assertEqual(vertices[1]["id"], "2:Forrest Gump") + mock_handle_graph_creation.assert_any_call( + self.commit2graph.client.graph().addEdge, + "acted_in", + "1:Tom Hanks", + "2:Forrest Gump", + {"role": "Forrest Gump"}, + ) + + @patch("hugegraph_llm.operators.hugegraph_op.commit_to_hugegraph.Commit2Graph._handle_graph_creation") + def test_load_into_graph_uses_explicit_customize_string_ids(self, mock_handle_graph_creation): + """Test custom string ids are passed to HugeGraph when schema requires them.""" + mock_handle_graph_creation.side_effect = [ + MagicMock(id="Tom Hanks"), + MagicMock(id="Forrest Gump"), + MagicMock(id="edge_id"), + ] + schema = { + "propertykeys": [ + {"name": "name", "data_type": "TEXT", "cardinality": "SINGLE"}, + {"name": "title", "data_type": "TEXT", "cardinality": "SINGLE"}, + ], + "vertexlabels": [ + { + "id": 7, + "name": "person", + "id_strategy": "CUSTOMIZE_STRING", + "primary_keys": ["name"], + "properties": ["name"], + "nullable_keys": [], + }, + { + "id": 8, + "name": "movie", + "id_strategy": "CUSTOMIZE_STRING", + "primary_keys": ["title"], + "properties": ["title"], + "nullable_keys": [], + }, + ], + "edgelabels": [{"name": "acted_in", "properties": [], "source_label": "person", "target_label": "movie"}], + } + vertices = [ + {"id": "Tom Hanks", "label": "person", "properties": {"name": "Tom Hanks"}}, + {"id": "Forrest Gump", "label": "movie", "properties": {"title": "Forrest Gump"}}, + ] + edges = [ + { + "label": "acted_in", + "properties": {}, + "outV": "Tom Hanks", + "inV": "Forrest Gump", + } + ] + + self.commit2graph.load_into_graph(vertices, edges, schema) + + mock_handle_graph_creation.assert_any_call( + self.commit2graph.client.graph().addVertex, + "person", + {"name": "Tom Hanks"}, + id="Tom Hanks", + ) + mock_handle_graph_creation.assert_any_call( + self.commit2graph.client.graph().addVertex, + "movie", + {"title": "Forrest Gump"}, + id="Forrest Gump", + ) + mock_handle_graph_creation.assert_any_call( + self.commit2graph.client.graph().addEdge, + "acted_in", + "Tom Hanks", + "Forrest Gump", + {}, + ) + + @patch("hugegraph_llm.operators.hugegraph_op.commit_to_hugegraph.Commit2Graph._handle_graph_creation") + def test_load_into_graph_accepts_normalized_extraction_without_item_type(self, mock_handle_graph_creation): + """Test normalized LLM output without type fields can be committed.""" + mock_handle_graph_creation.side_effect = [ + MagicMock(id="1:Tom Hanks"), + MagicMock(id="2:Forrest Gump"), + MagicMock(id="edge_id"), + ] + llm_output = """{ + "vertices": [ + { + "id": "person:Tom Hanks", + "label": "person", + "properties": { + "name": "Tom Hanks", + "age": 67 + } + }, + { + "id": "movie:Forrest Gump", + "label": "movie", + "properties": { + "title": "Forrest Gump", + "year": 1994 + } + } + ], + "edges": [ + { + "label": "acted_in", + "outV": "person:Tom Hanks", + "outVLabel": "person", + "inV": "movie:Forrest Gump", + "inVLabel": "movie", + "properties": { + "role": "Forrest Gump" + } + } + ] + }""" + + items = PropertyGraphExtract(llm=MagicMock())._extract_and_filter_label(self.schema, llm_output) + vertices = [item for item in items if item["type"] == "vertex"] + edges = [item for item in items if item["type"] == "edge"] + self.assertEqual(edges[0]["outV"], "1:Tom Hanks") + self.assertEqual(edges[0]["inV"], "2:Forrest Gump") + + self.commit2graph.load_into_graph(vertices, edges, self.schema) + + mock_handle_graph_creation.assert_any_call( + self.commit2graph.client.graph().addEdge, + "acted_in", + "1:Tom Hanks", + "2:Forrest Gump", + {"role": "Forrest Gump"}, + ) + @patch("hugegraph_llm.operators.hugegraph_op.commit_to_hugegraph.Commit2Graph._handle_graph_creation") def test_load_into_graph_with_data_type_validation_failure(self, mock_handle_graph_creation): """Test load_into_graph method with data type validation failure.""" diff --git a/hugegraph-llm/src/tests/operators/llm_op/test_property_graph_extract.py b/hugegraph-llm/src/tests/operators/llm_op/test_property_graph_extract.py index 7c84de156..3eb490261 100644 --- a/hugegraph-llm/src/tests/operators/llm_op/test_property_graph_extract.py +++ b/hugegraph-llm/src/tests/operators/llm_op/test_property_graph_extract.py @@ -39,19 +39,23 @@ def setUp(self): self.schema = { "vertexlabels": [ { + "id": 1, "name": "person", "primary_keys": ["name"], "nullable_keys": ["age"], "properties": ["name", "age"], }, { + "id": 2, "name": "movie", "primary_keys": ["title"], "nullable_keys": ["year"], "properties": ["title", "year"], }, ], - "edgelabels": [{"name": "acted_in", "properties": ["role"]}], + "edgelabels": [ + {"name": "acted_in", "properties": ["role"], "source_label": "person", "target_label": "movie"} + ], } # Sample text chunks @@ -77,6 +81,13 @@ def setUp(self): }""", """{ "vertices": [ + { + "type": "vertex", + "label": "person", + "properties": { + "name": "Tom Hanks" + } + }, { "type": "vertex", "label": "movie", @@ -194,11 +205,13 @@ def test_extract_and_filter_label_valid_json(self): result = extractor._extract_and_filter_label(self.schema, text) - self.assertEqual(len(result), 2) + self.assertEqual(len(result), 3) self.assertEqual(result[0]["type"], "vertex") - self.assertEqual(result[0]["label"], "movie") - self.assertEqual(result[1]["type"], "edge") - self.assertEqual(result[1]["label"], "acted_in") + self.assertEqual(result[0]["label"], "person") + self.assertEqual(result[1]["type"], "vertex") + self.assertEqual(result[1]["label"], "movie") + self.assertEqual(result[2]["type"], "edge") + self.assertEqual(result[2]["label"], "acted_in") def test_extract_and_filter_label_markdown_json(self): """Test _extract_and_filter_label with JSON wrapped in markdown fences.""" @@ -209,11 +222,13 @@ def test_extract_and_filter_label_markdown_json(self): result = extractor._extract_and_filter_label(self.schema, text) - self.assertEqual(len(result), 2) + self.assertEqual(len(result), 3) self.assertEqual(result[0]["type"], "vertex") - self.assertEqual(result[0]["label"], "movie") - self.assertEqual(result[1]["type"], "edge") - self.assertEqual(result[1]["label"], "acted_in") + self.assertEqual(result[0]["label"], "person") + self.assertEqual(result[1]["type"], "vertex") + self.assertEqual(result[1]["label"], "movie") + self.assertEqual(result[2]["type"], "edge") + self.assertEqual(result[2]["label"], "acted_in") def test_extract_and_filter_label_markdown_json_with_prose(self): """Test fenced JSON can be parsed when the LLM adds prose.""" @@ -226,11 +241,13 @@ def test_extract_and_filter_label_markdown_json_with_prose(self): result = extractor._extract_and_filter_label(self.schema, text) - self.assertEqual(len(result), 2) + self.assertEqual(len(result), 3) self.assertEqual(result[0]["type"], "vertex") - self.assertEqual(result[0]["label"], "movie") - self.assertEqual(result[1]["type"], "edge") - self.assertEqual(result[1]["label"], "acted_in") + self.assertEqual(result[0]["label"], "person") + self.assertEqual(result[1]["type"], "vertex") + self.assertEqual(result[1]["label"], "movie") + self.assertEqual(result[2]["type"], "edge") + self.assertEqual(result[2]["label"], "acted_in") def test_extract_and_filter_label_flat_array_json(self): """Test _extract_and_filter_label converts flat arrays to vertices and edges.""" @@ -244,6 +261,13 @@ def test_extract_and_filter_label_flat_array_json(self): "name": "Tom Hanks" } }, + { + "type": "vertex", + "label": "movie", + "properties": { + "title": "Forrest Gump" + } + }, { "type": "edge", "label": "acted_in", @@ -268,11 +292,13 @@ def test_extract_and_filter_label_flat_array_json(self): result = extractor._extract_and_filter_label(self.schema, text) - self.assertEqual(len(result), 2) + self.assertEqual(len(result), 3) self.assertEqual(result[0]["type"], "vertex") self.assertEqual(result[0]["label"], "person") - self.assertEqual(result[1]["type"], "edge") - self.assertEqual(result[1]["label"], "acted_in") + self.assertEqual(result[1]["type"], "vertex") + self.assertEqual(result[1]["label"], "movie") + self.assertEqual(result[2]["type"], "edge") + self.assertEqual(result[2]["label"], "acted_in") def test_extract_and_filter_label_flat_array_filters_invalid_items(self): """Test flat arrays keep valid graph items and drop invalid ones.""" @@ -285,6 +311,13 @@ def test_extract_and_filter_label_flat_array_filters_invalid_items(self): "name": "Tom Hanks" } }, + { + "type": "vertex", + "label": "movie", + "properties": { + "title": "Forrest Gump" + } + }, { "type": "vertex", "label": "unknown_label", @@ -326,11 +359,13 @@ def test_extract_and_filter_label_flat_array_filters_invalid_items(self): result = extractor._extract_and_filter_label(self.schema, text) - self.assertEqual(len(result), 2) + self.assertEqual(len(result), 3) self.assertEqual(result[0]["type"], "vertex") self.assertEqual(result[0]["label"], "person") - self.assertEqual(result[1]["type"], "edge") - self.assertEqual(result[1]["label"], "acted_in") + self.assertEqual(result[1]["type"], "vertex") + self.assertEqual(result[1]["label"], "movie") + self.assertEqual(result[2]["type"], "edge") + self.assertEqual(result[2]["label"], "acted_in") def test_extract_and_filter_label_malformed_fenced_json(self): """Test malformed fenced JSON returns no graph items.""" @@ -354,6 +389,432 @@ def test_extract_and_filter_label_malformed_fenced_json(self): self.assertEqual(result, []) + def test_extract_and_filter_label_infers_type_from_grouped_arrays(self): + """Infer item type from vertices/edges containers when LLM omits it.""" + extractor = PropertyGraphExtract(llm=self.mock_llm) + text = """{ + "vertices": [ + { + "label": "person", + "properties": { + "name": "Tom Hanks" + } + }, + { + "label": "movie", + "properties": { + "title": "Forrest Gump" + } + } + ], + "edges": [ + { + "label": "acted_in", + "properties": { + "role": "Forrest Gump" + }, + "source": { + "label": "person", + "properties": { + "name": "Tom Hanks" + } + }, + "target": { + "label": "movie", + "properties": { + "title": "Forrest Gump" + } + } + } + ] + }""" + + result = extractor._extract_and_filter_label(self.schema, text) + + self.assertEqual(len(result), 3) + self.assertEqual(result[0]["type"], "vertex") + self.assertEqual(result[0]["label"], "person") + self.assertEqual(result[1]["type"], "vertex") + self.assertEqual(result[1]["label"], "movie") + self.assertEqual(result[2]["type"], "edge") + self.assertEqual(result[2]["label"], "acted_in") + + def test_extract_and_filter_label_normalizes_primary_key_ids(self): + """Normalize LLM vertex ids to schema-derived primary-key ids.""" + extractor = PropertyGraphExtract(llm=self.mock_llm) + text = """{ + "vertices": [ + { + "id": "person:Tom Hanks", + "label": "person", + "properties": { + "name": "Tom Hanks" + } + }, + { + "id": "movie:Forrest Gump", + "label": "movie", + "properties": { + "title": "Forrest Gump" + } + } + ], + "edges": [ + { + "label": "acted_in", + "outV": "person:Tom Hanks", + "outVLabel": "person", + "inV": "movie:Forrest Gump", + "inVLabel": "movie", + "properties": { + "role": "Forrest Gump" + } + } + ] + }""" + + result = extractor._extract_and_filter_label(self.schema, text) + + self.assertEqual(result[0]["id"], "1:Tom Hanks") + self.assertEqual(result[1]["id"], "2:Forrest Gump") + self.assertEqual(result[2]["outV"], "1:Tom Hanks") + self.assertEqual(result[2]["inV"], "2:Forrest Gump") + + def test_extract_and_filter_label_keeps_canonical_primary_key_ids(self): + """Keep already-canonical vertex and edge ids intact.""" + extractor = PropertyGraphExtract(llm=self.mock_llm) + text = """{ + "vertices": [ + { + "id": "1:Tom Hanks", + "label": "person", + "properties": { + "name": "Tom Hanks" + } + }, + { + "id": "2:Forrest Gump", + "label": "movie", + "properties": { + "title": "Forrest Gump" + } + } + ], + "edges": [ + { + "label": "acted_in", + "outV": "1:Tom Hanks", + "outVLabel": "person", + "inV": "2:Forrest Gump", + "inVLabel": "movie", + "properties": { + "role": "Forrest Gump" + } + } + ] + }""" + + result = extractor._extract_and_filter_label(self.schema, text) + + self.assertEqual(result[0]["id"], "1:Tom Hanks") + self.assertEqual(result[1]["id"], "2:Forrest Gump") + self.assertEqual(result[2]["outV"], "1:Tom Hanks") + self.assertEqual(result[2]["inV"], "2:Forrest Gump") + + def test_extract_and_filter_label_normalizes_multiple_primary_key_ids(self): + """Normalize multi-primary-key vertex ids in schema primary-key order.""" + extractor = PropertyGraphExtract(llm=self.mock_llm) + schema = { + "vertexlabels": [ + { + "id": 3, + "name": "character", + "primary_keys": ["name", "universe"], + "nullable_keys": [], + "properties": ["name", "universe"], + } + ], + "edgelabels": [], + } + text = """{ + "vertices": [ + { + "id": "character:Tom!movie", + "label": "character", + "properties": { + "name": "Tom", + "universe": "movie" + } + } + ], + "edges": [] + }""" + + result = extractor._extract_and_filter_label(schema, text) + + self.assertEqual(result[0]["id"], "3:Tom!movie") + + def test_extract_and_filter_label_resolves_source_target_edge_refs(self): + """Resolve source/target edge endpoints to canonical outV/inV ids.""" + extractor = PropertyGraphExtract(llm=self.mock_llm) + text = """{ + "vertices": [ + { + "label": "person", + "properties": { + "name": "Tom Hanks" + } + }, + { + "label": "movie", + "properties": { + "title": "Forrest Gump" + } + } + ], + "edges": [ + { + "label": "acted_in", + "properties": { + "role": "Forrest Gump" + }, + "source": { + "label": "person", + "properties": { + "name": "Tom Hanks" + } + }, + "target": { + "label": "movie", + "properties": { + "title": "Forrest Gump" + } + } + } + ] + }""" + + result = extractor._extract_and_filter_label(self.schema, text) + + self.assertEqual(result[0]["id"], "1:Tom Hanks") + self.assertEqual(result[1]["id"], "2:Forrest Gump") + self.assertEqual(result[2]["outV"], "1:Tom Hanks") + self.assertEqual(result[2]["outVLabel"], "person") + self.assertEqual(result[2]["inV"], "2:Forrest Gump") + self.assertEqual(result[2]["inVLabel"], "movie") + + def test_extract_and_filter_label_drops_edges_with_unresolved_endpoints(self): + """Drop edges whose endpoints cannot be resolved before commit.""" + extractor = PropertyGraphExtract(llm=self.mock_llm) + text = """{ + "vertices": [ + { + "label": "person", + "properties": { + "name": "Tom Hanks" + } + } + ], + "edges": [ + { + "label": "acted_in", + "outV": "person:Missing", + "outVLabel": "person", + "inV": "movie:Missing", + "inVLabel": "movie", + "properties": { + "role": "Forrest Gump" + } + } + ] + }""" + + result = extractor._extract_and_filter_label(self.schema, text) + + self.assertEqual(len(result), 1) + self.assertEqual(result[0]["type"], "vertex") + + def test_extract_and_filter_label_drops_legacy_edges_with_missing_vertices(self): + """Drop legacy source/target edges unless both endpoints are emitted as vertices.""" + extractor = PropertyGraphExtract(llm=self.mock_llm) + text = """{ + "vertices": [ + { + "label": "person", + "properties": { + "name": "Tom Hanks" + } + } + ], + "edges": [ + { + "label": "acted_in", + "properties": { + "role": "Forrest Gump" + }, + "source": { + "label": "person", + "properties": { + "name": "Tom Hanks" + } + }, + "target": { + "label": "movie", + "properties": { + "title": "Forrest Gump" + } + } + } + ] + }""" + + result = extractor._extract_and_filter_label(self.schema, text) + + self.assertEqual(len(result), 1) + self.assertEqual(result[0]["type"], "vertex") + + def test_extract_and_filter_label_keeps_explicit_custom_ids(self): + """Keep self-consistent explicit ids when schema cannot derive primary-key ids.""" + extractor = PropertyGraphExtract(llm=self.mock_llm) + schema = { + "vertexlabels": [ + {"name": "person", "id_strategy": "CUSTOMIZE_STRING", "properties": ["name"], "nullable_keys": []}, + {"name": "movie", "id_strategy": "CUSTOMIZE_STRING", "properties": ["title"], "nullable_keys": []}, + ], + "edgelabels": [{"name": "acted_in", "properties": [], "source_label": "person", "target_label": "movie"}], + } + text = """{ + "vertices": [ + { + "id": "Tom Hanks", + "label": "person", + "properties": { + "name": "Tom Hanks" + } + }, + { + "id": "Forrest Gump", + "label": "movie", + "properties": { + "title": "Forrest Gump" + } + } + ], + "edges": [ + { + "label": "acted_in", + "outV": "Tom Hanks", + "outVLabel": "person", + "inV": "Forrest Gump", + "inVLabel": "movie", + "properties": {} + } + ] + }""" + + result = extractor._extract_and_filter_label(schema, text) + + self.assertEqual(len(result), 3) + self.assertEqual(result[2]["outV"], "Tom Hanks") + self.assertEqual(result[2]["inV"], "Forrest Gump") + + def test_extract_and_filter_label_keeps_explicit_custom_ids_with_label_metadata(self): + """Do not rewrite custom ids even when schema includes ids and primary keys.""" + extractor = PropertyGraphExtract(llm=self.mock_llm) + schema = { + "vertexlabels": [ + { + "id": 7, + "name": "person", + "id_strategy": "CUSTOMIZE_STRING", + "primary_keys": ["name"], + "properties": ["name"], + "nullable_keys": [], + }, + { + "id": 8, + "name": "movie", + "id_strategy": "CUSTOMIZE_STRING", + "primary_keys": ["title"], + "properties": ["title"], + "nullable_keys": [], + }, + ], + "edgelabels": [{"name": "acted_in", "properties": [], "source_label": "person", "target_label": "movie"}], + } + text = """{ + "vertices": [ + { + "id": "Tom Hanks", + "label": "person", + "properties": { + "name": "Tom Hanks" + } + }, + { + "id": "Forrest Gump", + "label": "movie", + "properties": { + "title": "Forrest Gump" + } + } + ], + "edges": [ + { + "label": "acted_in", + "outV": "Tom Hanks", + "outVLabel": "person", + "inV": "Forrest Gump", + "inVLabel": "movie", + "properties": {} + } + ] + }""" + + result = extractor._extract_and_filter_label(schema, text) + + self.assertEqual(result[0]["id"], "Tom Hanks") + self.assertEqual(result[1]["id"], "Forrest Gump") + self.assertEqual(result[2]["outV"], "Tom Hanks") + self.assertEqual(result[2]["inV"], "Forrest Gump") + + def test_extract_and_filter_label_drops_edges_with_mismatched_endpoint_labels(self): + """Drop edges whose endpoint labels conflict with the edge schema.""" + extractor = PropertyGraphExtract(llm=self.mock_llm) + text = """{ + "vertices": [ + { + "label": "person", + "properties": { + "name": "Tom Hanks" + } + }, + { + "label": "movie", + "properties": { + "title": "Forrest Gump" + } + } + ], + "edges": [ + { + "label": "acted_in", + "outV": "1:Tom Hanks", + "outVLabel": "movie", + "inV": "2:Forrest Gump", + "inVLabel": "person", + "properties": { + "role": "Forrest Gump" + } + } + ] + }""" + + result = extractor._extract_and_filter_label(self.schema, text) + + self.assertEqual(len(result), 2) + self.assertTrue(all(item["type"] == "vertex" for item in result)) + def test_extract_and_filter_label_invalid_json(self): """Test the _extract_and_filter_label method with invalid JSON.""" extractor = PropertyGraphExtract(llm=self.mock_llm) @@ -387,6 +848,34 @@ def test_extract_and_filter_label_invalid_item_type(self): self.assertEqual(result, []) + def test_extract_and_filter_label_rejects_explicit_type_mismatch(self): + """Do not override an explicit item type that conflicts with its container.""" + extractor = PropertyGraphExtract(llm=self.mock_llm) + text = """{ + "vertices": [ + { + "type": "edge", + "label": "person", + "properties": { + "name": "Tom Hanks" + } + } + ], + "edges": [ + { + "type": "vertex", + "label": "acted_in", + "properties": { + "role": "Forrest Gump" + } + } + ] + }""" + + result = extractor._extract_and_filter_label(self.schema, text) + + self.assertEqual(result, []) + def test_extract_and_filter_label_invalid_label(self): """Test the _extract_and_filter_label method with invalid label.""" extractor = PropertyGraphExtract(llm=self.mock_llm) @@ -446,13 +935,13 @@ def test_run(self): self.assertEqual(extractor.extract_property_graph_by_llm.call_count, 2) # Verify the results - self.assertEqual(len(result["vertices"]), 2) + self.assertEqual(len(result["vertices"]), 3) self.assertEqual(len(result["edges"]), 1) self.assertEqual(result["call_count"], 2) # Check vertex properties self.assertEqual(result["vertices"][0]["properties"]["name"], "Tom Hanks") - self.assertEqual(result["vertices"][1]["properties"]["title"], "Forrest Gump") + self.assertEqual(result["vertices"][2]["properties"]["title"], "Forrest Gump") # Check edge properties self.assertEqual(result["edges"][0]["properties"]["role"], "Forrest Gump") @@ -490,7 +979,7 @@ def test_run_with_existing_vertices_and_edges(self): result = extractor.run(context) # Verify the results - self.assertEqual(len(result["vertices"]), 3) # 1 existing + 2 new + self.assertEqual(len(result["vertices"]), 4) # 1 existing + 3 new self.assertEqual(len(result["edges"]), 2) # 1 existing + 1 new self.assertEqual(result["call_count"], 2)