This is an automated email from the ASF dual-hosted git repository.
imbajin pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/hugegraph-ai.git
The following commit(s) were added to refs/heads/main by this push:
new a15965ef fix(graph): resolve edge creation failure due to vertex ID
mismatch (#331)
a15965ef is described below
commit a15965ef00062f0eb4025f3120d66020b1499306
Author: mengmeng.lin <[email protected]>
AuthorDate: Tue May 19 21:50:26 2026 +0800
fix(graph): resolve edge creation failure due to vertex ID mismatch (#331)
## Summary
- Fix edge creation failure when loading extracted graph data into
HugeGraph
- The HugeGraph server assigns vertex IDs (e.g., `1:Sarah`) using
numeric label IDs, which differ from LLM-predicted IDs (e.g.,
`person:Sarah`) that edges reference
- Add `vid_mapping` to track the ID mapping and update edge `outV`/`inV`
after vertex creation
- Normalize extracted graph IDs and edge endpoints after LLM extraction,
including missing item `type` fields and custom string IDs
```
---------
Co-authored-by: imbajin <[email protected]>
---
.../src/hugegraph_llm/config/prompt_config.py | 172 +++----
.../operators/hugegraph_op/commit_to_hugegraph.py | 19 +-
.../operators/llm_op/property_graph_extract.py | 112 ++++-
.../resources/prompt_examples/prompt_examples.json | 8 +-
.../src/tests/config/test_prompt_config.py | 153 ++++++
.../hugegraph_op/test_commit_to_hugegraph.py | 170 +++++++
.../llm_op/test_property_graph_extract.py | 533 ++++++++++++++++++++-
7 files changed, 1046 insertions(+), 121 deletions(-)
diff --git a/hugegraph-llm/src/hugegraph_llm/config/prompt_config.py
b/hugegraph-llm/src/hugegraph_llm/config/prompt_config.py
index d56c830e..bf42d386 100644
--- a/hugegraph-llm/src/hugegraph_llm/config/prompt_config.py
+++ b/hugegraph-llm/src/hugegraph_llm/config/prompt_config.py
@@ -47,62 +47,52 @@ Answer:
# Note: Users should modify the prompt(examples) according to the real
schema and text (property_graph_extract.py)
extract_graph_prompt_EN: str = """## Main Task
-Given the following graph schema and a piece of text, your task is to analyze
the text and extract information that fits into the schema's structure,
formatting the information into vertices and edges as specified.
+Extract only the vertices and edges that are supported by the given graph
schema and input text. Return valid JSON only.
-## Basic Rules:
-### Schema Format:
-Graph Schema:
-- "vertices": [List of vertex labels and their properties]
-- "edges": [List of edge labels, their source and target vertex labels, and
properties]
-
-### Content Rule:
-Please read the provided text carefully and identify any information that
corresponds to the vertices and edges defined in the schema.
-You are not allowed to modify the schema contraints. Your task is to format
the provided information into the required schema, without missing any keyword.
-For each piece of information that matches a vertex or edge, format it
strictly according to the following JSON structures:
-
-#### Vertex Format:
-{"id":"vertexLabelID:entityName","label":"vertexLabel","type":"vertex","properties":{"propertyName":"propertyValue",
...}}
-
-where:
- - "vertexLabelID": int
- - "vertexLabel": str
- - "entityName": str
- - "type": "vertex"
- - "properties": dict
-
-#### Edge Format:
-{"id":"vertexlabelID:pk1!pk2!pk3",
label":"edgeLabel","type":"edge","outV":"sourceVertexId","outVLabel":"sourceVertexLabel","inV":"targetVertexId","inVLabel":"targetVertexLabel","properties":{"propertyName":"propertyValue",...}}
-
-where:
- - "id": int or str (conditional) (optional)
- - "edgeLabel": str
- - "type": "edge"
- - "outV": str
- - "outVLabel": str
- - "inV": str
- - "inVLabel": str
- - "properties": dict
- - "sourceVertexId": "vertexLabelID:entityName"
- - "targetVertexId": "vertexLabelID:entityName"
-
-Strictly follow these rules:
-1. Don't extract property fields or labels that doesn't exist in the given
schema. Do not generate new information.
-2. Ensure the extracted property set in the same type as the given schema
(like 'age' should be a number, 'select' should be a boolean).
-3. If there are multiple primary keys, the strategy for generating VID is:
vertexlabelID:pk1!pk2!pk3 (pk means primary key, and '!' is the separator).
This id must be generated ONLY if there are multiple primary keys. If there is
only one primary key, the strategy for generating VID is: int (sequencially
increasing).
-4. Output in JSON format, only include vertexes and edges & remove empty
properties, extracted and formatted based on the text/rules and schema.
-5. Translate the schema fields into Chinese if the given text input is Chinese
(Optional)
-
-Refer to the following baseline example to understand the output generation
requirements:
-## Example:
-### Input example:
-#### text:
-Meet Sarah, a 30-year-old attorney, and her roommate, James, whom she's shared
a home with since 2010. James, in his professional life, works as a journalist.
-
-#### graph schema example:
-{"vertices":[{"vertex_label":"person","properties":["name","age","occupation"]}],
"edges":[{"edge_label":"roommate",
"source_vertex_label":"person","target_vertex_label":"person","properties":["date"]]}
-
-### Output example:
-{"vertices":[{"id":"1:Sarah","label":"person","type":"vertex","properties":{"name":"Sarah","age":30,"occupation":"attorney"}},{"id":"1:James","label":"person","type":"vertex","properties":{"name":"James","occupation":"journalist"}}],
"edges":[{"id": 1,
"label":"roommate","type":"edge","outV":"1:Sarah","outVLabel":"person","inV":"1:James","inVLabel":"person","properties":{"date":"2010"}}]}"""
+## Schema Contract
+The Graph schema uses this shape:
+- vertexlabels[]: each vertex label has "id", "name", "primary_keys",
"properties", and optional "nullable_keys".
+- edgelabels[]: each edge label has "name", "source_label", "target_label",
and "properties".
+- propertykeys[]: each property key has "name", "data_type", and "cardinality".
+
+## Output Contract
+Return exactly one JSON object: {"vertices": [...], "edges": [...]}
+
+Vertex object:
+{"id":"vertex id","label":"vertex
label","properties":{"propertyName":"propertyValue", ...}}
+
+Edge object:
+{"label":"edge label","outV":"source vertex id","outVLabel":"source vertex
label","inV":"target vertex id","inVLabel":"target vertex
label","properties":{"propertyName":"propertyValue", ...}}
+
+## Deterministic Vertex ID Rules
+For every vertex, first find the schema entry where vertexlabels[].name equals
the output label.
+- vertexLabelID must be taken from that schema entry's vertexlabels[].id.
Never invent it from the label text.
+- If primary_keys has exactly one key: id =
"{vertexLabelID}:{properties.<primary_key>}".
+- If primary_keys has multiple keys: id =
"{vertexLabelID}:{properties.<pk1>}!{properties.<pk2>}" in the same order as
schema primary_keys.
+- Never use label names such as "person:Sarah" as vertex ids when schema gives
a numeric vertex label id.
+
+## Edge Reference Rules
+- outV and inV must exactly match the id of vertices in the same output.
+- outVLabel/inVLabel must match the corresponding source/target vertex label.
+- Only output an edge if both endpoint vertices are also present in vertices.
+- Do not create an edge label that is not present in edgelabels[].
+
+## Extraction Rules
+1. Do not extract labels or properties that are absent from the schema.
+2. Do not translate schema field names, labels, or property keys. Keep schema
names exactly as provided.
+3. Preserve property data types according to propertykeys[]; for example, INT
stays number and BOOLEAN stays boolean.
+4. Remove empty properties. Do not invent missing facts.
+5. Output JSON only; no Markdown fences, prose, comments, or trailing text.
+
+## Example
+Input text:
+Meet Sarah, a 30-year-old attorney, and her roommate, James, whom she's shared
a home with since 2010. James works as a journalist.
+
+Graph schema example:
+{"vertexlabels":[{"id":1,"name":"person","primary_keys":["name"],"properties":["name","age","occupation"],"nullable_keys":["age","occupation"]}],"edgelabels":[{"name":"roommate","source_label":"person","target_label":"person","properties":["date"]}],"propertykeys":[{"name":"name","data_type":"TEXT","cardinality":"SINGLE"},{"name":"age","data_type":"INT","cardinality":"SINGLE"},{"name":"occupation","data_type":"TEXT","cardinality":"SINGLE"},{"name":"date","data_type":"TEXT","cardinality":
[...]
+
+Output:
+{"vertices":[{"id":"1:Sarah","label":"person","properties":{"name":"Sarah","age":30,"occupation":"attorney"}},{"id":"1:James","label":"person","properties":{"name":"James","occupation":"journalist"}}],"edges":[{"label":"roommate","outV":"1:Sarah","outVLabel":"person","inV":"1:James","inVLabel":"person","properties":{"date":"2010"}}]}"""
graph_schema: str = """{
"vertexlabels": [
@@ -275,40 +265,52 @@ and experiences.
"""
extract_graph_prompt_CN: str = """## 主要任务
-根据以下图谱和一段文本,你的任务是分析文本并提取符合模式结构的信息,将信息格式化为顶点和边。
-
-## 基本规则
-### 模式格式
-图谱模式:
-- 顶点:[顶点标签及其属性列表]
-- 边:[边标签、源顶点标签、目标顶点标签及其属性列表]
-
-### 内容规则
-请仔细阅读提供的文本,识别与模式中定义的顶点和边相对应的信息。对于每一条匹配顶点或边的信息,按以下 JSON 结构格式化:
-
-#### 顶点格式:
-{"id":"顶点标签 ID:实体名称","label":"顶点标签","type":"vertex","properties":{"属性名":"属性值",
...}}
-
-#### 边格式:
-{"label":"边标签","type":"edge","outV":"源顶点 ID","outVLabel":"源顶点标签","inV":"目标顶点
ID","inVLabel":"目标顶点标签","properties":{"属性名":"属性值",...}}
-
-同时遵循以下规则:
-1. 不要提取给定模式中不存在的属性字段或标签
-2. 确保提取的属性集与给定模式类型一致(如'age'应为数字,'select'应为布尔值)
-3. 如果有多个主键,生成 VID 的策略是:顶点标签 ID:pk1!pk2!pk3(pk 表示主键,'!'是分隔符)
-4. 以 JSON 格式输出,仅包含顶点和边,移除空属性,基于文本/规则和模式提取和格式化
-5. 如果给定文本为中文但模式为英文,则将模式字段翻译成中文(可选)
+只抽取输入文本和给定图谱 schema 共同支持的顶点与边。只返回合法 JSON。
+
+## Schema 契约
+图谱 schema 使用以下结构:
+- vertexlabels[]:每个顶点标签包含 "id"、"name"、"primary_keys"、"properties",以及可选的
"nullable_keys"。
+- edgelabels[]:每个边标签包含 "name"、"source_label"、"target_label"、"properties"。
+- propertykeys[]:每个属性包含 "name"、"data_type"、"cardinality"。
+
+## 输出契约
+必须返回唯一 JSON 对象:{"vertices": [...], "edges": [...]}
+
+顶点对象:
+{"id":"顶点 id","label":"顶点标签","properties":{"属性名":"属性值", ...}}
+
+边对象:
+{"label":"边标签","outV":"源顶点 id","outVLabel":"源顶点标签","inV":"目标顶点
id","inVLabel":"目标顶点标签","properties":{"属性名":"属性值", ...}}
+
+## 确定性顶点 ID 规则
+对每个顶点,先找到 schema 中 vertexlabels[].name 等于输出 label 的条目。
+- vertexLabelID 必须取自该 schema 条目的 vertexlabels[].id,不能从标签文本猜测。
+- 如果 primary_keys 只有一个字段:id = "{vertexLabelID}:{properties.<primary_key>}"。
+- 如果 primary_keys 有多个字段:id =
"{vertexLabelID}:{properties.<pk1>}!{properties.<pk2>}",顺序必须与 schema
primary_keys 一致。
+- 当 schema 提供数字顶点标签 id 时,不要使用 "person:Sarah" 这样的标签名作为顶点 id。
+
+## 边引用规则
+- outV 和 inV 必须严格等于本次输出 vertices 中的 id。
+- outVLabel/inVLabel 必须分别匹配对应源/目标顶点标签。
+- 只有当两个端点顶点都出现在 vertices 中时,才输出该边。
+- 不要输出 edgelabels[] 中不存在的边标签。
+
+## 抽取规则
+1. 不要抽取 schema 中不存在的标签或属性。
+2. 不要翻译 schema 字段名、标签名或属性 key,必须与 schema 原文完全一致。
+3. 根据 propertykeys[] 保持属性类型,例如 INT 保持数字,BOOLEAN 保持布尔值。
+4. 移除空属性。不要编造缺失事实。
+5. 只输出 JSON;不要输出 Markdown 代码块、解释文本、注释或尾随文本。
## 示例
-### 输入示例:
-#### 文本
-认识 Sarah,一位 30 岁的律师,和她的室友 James,他们从 2010 年开始合住。James 在职业生活中是一名记者。
+输入文本:
+认识 Sarah,一位 30 岁的律师,和她的室友 James,他们从 2010 年开始合住。James 是一名记者。
-#### 图谱模式
-{"vertices":[{"vertex_label":"person","properties":["name","age","occupation"]}],
"edges":[{"edge_label":"roommate",
"source_vertex_label":"person","target_vertex_label":"person","properties":["date"]]}
+图谱 schema 示例:
+{"vertexlabels":[{"id":1,"name":"person","primary_keys":["name"],"properties":["name","age","occupation"],"nullable_keys":["age","occupation"]}],"edgelabels":[{"name":"roommate","source_label":"person","target_label":"person","properties":["date"]}],"propertykeys":[{"name":"name","data_type":"TEXT","cardinality":"SINGLE"},{"name":"age","data_type":"INT","cardinality":"SINGLE"},{"name":"occupation","data_type":"TEXT","cardinality":"SINGLE"},{"name":"date","data_type":"TEXT","cardinality":
[...]
-### 输出示例:
-[{"id":"1:Sarah","label":"person","type":"vertex","properties":{"name":"Sarah","age":30,"occupation":"律师"}},{"id":"1:James","label":"person","type":"vertex","properties":{"name":"James","occupation":"记者"}},{"label":"roommate","type":"edge","outV":"1:Sarah","outVLabel":"person","inV":"1:James","inVLabel":"person","properties":{"date":"2010"}}]
+输出:
+{"vertices":[{"id":"1:Sarah","label":"person","properties":{"name":"Sarah","age":30,"occupation":"律师"}},{"id":"1:James","label":"person","properties":{"name":"James","occupation":"记者"}}],"edges":[{"label":"roommate","outV":"1:Sarah","outVLabel":"person","inV":"1:James","inVLabel":"person","properties":{"date":"2010"}}]}
"""
gremlin_generate_prompt_CN: str = """
diff --git
a/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/commit_to_hugegraph.py
b/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/commit_to_hugegraph.py
index d464b80e..daade304 100644
---
a/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/commit_to_hugegraph.py
+++
b/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/commit_to_hugegraph.py
@@ -81,6 +81,7 @@ class Commit2Graph:
vertex_label_map = {v_label["name"]: v_label for v_label in
schema["vertexlabels"]}
edge_label_map = {e_label["name"]: e_label for e_label in
schema["edgelabels"]}
property_label_map = {p_label["name"]: p_label for p_label in
schema["propertykeys"]}
+ vid_mapping = {} # mapping from LLM-generated vertex ID to actual
server vertex ID
for vertex in vertices:
input_label = vertex["label"]
@@ -146,12 +147,24 @@ class Commit2Graph:
continue
# TODO: we could try batch add vertices first, setback to
single-mode if failed
- vid = self._handle_graph_creation(self.client.graph().addVertex,
input_label, input_properties).id
+ original_id = vertex.get("id")
+ if vertex_label.get("id_strategy") == "CUSTOMIZE_STRING" and
original_id:
+ result = self._handle_graph_creation(
+ self.client.graph().addVertex,
+ input_label,
+ input_properties,
+ id=original_id,
+ )
+ else:
+ result =
self._handle_graph_creation(self.client.graph().addVertex, input_label,
input_properties)
+ vid = result.id
vertex["id"] = vid
+ if original_id:
+ vid_mapping[original_id] = vid
for edge in edges:
- start = edge["outV"]
- end = edge["inV"]
+ start = vid_mapping.get(edge.get("outV"), edge.get("outV"))
+ end = vid_mapping.get(edge.get("inV"), edge.get("inV"))
label = edge["label"]
properties = edge["properties"]
diff --git
a/hugegraph-llm/src/hugegraph_llm/operators/llm_op/property_graph_extract.py
b/hugegraph-llm/src/hugegraph_llm/operators/llm_op/property_graph_extract.py
index 31411d96..ec4f7f33 100644
--- a/hugegraph-llm/src/hugegraph_llm/operators/llm_op/property_graph_extract.py
+++ b/hugegraph-llm/src/hugegraph_llm/operators/llm_op/property_graph_extract.py
@@ -120,6 +120,94 @@ class PropertyGraphExtract:
prompt = self.example_prompt + prompt
return self.llm.generate(prompt=prompt)
+ @staticmethod
+ def _primary_key_id(vertex_label, properties):
+ id_strategy = vertex_label.get("id_strategy")
+ if id_strategy and str(id_strategy).upper() != "PRIMARY_KEY":
+ return None
+ primary_keys = vertex_label.get("primary_keys", [])
+ if not primary_keys or "id" not in vertex_label:
+ return None
+ values = []
+ for key in primary_keys:
+ value = properties.get(key)
+ if value is None or value == "":
+ return None
+ values.append(str(value))
+ return f"{vertex_label['id']}:{'!'.join(values)}"
+
+ def _normalize_vertices(self, vertices, vertex_label_map):
+ vertex_id_map = {}
+ normalized_vertices = []
+ for vertex in vertices:
+ label = vertex["label"]
+ properties = vertex["properties"]
+ canonical_id = self._primary_key_id(vertex_label_map[label],
properties)
+ original_id = vertex.get("id")
+ if canonical_id is None:
+ if original_id:
+ vertex_id_map[(label, original_id)] = original_id
+ normalized_vertices.append(vertex)
+ continue
+
+ vertex["id"] = canonical_id
+ vertex_id_map[(label, canonical_id)] = canonical_id
+ if original_id:
+ vertex_id_map[(label, original_id)] = canonical_id
+ normalized_vertices.append(vertex)
+ return normalized_vertices, vertex_id_map
+
+ def _resolve_endpoint(self, edge, endpoint_key, label_key, legacy_key,
vertex_label_map, vertex_id_map):
+ endpoint = edge.get(endpoint_key)
+ label = edge.get(label_key)
+ if endpoint and label:
+ return vertex_id_map.get((label, endpoint)), label
+
+ legacy_endpoint = edge.get(legacy_key)
+ if not isinstance(legacy_endpoint, dict):
+ return None, label
+
+ label = legacy_endpoint.get("label")
+ properties = legacy_endpoint.get("properties", {})
+ if label not in vertex_label_map:
+ return None, label
+ canonical_id = self._primary_key_id(vertex_label_map[label],
properties)
+ return vertex_id_map.get((label, canonical_id)), label
+
+ def _normalize_edges(self, edges, edge_label_map, vertex_label_map,
vertex_id_map):
+ normalized_edges = []
+ for edge in edges:
+ edge_label = edge_label_map[edge["label"]]
+ out_v, out_v_label = self._resolve_endpoint(
+ edge,
+ "outV",
+ "outVLabel",
+ "source",
+ vertex_label_map,
+ vertex_id_map,
+ )
+ in_v, in_v_label = self._resolve_endpoint(
+ edge,
+ "inV",
+ "inVLabel",
+ "target",
+ vertex_label_map,
+ vertex_id_map,
+ )
+ if not out_v or not in_v:
+ log.warning("Invalid edge endpoints '%s' have been ignored.",
edge)
+ continue
+ if out_v_label != edge_label.get("source_label") or in_v_label !=
edge_label.get("target_label"):
+ log.warning("Invalid edge endpoint labels '%s' have been
ignored.", edge)
+ continue
+
+ edge["outV"] = out_v
+ edge["outVLabel"] = out_v_label
+ edge["inV"] = in_v
+ edge["inVLabel"] = in_v_label
+ normalized_edges.append(edge)
+ return normalized_edges
+
def _extract_and_filter_label(self, schema, text) -> List[Dict[str, Any]]:
# Strip markdown code blocks (e.g. ```json ... ```)
text = re.sub(r"```\w*\n?", "", text)
@@ -147,19 +235,25 @@ class PropertyGraphExtract:
return items
# Create sets for valid vertex and edge labels based on the schema
- vertex_label_set = {vertex["name"] for vertex in
schema["vertexlabels"]}
- edge_label_set = {edge["name"] for edge in schema["edgelabels"]}
+ vertex_label_map = {vertex["name"]: vertex for vertex in
schema["vertexlabels"]}
+ edge_label_map = {edge["name"]: edge for edge in
schema["edgelabels"]}
+ vertex_label_set = set(vertex_label_map)
+ edge_label_set = set(edge_label_map)
def process_items(item_list, valid_labels, item_type):
+ parsed_items = []
for item in item_list:
if not isinstance(item, dict):
log.warning("Invalid property graph item type '%s'.",
type(item))
continue
+ item = dict(item)
+ item_type_value = item.get("type", item_type)
+ item["type"] = item_type_value
if not self.NECESSARY_ITEM_KEYS.issubset(item.keys()):
log.warning("Invalid item keys '%s'.", item.keys())
continue
- if item["type"] != item_type:
- log.warning("Invalid %s type '%s' has been ignored.",
item_type, item["type"])
+ if item_type_value != item_type:
+ log.warning("Invalid %s type '%s' has been ignored.",
item_type, item_type_value)
continue
if item["label"] not in valid_labels:
log.warning(
@@ -168,10 +262,14 @@ class PropertyGraphExtract:
item["label"],
)
continue
- items.append(item)
+ parsed_items.append(item)
+ return parsed_items
- process_items(property_graph["vertices"], vertex_label_set,
"vertex")
- process_items(property_graph["edges"], edge_label_set, "edge")
+ vertex_items = process_items(property_graph["vertices"],
vertex_label_set, "vertex")
+ vertices, vertex_id_map = self._normalize_vertices(vertex_items,
vertex_label_map)
+ edge_items = process_items(property_graph["edges"],
edge_label_set, "edge")
+ edges = self._normalize_edges(edge_items, edge_label_map,
vertex_label_map, vertex_id_map)
+ items = vertices + edges
except json.JSONDecodeError:
log.critical("Invalid property graph JSON! Please check the
extracted JSON data carefully")
return items
diff --git
a/hugegraph-llm/src/hugegraph_llm/resources/prompt_examples/prompt_examples.json
b/hugegraph-llm/src/hugegraph_llm/resources/prompt_examples/prompt_examples.json
index f3bd33c3..3e7b17f4 100644
---
a/hugegraph-llm/src/hugegraph_llm/resources/prompt_examples/prompt_examples.json
+++
b/hugegraph-llm/src/hugegraph_llm/resources/prompt_examples/prompt_examples.json
@@ -3,24 +3,24 @@
"name": "Official Person-Relationship Extraction",
"description": "A standard template for extracting Person and Webpage
entities, along with their relationships (Roommate, Owns), from descriptive
text.",
"text": "Meet Sarah, a 30-year-old attorney, and her roommate, James, whom
she's shared a home with since 2010. James, in his professional life, works as
a journalist. Additionally, Sarah is the proud owner of the website
www.sarahsplace.com.",
- "prompt": "## Main Task\nGiven the following graph schema and a piece of
text, your task is to analyze the text and extract information that fits into
the schema's structure, formatting the information into vertices and edges as
specified.\n\n## Basic Rules:\n### Schema Format:\nGraph Schema:\n-
\"vertices\": [List of vertex labels and their properties]\n- \"edges\": [List
of edge labels, their source and target vertex labels, and properties]\n\n###
Content Rule:\nPlease read the pro [...]
+ "prompt": "## Main Task\nExtract only the vertices and edges supported by
the given graph schema and input text. Return valid JSON only.\n\n## Schema
Contract\nThe graph schema uses vertexlabels[], edgelabels[], and
propertykeys[]. Use vertexlabels[].id and primary_keys to build deterministic
vertex ids.\n\n## Output Contract\nReturn exactly one JSON object:
{\"vertices\": [...], \"edges\": [...]}\nVertex object: {\"id\":\"vertex
id\",\"label\":\"vertex label\",\"properties\":{\"prop [...]
},
{
"name": "Traffic Accident Element Extraction",
"description": "Extracts key elements from a traffic accident report,
including persons involved, vehicles, and responsibilities.",
"text": "On March 15, 2024, John Smith, driving a red Porsche with license
plate NY-88888, collided with a scooter ridden by Mike Lee at the intersection
of People's Road and Liberation Road. The collision resulted in a fracture in
Mike Lee's right leg. The traffic police determined that John Smith was fully
responsible for running a red light.",
- "prompt": "## Main Task\nGiven the following graph schema and a piece of
text about a traffic accident, your task is to extract information that fits
into the schema's structure, formatting the information into vertices and edges
as specified.\n\n## Basic Rules:\n### Schema Format:\nGraph Schema:\n-
\"vertices\": [List of vertex labels and their properties]\n- \"edges\": [List
of edge labels, their source and target vertex labels, and properties]\n\n###
Content Rule:\nPlease read the [...]
+ "prompt": "## Main Task\nExtract only the vertices and edges supported by
the given graph schema and input text. Return valid JSON only.\n\n## Schema
Contract\nThe graph schema uses vertexlabels[], edgelabels[], and
propertykeys[]. Use vertexlabels[].id and primary_keys to build deterministic
vertex ids.\n\n## Output Contract\nReturn exactly one JSON object:
{\"vertices\": [...], \"edges\": [...]}\nVertex object: {\"id\":\"vertex
id\",\"label\":\"vertex label\",\"properties\":{\"prop [...]
},
{
"name": "Financial Event Extraction",
"description": "Extracts key financial information such as companies,
acquisition events, and amounts from financial news.",
"text": "Tech giant Company A announced yesterday that it will fully
acquire startup Company B, which operates in the artificial intelligence
sector, for a price of $2 billion. The acquisition is expected to be completed
by the end of the year.",
- "prompt": "## Main Task\nGiven the following graph schema and a piece of
financial news, your task is to extract information about corporate mergers and
acquisitions.\n\n## Basic Rules:\n### Schema Format:\nGraph Schema:\n-
\"vertices\": [List of vertex labels and their properties]\n- \"edges\": [List
of edge labels, their source and target vertex labels, and properties]\n\n###
Content Rule:\nPlease read the provided text carefully and identify any
information that corresponds to the [...]
+ "prompt": "## Main Task\nExtract only the vertices and edges supported by
the given graph schema and input text. Return valid JSON only.\n\n## Schema
Contract\nThe graph schema uses vertexlabels[], edgelabels[], and
propertykeys[]. Use vertexlabels[].id and primary_keys to build deterministic
vertex ids.\n\n## Output Contract\nReturn exactly one JSON object:
{\"vertices\": [...], \"edges\": [...]}\nVertex object: {\"id\":\"vertex
id\",\"label\":\"vertex label\",\"properties\":{\"prop [...]
},
{
"name": "Medical Diagnosis Extraction",
"description": "Extracts patients, symptoms, diagnosis results, and
recommended drugs from medical record text.",
"text": "Patient Li Hua, presents with a headache and fever for three
days. After examination, the diagnosis is a viral cold. It is recommended to
take the drug 'Gankang' for treatment.",
- "prompt": "## Main Task\nGiven the following graph schema and a piece of
medical record, your task is to extract entities and relationships related to
diagnosis and treatment.\n\n## Basic Rules:\n### Schema Format:\nGraph
Schema:\n- \"vertices\": [List of vertex labels and their properties]\n-
\"edges\": [List of edge labels, their source and target vertex labels, and
properties]\n\n### Content Rule:\nPlease read the provided text carefully and
identify any information that correspon [...]
+ "prompt": "## Main Task\nExtract only the vertices and edges supported by
the given graph schema and input text. Return valid JSON only.\n\n## Schema
Contract\nThe graph schema uses vertexlabels[], edgelabels[], and
propertykeys[]. Use vertexlabels[].id and primary_keys to build deterministic
vertex ids.\n\n## Output Contract\nReturn exactly one JSON object:
{\"vertices\": [...], \"edges\": [...]}\nVertex object: {\"id\":\"vertex
id\",\"label\":\"vertex label\",\"properties\":{\"prop [...]
}
]
diff --git a/hugegraph-llm/src/tests/config/test_prompt_config.py
b/hugegraph-llm/src/tests/config/test_prompt_config.py
new file mode 100644
index 00000000..0a46b4c6
--- /dev/null
+++ b/hugegraph-llm/src/tests/config/test_prompt_config.py
@@ -0,0 +1,153 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import json
+from pathlib import Path
+from unittest.mock import MagicMock
+
+from hugegraph_llm.config.prompt_config import PromptConfig
+from hugegraph_llm.models.llms.base import BaseLLM
+from hugegraph_llm.operators.llm_op.property_graph_extract import
PropertyGraphExtract
+
+
+def _json_objects_after_marker(prompt, marker):
+ start = prompt.index(marker) + len(marker)
+ decoder = json.JSONDecoder()
+ objects = []
+ index = start
+ while True:
+ index = prompt.find("{", index)
+ if index == -1:
+ return objects
+ try:
+ value, end = decoder.raw_decode(prompt[index:])
+ except json.JSONDecodeError:
+ index += 1
+ continue
+ objects.append(value)
+ index += end
+
+
+def _example_schema_and_output(prompt, example_marker):
+ objects = _json_objects_after_marker(prompt, example_marker)
+ schema = next(obj for obj in objects if "vertexlabels" in obj and
"edgelabels" in obj)
+ output = next(obj for obj in objects if "vertices" in obj and "edges" in
obj)
+ return schema, output
+
+
+def _assert_prompt_example_contract(prompt, example_marker):
+ schema, output = _example_schema_and_output(prompt, example_marker)
+ _assert_output_matches_schema_contract(schema, output)
+
+
+def _assert_output_matches_schema_contract(schema, output):
+ assert set(output) == {"vertices", "edges"}
+ assert output["vertices"]
+ assert output["edges"]
+
+ vertex_ids = {vertex["id"] for vertex in output["vertices"]}
+ vertex_labels = {vertex["label"] for vertex in output["vertices"]}
+ schema_vertices = {vertex["name"]: vertex for vertex in
schema["vertexlabels"]}
+ schema_edges = {edge["name"]: edge for edge in schema["edgelabels"]}
+
+ for vertex in output["vertices"]:
+ assert set(vertex) == {"id", "label", "properties"}
+ schema_vertex = schema_vertices[vertex["label"]]
+ primary_values = [str(vertex["properties"][key]) for key in
schema_vertex["primary_keys"]]
+ expected_id = f"{schema_vertex['id']}:{'!'.join(primary_values)}"
+ assert vertex["id"] == expected_id
+ assert not vertex["id"].startswith(f"{vertex['label']}:")
+ assert isinstance(vertex["properties"], dict)
+
+ for edge in output["edges"]:
+ assert set(edge) == {"label", "outV", "outVLabel", "inV", "inVLabel",
"properties"}
+ assert edge["label"] in schema_edges
+ assert edge["outV"] in vertex_ids
+ assert edge["inV"] in vertex_ids
+ assert edge["outVLabel"] in vertex_labels
+ assert edge["inVLabel"] in vertex_labels
+ assert edge["outVLabel"] == schema_edges[edge["label"]]["source_label"]
+ assert edge["inVLabel"] == schema_edges[edge["label"]]["target_label"]
+ assert isinstance(edge["properties"], dict)
+
+ extractor = PropertyGraphExtract(llm=MagicMock(spec=BaseLLM))
+ parsed_items = extractor._extract_and_filter_label(schema,
json.dumps(output))
+ assert {item["type"] for item in parsed_items} == {"vertex", "edge"}
+ assert len(parsed_items) == len(output["vertices"]) + len(output["edges"])
+
+
+def test_extract_graph_prompt_en_example_matches_parser_contract():
+ _assert_prompt_example_contract(PromptConfig.extract_graph_prompt_EN, "##
Example")
+
+
+def test_extract_graph_prompt_cn_example_matches_parser_contract():
+ _assert_prompt_example_contract(PromptConfig.extract_graph_prompt_CN, "##
示例")
+
+
+def test_extract_graph_prompt_example_contract_rejects_label_name_vertex_id():
+ schema, output =
_example_schema_and_output(PromptConfig.extract_graph_prompt_EN, "## Example")
+ output["vertices"][0]["id"] = "person:Sarah"
+
+ try:
+ _assert_output_matches_schema_contract(schema, output)
+ except AssertionError:
+ return
+
+ raise AssertionError("Prompt example contract accepted a label-name vertex
id")
+
+
+def
test_extract_graph_prompt_example_contract_rejects_dangling_edge_reference():
+ schema, output =
_example_schema_and_output(PromptConfig.extract_graph_prompt_EN, "## Example")
+ output["edges"][0]["outV"] = "1:Missing"
+
+ try:
+ _assert_output_matches_schema_contract(schema, output)
+ except AssertionError:
+ return
+
+ raise AssertionError("Prompt example contract accepted an edge reference
outside vertices")
+
+
+def test_prompt_examples_match_extraction_contract():
+ examples_path = (
+ Path(__file__).parents[2] / "hugegraph_llm" / "resources" /
"prompt_examples" / "prompt_examples.json"
+ )
+ examples = json.loads(examples_path.read_text(encoding="utf-8"))
+
+ for example in examples:
+ prompt = example["prompt"]
+ assert '"type":"vertex"' not in prompt
+ assert '"type":"edge"' not in prompt
+ _assert_prompt_example_contract(prompt, "## Example")
+
+
+def test_prompt_examples_use_matching_domain_examples():
+ examples_path = (
+ Path(__file__).parents[2] / "hugegraph_llm" / "resources" /
"prompt_examples" / "prompt_examples.json"
+ )
+ examples = json.loads(examples_path.read_text(encoding="utf-8"))
+ domain_markers = {
+ "Official Person-Relationship Extraction": ["Sarah", "James"],
+ "Traffic Accident Element Extraction": ["John Smith", "NY-88888"],
+ "Financial Event Extraction": ["Company A", "$2 billion"],
+ "Medical Diagnosis Extraction": ["Li Hua", "Gankang"],
+ }
+
+ for example in examples:
+ prompt = example["prompt"]
+ for marker in domain_markers[example["name"]]:
+ assert marker in prompt
diff --git
a/hugegraph-llm/src/tests/operators/hugegraph_op/test_commit_to_hugegraph.py
b/hugegraph-llm/src/tests/operators/hugegraph_op/test_commit_to_hugegraph.py
index 634fdb96..b13f9042 100644
--- a/hugegraph-llm/src/tests/operators/hugegraph_op/test_commit_to_hugegraph.py
+++ b/hugegraph-llm/src/tests/operators/hugegraph_op/test_commit_to_hugegraph.py
@@ -22,6 +22,7 @@ from unittest.mock import MagicMock, patch
from pyhugegraph.utils.exceptions import CreateError, NotFoundError
from hugegraph_llm.operators.hugegraph_op.commit_to_hugegraph import
Commit2Graph
+from hugegraph_llm.operators.llm_op.property_graph_extract import
PropertyGraphExtract
class TestCommit2Graph(unittest.TestCase):
@@ -49,6 +50,7 @@ class TestCommit2Graph(unittest.TestCase):
],
"vertexlabels": [
{
+ "id": 1,
"name": "person",
"properties": ["name", "age"],
"primary_keys": ["name"],
@@ -56,6 +58,7 @@ class TestCommit2Graph(unittest.TestCase):
"id_strategy": "PRIMARY_KEY",
},
{
+ "id": 2,
"name": "movie",
"properties": ["title", "year"],
"primary_keys": ["title"],
@@ -351,6 +354,173 @@ class TestCommit2Graph(unittest.TestCase):
# Verify that _handle_graph_creation was called for each vertex and
edge
self.assertEqual(mock_handle_graph_creation.call_count, 3) # 2
vertices + 1 edge
+
@patch("hugegraph_llm.operators.hugegraph_op.commit_to_hugegraph.Commit2Graph._handle_graph_creation")
+ def test_load_into_graph_maps_llm_vertex_ids_to_created_vertex_ids(self,
mock_handle_graph_creation):
+ """Test edges use server-created vertex ids when LLM ids differ."""
+ mock_handle_graph_creation.side_effect = [
+ MagicMock(id="1:Tom Hanks"),
+ MagicMock(id="2:Forrest Gump"),
+ MagicMock(id="edge_id"),
+ ]
+
+ vertices = [
+ {
+ "id": "person:Tom Hanks",
+ "label": "person",
+ "properties": {"name": "Tom Hanks", "age": 67},
+ },
+ {
+ "id": "movie:Forrest Gump",
+ "label": "movie",
+ "properties": {"title": "Forrest Gump", "year": 1994},
+ },
+ ]
+ edges = [
+ {
+ "label": "acted_in",
+ "properties": {"role": "Forrest Gump"},
+ "outV": "person:Tom Hanks",
+ "inV": "movie:Forrest Gump",
+ }
+ ]
+
+ self.commit2graph.load_into_graph(vertices, edges, self.schema)
+
+ self.assertEqual(vertices[0]["id"], "1:Tom Hanks")
+ self.assertEqual(vertices[1]["id"], "2:Forrest Gump")
+ mock_handle_graph_creation.assert_any_call(
+ self.commit2graph.client.graph().addEdge,
+ "acted_in",
+ "1:Tom Hanks",
+ "2:Forrest Gump",
+ {"role": "Forrest Gump"},
+ )
+
+
@patch("hugegraph_llm.operators.hugegraph_op.commit_to_hugegraph.Commit2Graph._handle_graph_creation")
+ def test_load_into_graph_uses_explicit_customize_string_ids(self,
mock_handle_graph_creation):
+ """Test custom string ids are passed to HugeGraph when schema requires
them."""
+ mock_handle_graph_creation.side_effect = [
+ MagicMock(id="Tom Hanks"),
+ MagicMock(id="Forrest Gump"),
+ MagicMock(id="edge_id"),
+ ]
+ schema = {
+ "propertykeys": [
+ {"name": "name", "data_type": "TEXT", "cardinality": "SINGLE"},
+ {"name": "title", "data_type": "TEXT", "cardinality":
"SINGLE"},
+ ],
+ "vertexlabels": [
+ {
+ "id": 7,
+ "name": "person",
+ "id_strategy": "CUSTOMIZE_STRING",
+ "primary_keys": ["name"],
+ "properties": ["name"],
+ "nullable_keys": [],
+ },
+ {
+ "id": 8,
+ "name": "movie",
+ "id_strategy": "CUSTOMIZE_STRING",
+ "primary_keys": ["title"],
+ "properties": ["title"],
+ "nullable_keys": [],
+ },
+ ],
+ "edgelabels": [{"name": "acted_in", "properties": [],
"source_label": "person", "target_label": "movie"}],
+ }
+ vertices = [
+ {"id": "Tom Hanks", "label": "person", "properties": {"name": "Tom
Hanks"}},
+ {"id": "Forrest Gump", "label": "movie", "properties": {"title":
"Forrest Gump"}},
+ ]
+ edges = [
+ {
+ "label": "acted_in",
+ "properties": {},
+ "outV": "Tom Hanks",
+ "inV": "Forrest Gump",
+ }
+ ]
+
+ self.commit2graph.load_into_graph(vertices, edges, schema)
+
+ mock_handle_graph_creation.assert_any_call(
+ self.commit2graph.client.graph().addVertex,
+ "person",
+ {"name": "Tom Hanks"},
+ id="Tom Hanks",
+ )
+ mock_handle_graph_creation.assert_any_call(
+ self.commit2graph.client.graph().addVertex,
+ "movie",
+ {"title": "Forrest Gump"},
+ id="Forrest Gump",
+ )
+ mock_handle_graph_creation.assert_any_call(
+ self.commit2graph.client.graph().addEdge,
+ "acted_in",
+ "Tom Hanks",
+ "Forrest Gump",
+ {},
+ )
+
+
@patch("hugegraph_llm.operators.hugegraph_op.commit_to_hugegraph.Commit2Graph._handle_graph_creation")
+ def
test_load_into_graph_accepts_normalized_extraction_without_item_type(self,
mock_handle_graph_creation):
+ """Test normalized LLM output without type fields can be committed."""
+ mock_handle_graph_creation.side_effect = [
+ MagicMock(id="1:Tom Hanks"),
+ MagicMock(id="2:Forrest Gump"),
+ MagicMock(id="edge_id"),
+ ]
+ llm_output = """{
+ "vertices": [
+ {
+ "id": "person:Tom Hanks",
+ "label": "person",
+ "properties": {
+ "name": "Tom Hanks",
+ "age": 67
+ }
+ },
+ {
+ "id": "movie:Forrest Gump",
+ "label": "movie",
+ "properties": {
+ "title": "Forrest Gump",
+ "year": 1994
+ }
+ }
+ ],
+ "edges": [
+ {
+ "label": "acted_in",
+ "outV": "person:Tom Hanks",
+ "outVLabel": "person",
+ "inV": "movie:Forrest Gump",
+ "inVLabel": "movie",
+ "properties": {
+ "role": "Forrest Gump"
+ }
+ }
+ ]
+ }"""
+
+ items =
PropertyGraphExtract(llm=MagicMock())._extract_and_filter_label(self.schema,
llm_output)
+ vertices = [item for item in items if item["type"] == "vertex"]
+ edges = [item for item in items if item["type"] == "edge"]
+ self.assertEqual(edges[0]["outV"], "1:Tom Hanks")
+ self.assertEqual(edges[0]["inV"], "2:Forrest Gump")
+
+ self.commit2graph.load_into_graph(vertices, edges, self.schema)
+
+ mock_handle_graph_creation.assert_any_call(
+ self.commit2graph.client.graph().addEdge,
+ "acted_in",
+ "1:Tom Hanks",
+ "2:Forrest Gump",
+ {"role": "Forrest Gump"},
+ )
+
@patch("hugegraph_llm.operators.hugegraph_op.commit_to_hugegraph.Commit2Graph._handle_graph_creation")
def test_load_into_graph_with_data_type_validation_failure(self,
mock_handle_graph_creation):
"""Test load_into_graph method with data type validation failure."""
diff --git
a/hugegraph-llm/src/tests/operators/llm_op/test_property_graph_extract.py
b/hugegraph-llm/src/tests/operators/llm_op/test_property_graph_extract.py
index 7c84de15..3eb49026 100644
--- a/hugegraph-llm/src/tests/operators/llm_op/test_property_graph_extract.py
+++ b/hugegraph-llm/src/tests/operators/llm_op/test_property_graph_extract.py
@@ -39,19 +39,23 @@ class TestPropertyGraphExtract(unittest.TestCase):
self.schema = {
"vertexlabels": [
{
+ "id": 1,
"name": "person",
"primary_keys": ["name"],
"nullable_keys": ["age"],
"properties": ["name", "age"],
},
{
+ "id": 2,
"name": "movie",
"primary_keys": ["title"],
"nullable_keys": ["year"],
"properties": ["title", "year"],
},
],
- "edgelabels": [{"name": "acted_in", "properties": ["role"]}],
+ "edgelabels": [
+ {"name": "acted_in", "properties": ["role"], "source_label":
"person", "target_label": "movie"}
+ ],
}
# Sample text chunks
@@ -77,6 +81,13 @@ class TestPropertyGraphExtract(unittest.TestCase):
}""",
"""{
"vertices": [
+ {
+ "type": "vertex",
+ "label": "person",
+ "properties": {
+ "name": "Tom Hanks"
+ }
+ },
{
"type": "vertex",
"label": "movie",
@@ -194,11 +205,13 @@ class TestPropertyGraphExtract(unittest.TestCase):
result = extractor._extract_and_filter_label(self.schema, text)
- self.assertEqual(len(result), 2)
+ self.assertEqual(len(result), 3)
self.assertEqual(result[0]["type"], "vertex")
- self.assertEqual(result[0]["label"], "movie")
- self.assertEqual(result[1]["type"], "edge")
- self.assertEqual(result[1]["label"], "acted_in")
+ self.assertEqual(result[0]["label"], "person")
+ self.assertEqual(result[1]["type"], "vertex")
+ self.assertEqual(result[1]["label"], "movie")
+ self.assertEqual(result[2]["type"], "edge")
+ self.assertEqual(result[2]["label"], "acted_in")
def test_extract_and_filter_label_markdown_json(self):
"""Test _extract_and_filter_label with JSON wrapped in markdown
fences."""
@@ -209,11 +222,13 @@ class TestPropertyGraphExtract(unittest.TestCase):
result = extractor._extract_and_filter_label(self.schema, text)
- self.assertEqual(len(result), 2)
+ self.assertEqual(len(result), 3)
self.assertEqual(result[0]["type"], "vertex")
- self.assertEqual(result[0]["label"], "movie")
- self.assertEqual(result[1]["type"], "edge")
- self.assertEqual(result[1]["label"], "acted_in")
+ self.assertEqual(result[0]["label"], "person")
+ self.assertEqual(result[1]["type"], "vertex")
+ self.assertEqual(result[1]["label"], "movie")
+ self.assertEqual(result[2]["type"], "edge")
+ self.assertEqual(result[2]["label"], "acted_in")
def test_extract_and_filter_label_markdown_json_with_prose(self):
"""Test fenced JSON can be parsed when the LLM adds prose."""
@@ -226,11 +241,13 @@ Hope this helps."""
result = extractor._extract_and_filter_label(self.schema, text)
- self.assertEqual(len(result), 2)
+ self.assertEqual(len(result), 3)
self.assertEqual(result[0]["type"], "vertex")
- self.assertEqual(result[0]["label"], "movie")
- self.assertEqual(result[1]["type"], "edge")
- self.assertEqual(result[1]["label"], "acted_in")
+ self.assertEqual(result[0]["label"], "person")
+ self.assertEqual(result[1]["type"], "vertex")
+ self.assertEqual(result[1]["label"], "movie")
+ self.assertEqual(result[2]["type"], "edge")
+ self.assertEqual(result[2]["label"], "acted_in")
def test_extract_and_filter_label_flat_array_json(self):
"""Test _extract_and_filter_label converts flat arrays to vertices and
edges."""
@@ -244,6 +261,13 @@ Hope this helps."""
"name": "Tom Hanks"
}
},
+ {
+ "type": "vertex",
+ "label": "movie",
+ "properties": {
+ "title": "Forrest Gump"
+ }
+ },
{
"type": "edge",
"label": "acted_in",
@@ -268,11 +292,13 @@ Hope this helps."""
result = extractor._extract_and_filter_label(self.schema, text)
- self.assertEqual(len(result), 2)
+ self.assertEqual(len(result), 3)
self.assertEqual(result[0]["type"], "vertex")
self.assertEqual(result[0]["label"], "person")
- self.assertEqual(result[1]["type"], "edge")
- self.assertEqual(result[1]["label"], "acted_in")
+ self.assertEqual(result[1]["type"], "vertex")
+ self.assertEqual(result[1]["label"], "movie")
+ self.assertEqual(result[2]["type"], "edge")
+ self.assertEqual(result[2]["label"], "acted_in")
def test_extract_and_filter_label_flat_array_filters_invalid_items(self):
"""Test flat arrays keep valid graph items and drop invalid ones."""
@@ -285,6 +311,13 @@ Hope this helps."""
"name": "Tom Hanks"
}
},
+ {
+ "type": "vertex",
+ "label": "movie",
+ "properties": {
+ "title": "Forrest Gump"
+ }
+ },
{
"type": "vertex",
"label": "unknown_label",
@@ -326,11 +359,13 @@ Hope this helps."""
result = extractor._extract_and_filter_label(self.schema, text)
- self.assertEqual(len(result), 2)
+ self.assertEqual(len(result), 3)
self.assertEqual(result[0]["type"], "vertex")
self.assertEqual(result[0]["label"], "person")
- self.assertEqual(result[1]["type"], "edge")
- self.assertEqual(result[1]["label"], "acted_in")
+ self.assertEqual(result[1]["type"], "vertex")
+ self.assertEqual(result[1]["label"], "movie")
+ self.assertEqual(result[2]["type"], "edge")
+ self.assertEqual(result[2]["label"], "acted_in")
def test_extract_and_filter_label_malformed_fenced_json(self):
"""Test malformed fenced JSON returns no graph items."""
@@ -354,6 +389,432 @@ Hope this helps."""
self.assertEqual(result, [])
+ def test_extract_and_filter_label_infers_type_from_grouped_arrays(self):
+ """Infer item type from vertices/edges containers when LLM omits it."""
+ extractor = PropertyGraphExtract(llm=self.mock_llm)
+ text = """{
+ "vertices": [
+ {
+ "label": "person",
+ "properties": {
+ "name": "Tom Hanks"
+ }
+ },
+ {
+ "label": "movie",
+ "properties": {
+ "title": "Forrest Gump"
+ }
+ }
+ ],
+ "edges": [
+ {
+ "label": "acted_in",
+ "properties": {
+ "role": "Forrest Gump"
+ },
+ "source": {
+ "label": "person",
+ "properties": {
+ "name": "Tom Hanks"
+ }
+ },
+ "target": {
+ "label": "movie",
+ "properties": {
+ "title": "Forrest Gump"
+ }
+ }
+ }
+ ]
+ }"""
+
+ result = extractor._extract_and_filter_label(self.schema, text)
+
+ self.assertEqual(len(result), 3)
+ self.assertEqual(result[0]["type"], "vertex")
+ self.assertEqual(result[0]["label"], "person")
+ self.assertEqual(result[1]["type"], "vertex")
+ self.assertEqual(result[1]["label"], "movie")
+ self.assertEqual(result[2]["type"], "edge")
+ self.assertEqual(result[2]["label"], "acted_in")
+
+ def test_extract_and_filter_label_normalizes_primary_key_ids(self):
+ """Normalize LLM vertex ids to schema-derived primary-key ids."""
+ extractor = PropertyGraphExtract(llm=self.mock_llm)
+ text = """{
+ "vertices": [
+ {
+ "id": "person:Tom Hanks",
+ "label": "person",
+ "properties": {
+ "name": "Tom Hanks"
+ }
+ },
+ {
+ "id": "movie:Forrest Gump",
+ "label": "movie",
+ "properties": {
+ "title": "Forrest Gump"
+ }
+ }
+ ],
+ "edges": [
+ {
+ "label": "acted_in",
+ "outV": "person:Tom Hanks",
+ "outVLabel": "person",
+ "inV": "movie:Forrest Gump",
+ "inVLabel": "movie",
+ "properties": {
+ "role": "Forrest Gump"
+ }
+ }
+ ]
+ }"""
+
+ result = extractor._extract_and_filter_label(self.schema, text)
+
+ self.assertEqual(result[0]["id"], "1:Tom Hanks")
+ self.assertEqual(result[1]["id"], "2:Forrest Gump")
+ self.assertEqual(result[2]["outV"], "1:Tom Hanks")
+ self.assertEqual(result[2]["inV"], "2:Forrest Gump")
+
+ def test_extract_and_filter_label_keeps_canonical_primary_key_ids(self):
+ """Keep already-canonical vertex and edge ids intact."""
+ extractor = PropertyGraphExtract(llm=self.mock_llm)
+ text = """{
+ "vertices": [
+ {
+ "id": "1:Tom Hanks",
+ "label": "person",
+ "properties": {
+ "name": "Tom Hanks"
+ }
+ },
+ {
+ "id": "2:Forrest Gump",
+ "label": "movie",
+ "properties": {
+ "title": "Forrest Gump"
+ }
+ }
+ ],
+ "edges": [
+ {
+ "label": "acted_in",
+ "outV": "1:Tom Hanks",
+ "outVLabel": "person",
+ "inV": "2:Forrest Gump",
+ "inVLabel": "movie",
+ "properties": {
+ "role": "Forrest Gump"
+ }
+ }
+ ]
+ }"""
+
+ result = extractor._extract_and_filter_label(self.schema, text)
+
+ self.assertEqual(result[0]["id"], "1:Tom Hanks")
+ self.assertEqual(result[1]["id"], "2:Forrest Gump")
+ self.assertEqual(result[2]["outV"], "1:Tom Hanks")
+ self.assertEqual(result[2]["inV"], "2:Forrest Gump")
+
+ def
test_extract_and_filter_label_normalizes_multiple_primary_key_ids(self):
+ """Normalize multi-primary-key vertex ids in schema primary-key
order."""
+ extractor = PropertyGraphExtract(llm=self.mock_llm)
+ schema = {
+ "vertexlabels": [
+ {
+ "id": 3,
+ "name": "character",
+ "primary_keys": ["name", "universe"],
+ "nullable_keys": [],
+ "properties": ["name", "universe"],
+ }
+ ],
+ "edgelabels": [],
+ }
+ text = """{
+ "vertices": [
+ {
+ "id": "character:Tom!movie",
+ "label": "character",
+ "properties": {
+ "name": "Tom",
+ "universe": "movie"
+ }
+ }
+ ],
+ "edges": []
+ }"""
+
+ result = extractor._extract_and_filter_label(schema, text)
+
+ self.assertEqual(result[0]["id"], "3:Tom!movie")
+
+ def test_extract_and_filter_label_resolves_source_target_edge_refs(self):
+ """Resolve source/target edge endpoints to canonical outV/inV ids."""
+ extractor = PropertyGraphExtract(llm=self.mock_llm)
+ text = """{
+ "vertices": [
+ {
+ "label": "person",
+ "properties": {
+ "name": "Tom Hanks"
+ }
+ },
+ {
+ "label": "movie",
+ "properties": {
+ "title": "Forrest Gump"
+ }
+ }
+ ],
+ "edges": [
+ {
+ "label": "acted_in",
+ "properties": {
+ "role": "Forrest Gump"
+ },
+ "source": {
+ "label": "person",
+ "properties": {
+ "name": "Tom Hanks"
+ }
+ },
+ "target": {
+ "label": "movie",
+ "properties": {
+ "title": "Forrest Gump"
+ }
+ }
+ }
+ ]
+ }"""
+
+ result = extractor._extract_and_filter_label(self.schema, text)
+
+ self.assertEqual(result[0]["id"], "1:Tom Hanks")
+ self.assertEqual(result[1]["id"], "2:Forrest Gump")
+ self.assertEqual(result[2]["outV"], "1:Tom Hanks")
+ self.assertEqual(result[2]["outVLabel"], "person")
+ self.assertEqual(result[2]["inV"], "2:Forrest Gump")
+ self.assertEqual(result[2]["inVLabel"], "movie")
+
+ def
test_extract_and_filter_label_drops_edges_with_unresolved_endpoints(self):
+ """Drop edges whose endpoints cannot be resolved before commit."""
+ extractor = PropertyGraphExtract(llm=self.mock_llm)
+ text = """{
+ "vertices": [
+ {
+ "label": "person",
+ "properties": {
+ "name": "Tom Hanks"
+ }
+ }
+ ],
+ "edges": [
+ {
+ "label": "acted_in",
+ "outV": "person:Missing",
+ "outVLabel": "person",
+ "inV": "movie:Missing",
+ "inVLabel": "movie",
+ "properties": {
+ "role": "Forrest Gump"
+ }
+ }
+ ]
+ }"""
+
+ result = extractor._extract_and_filter_label(self.schema, text)
+
+ self.assertEqual(len(result), 1)
+ self.assertEqual(result[0]["type"], "vertex")
+
+ def
test_extract_and_filter_label_drops_legacy_edges_with_missing_vertices(self):
+ """Drop legacy source/target edges unless both endpoints are emitted
as vertices."""
+ extractor = PropertyGraphExtract(llm=self.mock_llm)
+ text = """{
+ "vertices": [
+ {
+ "label": "person",
+ "properties": {
+ "name": "Tom Hanks"
+ }
+ }
+ ],
+ "edges": [
+ {
+ "label": "acted_in",
+ "properties": {
+ "role": "Forrest Gump"
+ },
+ "source": {
+ "label": "person",
+ "properties": {
+ "name": "Tom Hanks"
+ }
+ },
+ "target": {
+ "label": "movie",
+ "properties": {
+ "title": "Forrest Gump"
+ }
+ }
+ }
+ ]
+ }"""
+
+ result = extractor._extract_and_filter_label(self.schema, text)
+
+ self.assertEqual(len(result), 1)
+ self.assertEqual(result[0]["type"], "vertex")
+
+ def test_extract_and_filter_label_keeps_explicit_custom_ids(self):
+ """Keep self-consistent explicit ids when schema cannot derive
primary-key ids."""
+ extractor = PropertyGraphExtract(llm=self.mock_llm)
+ schema = {
+ "vertexlabels": [
+ {"name": "person", "id_strategy": "CUSTOMIZE_STRING",
"properties": ["name"], "nullable_keys": []},
+ {"name": "movie", "id_strategy": "CUSTOMIZE_STRING",
"properties": ["title"], "nullable_keys": []},
+ ],
+ "edgelabels": [{"name": "acted_in", "properties": [],
"source_label": "person", "target_label": "movie"}],
+ }
+ text = """{
+ "vertices": [
+ {
+ "id": "Tom Hanks",
+ "label": "person",
+ "properties": {
+ "name": "Tom Hanks"
+ }
+ },
+ {
+ "id": "Forrest Gump",
+ "label": "movie",
+ "properties": {
+ "title": "Forrest Gump"
+ }
+ }
+ ],
+ "edges": [
+ {
+ "label": "acted_in",
+ "outV": "Tom Hanks",
+ "outVLabel": "person",
+ "inV": "Forrest Gump",
+ "inVLabel": "movie",
+ "properties": {}
+ }
+ ]
+ }"""
+
+ result = extractor._extract_and_filter_label(schema, text)
+
+ self.assertEqual(len(result), 3)
+ self.assertEqual(result[2]["outV"], "Tom Hanks")
+ self.assertEqual(result[2]["inV"], "Forrest Gump")
+
+ def
test_extract_and_filter_label_keeps_explicit_custom_ids_with_label_metadata(self):
+ """Do not rewrite custom ids even when schema includes ids and primary
keys."""
+ extractor = PropertyGraphExtract(llm=self.mock_llm)
+ schema = {
+ "vertexlabels": [
+ {
+ "id": 7,
+ "name": "person",
+ "id_strategy": "CUSTOMIZE_STRING",
+ "primary_keys": ["name"],
+ "properties": ["name"],
+ "nullable_keys": [],
+ },
+ {
+ "id": 8,
+ "name": "movie",
+ "id_strategy": "CUSTOMIZE_STRING",
+ "primary_keys": ["title"],
+ "properties": ["title"],
+ "nullable_keys": [],
+ },
+ ],
+ "edgelabels": [{"name": "acted_in", "properties": [],
"source_label": "person", "target_label": "movie"}],
+ }
+ text = """{
+ "vertices": [
+ {
+ "id": "Tom Hanks",
+ "label": "person",
+ "properties": {
+ "name": "Tom Hanks"
+ }
+ },
+ {
+ "id": "Forrest Gump",
+ "label": "movie",
+ "properties": {
+ "title": "Forrest Gump"
+ }
+ }
+ ],
+ "edges": [
+ {
+ "label": "acted_in",
+ "outV": "Tom Hanks",
+ "outVLabel": "person",
+ "inV": "Forrest Gump",
+ "inVLabel": "movie",
+ "properties": {}
+ }
+ ]
+ }"""
+
+ result = extractor._extract_and_filter_label(schema, text)
+
+ self.assertEqual(result[0]["id"], "Tom Hanks")
+ self.assertEqual(result[1]["id"], "Forrest Gump")
+ self.assertEqual(result[2]["outV"], "Tom Hanks")
+ self.assertEqual(result[2]["inV"], "Forrest Gump")
+
+ def
test_extract_and_filter_label_drops_edges_with_mismatched_endpoint_labels(self):
+ """Drop edges whose endpoint labels conflict with the edge schema."""
+ extractor = PropertyGraphExtract(llm=self.mock_llm)
+ text = """{
+ "vertices": [
+ {
+ "label": "person",
+ "properties": {
+ "name": "Tom Hanks"
+ }
+ },
+ {
+ "label": "movie",
+ "properties": {
+ "title": "Forrest Gump"
+ }
+ }
+ ],
+ "edges": [
+ {
+ "label": "acted_in",
+ "outV": "1:Tom Hanks",
+ "outVLabel": "movie",
+ "inV": "2:Forrest Gump",
+ "inVLabel": "person",
+ "properties": {
+ "role": "Forrest Gump"
+ }
+ }
+ ]
+ }"""
+
+ result = extractor._extract_and_filter_label(self.schema, text)
+
+ self.assertEqual(len(result), 2)
+ self.assertTrue(all(item["type"] == "vertex" for item in result))
+
def test_extract_and_filter_label_invalid_json(self):
"""Test the _extract_and_filter_label method with invalid JSON."""
extractor = PropertyGraphExtract(llm=self.mock_llm)
@@ -387,6 +848,34 @@ Hope this helps."""
self.assertEqual(result, [])
+ def test_extract_and_filter_label_rejects_explicit_type_mismatch(self):
+ """Do not override an explicit item type that conflicts with its
container."""
+ extractor = PropertyGraphExtract(llm=self.mock_llm)
+ text = """{
+ "vertices": [
+ {
+ "type": "edge",
+ "label": "person",
+ "properties": {
+ "name": "Tom Hanks"
+ }
+ }
+ ],
+ "edges": [
+ {
+ "type": "vertex",
+ "label": "acted_in",
+ "properties": {
+ "role": "Forrest Gump"
+ }
+ }
+ ]
+ }"""
+
+ result = extractor._extract_and_filter_label(self.schema, text)
+
+ self.assertEqual(result, [])
+
def test_extract_and_filter_label_invalid_label(self):
"""Test the _extract_and_filter_label method with invalid label."""
extractor = PropertyGraphExtract(llm=self.mock_llm)
@@ -446,13 +935,13 @@ Hope this helps."""
self.assertEqual(extractor.extract_property_graph_by_llm.call_count, 2)
# Verify the results
- self.assertEqual(len(result["vertices"]), 2)
+ self.assertEqual(len(result["vertices"]), 3)
self.assertEqual(len(result["edges"]), 1)
self.assertEqual(result["call_count"], 2)
# Check vertex properties
self.assertEqual(result["vertices"][0]["properties"]["name"], "Tom
Hanks")
- self.assertEqual(result["vertices"][1]["properties"]["title"],
"Forrest Gump")
+ self.assertEqual(result["vertices"][2]["properties"]["title"],
"Forrest Gump")
# Check edge properties
self.assertEqual(result["edges"][0]["properties"]["role"], "Forrest
Gump")
@@ -490,7 +979,7 @@ Hope this helps."""
result = extractor.run(context)
# Verify the results
- self.assertEqual(len(result["vertices"]), 3) # 1 existing + 2 new
+ self.assertEqual(len(result["vertices"]), 4) # 1 existing + 3 new
self.assertEqual(len(result["edges"]), 2) # 1 existing + 1 new
self.assertEqual(result["call_count"], 2)