This is an automated email from the ASF dual-hosted git repository.
jin pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-hugegraph-ai.git
The following commit(s) were added to refs/heads/main by this push:
new f8b4582 fix(llm): correct e_cache type (#117)
f8b4582 is described below
commit f8b4582ee39f13e5f8734caa2e608082a685d91c
Author: chenzihong <[email protected]>
AuthorDate: Thu Nov 21 14:46:49 2024 +0800
fix(llm): correct e_cache type (#117)
- [X] fix edge cache using (inV, label, outV) as key
- [X] limit node and edge property length
- [X] handle isolate node problem
---------
Co-authored-by: imbajin <[email protected]>
---
.../src/hugegraph_llm/config/config_data.py | 1 +
.../src/hugegraph_llm/operators/graph_rag_task.py | 7 ++-
.../operators/hugegraph_op/graph_rag_query.py | 69 +++++++++++++---------
3 files changed, 49 insertions(+), 28 deletions(-)
diff --git a/hugegraph-llm/src/hugegraph_llm/config/config_data.py
b/hugegraph-llm/src/hugegraph_llm/config/config_data.py
index 004a29e..52b41dd 100644
--- a/hugegraph-llm/src/hugegraph_llm/config/config_data.py
+++ b/hugegraph-llm/src/hugegraph_llm/config/config_data.py
@@ -70,6 +70,7 @@ class ConfigData:
graph_user: Optional[str] = "admin"
graph_pwd: Optional[str] = "xxx"
graph_space: Optional[str] = None
+ limit_property: Optional[str] = "False"
"""Admin settings"""
enable_login: Optional[str] = "False"
diff --git a/hugegraph-llm/src/hugegraph_llm/operators/graph_rag_task.py
b/hugegraph-llm/src/hugegraph_llm/operators/graph_rag_task.py
index e6da8e0..789ec20 100644
--- a/hugegraph-llm/src/hugegraph_llm/operators/graph_rag_task.py
+++ b/hugegraph-llm/src/hugegraph_llm/operators/graph_rag_task.py
@@ -117,6 +117,8 @@ class RAGPipeline:
self,
max_deep: int = 2,
max_items: int = 30,
+ max_v_prop_len: int = 2048,
+ max_e_prop_len: int = 256,
prop_to_match: Optional[str] = None,
):
"""
@@ -124,11 +126,14 @@ class RAGPipeline:
:param max_deep: Maximum depth for the graph query.
:param max_items: Maximum number of items to retrieve.
+ :param max_v_prop_len: Maximum length of vertex properties.
+ :param max_e_prop_len: Maximum length of edge properties.
:param prop_to_match: Property to match in the graph.
:return: Self-instance for chaining.
"""
self._operators.append(
- GraphRAGQuery(max_deep=max_deep, max_items=max_items,
prop_to_match=prop_to_match)
+ GraphRAGQuery(max_deep=max_deep, max_items=max_items,
max_v_prop_len=max_v_prop_len,
+ max_e_prop_len=max_e_prop_len,
prop_to_match=prop_to_match)
)
return self
diff --git
a/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/graph_rag_query.py
b/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/graph_rag_query.py
index 3f11c91..a3dc1ad 100644
--- a/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/graph_rag_query.py
+++ b/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/graph_rag_query.py
@@ -71,7 +71,8 @@ g.V().has('{prop}', within({keywords}))
class GraphRAGQuery:
- def __init__(self, max_deep: int = 2, max_items: int = 20, prop_to_match:
Optional[str] = None):
+ def __init__(self, max_deep: int = 2, max_items: int = 20, max_v_prop_len:
int = 2048,
+ max_e_prop_len: int = 256, prop_to_match: Optional[str] =
None):
self._client = PyHugeClient(
settings.graph_ip,
settings.graph_port,
@@ -84,6 +85,9 @@ class GraphRAGQuery:
self._max_items = max_items
self._prop_to_match = prop_to_match
self._schema = ""
+ self._limit_property = settings.limit_property.lower() == "true"
+ self._max_v_prop_len = max_v_prop_len
+ self._max_e_prop_len = max_e_prop_len
def run(self, context: Dict[str, Any]) -> Dict[str, Any]:
# pylint: disable=R0915 (too-many-statements)
@@ -138,6 +142,10 @@ class GraphRAGQuery:
graph_chain_knowledge, vertex_degree_list, knowledge_with_degree =
self._format_graph_query_result(
query_paths=paths
)
+
+ # TODO: we may need to optimize the logic here with global
deduplication (may lack some single vertex)
+ if not graph_chain_knowledge:
+ graph_chain_knowledge.update(vertex_knowledge)
if vertex_degree_list:
vertex_degree_list[0].update(vertex_knowledge)
else:
@@ -171,8 +179,7 @@ class GraphRAGQuery:
"extracted based on key entities as subject:\n"
)
# TODO: set color for ↓ "\033[93mKnowledge from Graph:\033[0m"
- log.debug("Knowledge from Graph:")
- log.debug("\n".join(context["graph_result"]))
+ log.debug("Knowledge from Graph:\n%s",
"\n".join(context["graph_result"]))
return context
def _format_graph_from_vertex(self, query_result: List[Any]) -> Set[str]:
@@ -189,20 +196,20 @@ class GraphRAGQuery:
subgraph_with_degree = {}
vertex_degree_list: List[Set[str]] = []
v_cache: Set[str] = set()
- e_cache: Set[str] = set()
+ e_cache: Set[Tuple[str, str, str]] = set()
for path in query_paths:
# 1. Process each path
- flat_rel, nodes_with_degree = self._process_path(path,
use_id_to_match, v_cache, e_cache)
- subgraph.add(flat_rel)
- subgraph_with_degree[flat_rel] = nodes_with_degree
+ path_str, vertex_with_degree = self._process_path(path,
use_id_to_match, v_cache, e_cache)
+ subgraph.add(path_str)
+ subgraph_with_degree[path_str] = vertex_with_degree
# 2. Update vertex degree list
- self._update_vertex_degree_list(vertex_degree_list,
nodes_with_degree)
+ self._update_vertex_degree_list(vertex_degree_list,
vertex_with_degree)
return subgraph, vertex_degree_list, subgraph_with_degree
def _process_path(self, path: Any, use_id_to_match: bool, v_cache:
Set[str],
- e_cache: Set[str]) -> Tuple[str, List[str]]:
+ e_cache: Set[Tuple[str, str, str]]) -> Tuple[str,
List[str]]:
flat_rel = ""
raw_flat_rel = path["objects"]
assert len(raw_flat_rel) % 2 == 1, "The length of raw_flat_rel should
be odd."
@@ -222,7 +229,7 @@ class GraphRAGQuery:
else:
# Process each edge
flat_rel, prior_edge_str_len = self._process_edge(
- item, flat_rel, prior_edge_str_len, raw_flat_rel,
i,use_id_to_match, e_cache
+ item, flat_rel, raw_flat_rel, i, use_id_to_match, e_cache
)
return flat_rel, nodes_with_degree
@@ -236,40 +243,40 @@ class GraphRAGQuery:
return flat_rel, prior_edge_str_len, depth
node_cache.add(matched_str)
- props_str = ", ".join(f"{k}: {v}" for k, v in item["props"].items() if
v)
+ props_str = ", ".join(f"{k}: {self._limit_property_query(v, 'v')}"
+ for k, v in item["props"].items() if v)
+
# TODO: we may remove label id or replace with label name
if matched_str in v_cache:
node_str = matched_str
else:
v_cache.add(matched_str)
node_str = f"{item['id']}{{{props_str}}}"
+
flat_rel += node_str
nodes_with_degree.append(node_str)
depth += 1
-
return flat_rel, prior_edge_str_len, depth
- def _process_edge(self, item: Any, flat_rel: str, prior_edge_str_len: int,
- raw_flat_rel: List[Any], i: int, use_id_to_match: bool,
e_cache: Set[str]) -> Tuple[str, int]:
- props_str = ", ".join(f"{k}: {v}" for k, v in item["props"].items() if
v)
- props_str = f"{{{props_str}}}" if len(props_str) > 0 else ""
+ def _process_edge(self, item: Any, path_str: str, raw_flat_rel: List[Any],
i: int, use_id_to_match: bool,
+ e_cache: Set[Tuple[str, str, str]]) -> Tuple[str, int]:
+ props_str = ", ".join(f"{k}: {self._limit_property_query(v, 'e')}"
+ for k, v in item["props"].items() if v)
+ props_str = f"{{{props_str}}}" if props_str else ""
prev_matched_str = raw_flat_rel[i - 1]["id"] if use_id_to_match else (
raw_flat_rel)[i - 1]["props"][self._prop_to_match]
- if item["label"] in e_cache:
- edge_str = f"{item['label']}"
+ edge_key = (item['inV'], item['label'], item['outV'])
+ if edge_key not in e_cache:
+ e_cache.add(edge_key)
+ edge_label = f"{item['label']}{props_str}"
else:
- e_cache.add(item["label"])
- edge_str = f"{item['label']}{props_str}"
+ edge_label = item['label']
- if item["outV"] == prev_matched_str:
- edge_str = f" --[{edge_str}]--> "
- else:
- edge_str = f" <--[{edge_str}]-- "
-
- flat_rel += edge_str
+ edge_str = f"--[{edge_label}]-->" if item["outV"] == prev_matched_str
else f"<--[{edge_label}]--"
+ path_str += edge_str
prior_edge_str_len = len(edge_str)
- return flat_rel, prior_edge_str_len
+ return path_str, prior_edge_str_len
def _update_vertex_degree_list(self, vertex_degree_list: List[Set[str]],
nodes_with_degree: List[str]) -> None:
for depth, node_str in enumerate(nodes_with_degree):
@@ -313,3 +320,11 @@ class GraphRAGQuery:
)
log.debug("Link(Relation): %s", relationships)
return self._schema
+
+ def _limit_property_query(self, value: Optional[str], item_type: str) ->
Optional[str]:
+ # NOTE: we skip the filter for list/set type (e.g., list of string,
add it if needed)
+ if not self._limit_property or not isinstance(value, str):
+ return value
+
+ max_len = self._max_v_prop_len if item_type == "v" else
self._max_e_prop_len
+ return value[:max_len] if value else value