This is an automated email from the ASF dual-hosted git repository.
ming pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-hugegraph-ai.git
The following commit(s) were added to refs/heads/main by this push:
new 7b02fd4 refactor(llm): enhance a string of graph query method (#89)
7b02fd4 is described below
commit 7b02fd43767b7f333cae04ad87f09981cb0778fc
Author: imbajin <[email protected]>
AuthorDate: Mon Oct 14 19:14:23 2024 +0800
refactor(llm): enhance a string of graph query method (#89)
* feat: Modify the method of importing graph by extracting from the entire
document
* fix: remove 'llm' objetc in json response
* fix: remove 'llm' object in context transferring
* TODO: only fetch vid rather than all vertex data
* refactor(llm): enhance a string of graph query method
* refactor(llm): enhance property query template
* refactor: enhance graphrag format logic
* refactor: enhance vector extract/fuzz logic & add dis_threshold for it
* improve & log
* filter single char & remove it from prompt
---------
Co-authored-by: Vichayturen <[email protected]>
---
.../demo/gremlin_generate_web_demo.py | 4 +-
.../hugegraph_llm/demo/rag_demo/configs_block.py | 8 +-
.../src/hugegraph_llm/indices/vector_index.py | 17 +-
.../operators/hugegraph_op/graph_rag_query.py | 221 +++++++++++----------
.../operators/index_op/semantic_id_query.py | 3 +
.../operators/llm_op/keyword_extract.py | 18 +-
.../src/pyhugegraph/api/schema.py | 8 +
7 files changed, 158 insertions(+), 121 deletions(-)
diff --git a/hugegraph-llm/src/hugegraph_llm/demo/gremlin_generate_web_demo.py
b/hugegraph-llm/src/hugegraph_llm/demo/gremlin_generate_web_demo.py
index 1ba7aba..6166321 100644
--- a/hugegraph-llm/src/hugegraph_llm/demo/gremlin_generate_web_demo.py
+++ b/hugegraph-llm/src/hugegraph_llm/demo/gremlin_generate_web_demo.py
@@ -91,7 +91,7 @@ if __name__ == '__main__':
]
else:
llm_config_input = []
- llm_config_button = gr.Button("apply configuration")
+ llm_config_button = gr.Button("Apply Configuration")
def apply_configuration(arg1, arg2, arg3, arg4):
llm_option = settings.llm_type
@@ -139,7 +139,7 @@ if __name__ == '__main__':
]
else:
embedding_config_input = []
- embedding_config_button = gr.Button("apply configuration")
+ embedding_config_button = gr.Button("Apply Configuration")
def apply_configuration(arg1, arg2, arg3):
embedding_option = settings.embedding_type
diff --git a/hugegraph-llm/src/hugegraph_llm/demo/rag_demo/configs_block.py
b/hugegraph-llm/src/hugegraph_llm/demo/rag_demo/configs_block.py
index 60ad0fe..be161a8 100644
--- a/hugegraph-llm/src/hugegraph_llm/demo/rag_demo/configs_block.py
+++ b/hugegraph-llm/src/hugegraph_llm/demo/rag_demo/configs_block.py
@@ -192,7 +192,7 @@ def create_configs_block():
gr.Textbox(value=settings.graph_pwd, label="pwd",
type="password"),
gr.Textbox(value=settings.graph_space,
label="graphspace(Optional)"),
]
- graph_config_button = gr.Button("Apply config")
+ graph_config_button = gr.Button("Apply Configuration")
graph_config_button.click(apply_graph_config, inputs=graph_config_input)
# pylint: disable=no-member
with gr.Accordion("2. Set up the LLM.", open=False):
@@ -227,7 +227,7 @@ def create_configs_block():
]
else:
llm_config_input = []
- llm_config_button = gr.Button("apply configuration")
+ llm_config_button = gr.Button("Apply Configuration")
llm_config_button.click(apply_llm_config, inputs=llm_config_input)
# pylint: disable=no-member
with gr.Accordion("3. Set up the Embedding.", open=False):
@@ -262,7 +262,7 @@ def create_configs_block():
else:
embedding_config_input = []
- embedding_config_button = gr.Button("apply configuration")
+ embedding_config_button = gr.Button("Apply Configuration")
# Call the separate apply_embedding_configuration function here
embedding_config_button.click( # pylint: disable=no-member
@@ -299,7 +299,7 @@ def create_configs_block():
]
else:
reranker_config_input = []
- reranker_config_button = gr.Button("apply configuration")
+ reranker_config_button = gr.Button("Apply Configuration")
# TODO: use "gr.update()" or other way to update the config in
time (refactor the click event)
# Call the separate apply_reranker_configuration function here
diff --git a/hugegraph-llm/src/hugegraph_llm/indices/vector_index.py
b/hugegraph-llm/src/hugegraph_llm/indices/vector_index.py
index 95bf133..1c39528 100644
--- a/hugegraph-llm/src/hugegraph_llm/indices/vector_index.py
+++ b/hugegraph-llm/src/hugegraph_llm/indices/vector_index.py
@@ -42,6 +42,7 @@ class VectorIndex:
if not os.path.exists(index_file) or not
os.path.exists(properties_file):
log.warning("No index file found, create a new one.")
return VectorIndex()
+
faiss_index = faiss.read_index(index_file)
embed_dim = faiss_index.d
with open(properties_file, "rb") as f:
@@ -54,6 +55,7 @@ class VectorIndex:
def to_index_file(self, dir_path: str):
if not os.path.exists(dir_path):
os.makedirs(dir_path)
+
index_file = os.path.join(dir_path, INDEX_FILE_NAME)
properties_file = os.path.join(dir_path, PROPERTIES_FILE_NAME)
faiss.write_index(self.index, index_file)
@@ -63,6 +65,7 @@ class VectorIndex:
def add(self, vectors: List[List[float]], props: List[Any]):
if len(vectors) == 0:
return
+
if self.index.ntotal == 0 and len(vectors[0]) != self.index.d:
self.index = faiss.IndexFlatL2(len(vectors[0]))
self.index.add(np.array(vectors))
@@ -73,6 +76,7 @@ class VectorIndex:
props = set(props)
indices = []
remove_num = 0
+
for i, p in enumerate(self.properties):
if p in props:
indices.append(i)
@@ -81,15 +85,20 @@ class VectorIndex:
self.properties = [p for i, p in enumerate(self.properties) if i not
in indices]
return remove_num
- def search(self, query_vector: List[float], top_k: int) -> List[Dict[str,
Any]]:
+ def search(self, query_vector: List[float], top_k: int, dis_threshold:
float = 0.9) -> List[Dict[str, Any]]:
if self.index.ntotal == 0:
return []
+
if len(query_vector) != self.index.d:
raise ValueError("Query vector dimension does not match index
dimension!")
- _, indices = self.index.search(np.array([query_vector]), top_k)
+
+ distances, indices = self.index.search(np.array([query_vector]), top_k)
results = []
- for i in indices[0]:
- results.append(deepcopy(self.properties[i]))
+ for dist, i in zip(distances[0], indices[0]):
+ if dist < dis_threshold: # Smaller distances indicate higher
similarity
+ results.append(deepcopy(self.properties[i]))
+ else:
+ log.debug("Distance %s is larger than threshold %s, ignore
this result.", dist, dis_threshold)
return results
@staticmethod
diff --git
a/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/graph_rag_query.py
b/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/graph_rag_query.py
index eb90684..fcc1553 100644
--- a/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/graph_rag_query.py
+++ b/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/graph_rag_query.py
@@ -14,9 +14,6 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
-
-
-import re
from typing import Any, Dict, Optional, List, Set, Tuple
from hugegraph_llm.config import settings
@@ -26,12 +23,13 @@ from pyhugegraph.client import PyHugeClient
VERTEX_QUERY_TPL = "g.V({keywords}).as('subj').toList()"
# TODO: we could use a simpler query (like kneighbor-api to get the edges)
-# TODO: use dedup() to filter duplicate paths
-ID_QUERY_NEIGHBOR_TPL = """
-g.V({keywords}).as('subj')
+# TODO: test with profile()/explain() to speed up the query
+VID_QUERY_NEIGHBOR_TPL = """
+g.V({keywords})
.repeat(
- bothE({edge_labels}).as('rel').otherV().as('obj')
-).times({max_deep})
+ bothE({edge_labels}).limit({edge_limit}).otherV().dedup()
+).times({max_deep}).emit()
+.simplePath()
.path()
.by(project('label', 'id', 'props')
.by(label())
@@ -49,10 +47,11 @@ g.V({keywords}).as('subj')
"""
PROPERTY_QUERY_NEIGHBOR_TPL = """
-g.V().has('{prop}', within({keywords})).as('subj')
+g.V().has('{prop}', within({keywords}))
.repeat(
- bothE({edge_labels}).as('rel').otherV().as('obj')
-).times({max_deep})
+ bothE({edge_labels}).limit({edge_limit}).otherV().dedup()
+).times({max_deep}).emit()
+.simplePath()
.path()
.by(project('label', 'props')
.by(label())
@@ -71,7 +70,7 @@ g.V().has('{prop}', within({keywords})).as('subj')
class GraphRAGQuery:
- def __init__(self, max_deep: int = 2, max_items: int = 30, prop_to_match:
Optional[str] = None):
+ def __init__(self, max_deep: int = 2, max_items: int = 20, prop_to_match:
Optional[str] = None):
self._client = PyHugeClient(
settings.graph_ip,
settings.graph_port,
@@ -100,9 +99,8 @@ class GraphRAGQuery:
self._client = PyHugeClient(ip, port, graph, user, pwd, gs)
assert self._client is not None, "No valid graph to search."
- keywords = context.get("keywords")
- match_vids = context.get("match_vids")
-
+ # 2. Extract params from context
+ matched_vids = context.get("match_vids")
if isinstance(context.get("max_deep"), int):
self._max_deep = context["max_deep"]
if isinstance(context.get("max_items"), int):
@@ -110,30 +108,34 @@ class GraphRAGQuery:
if isinstance(context.get("prop_to_match"), str):
self._prop_to_match = context["prop_to_match"]
+ # 3. Extract edge_labels from graph schema
_, edge_labels = self._extract_labels_from_schema()
edge_labels_str = ",".join("'" + label + "'" for label in edge_labels)
+ # TODO: enhance the limit logic later
+ edge_limit_amount = len(edge_labels) * 10
use_id_to_match = self._prop_to_match is None
if use_id_to_match:
- if not match_vids:
+ if not matched_vids:
return context
- gremlin_query = VERTEX_QUERY_TPL.format(keywords=match_vids)
- result: List[Any] =
self._client.gremlin().exec(gremlin=gremlin_query)["data"]
- log.debug("Vids query: %s", gremlin_query)
+ gremlin_query = VERTEX_QUERY_TPL.format(keywords=matched_vids)
+ vertexes =
self._client.gremlin().exec(gremlin=gremlin_query)["data"]
+ log.debug("Vids gremlin query: %s", gremlin_query)
- vertex_knowledge =
self._format_graph_from_vertex(query_result=result)
- gremlin_query = ID_QUERY_NEIGHBOR_TPL.format(
- keywords=match_vids,
+ vertex_knowledge =
self._format_graph_from_vertex(query_result=vertexes)
+ gremlin_query = VID_QUERY_NEIGHBOR_TPL.format(
+ keywords=matched_vids,
max_deep=self._max_deep,
- max_items=self._max_items,
edge_labels=edge_labels_str,
+ edge_limit=edge_limit_amount,
+ max_items=self._max_items,
)
- log.debug("Kneighbor query: %s", gremlin_query)
+ log.debug("Kneighbor gremlin query: %s", gremlin_query)
+ paths = self._client.gremlin().exec(gremlin=gremlin_query)["data"]
- result: List[Any] =
self._client.gremlin().exec(gremlin=gremlin_query)["data"]
- graph_chain_knowledge, vertex_degree_list, knowledge_with_degree =
self._format_graph_from_query_result(
- query_result=result
+ graph_chain_knowledge, vertex_degree_list, knowledge_with_degree =
self._format_graph_query_result(
+ query_paths=paths
)
graph_chain_knowledge.update(vertex_knowledge)
if vertex_degree_list:
@@ -142,29 +144,30 @@ class GraphRAGQuery:
vertex_degree_list.append(vertex_knowledge)
else:
# WARN: When will the query enter here?
+ keywords = context.get("keywords")
assert keywords, "No related property(keywords) for graph query."
keywords_str = ",".join("'" + kw + "'" for kw in keywords)
gremlin_query = PROPERTY_QUERY_NEIGHBOR_TPL.format(
prop=self._prop_to_match,
keywords=keywords_str,
+ edge_labels=edge_labels_str,
+ edge_limit=edge_limit_amount,
max_deep=self._max_deep,
max_items=self._max_items,
- edge_labels=edge_labels_str,
)
log.warning("Unable to find vid, downgraded to property query,
please confirm if it meets expectation.")
- result: List[Any] =
self._client.gremlin().exec(gremlin=gremlin_query)["data"]
- graph_chain_knowledge, vertex_degree_list, knowledge_with_degree =
self._format_graph_from_query_result(
- query_result=result
+ paths: List[Any] =
self._client.gremlin().exec(gremlin=gremlin_query)["data"]
+ graph_chain_knowledge, vertex_degree_list, knowledge_with_degree =
self._format_graph_query_result(
+ query_paths=paths
)
context["graph_result"] = list(graph_chain_knowledge)
context["vertex_degree_list"] = [list(vertex_degree) for vertex_degree
in vertex_degree_list]
context["knowledge_with_degree"] = knowledge_with_degree
context["graph_context_head"] = (
- f"The following are knowledge sequence in max depth
{self._max_deep} "
- f"in the form of directed graph like:\n"
- "`subject -[predicate]-> object <-[predicate_next_hop]-
object_next_hop ...`"
+ f"The following are graph knowledge in {self._max_deep} depth,
e.g:\n"
+ "`vertexA --[links]--> vertexB <--[links]-- vertexC ...`"
"extracted based on key entities as subject:\n"
)
@@ -172,7 +175,7 @@ class GraphRAGQuery:
verbose = context.get("verbose") or False
if verbose:
print("\033[93mKnowledge from Graph:")
- print("\n".join(chain for chain in context["graph_result"]) +
"\033[0m")
+ print("\n".join(context["graph_result"]) + "\033[0m")
return context
@@ -184,65 +187,93 @@ class GraphRAGQuery:
knowledge.add(node_str)
return knowledge
- def _format_graph_from_query_result(
- self, query_result: List[Any]
- ) -> Tuple[Set[str], List[Set[str]], Dict[str, List[str]]]:
+ def _format_graph_query_result(self, query_paths) -> Tuple[Set[str],
List[Set[str]], Dict[str, List[str]]]:
use_id_to_match = self._prop_to_match is None
- knowledge = set()
- knowledge_with_degree = {}
+ subgraph = set()
+ subgraph_with_degree = {}
vertex_degree_list: List[Set[str]] = []
- for line in query_result:
- flat_rel = ""
- raw_flat_rel = line["objects"]
- assert len(raw_flat_rel) % 2 == 1
- node_cache = set()
- prior_edge_str_len = 0
- depth = 0
- nodes_with_degree = []
- for i, item in enumerate(raw_flat_rel):
- if i % 2 == 0:
- matched_str = item["id"] if use_id_to_match else
item["props"][self._prop_to_match]
- if matched_str in node_cache:
- flat_rel = flat_rel[:-prior_edge_str_len]
- break
- node_cache.add(matched_str)
- props_str = ", ".join(f"{k}: {v}" for k, v in
item["props"].items())
- node_str = f"{item['id']}{{{props_str}}}"
- flat_rel += node_str
- nodes_with_degree.append(node_str)
- if flat_rel in knowledge:
- knowledge.remove(flat_rel)
- knowledge_with_degree.pop(flat_rel)
- if depth >= len(vertex_degree_list):
- vertex_degree_list.append(set())
- vertex_degree_list[depth].add(node_str)
- depth += 1
- else:
- props_str = ", ".join(f"{k}: {v}" for k, v in
item["props"].items())
- props_str = f"{{{props_str}}}" if len(props_str) > 0 else
""
- prev_matched_str = (
- raw_flat_rel[i - 1]["id"]
- if use_id_to_match
- else raw_flat_rel[i - 1]["props"][self._prop_to_match]
- )
- if item["outV"] == prev_matched_str:
- edge_str = f" -[{item['label']}{props_str}]-> "
- else:
- edge_str = f" <-[{item['label']}{props_str}]- "
- flat_rel += edge_str
- prior_edge_str_len = len(edge_str)
- knowledge.add(flat_rel)
- knowledge_with_degree[flat_rel] = nodes_with_degree
- return knowledge, vertex_degree_list, knowledge_with_degree
+
+ for path in query_paths:
+ # 1. Process each path
+ flat_rel, nodes_with_degree = self._process_path(path,
use_id_to_match)
+ subgraph.add(flat_rel)
+ subgraph_with_degree[flat_rel] = nodes_with_degree
+ # 2. Update vertex degree list
+ self._update_vertex_degree_list(vertex_degree_list,
nodes_with_degree)
+
+ return subgraph, vertex_degree_list, subgraph_with_degree
+
+ def _process_path(self, path: Any, use_id_to_match: bool) -> Tuple[str,
List[str]]:
+ flat_rel = ""
+ raw_flat_rel = path["objects"]
+ assert len(raw_flat_rel) % 2 == 1, "The length of raw_flat_rel should
be odd."
+
+ node_cache = set()
+ prior_edge_str_len = 0
+ depth = 0
+ nodes_with_degree = []
+
+ for i, item in enumerate(raw_flat_rel):
+ if i % 2 == 0:
+ # Process each vertex
+ flat_rel, prior_edge_str_len, depth = self._process_vertex(
+ item, flat_rel, node_cache, prior_edge_str_len, depth,
nodes_with_degree, use_id_to_match
+ )
+ else:
+ # Process each edge
+ flat_rel, prior_edge_str_len = self._process_edge(
+ item, flat_rel, prior_edge_str_len, raw_flat_rel,
i,use_id_to_match
+ )
+
+ return flat_rel, nodes_with_degree
+
+ def _process_vertex(self, item: Any, flat_rel: str, node_cache: Set[str],
+ prior_edge_str_len: int, depth: int,
nodes_with_degree: List[str],
+ use_id_to_match: bool) -> Tuple[str, int, int]:
+ matched_str = item["id"] if use_id_to_match else
item["props"][self._prop_to_match]
+ if matched_str in node_cache:
+ flat_rel = flat_rel[:-prior_edge_str_len]
+ return flat_rel, prior_edge_str_len, depth
+
+ node_cache.add(matched_str)
+ props_str = ", ".join(f"{k}: {v}" for k, v in item["props"].items())
+ node_str = f"{item['id']}{{{props_str}}}"
+ flat_rel += node_str
+ nodes_with_degree.append(node_str)
+ depth += 1
+
+ return flat_rel, prior_edge_str_len, depth
+
+ def _process_edge(self, item: Any, flat_rel: str, prior_edge_str_len: int,
+ raw_flat_rel: List[Any], i: int, use_id_to_match: bool)
-> Tuple[str, int]:
+ props_str = ", ".join(f"{k}: {v}" for k, v in item["props"].items())
+ props_str = f"{{{props_str}}}" if len(props_str) > 0 else ""
+ prev_matched_str = raw_flat_rel[i - 1]["id"] if use_id_to_match else
raw_flat_rel[i - 1]["props"][self._prop_to_match]
+
+ if item["outV"] == prev_matched_str:
+ edge_str = f" --[{item['label']}{props_str}]--> "
+ else:
+ edge_str = f" <--[{item['label']}{props_str}]-- "
+
+ flat_rel += edge_str
+ prior_edge_str_len = len(edge_str)
+ return flat_rel, prior_edge_str_len
+
+ def _update_vertex_degree_list(self, vertex_degree_list: List[Set[str]],
nodes_with_degree: List[str]) -> None:
+ for depth, node_str in enumerate(nodes_with_degree):
+ if depth >= len(vertex_degree_list):
+ vertex_degree_list.append(set())
+ vertex_degree_list[depth].add(node_str)
def _extract_labels_from_schema(self) -> Tuple[List[str], List[str]]:
schema = self._get_graph_schema()
- node_props_str, edge_props_str = schema.split("\n")[:2]
- node_props_str = node_props_str[len("Node properties:
"):].strip("[").strip("]")
+ vertex_props_str, edge_props_str = schema.split("\n")[:2]
+ # TODO: rename to vertex (also need update in the schema)
+ vertex_props_str = vertex_props_str[len("Vertex properties:
"):].strip("[").strip("]")
edge_props_str = edge_props_str[len("Edge properties:
"):].strip("[").strip("]")
- node_labels = self._extract_label_names(node_props_str)
+ vertex_labels = self._extract_label_names(vertex_props_str)
edge_labels = self._extract_label_names(edge_props_str)
- return node_labels, edge_labels
+ return vertex_labels, edge_labels
@staticmethod
def _extract_label_names(source: str, head: str = "name: ", tail: str = ",
") -> List[str]:
@@ -254,19 +285,6 @@ class GraphRAGQuery:
result.append(label)
return result
- def _get_graph_id_format(self) -> str:
- sample = self._client.gremlin().exec("g.V().limit(1)")["data"]
- if len(sample) == 0:
- return "EMPTY"
- sample_id = sample[0]["id"]
- if isinstance(sample_id, int):
- return "INT"
- if isinstance(sample_id, str):
- if re.match(r"^\d+:.*", sample_id):
- return "INT:STRING"
- return "STRING"
- return "UNKNOWN"
-
def _get_graph_schema(self, refresh: bool = False) -> str:
if self._schema and not refresh:
return self._schema
@@ -277,8 +295,9 @@ class GraphRAGQuery:
relationships = schema.getRelations()
self._schema = (
- f"Node properties: {vertex_schema}\n"
+ f"Vertex properties: {vertex_schema}\n"
f"Edge properties: {edge_schema}\n"
f"Relationships: {relationships}\n"
)
+ log.debug("Link(Relation): %s", relationships)
return self._schema
diff --git
a/hugegraph-llm/src/hugegraph_llm/operators/index_op/semantic_id_query.py
b/hugegraph-llm/src/hugegraph_llm/operators/index_op/semantic_id_query.py
index d22d00a..2df3a04 100644
--- a/hugegraph-llm/src/hugegraph_llm/operators/index_op/semantic_id_query.py
+++ b/hugegraph-llm/src/hugegraph_llm/operators/index_op/semantic_id_query.py
@@ -20,6 +20,7 @@ import os
from copy import deepcopy
from typing import Dict, Any, Literal, List, Tuple
+from hugegraph_llm.utils.log import log
from pyhugegraph.client import PyHugeClient
from hugegraph_llm.config import resource_path, settings
from hugegraph_llm.indices.vector_index import VectorIndex
@@ -62,6 +63,7 @@ class SemanticIdQuery:
exact_match_vids, unmatched_vids =
self._exact_match_vids(context["keywords"])
graph_query_list.extend(exact_match_vids)
fuzzy_match_vids = self._fuzzy_match_vids(unmatched_vids)
+ log.debug("Fuzzy match vids: %s", fuzzy_match_vids)
graph_query_list.extend(fuzzy_match_vids)
context["match_vids"] = list(set(graph_query_list))
return context
@@ -71,6 +73,7 @@ class SemanticIdQuery:
possible_vids = deepcopy(keywords)
for i in range(vertex_label_num):
possible_vids.extend([f"{i+1}:{keyword}" for keyword in keywords])
+
vids_str = ",".join([f"'{vid}'" for vid in possible_vids])
resp =
self._client.gremlin().exec(SemanticIdQuery.ID_QUERY_TEMPL.format(vids_str=vids_str))
searched_vids = [v['id'] for v in resp['data']]
diff --git
a/hugegraph-llm/src/hugegraph_llm/operators/llm_op/keyword_extract.py
b/hugegraph-llm/src/hugegraph_llm/operators/llm_op/keyword_extract.py
index 0df4051..2cad98d 100644
--- a/hugegraph-llm/src/hugegraph_llm/operators/llm_op/keyword_extract.py
+++ b/hugegraph-llm/src/hugegraph_llm/operators/llm_op/keyword_extract.py
@@ -23,8 +23,11 @@ from hugegraph_llm.models.llms.base import BaseLLM
from hugegraph_llm.models.llms.init_llm import LLMs
from hugegraph_llm.operators.common_op.nltk_helper import NLTKHelper
-KEYWORDS_EXTRACT_TPL = """extract {max_keywords} keywords from the text:
+KEYWORDS_EXTRACT_TPL = """Extract {max_keywords} keywords from the text:
{question}
+
+1. Keywords can't contain meaningless/broad words(e.g action/relation/thing),
must represent certain entities,
+2. Better to extract subject/verb/object and don't extract particles, don't
extend to synonyms/general categories.
Provide keywords in the following comma-separated format: 'KEYWORDS:
<keywords>'
"""
@@ -73,10 +76,7 @@ class KeywordExtract:
if isinstance(context.get("max_keywords"), int):
self._max_keywords = context["max_keywords"]
- prompt = self._extract_template.format(
- question=self._query,
- max_keywords=self._max_keywords,
- )
+ prompt = self._extract_template.format(question=self._query,
max_keywords=self._max_keywords)
response = self._llm.generate(prompt=prompt)
keywords = self._extract_keywords_from_response(
@@ -95,10 +95,7 @@ class KeywordExtract:
return context
def _expand_synonyms(self, keywords: Set[str]) -> Set[str]:
- prompt = self._expand_template.format(
- question=str(keywords),
- max_keywords=self._max_keywords,
- )
+ prompt = self._expand_template.format(question=str(keywords),
max_keywords=self._max_keywords)
response = self._llm.generate(prompt=prompt)
keywords = self._extract_keywords_from_response(
response=response, lowercase=False, start_token="SYNONYMS:"
@@ -113,11 +110,12 @@ class KeywordExtract:
) -> Set[str]:
keywords = []
matches = re.findall(rf'{start_token}[^\n]+\n?', response)
+
for match in matches:
match = match[len(start_token):]
for k in re.split(r"[,,]+", match):
k = k.strip()
- if len(k) > 0:
+ if len(k) > 1:
if lowercase:
keywords.append(k.lower())
else:
diff --git a/hugegraph-python-client/src/pyhugegraph/api/schema.py
b/hugegraph-python-client/src/pyhugegraph/api/schema.py
index fdebc9d..8b4f54c 100644
--- a/hugegraph-python-client/src/pyhugegraph/api/schema.py
+++ b/hugegraph-python-client/src/pyhugegraph/api/schema.py
@@ -115,6 +115,14 @@ class SchemaManager(HugeParamsBase):
@router.http("GET", "schema/edgelabels")
def getRelations(self) -> Optional[List[str]]:
+ """
+ Retrieve all edge_label links/paths from the graph-sever.
+
+ Returns a list of links representations for each edge_label, e.g:
+ The format is like
"source_vertexlabel--edge_label-->target_vertexlabel".(e.g.
"Person--likes-->Animal")
+
+ :return: A list of relationship links/paths for all edge_labels, or
None if not found.
+ """
if response := self._invoke_request():
return [EdgeLabelData(item).relations() for item in
response["edgelabels"]]
return None