(incubator-hugegraph-ai) branch main updated: refactor(llm): enhance a string of graph query method (#89)

ming Mon, 14 Oct 2024 04:15:49 -0700

This is an automated email from the ASF dual-hosted git repository.

ming pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-hugegraph-ai.git



The following commit(s) were added to refs/heads/main by this push:
     new 7b02fd4  refactor(llm): enhance a string of graph query method (#89)
7b02fd4 is described below

commit 7b02fd43767b7f333cae04ad87f09981cb0778fc
Author: imbajin <[email protected]>
AuthorDate: Mon Oct 14 19:14:23 2024 +0800

    refactor(llm): enhance a string of graph query method (#89)
    
    * feat: Modify the method of importing graph by extracting from the entire 
document
    
    * fix: remove 'llm' objetc in json response
    
    * fix: remove 'llm' object in context transferring
    
    * TODO: only fetch vid rather than all vertex data
    
    * refactor(llm): enhance a string of graph query method
    
    * refactor(llm): enhance property query template
    
    * refactor: enhance graphrag format logic
    
    * refactor: enhance vector extract/fuzz logic & add dis_threshold for it
    
    * improve & log
    
    * filter single char & remove it from prompt
    
    ---------
    
    Co-authored-by: Vichayturen <[email protected]>
---
 .../demo/gremlin_generate_web_demo.py              |   4 +-
 .../hugegraph_llm/demo/rag_demo/configs_block.py   |   8 +-
 .../src/hugegraph_llm/indices/vector_index.py      |  17 +-
 .../operators/hugegraph_op/graph_rag_query.py      | 221 +++++++++++----------
 .../operators/index_op/semantic_id_query.py        |   3 +
 .../operators/llm_op/keyword_extract.py            |  18 +-
 .../src/pyhugegraph/api/schema.py                  |   8 +
 7 files changed, 158 insertions(+), 121 deletions(-)

diff --git a/hugegraph-llm/src/hugegraph_llm/demo/gremlin_generate_web_demo.py 
b/hugegraph-llm/src/hugegraph_llm/demo/gremlin_generate_web_demo.py
index 1ba7aba..6166321 100644
--- a/hugegraph-llm/src/hugegraph_llm/demo/gremlin_generate_web_demo.py
+++ b/hugegraph-llm/src/hugegraph_llm/demo/gremlin_generate_web_demo.py
@@ -91,7 +91,7 @@ if __name__ == '__main__':
                     ]
             else:
                 llm_config_input = []
-            llm_config_button = gr.Button("apply configuration")
+            llm_config_button = gr.Button("Apply Configuration")
 
             def apply_configuration(arg1, arg2, arg3, arg4):
                 llm_option = settings.llm_type
@@ -139,7 +139,7 @@ if __name__ == '__main__':
                     ]
             else:
                 embedding_config_input = []
-            embedding_config_button = gr.Button("apply configuration")
+            embedding_config_button = gr.Button("Apply Configuration")
 
             def apply_configuration(arg1, arg2, arg3):
                 embedding_option = settings.embedding_type
diff --git a/hugegraph-llm/src/hugegraph_llm/demo/rag_demo/configs_block.py 
b/hugegraph-llm/src/hugegraph_llm/demo/rag_demo/configs_block.py
index 60ad0fe..be161a8 100644
--- a/hugegraph-llm/src/hugegraph_llm/demo/rag_demo/configs_block.py
+++ b/hugegraph-llm/src/hugegraph_llm/demo/rag_demo/configs_block.py
@@ -192,7 +192,7 @@ def create_configs_block():
                 gr.Textbox(value=settings.graph_pwd, label="pwd", 
type="password"),
                 gr.Textbox(value=settings.graph_space, 
label="graphspace(Optional)"),
             ]
-        graph_config_button = gr.Button("Apply config")
+        graph_config_button = gr.Button("Apply Configuration")
     graph_config_button.click(apply_graph_config, inputs=graph_config_input)  
# pylint: disable=no-member
 
     with gr.Accordion("2. Set up the LLM.", open=False):
@@ -227,7 +227,7 @@ def create_configs_block():
                     ]
             else:
                 llm_config_input = []
-            llm_config_button = gr.Button("apply configuration")
+            llm_config_button = gr.Button("Apply Configuration")
             llm_config_button.click(apply_llm_config, inputs=llm_config_input) 
 # pylint: disable=no-member
 
     with gr.Accordion("3. Set up the Embedding.", open=False):
@@ -262,7 +262,7 @@ def create_configs_block():
             else:
                 embedding_config_input = []
 
-            embedding_config_button = gr.Button("apply configuration")
+            embedding_config_button = gr.Button("Apply Configuration")
 
             # Call the separate apply_embedding_configuration function here
             embedding_config_button.click(  # pylint: disable=no-member
@@ -299,7 +299,7 @@ def create_configs_block():
                     ]
             else:
                 reranker_config_input = []
-            reranker_config_button = gr.Button("apply configuration")
+            reranker_config_button = gr.Button("Apply Configuration")
 
             # TODO: use "gr.update()" or other way to update the config in 
time (refactor the click event)
             # Call the separate apply_reranker_configuration function here
diff --git a/hugegraph-llm/src/hugegraph_llm/indices/vector_index.py 
b/hugegraph-llm/src/hugegraph_llm/indices/vector_index.py
index 95bf133..1c39528 100644
--- a/hugegraph-llm/src/hugegraph_llm/indices/vector_index.py
+++ b/hugegraph-llm/src/hugegraph_llm/indices/vector_index.py
@@ -42,6 +42,7 @@ class VectorIndex:
         if not os.path.exists(index_file) or not 
os.path.exists(properties_file):
             log.warning("No index file found, create a new one.")
             return VectorIndex()
+
         faiss_index = faiss.read_index(index_file)
         embed_dim = faiss_index.d
         with open(properties_file, "rb") as f:
@@ -54,6 +55,7 @@ class VectorIndex:
     def to_index_file(self, dir_path: str):
         if not os.path.exists(dir_path):
             os.makedirs(dir_path)
+
         index_file = os.path.join(dir_path, INDEX_FILE_NAME)
         properties_file = os.path.join(dir_path, PROPERTIES_FILE_NAME)
         faiss.write_index(self.index, index_file)
@@ -63,6 +65,7 @@ class VectorIndex:
     def add(self, vectors: List[List[float]], props: List[Any]):
         if len(vectors) == 0:
             return
+
         if self.index.ntotal == 0 and len(vectors[0]) != self.index.d:
             self.index = faiss.IndexFlatL2(len(vectors[0]))
         self.index.add(np.array(vectors))
@@ -73,6 +76,7 @@ class VectorIndex:
             props = set(props)
         indices = []
         remove_num = 0
+
         for i, p in enumerate(self.properties):
             if p in props:
                 indices.append(i)
@@ -81,15 +85,20 @@ class VectorIndex:
         self.properties = [p for i, p in enumerate(self.properties) if i not 
in indices]
         return remove_num
 
-    def search(self, query_vector: List[float], top_k: int) -> List[Dict[str, 
Any]]:
+    def search(self, query_vector: List[float], top_k: int, dis_threshold: 
float = 0.9) -> List[Dict[str, Any]]:
         if self.index.ntotal == 0:
             return []
+
         if len(query_vector) != self.index.d:
             raise ValueError("Query vector dimension does not match index 
dimension!")
-        _, indices = self.index.search(np.array([query_vector]), top_k)
+
+        distances, indices = self.index.search(np.array([query_vector]), top_k)
         results = []
-        for i in indices[0]:
-            results.append(deepcopy(self.properties[i]))
+        for dist, i in zip(distances[0], indices[0]):
+            if dist < dis_threshold: # Smaller distances indicate higher 
similarity
+                results.append(deepcopy(self.properties[i]))
+            else:
+                log.debug("Distance %s is larger than threshold %s, ignore 
this result.", dist, dis_threshold)
         return results
 
     @staticmethod
diff --git 
a/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/graph_rag_query.py 
b/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/graph_rag_query.py
index eb90684..fcc1553 100644
--- a/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/graph_rag_query.py
+++ b/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/graph_rag_query.py
@@ -14,9 +14,6 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
-
-import re
 from typing import Any, Dict, Optional, List, Set, Tuple
 
 from hugegraph_llm.config import settings
@@ -26,12 +23,13 @@ from pyhugegraph.client import PyHugeClient
 VERTEX_QUERY_TPL = "g.V({keywords}).as('subj').toList()"
 
 # TODO: we could use a simpler query (like kneighbor-api to get the edges)
-# TODO: use dedup() to filter duplicate paths
-ID_QUERY_NEIGHBOR_TPL = """
-g.V({keywords}).as('subj')
+# TODO: test with profile()/explain() to speed up the query
+VID_QUERY_NEIGHBOR_TPL = """
+g.V({keywords})
 .repeat(
-   bothE({edge_labels}).as('rel').otherV().as('obj')
-).times({max_deep})
+   bothE({edge_labels}).limit({edge_limit}).otherV().dedup()
+).times({max_deep}).emit()
+.simplePath()
 .path()
 .by(project('label', 'id', 'props')
    .by(label())
@@ -49,10 +47,11 @@ g.V({keywords}).as('subj')
 """
 
 PROPERTY_QUERY_NEIGHBOR_TPL = """
-g.V().has('{prop}', within({keywords})).as('subj')
+g.V().has('{prop}', within({keywords}))
 .repeat(
-   bothE({edge_labels}).as('rel').otherV().as('obj')
-).times({max_deep})
+   bothE({edge_labels}).limit({edge_limit}).otherV().dedup()
+).times({max_deep}).emit()
+.simplePath()
 .path()
 .by(project('label', 'props')
    .by(label())
@@ -71,7 +70,7 @@ g.V().has('{prop}', within({keywords})).as('subj')
 
 class GraphRAGQuery:
 
-    def __init__(self, max_deep: int = 2, max_items: int = 30, prop_to_match: 
Optional[str] = None):
+    def __init__(self, max_deep: int = 2, max_items: int = 20, prop_to_match: 
Optional[str] = None):
         self._client = PyHugeClient(
             settings.graph_ip,
             settings.graph_port,
@@ -100,9 +99,8 @@ class GraphRAGQuery:
                 self._client = PyHugeClient(ip, port, graph, user, pwd, gs)
         assert self._client is not None, "No valid graph to search."
 
-        keywords = context.get("keywords")
-        match_vids = context.get("match_vids")
-
+        # 2. Extract params from context
+        matched_vids = context.get("match_vids")
         if isinstance(context.get("max_deep"), int):
             self._max_deep = context["max_deep"]
         if isinstance(context.get("max_items"), int):
@@ -110,30 +108,34 @@ class GraphRAGQuery:
         if isinstance(context.get("prop_to_match"), str):
             self._prop_to_match = context["prop_to_match"]
 
+        # 3. Extract edge_labels from graph schema
         _, edge_labels = self._extract_labels_from_schema()
         edge_labels_str = ",".join("'" + label + "'" for label in edge_labels)
+        # TODO: enhance the limit logic later
+        edge_limit_amount = len(edge_labels) * 10
 
         use_id_to_match = self._prop_to_match is None
         if use_id_to_match:
-            if not match_vids:
+            if not matched_vids:
                 return context
 
-            gremlin_query = VERTEX_QUERY_TPL.format(keywords=match_vids)
-            result: List[Any] = 
self._client.gremlin().exec(gremlin=gremlin_query)["data"]
-            log.debug("Vids query: %s", gremlin_query)
+            gremlin_query = VERTEX_QUERY_TPL.format(keywords=matched_vids)
+            vertexes = 
self._client.gremlin().exec(gremlin=gremlin_query)["data"]
+            log.debug("Vids gremlin query: %s", gremlin_query)
 
-            vertex_knowledge = 
self._format_graph_from_vertex(query_result=result)
-            gremlin_query = ID_QUERY_NEIGHBOR_TPL.format(
-                keywords=match_vids,
+            vertex_knowledge = 
self._format_graph_from_vertex(query_result=vertexes)
+            gremlin_query = VID_QUERY_NEIGHBOR_TPL.format(
+                keywords=matched_vids,
                 max_deep=self._max_deep,
-                max_items=self._max_items,
                 edge_labels=edge_labels_str,
+                edge_limit=edge_limit_amount,
+                max_items=self._max_items,
             )
-            log.debug("Kneighbor query: %s", gremlin_query)
+            log.debug("Kneighbor gremlin query: %s", gremlin_query)
+            paths = self._client.gremlin().exec(gremlin=gremlin_query)["data"]
 
-            result: List[Any] = 
self._client.gremlin().exec(gremlin=gremlin_query)["data"]
-            graph_chain_knowledge, vertex_degree_list, knowledge_with_degree = 
self._format_graph_from_query_result(
-                query_result=result
+            graph_chain_knowledge, vertex_degree_list, knowledge_with_degree = 
self._format_graph_query_result(
+                query_paths=paths
             )
             graph_chain_knowledge.update(vertex_knowledge)
             if vertex_degree_list:
@@ -142,29 +144,30 @@ class GraphRAGQuery:
                 vertex_degree_list.append(vertex_knowledge)
         else:
             # WARN: When will the query enter here?
+            keywords = context.get("keywords")
             assert keywords, "No related property(keywords) for graph query."
             keywords_str = ",".join("'" + kw + "'" for kw in keywords)
             gremlin_query = PROPERTY_QUERY_NEIGHBOR_TPL.format(
                 prop=self._prop_to_match,
                 keywords=keywords_str,
+                edge_labels=edge_labels_str,
+                edge_limit=edge_limit_amount,
                 max_deep=self._max_deep,
                 max_items=self._max_items,
-                edge_labels=edge_labels_str,
             )
             log.warning("Unable to find vid, downgraded to property query, 
please confirm if it meets expectation.")
 
-            result: List[Any] = 
self._client.gremlin().exec(gremlin=gremlin_query)["data"]
-            graph_chain_knowledge, vertex_degree_list, knowledge_with_degree = 
self._format_graph_from_query_result(
-                query_result=result
+            paths: List[Any] = 
self._client.gremlin().exec(gremlin=gremlin_query)["data"]
+            graph_chain_knowledge, vertex_degree_list, knowledge_with_degree = 
self._format_graph_query_result(
+                query_paths=paths
             )
 
         context["graph_result"] = list(graph_chain_knowledge)
         context["vertex_degree_list"] = [list(vertex_degree) for vertex_degree 
in vertex_degree_list]
         context["knowledge_with_degree"] = knowledge_with_degree
         context["graph_context_head"] = (
-            f"The following are knowledge sequence in max depth 
{self._max_deep} "
-            f"in the form of directed graph like:\n"
-            "`subject -[predicate]-> object <-[predicate_next_hop]- 
object_next_hop ...`"
+            f"The following are graph knowledge in {self._max_deep} depth, 
e.g:\n"
+            "`vertexA --[links]--> vertexB <--[links]-- vertexC ...`"
             "extracted based on key entities as subject:\n"
         )
 
@@ -172,7 +175,7 @@ class GraphRAGQuery:
         verbose = context.get("verbose") or False
         if verbose:
             print("\033[93mKnowledge from Graph:")
-            print("\n".join(chain for chain in context["graph_result"]) + 
"\033[0m")
+            print("\n".join(context["graph_result"]) + "\033[0m")
 
         return context
 
@@ -184,65 +187,93 @@ class GraphRAGQuery:
             knowledge.add(node_str)
         return knowledge
 
-    def _format_graph_from_query_result(
-        self, query_result: List[Any]
-    ) -> Tuple[Set[str], List[Set[str]], Dict[str, List[str]]]:
+    def _format_graph_query_result(self, query_paths) -> Tuple[Set[str], 
List[Set[str]], Dict[str, List[str]]]:
         use_id_to_match = self._prop_to_match is None
-        knowledge = set()
-        knowledge_with_degree = {}
+        subgraph = set()
+        subgraph_with_degree = {}
         vertex_degree_list: List[Set[str]] = []
-        for line in query_result:
-            flat_rel = ""
-            raw_flat_rel = line["objects"]
-            assert len(raw_flat_rel) % 2 == 1
-            node_cache = set()
-            prior_edge_str_len = 0
-            depth = 0
-            nodes_with_degree = []
-            for i, item in enumerate(raw_flat_rel):
-                if i % 2 == 0:
-                    matched_str = item["id"] if use_id_to_match else 
item["props"][self._prop_to_match]
-                    if matched_str in node_cache:
-                        flat_rel = flat_rel[:-prior_edge_str_len]
-                        break
-                    node_cache.add(matched_str)
-                    props_str = ", ".join(f"{k}: {v}" for k, v in 
item["props"].items())
-                    node_str = f"{item['id']}{{{props_str}}}"
-                    flat_rel += node_str
-                    nodes_with_degree.append(node_str)
-                    if flat_rel in knowledge:
-                        knowledge.remove(flat_rel)
-                        knowledge_with_degree.pop(flat_rel)
-                    if depth >= len(vertex_degree_list):
-                        vertex_degree_list.append(set())
-                    vertex_degree_list[depth].add(node_str)
-                    depth += 1
-                else:
-                    props_str = ", ".join(f"{k}: {v}" for k, v in 
item["props"].items())
-                    props_str = f"{{{props_str}}}" if len(props_str) > 0 else 
""
-                    prev_matched_str = (
-                        raw_flat_rel[i - 1]["id"]
-                        if use_id_to_match
-                        else raw_flat_rel[i - 1]["props"][self._prop_to_match]
-                    )
-                    if item["outV"] == prev_matched_str:
-                        edge_str = f" -[{item['label']}{props_str}]-> "
-                    else:
-                        edge_str = f" <-[{item['label']}{props_str}]- "
-                    flat_rel += edge_str
-                    prior_edge_str_len = len(edge_str)
-            knowledge.add(flat_rel)
-            knowledge_with_degree[flat_rel] = nodes_with_degree
-        return knowledge, vertex_degree_list, knowledge_with_degree
+
+        for path in query_paths:
+            # 1. Process each path
+            flat_rel, nodes_with_degree = self._process_path(path, 
use_id_to_match)
+            subgraph.add(flat_rel)
+            subgraph_with_degree[flat_rel] = nodes_with_degree
+            # 2. Update vertex degree list
+            self._update_vertex_degree_list(vertex_degree_list, 
nodes_with_degree)
+
+        return subgraph, vertex_degree_list, subgraph_with_degree
+
+    def _process_path(self, path: Any, use_id_to_match: bool) -> Tuple[str, 
List[str]]:
+        flat_rel = ""
+        raw_flat_rel = path["objects"]
+        assert len(raw_flat_rel) % 2 == 1, "The length of raw_flat_rel should 
be odd."
+
+        node_cache = set()
+        prior_edge_str_len = 0
+        depth = 0
+        nodes_with_degree = []
+
+        for i, item in enumerate(raw_flat_rel):
+            if i % 2 == 0:
+                # Process each vertex
+                flat_rel, prior_edge_str_len, depth = self._process_vertex(
+                    item, flat_rel, node_cache, prior_edge_str_len, depth, 
nodes_with_degree, use_id_to_match
+                )
+            else:
+                # Process each edge
+                flat_rel, prior_edge_str_len = self._process_edge(
+                    item, flat_rel, prior_edge_str_len, raw_flat_rel, 
i,use_id_to_match
+                )
+
+        return flat_rel, nodes_with_degree
+
+    def _process_vertex(self, item: Any, flat_rel: str, node_cache: Set[str],
+                        prior_edge_str_len: int, depth: int, 
nodes_with_degree: List[str],
+                        use_id_to_match: bool) -> Tuple[str, int, int]:
+        matched_str = item["id"] if use_id_to_match else 
item["props"][self._prop_to_match]
+        if matched_str in node_cache:
+            flat_rel = flat_rel[:-prior_edge_str_len]
+            return flat_rel, prior_edge_str_len, depth
+
+        node_cache.add(matched_str)
+        props_str = ", ".join(f"{k}: {v}" for k, v in item["props"].items())
+        node_str = f"{item['id']}{{{props_str}}}"
+        flat_rel += node_str
+        nodes_with_degree.append(node_str)
+        depth += 1
+
+        return flat_rel, prior_edge_str_len, depth
+
+    def _process_edge(self, item: Any, flat_rel: str, prior_edge_str_len: int,
+                      raw_flat_rel: List[Any], i: int, use_id_to_match: bool) 
-> Tuple[str, int]:
+        props_str = ", ".join(f"{k}: {v}" for k, v in item["props"].items())
+        props_str = f"{{{props_str}}}" if len(props_str) > 0 else ""
+        prev_matched_str = raw_flat_rel[i - 1]["id"] if use_id_to_match else 
raw_flat_rel[i - 1]["props"][self._prop_to_match]
+
+        if item["outV"] == prev_matched_str:
+            edge_str = f" --[{item['label']}{props_str}]--> "
+        else:
+            edge_str = f" <--[{item['label']}{props_str}]-- "
+
+        flat_rel += edge_str
+        prior_edge_str_len = len(edge_str)
+        return flat_rel, prior_edge_str_len
+
+    def _update_vertex_degree_list(self, vertex_degree_list: List[Set[str]], 
nodes_with_degree: List[str]) -> None:
+        for depth, node_str in enumerate(nodes_with_degree):
+            if depth >= len(vertex_degree_list):
+                vertex_degree_list.append(set())
+            vertex_degree_list[depth].add(node_str)
 
     def _extract_labels_from_schema(self) -> Tuple[List[str], List[str]]:
         schema = self._get_graph_schema()
-        node_props_str, edge_props_str = schema.split("\n")[:2]
-        node_props_str = node_props_str[len("Node properties: 
"):].strip("[").strip("]")
+        vertex_props_str, edge_props_str = schema.split("\n")[:2]
+        # TODO: rename to vertex (also need update in the schema)
+        vertex_props_str = vertex_props_str[len("Vertex properties: 
"):].strip("[").strip("]")
         edge_props_str = edge_props_str[len("Edge properties: 
"):].strip("[").strip("]")
-        node_labels = self._extract_label_names(node_props_str)
+        vertex_labels = self._extract_label_names(vertex_props_str)
         edge_labels = self._extract_label_names(edge_props_str)
-        return node_labels, edge_labels
+        return vertex_labels, edge_labels
 
     @staticmethod
     def _extract_label_names(source: str, head: str = "name: ", tail: str = ", 
") -> List[str]:
@@ -254,19 +285,6 @@ class GraphRAGQuery:
                 result.append(label)
         return result
 
-    def _get_graph_id_format(self) -> str:
-        sample = self._client.gremlin().exec("g.V().limit(1)")["data"]
-        if len(sample) == 0:
-            return "EMPTY"
-        sample_id = sample[0]["id"]
-        if isinstance(sample_id, int):
-            return "INT"
-        if isinstance(sample_id, str):
-            if re.match(r"^\d+:.*", sample_id):
-                return "INT:STRING"
-            return "STRING"
-        return "UNKNOWN"
-
     def _get_graph_schema(self, refresh: bool = False) -> str:
         if self._schema and not refresh:
             return self._schema
@@ -277,8 +295,9 @@ class GraphRAGQuery:
         relationships = schema.getRelations()
 
         self._schema = (
-            f"Node properties: {vertex_schema}\n"
+            f"Vertex properties: {vertex_schema}\n"
             f"Edge properties: {edge_schema}\n"
             f"Relationships: {relationships}\n"
         )
+        log.debug("Link(Relation): %s", relationships)
         return self._schema
diff --git 
a/hugegraph-llm/src/hugegraph_llm/operators/index_op/semantic_id_query.py 
b/hugegraph-llm/src/hugegraph_llm/operators/index_op/semantic_id_query.py
index d22d00a..2df3a04 100644
--- a/hugegraph-llm/src/hugegraph_llm/operators/index_op/semantic_id_query.py
+++ b/hugegraph-llm/src/hugegraph_llm/operators/index_op/semantic_id_query.py
@@ -20,6 +20,7 @@ import os
 from copy import deepcopy
 from typing import Dict, Any, Literal, List, Tuple
 
+from hugegraph_llm.utils.log import log
 from pyhugegraph.client import PyHugeClient
 from hugegraph_llm.config import resource_path, settings
 from hugegraph_llm.indices.vector_index import VectorIndex
@@ -62,6 +63,7 @@ class SemanticIdQuery:
             exact_match_vids, unmatched_vids = 
self._exact_match_vids(context["keywords"])
             graph_query_list.extend(exact_match_vids)
             fuzzy_match_vids = self._fuzzy_match_vids(unmatched_vids)
+            log.debug("Fuzzy match vids: %s", fuzzy_match_vids)
             graph_query_list.extend(fuzzy_match_vids)
         context["match_vids"] = list(set(graph_query_list))
         return context
@@ -71,6 +73,7 @@ class SemanticIdQuery:
         possible_vids = deepcopy(keywords)
         for i in range(vertex_label_num):
             possible_vids.extend([f"{i+1}:{keyword}" for keyword in keywords])
+
         vids_str = ",".join([f"'{vid}'" for vid in possible_vids])
         resp = 
self._client.gremlin().exec(SemanticIdQuery.ID_QUERY_TEMPL.format(vids_str=vids_str))
         searched_vids = [v['id'] for v in resp['data']]
diff --git 
a/hugegraph-llm/src/hugegraph_llm/operators/llm_op/keyword_extract.py 
b/hugegraph-llm/src/hugegraph_llm/operators/llm_op/keyword_extract.py
index 0df4051..2cad98d 100644
--- a/hugegraph-llm/src/hugegraph_llm/operators/llm_op/keyword_extract.py
+++ b/hugegraph-llm/src/hugegraph_llm/operators/llm_op/keyword_extract.py
@@ -23,8 +23,11 @@ from hugegraph_llm.models.llms.base import BaseLLM
 from hugegraph_llm.models.llms.init_llm import LLMs
 from hugegraph_llm.operators.common_op.nltk_helper import NLTKHelper
 
-KEYWORDS_EXTRACT_TPL = """extract {max_keywords} keywords from the text:
+KEYWORDS_EXTRACT_TPL = """Extract {max_keywords} keywords from the text:
 {question}
+
+1. Keywords can't contain meaningless/broad words(e.g action/relation/thing), 
must represent certain entities,  
+2. Better to extract subject/verb/object and don't extract particles, don't 
extend to synonyms/general categories.
 Provide keywords in the following comma-separated format: 'KEYWORDS: 
<keywords>'
 """
 
@@ -73,10 +76,7 @@ class KeywordExtract:
         if isinstance(context.get("max_keywords"), int):
             self._max_keywords = context["max_keywords"]
 
-        prompt = self._extract_template.format(
-            question=self._query,
-            max_keywords=self._max_keywords,
-        )
+        prompt = self._extract_template.format(question=self._query, 
max_keywords=self._max_keywords)
         response = self._llm.generate(prompt=prompt)
 
         keywords = self._extract_keywords_from_response(
@@ -95,10 +95,7 @@ class KeywordExtract:
         return context
 
     def _expand_synonyms(self, keywords: Set[str]) -> Set[str]:
-        prompt = self._expand_template.format(
-            question=str(keywords),
-            max_keywords=self._max_keywords,
-        )
+        prompt = self._expand_template.format(question=str(keywords), 
max_keywords=self._max_keywords)
         response = self._llm.generate(prompt=prompt)
         keywords = self._extract_keywords_from_response(
             response=response, lowercase=False, start_token="SYNONYMS:"
@@ -113,11 +110,12 @@ class KeywordExtract:
     ) -> Set[str]:
         keywords = []
         matches = re.findall(rf'{start_token}[^\n]+\n?', response)
+
         for match in matches:
             match = match[len(start_token):]
             for k in re.split(r"[,，]+", match):
                 k = k.strip()
-                if len(k) > 0:
+                if len(k) > 1:
                     if lowercase:
                         keywords.append(k.lower())
                     else:
diff --git a/hugegraph-python-client/src/pyhugegraph/api/schema.py 
b/hugegraph-python-client/src/pyhugegraph/api/schema.py
index fdebc9d..8b4f54c 100644
--- a/hugegraph-python-client/src/pyhugegraph/api/schema.py
+++ b/hugegraph-python-client/src/pyhugegraph/api/schema.py
@@ -115,6 +115,14 @@ class SchemaManager(HugeParamsBase):
 
     @router.http("GET", "schema/edgelabels")
     def getRelations(self) -> Optional[List[str]]:
+        """
+        Retrieve all edge_label links/paths from the graph-sever.
+
+        Returns a list of links representations for each edge_label, e.g:
+        The format is like 
"source_vertexlabel--edge_label-->target_vertexlabel".(e.g. 
"Person--likes-->Animal")
+
+        :return: A list of relationship links/paths for all edge_labels, or 
None if not found.
+        """
         if response := self._invoke_request():
             return [EdgeLabelData(item).relations() for item in 
response["edgelabels"]]
         return None

(incubator-hugegraph-ai) branch main updated: refactor(llm): enhance a string of graph query method (#89)

Reply via email to