(incubator-hugegraph-ai) branch graphdata updated: refact: enhance a string of points for graphrag

jin Wed, 18 Sep 2024 02:13:27 -0700

This is an automated email from the ASF dual-hosted git repository.

jin pushed a commit to branch graphdata
in repository https://gitbox.apache.org/repos/asf/incubator-hugegraph-ai.git



The following commit(s) were added to refs/heads/graphdata by this push:
     new ccbdf33  refact: enhance a string of points for graphrag
ccbdf33 is described below

commit ccbdf3395a14f016df0f4abef371da37d5c8d6fe
Author: imbajin <[email protected]>
AuthorDate: Wed Sep 18 17:13:12 2024 +0800

    refact: enhance a string of points for graphrag
---
 hugegraph-llm/README.md                            |   6 +-
 .../src/hugegraph_llm/demo/rag_web_demo.py         |  53 ++++----
 .../src/hugegraph_llm/models/llms/init_llm.py      |   4 +-
 .../src/hugegraph_llm/models/llms/openai.py        |   4 +-
 .../operators/document_op/word_extract.py          |   3 +-
 .../src/hugegraph_llm/operators/graph_rag_task.py  |  37 ++----
 .../operators/hugegraph_op/graph_rag_query.py      | 142 ++++++++++-----------
 .../operators/hugegraph_op/schema_manager.py       |   2 -
 .../operators/index_op/semantic_id_query.py        |  12 +-
 .../operators/llm_op/answer_synthesize.py          |   8 +-
 .../operators/llm_op/disambiguate_data.py          |   2 +-
 .../operators/llm_op/keyword_extract.py            |  41 +++---
 .../src/hugegraph_llm/utils/decorators.py          |   2 +-
 .../src/hugegraph_llm/utils/graph_index_utils.py   |   3 +-
 14 files changed, 145 insertions(+), 174 deletions(-)

diff --git a/hugegraph-llm/README.md b/hugegraph-llm/README.md
index 25de21b..51e8e6d 100644
--- a/hugegraph-llm/README.md
+++ b/hugegraph-llm/README.md
@@ -134,15 +134,15 @@ The `GraphRAG` class is used to integrate HugeGraph with 
large language models t
 Here is a brief usage guide:
 
 1. **Extract Keyword:**: Extract keywords and expand synonyms.
-    
+
     ```python
-    graph_rag.extract_keyword(text="Tell me about Al Pacino.").print_result()
+    graph_rag.extract_keywords(text="Tell me about Al Pacino.").print_result()
     ```
 
 2. **Query Graph for Rag**: Retrieve the corresponding keywords and their 
multi-degree associated relationships from HugeGraph.
 
      ```python
-     graph_rag.query_graph_for_rag(
+     graph_rag.query_graphdb(
         max_deep=2,
         max_items=30
      ).print_result()
diff --git a/hugegraph-llm/src/hugegraph_llm/demo/rag_web_demo.py 
b/hugegraph-llm/src/hugegraph_llm/demo/rag_web_demo.py
index 6dc60b1..b8c772a 100644
--- a/hugegraph-llm/src/hugegraph_llm/demo/rag_web_demo.py
+++ b/hugegraph-llm/src/hugegraph_llm/demo/rag_web_demo.py
@@ -68,8 +68,16 @@ def rag_answer(
     custom_related_information: str,
     answer_prompt: str,
 ) -> Tuple:
-    
-    if prompt.default_question != text or prompt.custom_rerank_info != 
custom_related_information or prompt.answer_prompt != answer_prompt:
+    """
+    Generate an answer using the RAG (Retrieval-Augmented Generation) pipeline.
+    1. Initialize the RAGPipeline.
+    2. Select vector search or graph search based on parameters.
+    3. Merge, deduplicate, and rerank the results.
+    4. Synthesize the final answer.
+    5. Run the pipeline and return the results.
+    """
+    should_update_prompt = prompt.default_question != text or 
prompt.answer_prompt != answer_prompt
+    if should_update_prompt or prompt.custom_rerank_info != 
custom_related_information:
         prompt.custom_rerank_info = custom_related_information
         prompt.default_question = text
         prompt.answer_prompt = answer_prompt
@@ -77,30 +85,23 @@ def rag_answer(
     
     vector_search = vector_only_answer or graph_vector_answer
     graph_search = graph_only_answer or graph_vector_answer
-
     if raw_answer is False and not vector_search and not graph_search:
         gr.Warning("Please select at least one generate mode.")
         return "", "", "", ""
-    searcher = RAGPipeline()
+
+    rag = RAGPipeline()
     if vector_search:
-        searcher.query_vector_index_for_rag()
+        rag.query_vector_index()
     if graph_search:
-        searcher.extract_keyword().match_keyword_to_id().query_graph_for_rag()
+        rag.extract_keywords().keywords_to_vid().query_graphdb()
     # TODO: add more user-defined search strategies
-    searcher.merge_dedup_rerank(
-        graph_ratio, rerank_method, near_neighbor_first, 
custom_related_information
-    ).synthesize_answer(
-        raw_answer=raw_answer,
-        vector_only_answer=vector_only_answer,
-        graph_only_answer=graph_only_answer,
-        graph_vector_answer=graph_vector_answer,
-        answer_prompt=answer_prompt,
-    )
+    rag.merge_dedup_rerank(graph_ratio, rerank_method, near_neighbor_first, 
custom_related_information)
+    rag.synthesize_answer(raw_answer, vector_only_answer, graph_only_answer, 
graph_vector_answer, answer_prompt)
 
     try:
-        context = searcher.run(verbose=True, query=text, 
vector_search=vector_search, graph_search=graph_search)
+        context = rag.run(verbose=True, query=text, 
vector_search=vector_search, graph_search=graph_search)
         if context.get("switch_to_bleu"):
-            gr.Warning("Online reranker fails, automatically switches to local 
bleu method.")
+            gr.Warning("Online reranker fails, automatically switches to local 
bleu rerank.")
         return (
             context.get("raw_answer", ""),
             context.get("vector_only_answer", ""),
@@ -108,10 +109,10 @@ def rag_answer(
             context.get("graph_vector_answer", ""),
         )
     except ValueError as e:
-        log.error(e)
+        log.critical(e)
         raise gr.Error(str(e))
     except Exception as e:
-        log.error(e)
+        log.critical(e)
         raise gr.Error(f"An unexpected error occurred: {str(e)}")
 
 
@@ -665,19 +666,13 @@ if __name__ == "__main__":
     parser.add_argument("--port", type=int, default=8001, help="port")
     args = parser.parse_args()
     app = FastAPI()
-    app_auth = APIRouter(dependencies=[Depends(authenticate)])
+    api_auth = APIRouter(dependencies=[Depends(authenticate)])
 
     hugegraph_llm = init_rag_ui()
-    rag_http_api(
-        app_auth,
-        rag_answer,
-        apply_graph_config,
-        apply_llm_config,
-        apply_embedding_config,
-        apply_reranker_config,
-    )
+    rag_http_api(api_auth, rag_answer, apply_graph_config, apply_llm_config, 
apply_embedding_config,
+                 apply_reranker_config)
 
-    app.include_router(app_auth)
+    app.include_router(api_auth)
     auth_enabled = os.getenv("ENABLE_LOGIN", "False").lower() == "true"
     log.info("(Status) Authentication is %s now.", "enabled" if auth_enabled 
else "disabled")
     # TODO: support multi-user login when need
diff --git a/hugegraph-llm/src/hugegraph_llm/models/llms/init_llm.py 
b/hugegraph-llm/src/hugegraph_llm/models/llms/init_llm.py
index 22a82cb..2c90748 100644
--- a/hugegraph-llm/src/hugegraph_llm/models/llms/init_llm.py
+++ b/hugegraph-llm/src/hugegraph_llm/models/llms/init_llm.py
@@ -17,7 +17,7 @@
 
 
 from hugegraph_llm.models.llms.ollama import OllamaClient
-from hugegraph_llm.models.llms.openai import OpenAIChat
+from hugegraph_llm.models.llms.openai import OpenAIClient
 from hugegraph_llm.models.llms.qianfan import QianfanClient
 from hugegraph_llm.config import settings
 
@@ -34,7 +34,7 @@ class LLMs:
                 secret_key=settings.qianfan_secret_key
             )
         if self.llm_type == "openai":
-            return OpenAIChat(
+            return OpenAIClient(
                 api_key=settings.openai_api_key,
                 api_base=settings.openai_api_base,
                 model_name=settings.openai_language_model,
diff --git a/hugegraph-llm/src/hugegraph_llm/models/llms/openai.py 
b/hugegraph-llm/src/hugegraph_llm/models/llms/openai.py
index 30ac805..bfdb83b 100644
--- a/hugegraph-llm/src/hugegraph_llm/models/llms/openai.py
+++ b/hugegraph-llm/src/hugegraph_llm/models/llms/openai.py
@@ -27,8 +27,8 @@ from hugegraph_llm.models.llms.base import BaseLLM
 from hugegraph_llm.utils.log import log
 
 
-class OpenAIChat(BaseLLM):
-    """Wrapper around OpenAI Chat large language models."""
+class OpenAIClient(BaseLLM):
+    """Wrapper for OpenAI Client."""
 
     def __init__(
         self,
diff --git 
a/hugegraph-llm/src/hugegraph_llm/operators/document_op/word_extract.py 
b/hugegraph-llm/src/hugegraph_llm/operators/document_op/word_extract.py
index 43d0a29..b8f40df 100644
--- a/hugegraph-llm/src/hugegraph_llm/operators/document_op/word_extract.py
+++ b/hugegraph-llm/src/hugegraph_llm/operators/document_op/word_extract.py
@@ -62,7 +62,8 @@ class WordExtract:
 
         verbose = context.get("verbose") or False
         if verbose:
-            print(f"\033[92mKEYWORDS: {context['keywords']}\033[0m")
+            from hugegraph_llm.utils.log import log
+            log.info(f"KEYWORDS: {context['keywords']}")
 
         return context
 
diff --git a/hugegraph-llm/src/hugegraph_llm/operators/graph_rag_task.py 
b/hugegraph-llm/src/hugegraph_llm/operators/graph_rag_task.py
index 91bc7b3..dd75b18 100644
--- a/hugegraph-llm/src/hugegraph_llm/operators/graph_rag_task.py
+++ b/hugegraph-llm/src/hugegraph_llm/operators/graph_rag_task.py
@@ -50,11 +50,7 @@ class RAGPipeline:
         self._embedding = embedding or Embeddings().get_embedding()
         self._operators: List[Any] = []
 
-    def extract_word(
-            self,
-            text: Optional[str] = None,
-            language: str = "english",
-    ):
+    def extract_word(self, text: Optional[str] = None, language: str = 
"english"):
         """
         Add a word extraction operator to the pipeline.
 
@@ -62,15 +58,10 @@ class RAGPipeline:
         :param language: Language of the text.
         :return: Self-instance for chaining.
         """
-        self._operators.append(
-            WordExtract(
-                text=text,
-                language=language,
-            )
-        )
+        self._operators.append(WordExtract(text=text, language=language))
         return self
 
-    def extract_keyword(
+    def extract_keywords(
             self,
             text: Optional[str] = None,
             max_keywords: int = 5,
@@ -99,7 +90,7 @@ class RAGPipeline:
         )
         return self
 
-    def match_keyword_to_id(
+    def keywords_to_vid(
         self,
         by: Literal["query", "keywords"] = "keywords",
         topk_per_keyword: int = 1,
@@ -107,8 +98,9 @@ class RAGPipeline:
     ):
         """
         Add a semantic ID query operator to the pipeline.
-
+        :param by: Match by query or keywords.
         :param topk_per_keyword: Top K results per keyword.
+        :param topk_per_query: Top K results per query.
         :return: Self-instance for chaining.
         """
         self._operators.append(
@@ -121,7 +113,7 @@ class RAGPipeline:
         )
         return self
 
-    def query_graph_for_rag(
+    def query_graphdb(
         self,
         max_deep: int = 2,
         max_items: int = 30,
@@ -136,15 +128,11 @@ class RAGPipeline:
         :return: Self-instance for chaining.
         """
         self._operators.append(
-            GraphRAGQuery(
-                max_deep=max_deep,
-                max_items=max_items,
-                prop_to_match=prop_to_match,
-            )
+            GraphRAGQuery(max_deep=max_deep, max_items=max_items, 
prop_to_match=prop_to_match)
         )
         return self
 
-    def query_vector_index_for_rag(self, max_items: int = 3):
+    def query_vector_index(self, max_items: int = 3):
         """
         Add a vector index query operator to the pipeline.
 
@@ -152,10 +140,7 @@ class RAGPipeline:
         :return: Self-instance for chaining.
         """
         self._operators.append(
-            VectorIndexQuery(
-                embedding=self._embedding,
-                topk=max_items,
-            )
+            VectorIndexQuery(embedding=self._embedding, topk=max_items, )
         )
         return self
 
@@ -230,7 +215,7 @@ class RAGPipeline:
         :return: Final context after all operators have been executed.
         """
         if len(self._operators) == 0:
-            self.extract_keyword().query_graph_for_rag().synthesize_answer()
+            self.extract_keywords().query_graphdb().synthesize_answer()
 
         context = kwargs
         context["llm"] = self._llm
diff --git 
a/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/graph_rag_query.py 
b/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/graph_rag_query.py
index 2f08f11..cecb89c 100644
--- a/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/graph_rag_query.py
+++ b/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/graph_rag_query.py
@@ -20,64 +20,58 @@ import re
 from typing import Any, Dict, Optional, List, Set, Tuple
 
 from hugegraph_llm.config import settings
+from hugegraph_llm.utils.log import log
 from pyhugegraph.client import PyHugeClient
 
+VERTEX_QUERY_TPL = "g.V({keywords}).as('subj').toList()"
 
-class GraphRAGQuery:
-    VERTEX_GREMLIN_QUERY_TEMPL = "g.V().hasId({keywords}).as('subj').toList()"
-    # ID_RAG_GREMLIN_QUERY_TEMPL = 
"g.V().hasId({keywords}).as('subj').repeat(bothE({edge_labels}).as('rel').otherV(
-    # ).as('obj')).times({max_deep}).path().by(project('label', 'id', 
'props').by(label()).by(id()).by(valueMap().by(
-    # unfold()))).by(project('label', 'inV', 'outV', 
'props').by(label()).by(inV().id()).by(outV().id()).by(valueMap(
-    # ).by(unfold()))).limit({max_items}).toList()"
+# TODO: we could use a simpler query (like kneighbor-api to get the edges)
+# TODO: use dedup() to filter duplicate paths
+ID_QUERY_NEIGHBOR_TPL = """
+g.V({keywords}).as('subj')
+.repeat(
+   bothE({edge_labels}).as('rel').otherV().as('obj')
+).times({max_deep})
+.path()
+.by(project('label', 'id', 'props')
+   .by(label())
+   .by(id())
+   .by(valueMap().by(unfold()))
+)
+.by(project('label', 'inV', 'outV', 'props')
+   .by(label())
+   .by(inV().id())
+   .by(outV().id())
+   .by(valueMap().by(unfold()))
+)
+.limit({max_items})
+.toList()
+"""
+
+PROPERTY_QUERY_NEIGHBOR_TPL = """
+g.V().has('{prop}', within({keywords})).as('subj')
+.repeat(
+   bothE({edge_labels}).as('rel').otherV().as('obj')
+).times({max_deep})
+.path()
+.by(project('label', 'props')
+   .by(label())
+   .by(valueMap().by(unfold()))
+)
+.by(project('label', 'inV', 'outV', 'props')
+   .by(label())
+   .by(inV().values('{prop}'))
+   .by(outV().values('{prop}'))
+   .by(valueMap().by(unfold()))
+)
+.limit({max_items})
+.toList()
+"""
 
-    # TODO: we could use a simpler query (like kneighbor-api to get the edges)
-    ID_RAG_GREMLIN_QUERY_TEMPL = """
-    g.V().hasId({keywords}).as('subj')
-    .repeat(
-       bothE({edge_labels}).as('rel').otherV().as('obj')
-    ).times({max_deep})
-    .path()
-    .by(project('label', 'id', 'props')
-       .by(label())
-       .by(id())
-       .by(valueMap().by(unfold()))
-    )
-    .by(project('label', 'inV', 'outV', 'props')
-       .by(label())
-       .by(inV().id())
-       .by(outV().id())
-       .by(valueMap().by(unfold()))
-    )
-    .limit({max_items})
-    .toList()
-    """
 
-    PROP_RAG_GREMLIN_QUERY_TEMPL = """
-    g.V().has('{prop}', within({keywords})).as('subj')
-    .repeat(
-       bothE({edge_labels}).as('rel').otherV().as('obj')
-    ).times({max_deep})
-    .path()
-    .by(project('label', 'props')
-       .by(label())
-       .by(valueMap().by(unfold()))
-    )
-    .by(project('label', 'inV', 'outV', 'props')
-       .by(label())
-       .by(inV().values('{prop}'))
-       .by(outV().values('{prop}'))
-       .by(valueMap().by(unfold()))
-    )
-    .limit({max_items})
-    .toList()
-    """
+class GraphRAGQuery:
 
-    def __init__(
-        self,
-        max_deep: int = 2,
-        max_items: int = 30,
-        prop_to_match: Optional[str] = None,
-    ):
+    def __init__(self, max_deep: int = 2, max_items: int = 30, prop_to_match: 
Optional[str] = None):
         self._client = PyHugeClient(
             settings.graph_ip,
             settings.graph_port,
@@ -106,7 +100,7 @@ class GraphRAGQuery:
         assert self._client is not None, "No valid graph to search."
 
         keywords = context.get("keywords")
-        entrance_vids = context.get("entrance_vids")
+        match_vids = context.get("match_vids")
 
         if isinstance(context.get("max_deep"), int):
             self._max_deep = context["max_deep"]
@@ -120,21 +114,24 @@ class GraphRAGQuery:
 
         use_id_to_match = self._prop_to_match is None
         if use_id_to_match:
-            if not entrance_vids:
+            if not match_vids:
                 return context
 
-            rag_gremlin_query = 
self.VERTEX_GREMLIN_QUERY_TEMPL.format(keywords=entrance_vids)
-            result: List[Any] = 
self._client.gremlin().exec(gremlin=rag_gremlin_query)["data"]
+            gremlin_query = VERTEX_QUERY_TPL.format(keywords=match_vids)
+            result: List[Any] = 
self._client.gremlin().exec(gremlin=gremlin_query)["data"]
+            log.debug(f"Vids query: {gremlin_query}")
 
-            vertex_knowledge = 
self._format_knowledge_from_vertex(query_result=result)
-            rag_gremlin_query = self.ID_RAG_GREMLIN_QUERY_TEMPL.format(
-                keywords=entrance_vids,
+            vertex_knowledge = 
self._format_graph_from_vertex(query_result=result)
+            gremlin_query = ID_QUERY_NEIGHBOR_TPL.format(
+                keywords=match_vids,
                 max_deep=self._max_deep,
                 max_items=self._max_items,
                 edge_labels=edge_labels_str,
             )
-            result: List[Any] = 
self._client.gremlin().exec(gremlin=rag_gremlin_query)["data"]
-            graph_chain_knowledge, vertex_degree_list, knowledge_with_degree = 
self._format_knowledge_from_query_result(
+            log.debug(f"Kneighbor query: {gremlin_query}")
+
+            result: List[Any] = 
self._client.gremlin().exec(gremlin=gremlin_query)["data"]
+            graph_chain_knowledge, vertex_degree_list, knowledge_with_degree = 
self._format_graph_from_query_result(
                 query_result=result
             )
             graph_chain_knowledge.update(vertex_knowledge)
@@ -143,17 +140,20 @@ class GraphRAGQuery:
             else:
                 vertex_degree_list.append(vertex_knowledge)
         else:
+            # WARN: When will the query enter here?
             assert keywords, "No related property(keywords) for graph query."
             keywords_str = ",".join("'" + kw + "'" for kw in keywords)
-            rag_gremlin_query = self.PROP_RAG_GREMLIN_QUERY_TEMPL.format(
+            gremlin_query = PROPERTY_QUERY_NEIGHBOR_TPL.format(
                 prop=self._prop_to_match,
                 keywords=keywords_str,
                 max_deep=self._max_deep,
                 max_items=self._max_items,
                 edge_labels=edge_labels_str,
             )
-            result: List[Any] = 
self._client.gremlin().exec(gremlin=rag_gremlin_query)["data"]
-            graph_chain_knowledge, vertex_degree_list, knowledge_with_degree = 
self._format_knowledge_from_query_result(
+            log.warning("Unable to find vid, downgraded to property query, 
please confirm if it meets expectation.")
+
+            result: List[Any] = 
self._client.gremlin().exec(gremlin=gremlin_query)["data"]
+            graph_chain_knowledge, vertex_degree_list, knowledge_with_degree = 
self._format_graph_from_query_result(
                 query_result=result
             )
 
@@ -175,7 +175,7 @@ class GraphRAGQuery:
 
         return context
 
-    def _format_knowledge_from_vertex(self, query_result: List[Any]) -> 
Set[str]:
+    def _format_graph_from_vertex(self, query_result: List[Any]) -> Set[str]:
         knowledge = set()
         for item in query_result:
             props_str = ", ".join(f"{k}: {v}" for k, v in 
item["properties"].items())
@@ -183,7 +183,7 @@ class GraphRAGQuery:
             knowledge.add(node_str)
         return knowledge
 
-    def _format_knowledge_from_query_result(
+    def _format_graph_from_query_result(
         self, query_result: List[Any]
     ) -> Tuple[Set[str], List[Set[str]], Dict[str, List[str]]]:
         use_id_to_match = self._prop_to_match is None
@@ -237,18 +237,14 @@ class GraphRAGQuery:
     def _extract_labels_from_schema(self) -> Tuple[List[str], List[str]]:
         schema = self._get_graph_schema()
         node_props_str, edge_props_str = schema.split("\n")[:2]
-        node_props_str = node_props_str[len("Node properties: ") 
:].strip("[").strip("]")
-        edge_props_str = edge_props_str[len("Edge properties: ") 
:].strip("[").strip("]")
+        node_props_str = node_props_str[len("Node properties: 
"):].strip("[").strip("]")
+        edge_props_str = edge_props_str[len("Edge properties: 
"):].strip("[").strip("]")
         node_labels = self._extract_label_names(node_props_str)
         edge_labels = self._extract_label_names(edge_props_str)
         return node_labels, edge_labels
 
     @staticmethod
-    def _extract_label_names(
-        source: str,
-        head: str = "name: ",
-        tail: str = ", ",
-    ) -> List[str]:
+    def _extract_label_names(source: str, head: str = "name: ", tail: str = ", 
") -> List[str]:
         result = []
         for s in source.split(head):
             end = s.find(tail)
diff --git 
a/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/schema_manager.py 
b/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/schema_manager.py
index a61063f..57da3ef 100644
--- a/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/schema_manager.py
+++ b/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/schema_manager.py
@@ -33,8 +33,6 @@ class SchemaManager:
         )
         self.schema = self.client.schema()
 
-    # FIXME: This method is not working as expected
-    # def run(self, context: Dict[str, Any]) -> Dict[str, Any]:
     def run(self, context: Optional[Dict[str, Any]]) -> Dict[str, Any]:
         if context is None:
             context = {}
diff --git 
a/hugegraph-llm/src/hugegraph_llm/operators/index_op/semantic_id_query.py 
b/hugegraph-llm/src/hugegraph_llm/operators/index_op/semantic_id_query.py
index 6cd8620..d22d00a 100644
--- a/hugegraph-llm/src/hugegraph_llm/operators/index_op/semantic_id_query.py
+++ b/hugegraph-llm/src/hugegraph_llm/operators/index_op/semantic_id_query.py
@@ -51,19 +51,19 @@ class SemanticIdQuery:
         )
 
     def run(self, context: Dict[str, Any]) -> Dict[str, Any]:
-        graph_query_entrance = []
+        graph_query_list = []
         if self.by == "query":
             query = context["query"]
             query_vector = self.embedding.get_text_embedding(query)
             results = self.vector_index.search(query_vector, 
top_k=self.topk_per_query)
             if results:
-                graph_query_entrance.extend(results[:self.topk_per_query])
+                graph_query_list.extend(results[:self.topk_per_query])
         else:  # by keywords
             exact_match_vids, unmatched_vids = 
self._exact_match_vids(context["keywords"])
-            graph_query_entrance.extend(exact_match_vids)
+            graph_query_list.extend(exact_match_vids)
             fuzzy_match_vids = self._fuzzy_match_vids(unmatched_vids)
-            graph_query_entrance.extend(fuzzy_match_vids)
-        context["entrance_vids"] = list(set(graph_query_entrance))
+            graph_query_list.extend(fuzzy_match_vids)
+        context["match_vids"] = list(set(graph_query_list))
         return context
 
     def _exact_match_vids(self, keywords: List[str]) -> Tuple[List[str], 
List[str]]:
@@ -89,4 +89,4 @@ class SemanticIdQuery:
             results = self.vector_index.search(keyword_vector, 
top_k=self.topk_per_keyword)
             if results:
                 fuzzy_match_result.extend(results[:self.topk_per_keyword])
-        return fuzzy_match_result
+        return fuzzy_match_result # FIXME: type mismatch, got 'list[dict[str, 
Any]]' instead
diff --git 
a/hugegraph-llm/src/hugegraph_llm/operators/llm_op/answer_synthesize.py 
b/hugegraph-llm/src/hugegraph_llm/operators/llm_op/answer_synthesize.py
index fda6eb7..3d1e018 100644
--- a/hugegraph-llm/src/hugegraph_llm/operators/llm_op/answer_synthesize.py
+++ b/hugegraph-llm/src/hugegraph_llm/operators/llm_op/answer_synthesize.py
@@ -22,7 +22,7 @@ from typing import Any, Dict, Optional
 from hugegraph_llm.models.llms.base import BaseLLM
 from hugegraph_llm.models.llms.init_llm import LLMs
 from hugegraph_llm.config import prompt
-
+from hugegraph_llm.utils.log import log
 
 DEFAULT_ANSWER_TEMPLATE = prompt.answer_prompt
 
@@ -87,13 +87,13 @@ class AnswerSynthesize:
 
         graph_result = context.get("graph_result")
         if graph_result:
-            graph_context_head = context.get("graph_context_head",
-                                             "The following are knowledge from 
HugeGraph related to the query:\n")
+            graph_context_head = context.get("graph_context_head", "Knowledge 
from graphdb for the query:\n")
             graph_result_context = graph_context_head + "\n".join(
                 f"{i + 1}. {res}" for i, res in enumerate(graph_result)
             )
         else:
-            graph_result_context = "No related knowledge found in graph for 
the query."
+            graph_result_context = "No related graph data found for current 
query."
+            log.warning(graph_result_context)
 
         context = asyncio.run(self.async_generate(context, context_head_str, 
context_tail_str,
                                                   vector_result_context, 
graph_result_context))
diff --git 
a/hugegraph-llm/src/hugegraph_llm/operators/llm_op/disambiguate_data.py 
b/hugegraph-llm/src/hugegraph_llm/operators/llm_op/disambiguate_data.py
index fffa03c..dcc0ab6 100644
--- a/hugegraph-llm/src/hugegraph_llm/operators/llm_op/disambiguate_data.py
+++ b/hugegraph-llm/src/hugegraph_llm/operators/llm_op/disambiguate_data.py
@@ -46,7 +46,7 @@ class DisambiguateData:
         # only disambiguate triples
         if "triples" in data:
             # TODO: ensure the logic here
-            log.debug(data)
+            # log.debug(data)
             triples = data["triples"]
             prompt = generate_disambiguate_prompt(triples)
             llm_output = self.llm.generate(prompt=prompt)
diff --git 
a/hugegraph-llm/src/hugegraph_llm/operators/llm_op/keyword_extract.py 
b/hugegraph-llm/src/hugegraph_llm/operators/llm_op/keyword_extract.py
index 3249670..5e58c3c 100644
--- a/hugegraph-llm/src/hugegraph_llm/operators/llm_op/keyword_extract.py
+++ b/hugegraph-llm/src/hugegraph_llm/operators/llm_op/keyword_extract.py
@@ -23,20 +23,18 @@ from hugegraph_llm.models.llms.base import BaseLLM
 from hugegraph_llm.models.llms.init_llm import LLMs
 from hugegraph_llm.operators.common_op.nltk_helper import NLTKHelper
 
-DEFAULT_KEYWORDS_EXTRACT_TEMPLATE_TMPL = """extract {max_keywords} keywords 
from the text:
-    {question}
-    Provide keywords in the following comma-separated format: 'KEYWORDS: 
<keywords>'
-    """
-
-DEFAULT_KEYWORDS_EXPAND_TEMPLATE_TMPL = (
-    "Generate synonyms or possible form of keywords up to {max_keywords} in 
total,\n"
-    "considering possible cases of capitalization, pluralization, common 
expressions, etc.\n"
-    "Provide all synonyms of keywords in comma-separated format: 'SYNONYMS: 
<keywords>'\n"
-    "Note, result should be in one-line with only one 'SYNONYMS: ' prefix\n"
-    "----\n"
-    "KEYWORDS: {question}\n"
-    "----"
-)
+KEYWORDS_EXTRACT_TPL = """extract {max_keywords} keywords from the text:
+{question}
+Provide keywords in the following comma-separated format: 'KEYWORDS: 
<keywords>'
+"""
+
+KEYWORDS_EXPAND_TPL = """Generate synonyms or possible form of keywords up to 
{max_keywords} in total,
+considering possible cases of capitalization, pluralization, common 
expressions, etc.
+Provide all synonyms of keywords in comma-separated format: 'SYNONYMS: 
<keywords>'
+Note, result should be in one-line with only one 'SYNONYMS: ' prefix
+----
+KEYWORDS: {question}
+----"""
 
 
 class KeywordExtract:
@@ -53,8 +51,8 @@ class KeywordExtract:
         self._query = text
         self._language = language.lower()
         self._max_keywords = max_keywords
-        self._extract_template = extract_template or 
DEFAULT_KEYWORDS_EXTRACT_TEMPLATE_TMPL
-        self._expand_template = expand_template or 
DEFAULT_KEYWORDS_EXPAND_TEMPLATE_TMPL
+        self._extract_template = extract_template or KEYWORDS_EXTRACT_TPL
+        self._expand_template = expand_template or KEYWORDS_EXPAND_TPL
 
     def run(self, context: Dict[str, Any]) -> Dict[str, Any]:
         if self._query is None:
@@ -91,7 +89,8 @@ class KeywordExtract:
 
         verbose = context.get("verbose") or False
         if verbose:
-            print(f"\033[92mKEYWORDS: {context['keywords']}\033[0m")
+            from hugegraph_llm.utils.log import log
+            log.info(f"KEYWORDS: {context['keywords']}")
 
         # extracting keywords & expanding synonyms increase the call count by 2
         context["call_count"] = context.get("call_count", 0) + 2
@@ -126,15 +125,11 @@ class KeywordExtract:
                     else:
                         keywords.append(k)
 
-        # if the keyword consists of multiple words, split into sub-words
-        # (removing stopwords)
+        # if the keyword consists of multiple words, split into sub-words 
(removing stopwords)
         results = set()
         for token in keywords:
             results.add(token)
             sub_tokens = re.findall(r"\w+", token)
             if len(sub_tokens) > 1:
-                results.update(
-                    {w for w in sub_tokens if w not in 
NLTKHelper().stopwords(lang=self._language)}
-                )
-
+                results.update({w for w in sub_tokens if w not in 
NLTKHelper().stopwords(lang=self._language)})
         return results
diff --git a/hugegraph-llm/src/hugegraph_llm/utils/decorators.py 
b/hugegraph-llm/src/hugegraph_llm/utils/decorators.py
index 5f37d47..394acda 100644
--- a/hugegraph-llm/src/hugegraph_llm/utils/decorators.py
+++ b/hugegraph-llm/src/hugegraph_llm/utils/decorators.py
@@ -72,7 +72,7 @@ def log_operator_time(func: Callable) -> Callable:
         # Only record time ≥ 0.01s (10ms)
         if op_time >= 0.01:
             log.debug("Operator %s finished in %.2f seconds", 
operator.__class__.__name__, op_time)
-            # log.debug("Context:\n%s", result)
+            # log.debug("Current context:\n%s", result)
         return result
     return wrapper
 
diff --git a/hugegraph-llm/src/hugegraph_llm/utils/graph_index_utils.py 
b/hugegraph-llm/src/hugegraph_llm/utils/graph_index_utils.py
index 73bb813..fc3ead8 100644
--- a/hugegraph-llm/src/hugegraph_llm/utils/graph_index_utils.py
+++ b/hugegraph-llm/src/hugegraph_llm/utils/graph_index_utils.py
@@ -22,6 +22,7 @@ import traceback
 from typing import Dict, Any, Union
 
 import gradio as gr
+
 from .hugegraph_utils import get_hg_client, clean_hg_data
 from .log import log
 from .vector_index_utils import read_documents
@@ -86,7 +87,7 @@ def extract_graph(input_file, input_text, schema, 
example_prompt) -> str:
 def fit_vid_index():
     builder = KgBuilder(LLMs().get_llm(), Embeddings().get_embedding(), 
get_hg_client())
     builder.fetch_graph_data().build_vertex_id_semantic_index()
-    log.debug(builder.operators)
+    log.debug("Operators: %s", builder.operators)
     try:
         context = builder.run()
         removed_num = context["removed_vid_vector_num"]

(incubator-hugegraph-ai) branch graphdata updated: refact: enhance a string of points for graphrag

Reply via email to