Re: [PR] feat(llm): (BREAKING CHANGE) update keyword extraction method [incubator-hugegraph-ai]

via GitHub Mon, 20 Oct 2025 04:50:38 -0700


imbajin commented on code in PR #282:
URL: 
https://github.com/apache/incubator-hugegraph-ai/pull/282#discussion_r2444790265



##########
hugegraph-llm/src/hugegraph_llm/operators/llm_op/keyword_extract.py:
##########
@@ -55,48 +57,122 @@ def run(self, context: Dict[str, Any]) -> Dict[str, Any]:
             self._llm = LLMs().get_extract_llm()
             assert isinstance(self._llm, BaseLLM), "Invalid LLM Object."
 
-        self._language = context.get("language", self._language).lower()
-        self._max_keywords = context.get("max_keywords", self._max_keywords)
+        # Use English by default
+        self._language = "chinese" if self._language == "cn" else "english"
+        max_keyword_num = context.get("max_keywords", self._max_keywords)
+        try:
+            max_keyword_num = int(max_keyword_num)
+        except (TypeError, ValueError):
+            max_keyword_num = self._max_keywords
+        self._max_keywords = max(1, max_keyword_num)
+
+        method = (context.get("extract_method", self._extract_method) or 
"LLM").strip().lower()
+        if method == "llm":
+            # LLM method
+            ranks = self._extract_with_llm()
+        elif method == "textrank":
+            # TextRank method
+            ranks = self._extract_with_textrank()
+        elif method == "hybrid":
+            # Hybrid method
+            ranks = self._extract_with_hybrid()
+        else:
+            log.warning("Invalid extract_method %s", method)
+            raise ValueError(f"Invalid extract_method: {method}")
+
+        keywords = [] if not ranks else sorted(ranks, key=ranks.get, 
reverse=True)
+        keywords = [k.replace("'", "") for k in keywords]
+        context["keywords"] = keywords[:self._max_keywords]
+        log.info("User Query: %s\nKeywords: %s", self._query, 
context["keywords"])
+
+        # extracting keywords & expanding synonyms increase the call count by 1
+        context["call_count"] = context.get("call_count", 0) + 1
+        return context
 
+    def _extract_with_llm(self) -> Dict[str, float]:
         prompt_run = f"{self._extract_template.format(question=self._query, 
max_keywords=self._max_keywords)}"
         start_time = time.perf_counter()
         response = self._llm.generate(prompt=prompt_run)
         end_time = time.perf_counter()
-        log.debug("Keyword extraction time: %.2f seconds", end_time - 
start_time)
-
+        log.debug("LLM Keyword extraction time: %.2f seconds", end_time - 
start_time)
         keywords = self._extract_keywords_from_response(
             response=response, lowercase=False, start_token="KEYWORDS:"
         )
-        keywords = {k.replace("'", "") for k in keywords}
-        context["keywords"] = list(keywords)
-        log.info("User Query: %s\nKeywords: %s", self._query, 
context["keywords"])
+        return keywords
 
-        # extracting keywords & expanding synonyms increase the call count by 1
-        context["call_count"] = context.get("call_count", 0) + 1
-        return context
+    def _extract_with_textrank(self) -> Dict[str, float]:
+        """ TextRank mode extraction """
+        start_time = time.perf_counter()
+        ranks = {}
+        try:
+            ranks = self._textrank_model.extract_keywords(self._query)

Review Comment:
   ‼️ **Critical Issue: Unhandled exception in score parsing**
   
   The code assumes LLM output is perfectly formatted but doesn't handle 
malformed responses, which will cause runtime failures.
   
   **Problem:**
   ```python
   keyword, score = item.split(":")
   llm_ranks[keyword.strip()] = float(score)
   ```
   
   If LLM returns `keyword1::0.95` or `keyword1` (no colon), this will crash.
   
   **Recommendation:**
   Add comprehensive error handling:
   ```python
   try:
       parts = item.split(":", 1)
       if len(parts) != 2:
           log.warning("Skipping malformed item: %s", item)
           continue
       keyword, score_str = parts
       keyword = keyword.strip()
       if not keyword:
           continue
       score = float(score_str.strip())
       if not 0.0 <= score <= 1.0:
           log.warning("Score out of range for %s: %s", keyword, score)
           score = max(0.0, min(1.0, score))
       llm_ranks[keyword] = score
   except (ValueError, AttributeError) as e:
       log.warning("Failed to parse item '%s': %s", item, e)
       continue
   ```



##########
hugegraph-llm/src/hugegraph_llm/operators/document_op/textrank_word_extract.py:
##########
@@ -0,0 +1,151 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import re
+from collections import defaultdict
+from typing import Dict
+
+import igraph as ig
+import jieba.posseg as pseg
+import nltk
+import regex
+
+from hugegraph_llm.operators.common_op.nltk_helper import NLTKHelper
+from hugegraph_llm.utils.log import log
+
+
+class MultiLingualTextRank:
+    def __init__(self, keyword_num: int = 5, window_size: int = 3):
+        self.top_k = keyword_num
+        self.window = window_size if 0 < window_size <= 10 else 3
+        self.graph = None
+        self.max_len = 100
+
+        self.pos_filter = {
+            'chinese': ('n', 'nr', 'ns', 'nt', 'nrt', 'nz', 'v', 'vd', 'vn', 
"eng", "j", "l"),
+            'english': ('NN', 'NNS', 'NNP', 'NNPS', 'VB', 'VBG', 'VBN', 'VBZ')
+        }
+        self.rules = [r"https?://\S+|www\.\S+",
+                      r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b",
+                      r"\b\w+(?:[-’\']\w+)+\b",
+                      r"\b\d+[,.]\d+\b"]
+
+    def _word_mask(self, text):
+
+        placeholder_id_counter = 0
+        placeholder_map = {}
+
+        def _create_placeholder(match_obj):
+            nonlocal placeholder_id_counter
+            original_word = match_obj.group(0)
+            _placeholder = f" __shieldword_{placeholder_id_counter}__ "
+            placeholder_map[_placeholder.strip()] = original_word
+            placeholder_id_counter += 1
+            return _placeholder
+
+        special_regex = regex.compile('|'.join(self.rules), regex.V1)

Review Comment:
   ⚠️ **Performance Issue: Repeated regex compilation**
   
   The regex pattern is recompiled on every call to `_word_mask`, which is 
inefficient for repeated extractions.
   
   **Recommendation:**
   Compile once during initialization:
   ```python
   def __init__(self, keyword_num: int = 5, window_size: int = 3):
       # ... existing code ...
       self.rules = [r"https?://\S+|www\.\S+", ...]
       self.special_regex = regex.compile('|'.join(self.rules), regex.V1)
   
   def _word_mask(self, text):
       # ... existing code ...
       text = self.special_regex.sub(_create_placeholder, text)
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] feat(llm): (BREAKING CHANGE) update keyword extraction method [incubator-hugegraph-ai]

Reply via email to