This is an automated email from the ASF dual-hosted git repository.
jin pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-hugegraph-ai.git
The following commit(s) were added to refs/heads/main by this push:
new 143e29f feat: add llm wenxinyiyan & config util & spo_triple_extract
(#27)
143e29f is described below
commit 143e29ff22bb4b239577ad0023ff0312a47c4461
Author: Simon Cheung <[email protected]>
AuthorDate: Wed Jan 24 13:26:57 2024 +0800
feat: add llm wenxinyiyan & config util & spo_triple_extract (#27)
1. add llm wenxinyiyan
2. add spo_triple_extract & CommitSPOToKg
3. add config util
4. code style format
5. Update README.md
---------
Co-authored-by: imbajin <[email protected]>
---
.github/workflows/pylint.yml | 5 +-
README.md | 7 +
hugegraph-llm/examples/build_kg_test.py | 23 +-
hugegraph-llm/examples/graph_rag_test.py | 62 ++---
hugegraph-llm/requirements.txt | 2 +-
.../utils_op/__init__.py => config/config.ini} | 15 +-
.../{operators/utils_op => }/__init__.py | 3 -
hugegraph-llm/src/hugegraph_llm/llms/base.py | 10 +-
hugegraph-llm/src/hugegraph_llm/llms/ernie_bot.py | 87 +++++++
hugegraph-llm/src/hugegraph_llm/llms/init_llm.py | 44 ++++
.../llms/{openai_llm.py => openai.py} | 11 +-
.../hugegraph_llm/operators/build_kg_operator.py | 65 -----
.../operators/{utils_op => common_op}/__init__.py | 3 -
.../{utils_op => common_op}/nltk_helper.py | 5 +-
.../__init__.py => common_op/print_result.py} | 12 +-
.../{graph_rag_operator.py => graph_rag_task.py} | 30 +--
...commit_data_to_kg.py => commit_to_hugegraph.py} | 50 +++-
.../operators/hugegraph_op/graph_rag_query.py | 40 ++-
.../operators/kg_construction_task.py | 66 +++++
.../operators/llm_op/answer_synthesize.py | 30 +--
.../hugegraph_llm/operators/llm_op/info_extract.py | 274 +++++++++++++++++++++
.../operators/llm_op/keyword_extract.py | 21 +-
.../operators/llm_op/parse_text_to_data.py | 208 ----------------
.../operators/llm_op/unstructured_data_utils.py | 1 +
.../{operators/utils_op => utils}/__init__.py | 3 -
hugegraph-llm/src/hugegraph_llm/utils/config.py | 68 +++++
.../utils_op/__init__.py => utils/constants.py} | 4 +-
.../src/pyhugegraph/api/graph.py | 28 ++-
.../src/pyhugegraph/api/graphs.py | 6 +-
.../src/pyhugegraph/api/gremlin.py | 6 +-
.../src/pyhugegraph/api/variable.py | 26 +-
.../src/pyhugegraph/structure/gremlin_data.py | 10 +-
.../src/tests/api/test_gremlin.py | 33 ++-
.../src/tests/api/test_variable.py | 31 ++-
hugegraph-python-client/src/tests/client_utils.py | 37 +--
style/code_format_and_analysis.sh | 42 ++++
pylint.conf => style/pylint.conf | 0
37 files changed, 868 insertions(+), 500 deletions(-)
diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml
index 7467c4f..8ccaf8d 100644
--- a/.github/workflows/pylint.yml
+++ b/.github/workflows/pylint.yml
@@ -26,7 +26,4 @@ jobs:
pip install -r ./hugegraph-python-client/requirements.txt
- name: Analysing the code with pylint
run: |
- export
PYTHONPATH=$(pwd)/hugegraph-llm/src:$(pwd)/hugegraph-python-client/src
- echo ${PYTHONPATH}
- pylint --rcfile=./pylint.conf hugegraph-llm
- pylint --rcfile=./pylint.conf hugegraph-python-client
+ bash ./style/code_format_and_analysis.sh -p
diff --git a/README.md b/README.md
index de99ab9..3209f58 100644
--- a/README.md
+++ b/README.md
@@ -19,6 +19,13 @@ to seamlessly connect with third-party graph-related ML
frameworks.
It is used to define graph structures and perform CRUD operations on graph
data. Both the `hugegraph-llm` and `hugegraph-ml`
modules will depend on this foundational library.
+## Contributing
+- Welcome to contribute to HugeGraph, please see
[Guidelines](https://hugegraph.apache.org/docs/contribution-guidelines/) for
more information.
+- Note: It's recommended to use [GitHub Desktop](https://desktop.github.com/)
to greatly simplify the PR and commit process.
+- Code format: Please run
[`./style/code_format_and_analysis.sh`](style/code_format_and_analysis.sh) to
format your code before submitting a PR.
+- Thank you to all the people who already contributed to HugeGraph!
+
+[](https://github.com/apache/incubator-hugegraph-ai/graphs/contributors)
## License
hugegraph-ai is licensed under `Apache 2.0` License.
diff --git a/hugegraph-llm/examples/build_kg_test.py
b/hugegraph-llm/examples/build_kg_test.py
index ece0048..d9b56f6 100644
--- a/hugegraph-llm/examples/build_kg_test.py
+++ b/hugegraph-llm/examples/build_kg_test.py
@@ -16,21 +16,12 @@
# under the License.
-import os
-from hugegraph_llm.operators.build_kg_operator import KgBuilder
-from hugegraph_llm.llms.openai_llm import OpenAIChat
+from hugegraph_llm.llms.init_llm import LLMs
+from hugegraph_llm.operators.kg_construction_task import KgBuilder
-if __name__ == "__main__":
- # If you need a proxy to access OpenAI's API, please set your HTTP proxy
here
- os.environ["http_proxy"] = ""
- os.environ["https_proxy"] = ""
- API_KEY = ""
- default_llm = OpenAIChat(
- api_key=API_KEY,
- model_name="gpt-3.5-turbo-16k",
- max_tokens=4000,
- )
+if __name__ == "__main__":
+ default_llm = LLMs().get_llm()
TEXT = (
"Meet Sarah, a 30-year-old attorney, and her roommate, James, whom
she's shared a home with"
" since 2010. James, in his professional life, works as a journalist.
Additionally, Sarah"
@@ -41,8 +32,11 @@ if __name__ == "__main__":
" varied interests and experiences."
)
builder = KgBuilder(default_llm)
+
+ # spo triple extract
+
builder.extract_spo_triple(TEXT).print_result().commit_to_hugegraph(spo=True).run()
# build kg with only text
-
builder.parse_text_to_data(TEXT).disambiguate_data().commit_data_to_kg().run()
+
builder.extract_nodes_relationships(TEXT).disambiguate_word_sense().commit_to_hugegraph().run()
# build kg with text and schemas
nodes_schemas = [
{
@@ -74,6 +68,7 @@ if __name__ == "__main__":
"properties": {},
},
]
+
(
builder.parse_text_to_data_with_schemas(TEXT, nodes_schemas,
relationships_schemas)
.disambiguate_data_with_schemas()
diff --git a/hugegraph-llm/examples/graph_rag_test.py
b/hugegraph-llm/examples/graph_rag_test.py
index 5c163fe..bbd6862 100644
--- a/hugegraph-llm/examples/graph_rag_test.py
+++ b/hugegraph-llm/examples/graph_rag_test.py
@@ -18,19 +18,18 @@
import os
-from hugegraph_llm.operators.graph_rag_operator import GraphRAG
+from hugegraph_llm.operators.graph_rag_task import GraphRAG
from pyhugegraph.client import PyHugeClient
def prepare_data():
- client = PyHugeClient(
- "127.0.0.1", 18080, "hugegraph", "admin", "admin"
- )
+ client = PyHugeClient("127.0.0.1", 8080, "hugegraph", "admin", "admin")
schema = client.schema()
schema.propertyKey("name").asText().ifNotExist().create()
schema.propertyKey("birthDate").asText().ifNotExist().create()
- schema.vertexLabel("Person").properties("name", "birthDate") \
- .useCustomizeStringId().ifNotExist().create()
+ schema.vertexLabel("Person").properties(
+ "name", "birthDate"
+ ).useCustomizeStringId().ifNotExist().create()
schema.vertexLabel("Movie").properties("name").useCustomizeStringId().ifNotExist().create()
schema.indexLabel("PersonByName").onV("Person").by("name").secondary().ifNotExist().create()
schema.indexLabel("MovieByName").onV("Movie").by("name").secondary().ifNotExist().create()
@@ -39,11 +38,17 @@ def prepare_data():
graph = client.graph()
graph.addVertex("Person", {"name": "Al Pacino", "birthDate":
"1940-04-25"}, id="Al Pacino")
graph.addVertex(
- "Person", {"name": "Robert De Niro", "birthDate": "1943-08-17"},
id="Robert De Niro")
+ "Person",
+ {"name": "Robert De Niro", "birthDate": "1943-08-17"},
+ id="Robert De Niro",
+ )
graph.addVertex("Movie", {"name": "The Godfather"}, id="The Godfather")
graph.addVertex("Movie", {"name": "The Godfather Part II"}, id="The
Godfather Part II")
- graph.addVertex("Movie", {"name": "The Godfather Coda The Death of Michael
Corleone"},
- id="The Godfather Coda The Death of Michael Corleone")
+ graph.addVertex(
+ "Movie",
+ {"name": "The Godfather Coda The Death of Michael Corleone"},
+ id="The Godfather Coda The Death of Michael Corleone",
+ )
graph.addEdge("ActedIn", "Al Pacino", "The Godfather", {})
graph.addEdge("ActedIn", "Al Pacino", "The Godfather Part II", {})
@@ -53,7 +58,7 @@ def prepare_data():
graph.close()
-if __name__ == '__main__':
+if __name__ == "__main__":
os.environ["http_proxy"] = ""
os.environ["https_proxy"] = ""
os.environ["OPENAI_API_KEY"] = ""
@@ -70,45 +75,40 @@ if __name__ == '__main__':
"user": "admin", # default to "admin" if not set
"pwd": "admin", # default to "admin" if not set
"graph": "hugegraph", # default to "hugegraph" if not set
-
# query question
"query": "Tell me about Al Pacino.", # must be set
-
# keywords extraction
"max_keywords": 5, # default to 5 if not set
"language": "english", # default to "english" if not set
-
# graph rag query
"prop_to_match": "name", # default to None if not set
"max_deep": 2, # default to 2 if not set
"max_items": 30, # default to 30 if not set
-
# print intermediate processes result
"verbose": True, # default to False if not set
}
- result = graph_rag \
- .extract_keyword() \
- .query_graph_for_rag() \
- .synthesize_answer() \
- .run(**context)
+ result =
graph_rag.extract_keyword().query_graph_for_rag().synthesize_answer().run(**context)
print(f"Query:\n- {context['query']}")
print(f"Answer:\n- {result['answer']}")
print("--------------------------------------------------------")
# configure operator with parameters
- graph_client = PyHugeClient(
- "127.0.0.1", 18080, "hugegraph", "admin", "admin"
+ graph_client = PyHugeClient("127.0.0.1", 18080, "hugegraph", "admin",
"admin")
+ result = (
+ graph_rag.extract_keyword(
+ text="Tell me about Al Pacino.",
+ max_keywords=5, # default to 5 if not set
+ language="english", # default to "english" if not set
+ )
+ .query_graph_for_rag(
+ graph_client=graph_client,
+ max_deep=2, # default to 2 if not set
+ max_items=30, # default to 30 if not set
+ prop_to_match=None, # default to None if not set
+ )
+ .synthesize_answer()
+ .run(verbose=True)
)
- result = graph_rag.extract_keyword(
- text="Tell me about Al Pacino.",
- max_keywords=5, # default to 5 if not set
- language="english", # default to "english" if not set
- ).query_graph_for_rag(
- graph_client=graph_client,
- max_deep=2, # default to 2 if not set
- max_items=30, # default to 30 if not set
- prop_to_match=None, # default to None if not set
- ).synthesize_answer().run(verbose=True)
print("Query:\n- Tell me about Al Pacino.")
print(f"Answer:\n- {result['answer']}")
diff --git a/hugegraph-llm/requirements.txt b/hugegraph-llm/requirements.txt
index e8cd6d0..03bba7f 100644
--- a/hugegraph-llm/requirements.txt
+++ b/hugegraph-llm/requirements.txt
@@ -1,4 +1,4 @@
openai==0.28.1
retry==0.9.2
tiktoken==0.5.1
-nltk==3.8.1
\ No newline at end of file
+nltk==3.8.1
diff --git a/hugegraph-llm/src/hugegraph_llm/operators/utils_op/__init__.py
b/hugegraph-llm/src/config/config.ini
similarity index 73%
copy from hugegraph-llm/src/hugegraph_llm/operators/utils_op/__init__.py
copy to hugegraph-llm/src/config/config.ini
index 309b3ca..a45d1f0 100644
--- a/hugegraph-llm/src/hugegraph_llm/operators/utils_op/__init__.py
+++ b/hugegraph-llm/src/config/config.ini
@@ -16,4 +16,17 @@
# under the License.
-from .nltk_helper import nltk_helper
+[hugegraph]
+ip = 127.0.0.1
+port = 8080
+user = admin
+pwd = admin
+graph = hugegraph
+
+[llm]
+type = openai
+api_key = xxx
+secret_key = xxx
+ernie_url =
https://aip.baidubce.com/rpc/2.0/ai_custom/v1/wenxinworkshop/chat/eb-instant?access_token=
+model_name = gpt-3.5-turbo-16k
+max_token = 4000
diff --git a/hugegraph-llm/src/hugegraph_llm/operators/utils_op/__init__.py
b/hugegraph-llm/src/hugegraph_llm/__init__.py
similarity index 95%
copy from hugegraph-llm/src/hugegraph_llm/operators/utils_op/__init__.py
copy to hugegraph-llm/src/hugegraph_llm/__init__.py
index 309b3ca..13a8339 100644
--- a/hugegraph-llm/src/hugegraph_llm/operators/utils_op/__init__.py
+++ b/hugegraph-llm/src/hugegraph_llm/__init__.py
@@ -14,6 +14,3 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
-
-
-from .nltk_helper import nltk_helper
diff --git a/hugegraph-llm/src/hugegraph_llm/llms/base.py
b/hugegraph-llm/src/hugegraph_llm/llms/base.py
index 17b51be..0e05156 100644
--- a/hugegraph-llm/src/hugegraph_llm/llms/base.py
+++ b/hugegraph-llm/src/hugegraph_llm/llms/base.py
@@ -32,7 +32,7 @@ class BaseLLM(ABC):
"""Comment"""
@abstractmethod
- async def generate_streaming(
+ def generate_streaming(
self,
messages: Optional[List[Dict[str, Any]]] = None,
prompt: Optional[str] = None,
@@ -41,14 +41,18 @@ class BaseLLM(ABC):
"""Comment"""
@abstractmethod
- async def num_tokens_from_string(
+ def num_tokens_from_string(
self,
string: str,
) -> str:
"""Given a string returns the number of tokens the given string
consists of"""
@abstractmethod
- async def max_allowed_token_length(
+ def max_allowed_token_length(
self,
) -> int:
"""Returns the maximum number of tokens the LLM can handle"""
+
+ @abstractmethod
+ def get_llm_type(self) -> str:
+ """Returns the type of the LLM"""
diff --git a/hugegraph-llm/src/hugegraph_llm/llms/ernie_bot.py
b/hugegraph-llm/src/hugegraph_llm/llms/ernie_bot.py
new file mode 100644
index 0000000..1ab5613
--- /dev/null
+++ b/hugegraph-llm/src/hugegraph_llm/llms/ernie_bot.py
@@ -0,0 +1,87 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import json
+from typing import Optional, List, Dict, Any, Callable
+
+import requests
+from retry import retry
+
+from hugegraph_llm.llms.base import BaseLLM
+from hugegraph_llm.utils.config import Config
+from hugegraph_llm.utils.constants import Constants
+
+
+class ErnieBotClient(BaseLLM):
+ def __init__(self):
+ self.c = Config(section=Constants.LLM_CONFIG)
+ self.api_key = self.c.get_llm_api_key()
+ self.secret_key = self.c.get_llm_secret_key()
+ self.base_url = self.c.get_llm_ernie_url()
+ self.get_access_token()
+
+ def get_access_token(self):
+ url = "https://aip.baidubce.com/oauth/2.0/token"
+ params = {
+ "grant_type": "client_credentials",
+ "client_id": self.api_key,
+ "client_secret": self.secret_key,
+ }
+ return str(requests.post(url, params=params,
timeout=2).json().get("access_token"))
+
+ @retry(tries=3, delay=1)
+ def generate(
+ self,
+ messages: Optional[List[Dict[str, Any]]] = None,
+ prompt: Optional[str] = None,
+ ) -> str:
+ if messages is None:
+ assert prompt is not None, "Messages or prompt must be provided."
+ messages = [{"role": "user", "content": prompt}]
+ url = self.base_url + self.get_access_token()
+ # parameter check failed, temperature range is (0, 1.0]
+ payload = json.dumps({"messages": messages, "temperature":
0.00000000001})
+ headers = {"Content-Type": "application/json"}
+ response = requests.request("POST", url, headers=headers,
data=payload, timeout=10)
+ if response.status_code != 200:
+ raise Exception(
+ f"Request failed with code {response.status_code}, message:
{response.text}"
+ )
+ return response.text
+
+ def generate_streaming(
+ self,
+ messages: Optional[List[Dict[str, Any]]] = None,
+ prompt: Optional[str] = None,
+ on_token_callback: Callable = None,
+ ) -> str:
+ return self.generate(messages, prompt)
+
+ def num_tokens_from_string(self, string: str) -> int:
+ return len(string)
+
+ def max_allowed_token_length(self) -> int:
+ return 6000
+
+ def get_llm_type(self) -> str:
+ return "ernie"
+
+
+if __name__ == "__main__":
+ client = ErnieBotClient()
+ print(client.generate(prompt="What is the capital of China?"))
+ print(client.generate(messages=[{"role": "user", "content": "What is the
capital of China?"}]))
diff --git a/hugegraph-llm/src/hugegraph_llm/llms/init_llm.py
b/hugegraph-llm/src/hugegraph_llm/llms/init_llm.py
new file mode 100644
index 0000000..f8e7138
--- /dev/null
+++ b/hugegraph-llm/src/hugegraph_llm/llms/init_llm.py
@@ -0,0 +1,44 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from hugegraph_llm.llms.openai import OpenAIChat
+from hugegraph_llm.llms.ernie_bot import ErnieBotClient
+from hugegraph_llm.utils.config import Config
+from hugegraph_llm.utils.constants import Constants
+
+
+class LLMs:
+ def __init__(self):
+ self.config = Config(section=Constants.LLM_CONFIG)
+ self.config.get_llm_type()
+
+ def get_llm(self):
+ if self.config.get_llm_type() == "ernie":
+ return ErnieBotClient()
+ if self.config.get_llm_type() == "openai":
+ return OpenAIChat(
+ api_key=self.config.get_llm_api_key(),
+ model_name=self.config.get_llm_model_name(),
+ max_tokens=self.config.get_llm_max_token(),
+ )
+ raise Exception("llm type is not supported !")
+
+
+if __name__ == "__main__":
+ client = LLMs().get_llm()
+ print(client.generate(prompt="What is the capital of China?"))
+ print(client.generate(messages=[{"role": "user", "content": "What is the
capital of China?"}]))
diff --git a/hugegraph-llm/src/hugegraph_llm/llms/openai_llm.py
b/hugegraph-llm/src/hugegraph_llm/llms/openai.py
similarity index 94%
rename from hugegraph-llm/src/hugegraph_llm/llms/openai_llm.py
rename to hugegraph-llm/src/hugegraph_llm/llms/openai.py
index 47735b6..c9b753c 100644
--- a/hugegraph-llm/src/hugegraph_llm/llms/openai_llm.py
+++ b/hugegraph-llm/src/hugegraph_llm/llms/openai.py
@@ -68,7 +68,7 @@ class OpenAIChat(BaseLLM):
print(f"Retrying LLM call {e}")
raise e
- async def generate_streaming(
+ def generate_streaming(
self,
messages: Optional[List[Dict[str, Any]]] = None,
prompt: Optional[str] = None,
@@ -91,16 +91,19 @@ class OpenAIChat(BaseLLM):
delta = message["choices"][0]["delta"]
if "content" in delta:
result += delta["content"]
- await on_token_callback(message)
+ on_token_callback(message)
return result
- async def num_tokens_from_string(self, string: str) -> int:
+ def num_tokens_from_string(self, string: str) -> int:
"""Get token count from string."""
encoding = tiktoken.encoding_for_model(self.model)
num_tokens = len(encoding.encode(string))
return num_tokens
- async def max_allowed_token_length(self) -> int:
+ def max_allowed_token_length(self) -> int:
"""Get max-allowed token length"""
# TODO: list all models and their max tokens from api
return 2049
+
+ def get_llm_type(self) -> str:
+ return "openai"
diff --git a/hugegraph-llm/src/hugegraph_llm/operators/build_kg_operator.py
b/hugegraph-llm/src/hugegraph_llm/operators/build_kg_operator.py
deleted file mode 100644
index 7e4b51c..0000000
--- a/hugegraph-llm/src/hugegraph_llm/operators/build_kg_operator.py
+++ /dev/null
@@ -1,65 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-
-from hugegraph_llm.llms.base import BaseLLM
-from hugegraph_llm.operators.hugegraph_op.commit_data_to_kg import
CommitDataToKg
-from hugegraph_llm.operators.llm_op.disambiguate_data import DisambiguateData
-from hugegraph_llm.operators.llm_op.parse_text_to_data import (
- ParseTextToData,
- ParseTextToDataWithSchemas,
-)
-
-
-class KgBuilder:
- def __init__(self, llm: BaseLLM):
- self.parse_text_to_kg = []
- self.llm = llm
- self.data = {}
-
- def parse_text_to_data(self, text: str):
- self.parse_text_to_kg.append(ParseTextToData(llm=self.llm, text=text))
- return self
-
- def parse_text_to_data_with_schemas(self, text: str, nodes_schemas,
relationships_schemas):
- self.parse_text_to_kg.append(
- ParseTextToDataWithSchemas(
- llm=self.llm,
- text=text,
- nodes_schema=nodes_schemas,
- relationships_schemas=relationships_schemas,
- )
- )
- return self
-
- def disambiguate_data(self):
- self.parse_text_to_kg.append(DisambiguateData(llm=self.llm,
is_user_schema=False))
- return self
-
- def disambiguate_data_with_schemas(self):
- self.parse_text_to_kg.append(DisambiguateData(llm=self.llm,
is_user_schema=True))
- return self
-
- def commit_data_to_kg(self):
- self.parse_text_to_kg.append(CommitDataToKg())
- return self
-
- def run(self):
- result = ""
- for i in self.parse_text_to_kg:
- result = i.run(result)
- print(result)
diff --git a/hugegraph-llm/src/hugegraph_llm/operators/utils_op/__init__.py
b/hugegraph-llm/src/hugegraph_llm/operators/common_op/__init__.py
similarity index 95%
copy from hugegraph-llm/src/hugegraph_llm/operators/utils_op/__init__.py
copy to hugegraph-llm/src/hugegraph_llm/operators/common_op/__init__.py
index 309b3ca..13a8339 100644
--- a/hugegraph-llm/src/hugegraph_llm/operators/utils_op/__init__.py
+++ b/hugegraph-llm/src/hugegraph_llm/operators/common_op/__init__.py
@@ -14,6 +14,3 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
-
-
-from .nltk_helper import nltk_helper
diff --git a/hugegraph-llm/src/hugegraph_llm/operators/utils_op/nltk_helper.py
b/hugegraph-llm/src/hugegraph_llm/operators/common_op/nltk_helper.py
similarity index 97%
rename from hugegraph-llm/src/hugegraph_llm/operators/utils_op/nltk_helper.py
rename to hugegraph-llm/src/hugegraph_llm/operators/common_op/nltk_helper.py
index 35aa921..944328e 100644
--- a/hugegraph-llm/src/hugegraph_llm/operators/utils_op/nltk_helper.py
+++ b/hugegraph-llm/src/hugegraph_llm/operators/common_op/nltk_helper.py
@@ -26,7 +26,6 @@ from nltk.corpus import stopwords
class NLTKHelper:
-
_stopwords: Dict[str, Optional[List[str]]] = {
"english": None,
"chinese": None,
@@ -69,9 +68,7 @@ class NLTKHelper:
# Windows (hopefully)
else:
- local = os.environ.get("LOCALAPPDATA", None) or os.path.expanduser(
- "~\\AppData\\Local"
- )
+ local = os.environ.get("LOCALAPPDATA", None) or
os.path.expanduser("~\\AppData\\Local")
path = Path(local, "hugegraph_llm")
if not os.path.exists(path):
diff --git a/hugegraph-llm/src/hugegraph_llm/operators/utils_op/__init__.py
b/hugegraph-llm/src/hugegraph_llm/operators/common_op/print_result.py
similarity index 78%
copy from hugegraph-llm/src/hugegraph_llm/operators/utils_op/__init__.py
copy to hugegraph-llm/src/hugegraph_llm/operators/common_op/print_result.py
index 309b3ca..d08de78 100644
--- a/hugegraph-llm/src/hugegraph_llm/operators/utils_op/__init__.py
+++ b/hugegraph-llm/src/hugegraph_llm/operators/common_op/print_result.py
@@ -16,4 +16,14 @@
# under the License.
-from .nltk_helper import nltk_helper
+from typing import Any
+
+
+class PrintResult:
+ def __init__(self):
+ self.result = None
+
+ def run(self, data: Any) -> Any:
+ self.result = data
+ print(self.result)
+ return self.result
diff --git a/hugegraph-llm/src/hugegraph_llm/operators/graph_rag_operator.py
b/hugegraph-llm/src/hugegraph_llm/operators/graph_rag_task.py
similarity index 80%
rename from hugegraph-llm/src/hugegraph_llm/operators/graph_rag_operator.py
rename to hugegraph-llm/src/hugegraph_llm/operators/graph_rag_task.py
index e3c27bd..0088119 100644
--- a/hugegraph-llm/src/hugegraph_llm/operators/graph_rag_operator.py
+++ b/hugegraph-llm/src/hugegraph_llm/operators/graph_rag_task.py
@@ -19,7 +19,7 @@
from typing import Dict, Any, Optional, List
from hugegraph_llm.llms.base import BaseLLM
-from hugegraph_llm.llms.openai_llm import OpenAIChat
+from hugegraph_llm.llms.init_llm import LLMs
from hugegraph_llm.operators.hugegraph_op.graph_rag_query import GraphRAGQuery
from hugegraph_llm.operators.llm_op.answer_synthesize import AnswerSynthesize
from hugegraph_llm.operators.llm_op.keyword_extract import KeywordExtract
@@ -28,16 +28,16 @@ from pyhugegraph.client import PyHugeClient
class GraphRAG:
def __init__(self, llm: Optional[BaseLLM] = None):
- self._llm = llm or OpenAIChat()
+ self._llm = llm or LLMs().get_llm()
self._operators: List[Any] = []
def extract_keyword(
- self,
- text: Optional[str] = None,
- max_keywords: int = 5,
- language: str = 'english',
- extract_template: Optional[str] = None,
- expand_template: Optional[str] = None,
+ self,
+ text: Optional[str] = None,
+ max_keywords: int = 5,
+ language: str = "english",
+ extract_template: Optional[str] = None,
+ expand_template: Optional[str] = None,
):
self._operators.append(
KeywordExtract(
@@ -51,11 +51,11 @@ class GraphRAG:
return self
def query_graph_for_rag(
- self,
- graph_client: Optional[PyHugeClient] = None,
- max_deep: int = 2,
- max_items: int = 30,
- prop_to_match: Optional[str] = None,
+ self,
+ graph_client: Optional[PyHugeClient] = None,
+ max_deep: int = 2,
+ max_items: int = 30,
+ prop_to_match: Optional[str] = None,
):
self._operators.append(
GraphRAGQuery(
@@ -68,8 +68,8 @@ class GraphRAG:
return self
def synthesize_answer(
- self,
- prompt_template: Optional[str] = None,
+ self,
+ prompt_template: Optional[str] = None,
):
self._operators.append(
AnswerSynthesize(
diff --git
a/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/commit_data_to_kg.py
b/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/commit_to_hugegraph.py
similarity index 80%
rename from
hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/commit_data_to_kg.py
rename to
hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/commit_to_hugegraph.py
index 6d6764b..350fb95 100644
---
a/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/commit_data_to_kg.py
+++
b/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/commit_to_hugegraph.py
@@ -16,7 +16,8 @@
# under the License.
-import os
+from hugegraph_llm.utils.config import Config
+from hugegraph_llm.utils.constants import Constants
from pyhugegraph.client import PyHugeClient
@@ -144,15 +145,17 @@ def generate_relationships(data):
class CommitDataToKg:
def __init__(self):
+ config = Config(section=Constants.HUGEGRAPH_CONFIG)
self.client = PyHugeClient(
- "127.0.0.1", "8080", user="admin", pwd="admin", graph="hugegraph"
+ config.get_graph_ip(),
+ config.get_graph_port(),
+ config.get_graph_user(),
+ config.get_graph_pwd(),
+ config.get_graph_name(),
)
self.schema = self.client.schema()
def run(self, data: dict):
- # If you are using a http proxy, you can run the following code to
unset http proxy
- os.environ.pop("http_proxy")
- os.environ.pop("https_proxy")
nodes = data["nodes"]
relationships = data["relationships"]
nodes_schemas = data["nodes_schemas"]
@@ -186,3 +189,40 @@ class CommitDataToKg:
relationships_schemas = generate_relationships(new_relationships)
for relationship in relationships_schemas:
exec(relationship)
+
+
+class CommitSpoToKg:
+ def __init__(self):
+ config = Config(section=Constants.HUGEGRAPH_CONFIG)
+ self.client = PyHugeClient(
+ config.get_graph_ip(),
+ config.get_graph_port(),
+ config.get_graph_name(),
+ config.get_graph_user(),
+ config.get_graph_pwd(),
+ )
+ self.schema = self.client.schema()
+
+ def run(self, data: dict):
+ self.schema.propertyKey("name").asText().ifNotExist().create()
+ self.schema.vertexLabel("vertex").useCustomizeStringId().properties(
+ "name"
+ ).ifNotExist().create()
+
self.schema.edgeLabel("edge").sourceLabel("vertex").targetLabel("vertex").properties(
+ "name"
+ ).ifNotExist().create()
+
+ self.schema.indexLabel("vertexByName").onV("vertex").by(
+ "name"
+ ).secondary().ifNotExist().create()
+ self.schema.indexLabel("edgeByName").onE("edge").by(
+ "name"
+ ).secondary().ifNotExist().create()
+
+ for item in data:
+ s = item[0]
+ p = item[1]
+ o = item[2]
+ s_id = self.client.graph().addVertex("vertex", {"name": s},
id=s).id
+ t_id = self.client.graph().addVertex("vertex", {"name": o},
id=o).id
+ self.client.graph().addEdge("edge", s_id, t_id, {"name": p})
diff --git
a/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/graph_rag_query.py
b/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/graph_rag_query.py
index b50d1c1..a59acc1 100644
--- a/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/graph_rag_query.py
+++ b/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/graph_rag_query.py
@@ -64,11 +64,11 @@ class GraphRAGQuery:
)
def __init__(
- self,
- client: Optional[PyHugeClient] = None,
- max_deep: int = 2,
- max_items: int = 30,
- prop_to_match: Optional[str] = None,
+ self,
+ client: Optional[PyHugeClient] = None,
+ max_deep: int = 2,
+ max_items: int = 30,
+ prop_to_match: Optional[str] = None,
):
self._client = client
self._max_deep = max_deep
@@ -86,9 +86,7 @@ class GraphRAGQuery:
graph = context.get("graph") or "hugegraph"
user = context.get("user") or "admin"
pwd = context.get("pwd") or "admin"
- self._client = PyHugeClient(
- ip=ip, port=port, graph=graph, user=user, pwd=pwd
- )
+ self._client = PyHugeClient(ip=ip, port=port, graph=graph,
user=user, pwd=pwd)
assert self._client is not None, "No graph for query."
keywords = context.get("keywords")
@@ -132,9 +130,7 @@ class GraphRAGQuery:
)
result: List[Any] =
self._client.gremlin().exec(gremlin=rag_gremlin_query)["data"]
- knowledge: Set[str] = self._format_knowledge_from_query_result(
- query_result=result
- )
+ knowledge: Set[str] =
self._format_knowledge_from_query_result(query_result=result)
context["synthesize_context_body"] = list(knowledge)
context["synthesize_context_head"] = (
@@ -152,8 +148,8 @@ class GraphRAGQuery:
return context
def _format_knowledge_from_query_result(
- self,
- query_result: List[Any],
+ self,
+ query_result: List[Any],
) -> Set[str]:
use_id_to_match = self._prop_to_match is None
knowledge = set()
@@ -166,9 +162,7 @@ class GraphRAGQuery:
for i, item in enumerate(raw_flat_rel):
if i % 2 == 0:
matched_str = (
- item["id"]
- if use_id_to_match
- else item["props"][self._prop_to_match]
+ item["id"] if use_id_to_match else
item["props"][self._prop_to_match]
)
if matched_str in node_cache:
flat_rel = flat_rel[:-prior_edge_str_len]
@@ -199,21 +193,17 @@ class GraphRAGQuery:
def _extract_labels_from_schema(self) -> Tuple[List[str], List[str]]:
schema = self._get_graph_schema()
node_props_str, edge_props_str = schema.split("\n")[:2]
- node_props_str = (
- node_props_str[len("Node properties: "):].strip("[").strip("]")
- )
- edge_props_str = (
- edge_props_str[len("Edge properties: "):].strip("[").strip("]")
- )
+ node_props_str = node_props_str[len("Node properties: ")
:].strip("[").strip("]")
+ edge_props_str = edge_props_str[len("Edge properties: ")
:].strip("[").strip("]")
node_labels = self._extract_label_names(node_props_str)
edge_labels = self._extract_label_names(edge_props_str)
return node_labels, edge_labels
@staticmethod
def _extract_label_names(
- source: str,
- head: str = "name: ",
- tail: str = ", ",
+ source: str,
+ head: str = "name: ",
+ tail: str = ", ",
) -> List[str]:
result = []
for s in source.split(head):
diff --git a/hugegraph-llm/src/hugegraph_llm/operators/kg_construction_task.py
b/hugegraph-llm/src/hugegraph_llm/operators/kg_construction_task.py
new file mode 100644
index 0000000..5b8d019
--- /dev/null
+++ b/hugegraph-llm/src/hugegraph_llm/operators/kg_construction_task.py
@@ -0,0 +1,66 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+from hugegraph_llm.llms.base import BaseLLM
+from hugegraph_llm.operators.common_op.print_result import PrintResult
+from hugegraph_llm.operators.hugegraph_op.commit_to_hugegraph import (
+ CommitDataToKg,
+ CommitSpoToKg,
+)
+from hugegraph_llm.operators.llm_op.disambiguate_data import DisambiguateData
+from hugegraph_llm.operators.llm_op.info_extract import InfoExtract
+
+
+class KgBuilder:
+ def __init__(self, llm: BaseLLM):
+ self.operators = []
+ self.llm = llm
+ self.result = None
+
+ def extract_nodes_relationships(
+ self, text: str, nodes_schemas=None, relationships_schemas=None
+ ):
+ if nodes_schemas and relationships_schemas:
+ self.operators.append(InfoExtract(self.llm, text, nodes_schemas,
relationships_schemas))
+ else:
+ self.operators.append(InfoExtract(self.llm, text))
+ return self
+
+ def extract_spo_triple(self, text: str):
+ self.operators.append(InfoExtract(self.llm, text, spo=True))
+ return self
+
+ def disambiguate_word_sense(self, with_schemas=False):
+ self.operators.append(DisambiguateData(self.llm, with_schemas))
+ return self
+
+ def commit_to_hugegraph(self, spo=False):
+ if spo:
+ self.operators.append(CommitSpoToKg())
+ else:
+ self.operators.append(CommitDataToKg())
+ return self
+
+ def print_result(self):
+ self.operators.append(PrintResult())
+ return self
+
+ def run(self):
+ result = ""
+ for operator in self.operators:
+ result = operator.run(result)
diff --git
a/hugegraph-llm/src/hugegraph_llm/operators/llm_op/answer_synthesize.py
b/hugegraph-llm/src/hugegraph_llm/operators/llm_op/answer_synthesize.py
index 7a24ebc..b08adb3 100644
--- a/hugegraph-llm/src/hugegraph_llm/operators/llm_op/answer_synthesize.py
+++ b/hugegraph-llm/src/hugegraph_llm/operators/llm_op/answer_synthesize.py
@@ -19,7 +19,7 @@
from typing import Any, Dict, Optional
from hugegraph_llm.llms.base import BaseLLM
-from hugegraph_llm.llms.openai_llm import OpenAIChat
+from hugegraph_llm.llms.init_llm import LLMs
DEFAULT_ANSWER_SYNTHESIZE_TEMPLATE_TMPL = (
"Context information is below.\n"
@@ -34,18 +34,16 @@ DEFAULT_ANSWER_SYNTHESIZE_TEMPLATE_TMPL = (
class AnswerSynthesize:
def __init__(
- self,
- llm: Optional[BaseLLM] = None,
- prompt_template: Optional[str] = None,
- question: Optional[str] = None,
- context_body: Optional[str] = None,
- context_head: Optional[str] = None,
- context_tail: Optional[str] = None,
+ self,
+ llm: Optional[BaseLLM] = None,
+ prompt_template: Optional[str] = None,
+ question: Optional[str] = None,
+ context_body: Optional[str] = None,
+ context_head: Optional[str] = None,
+ context_tail: Optional[str] = None,
):
self._llm = llm
- self._prompt_template = (
- prompt_template or DEFAULT_ANSWER_SYNTHESIZE_TEMPLATE_TMPL
- )
+ self._prompt_template = prompt_template or
DEFAULT_ANSWER_SYNTHESIZE_TEMPLATE_TMPL
self._question = question
self._context_body = context_body
self._context_head = context_head
@@ -53,7 +51,7 @@ class AnswerSynthesize:
def run(self, context: Dict[str, Any]) -> Dict[str, Any]:
if self._llm is None:
- self._llm = context.get("llm") or OpenAIChat()
+ self._llm = context.get("llm") or LLMs().get_llm()
if context.get("llm") is None:
context["llm"] = self._llm
@@ -78,11 +76,9 @@ class AnswerSynthesize:
context_head_str = context.get("synthesize_context_head") or
self._context_head or ""
context_tail_str = context.get("synthesize_context_tail") or
self._context_tail or ""
- context_str = (
- f"{context_head_str}\n"
- f"{context_body_str}\n"
- f"{context_tail_str}"
- ).strip("\n")
+ context_str = (f"{context_head_str}\n" f"{context_body_str}\n"
f"{context_tail_str}").strip(
+ "\n"
+ )
prompt = self._prompt_template.format(
context_str=context_str,
diff --git a/hugegraph-llm/src/hugegraph_llm/operators/llm_op/info_extract.py
b/hugegraph-llm/src/hugegraph_llm/operators/llm_op/info_extract.py
new file mode 100644
index 0000000..7386fd0
--- /dev/null
+++ b/hugegraph-llm/src/hugegraph_llm/operators/llm_op/info_extract.py
@@ -0,0 +1,274 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import re
+from typing import List, Any, Dict
+
+from hugegraph_llm.llms.base import BaseLLM
+from hugegraph_llm.operators.llm_op.unstructured_data_utils import (
+ nodes_text_to_list_of_dict,
+ nodes_schemas_text_to_list_of_dict,
+ relationships_schemas_text_to_list_of_dict,
+ relationships_text_to_list_of_dict,
+)
+
+
+def generate_system_message() -> str:
+ return """You are a data scientist working for a company that is building
a graph database.
+ Your task is to extract information from data and convert it into a graph
database. Provide a
+ set of Nodes in the form [ENTITY_ID, TYPE, PROPERTIES] and a set of
relationships in the form
+ [ENTITY_ID_1, RELATIONSHIP, ENTITY_ID_2, PROPERTIES] and a set of
NodesSchemas in the form [
+ ENTITY_TYPE, PRIMARY_KEY, PROPERTIES] and a set of RelationshipsSchemas in
the form [
+ ENTITY_TYPE_1, RELATIONSHIP, ENTITY_TYPE_2, PROPERTIES] It is important
that the ENTITY_ID_1
+ and ENTITY_ID_2 exists as nodes with a matching ENTITY_ID. If you can't
pair a relationship
+ with a pair of nodes don't add it. When you find a node or relationship
you want to add try
+ to create a generic TYPE for it that describes the entity you can also
think of it as a label.
+
+ Here is an example The input you will be given: Data: Alice lawyer and is
25 years old and Bob
+ is her roommate since 2001. Bob works as a journalist. Alice owns a the
webpage www.alice.com
+ and Bob owns the webpage www.bob.com. The output you need to provide:
Nodes: ["Alice", "Person",
+ {"age": 25, "occupation": "lawyer", "name": "Alice"}], ["Bob", "Person",
{"occupation":
+ "journalist", "name": "Bob"}], ["alice.com", "Webpage", {"name":
"alice.com",
+ "url": "www.alice.com"}], ["bob.com", "Webpage", {"name": "bob.com",
"url": "www.bob.com"}]
+ Relationships: [{"Person": "Alice"}, "roommate", {"Person": "Bob"},
{"start": 2021}],
+ [{"Person": "Alice"}, "owns", {"Webpage": "alice.com"}, {}], [{"Person":
"Bob"}, "owns",
+ {"Webpage": "bob.com"}, {}] NodesSchemas: ["Person", "name", {"age":
"int",
+ "name": "text", "occupation":
+ "text"}], ["Webpage", "name", {"name": "text", "url": "text"}]
RelationshipsSchemas :["Person",
+ "roommate", "Person", {"start": "int"}], ["Person", "owns", "Webpage",
{}]"""
+
+
+def generate_ernie_prompt_spo(data) -> str:
+ return f"""Extract subject-verb-object (SPO) triples from text strictly
according to the
+ following format, each structure has only three elements: ("vertex_1",
"edge", "vertex_2").
+ for example:
+ Alice lawyer and is 25 years old and Bob is her roommate since 2001. Bob
works as a journalist.
+ Alice owns a the webpage www.alice.com and Bob owns the webpage www.bob.com
+ output:[("Alice", "Age", "25"),("Alice", "Profession", "lawyer"),("Bob",
"Job", "journalist"),
+ ("Alice", "Roommate of", "Bob"),("Alice", "Owns", "http://www.alice.com"),
+ ("Bob", "Owns", "http://www.bob.com")]
+
+ The extracted text is: {data}"""
+
+
+def generate_ernie_message(data) -> str:
+ return (
+ """You are a data scientist working for a company that is building a
graph database.
+ Your task is to extract information from data and convert it into a
graph database. Provide
+ a set of Nodes in the form [ENTITY_ID, TYPE, PROPERTIES] and a set of
relationships in the
+ form [ENTITY_ID_1, RELATIONSHIP, ENTITY_ID_2, PROPERTIES] and a set of
NodesSchemas in the
+ form [ENTITY_TYPE, PRIMARY_KEY, PROPERTIES] and a set of
RelationshipsSchemas in the form [
+ ENTITY_TYPE_1, RELATIONSHIP, ENTITY_TYPE_2, PROPERTIES] It is
important that the ENTITY_ID_1
+ and ENTITY_ID_2 exists as nodes with a matching ENTITY_ID. If you
can't pair a relationship
+ with a pair of nodes don't add it. When you find a node or
relationship you want to add try
+ to create a generic TYPE for it that describes the entity you can
also think of it as a
+ label.
+
+ Here is an example The input you will be given: Data: Alice lawyer and
is 25 years old and
+ Bob is her roommate since 2001. Bob works as a journalist. Alice owns
a the webpage
+ www.alice.com and Bob owns the webpage www.bob.com. The output you
need to provide:
+ Nodes: ["Alice", "Person", {"age": 25, "occupation": "lawyer", "name":
"Alice"}],
+ ["Bob", "Person", {"occupation":
+ "journalist", "name": "Bob"}], ["alice.com", "Webpage", {"name":
"alice.com",
+ "url": "www.alice.com"}], ["bob.com", "Webpage", {"name": "bob.com",
"url": "www.bob.com"}]
+ Relationships: [{"Person": "Alice"}, "roommate", {"Person": "Bob"},
{"start": 2021}],
+ [{"Person": "Alice"}, "owns", {"Webpage": "alice.com"}, {}],
[{"Person": "Bob"}, "owns",
+ {"Webpage": "bob.com"}, {}] NodesSchemas: ["Person", "name", {"age":
"int", "name":
+ "text", "occupation": "text"}], ["Webpage", "name", {"name": "text",
"url": "text"}]
+ RelationshipsSchemas :["Person", "roommate", "Person", {"start":
"int"}],
+ ["Person", "owns", "Webpage", {}]
+
+ Now extract information from the following data:
+ """
+ + data
+ )
+
+
+def generate_system_message_with_schemas() -> str:
+ return """You are a data scientist working for a company that is building
a graph database.
+ Your task is to extract information from data and convert it into a graph
database. Provide a
+ set of Nodes in the form [ENTITY_ID, TYPE, PROPERTIES] and a set of
relationships in the form
+ [ENTITY_ID_1, RELATIONSHIP, ENTITY_ID_2, PROPERTIES] and a set of
NodesSchemas in the form [
+ ENTITY_TYPE, PRIMARY_KEY, PROPERTIES] and a set of RelationshipsSchemas in
the form [
+ ENTITY_TYPE_1, RELATIONSHIP, ENTITY_TYPE_2, PROPERTIES] It is important
that the ENTITY_ID_1
+ and ENTITY_ID_2 exists as nodes with a matching ENTITY_ID. If you can't
pair a relationship
+ with a pair of nodes don't add it. When you find a node or relationship
you want to add try
+ to create a generic TYPE for it that describes the entity you can also
think of it as a label.
+
+ Here is an example The input you will be given: Data: Alice lawyer and is
25 years old and Bob
+ is her roommate since 2001. Bob works as a journalist. Alice owns a the
webpage www.alice.com
+ and Bob owns the webpage www.bob.com. NodesSchemas: ["Person", "name",
{"age": "int",
+ "name": "text", "occupation": "text"}], ["Webpage", "name", {"name":
"text", "url": "text"}]
+ RelationshipsSchemas :["Person", "roommate", "Person", {"start": "int"}],
["Person", "owns",
+ "Webpage", {}] The output you need to provide: Nodes: ["Alice", "Person",
{"age": 25,
+ "occupation": "lawyer", "name": "Alice"}], ["Bob", "Person",
{"occupation": "journalist",
+ "name": "Bob"}], ["alice.com", "Webpage", {"name": "alice.com", "url":
"www.alice.com"}],
+ ["bob.com", "Webpage", {"name": "bob.com", "url": "www.bob.com"}]
Relationships: [{"Person":
+ "Alice"}, "roommate", {"Person": "Bob"}, {"start": 2021}], [{"Person":
"Alice"}, "owns",
+ {"Webpage": "alice.com"}, {}], [{"Person": "Bob"}, "owns", {"Webpage":
"bob.com"},
+ {}] NodesSchemas: ["Person", "name", {"age": "int", "name": "text",
"occupation": "text"}],
+ ["Webpage", "name", {"name": "text", "url": "text"}] RelationshipsSchemas
:["Person",
+ "roommate", "Person", {"start": "int"}], ["Person", "owns", "Webpage", {}]
+ """
+
+
+def generate_prompt(data) -> str:
+ return f"""
+ Data: {data}
+ """
+
+
+def generate_prompt_with_schemas(data, nodes_schemas, relationships_schemas)
-> str:
+ return f"""
+ Data: {data}
+ NodesSchemas: {nodes_schemas}
+ RelationshipsSchemas: {relationships_schemas}
+ """
+
+
+def split_string(string, max_length) -> List[str]:
+ return [string[i : i + max_length] for i in range(0, len(string),
max_length)]
+
+
+def split_string_to_fit_token_space(
+ llm: BaseLLM, string: str, token_use_per_string: int
+) -> List[str]:
+ allowed_tokens = llm.max_allowed_token_length() - token_use_per_string
+ chunked_data = split_string(string, 500)
+ combined_chunks = []
+ current_chunk = ""
+ for chunk in chunked_data:
+ if (
+ llm.num_tokens_from_string(current_chunk) +
llm.num_tokens_from_string(chunk)
+ < allowed_tokens
+ ):
+ current_chunk += chunk
+ else:
+ combined_chunks.append(current_chunk)
+ current_chunk = chunk
+ combined_chunks.append(current_chunk)
+
+ return combined_chunks
+
+
+def get_spo_from_result(result):
+ res = []
+ for row in result:
+ row = row.replace("\\n", "").replace("\\", "")
+ pattern = r'\("(.*?)", "(.*?)", "(.*?)"\)'
+ res += re.findall(pattern, row)
+ return res
+
+
+def get_nodes_and_relationships_from_result(result):
+ regex = (
+
r"Nodes:\s+(.*?)\s?\s?Relationships:\s+(.*?)\s?\s?NodesSchemas:\s+(.*?)\s?\s?\s?"
+ r"RelationshipsSchemas:\s?\s?(.*)"
+ )
+ internal_regex = r"\[(.*?)\]"
+ nodes = []
+ relationships = []
+ nodes_schemas = []
+ relationships_schemas = []
+ for row in result:
+ row = row.replace("\n", "")
+ parsing = re.search(regex, row, flags=re.S)
+ if parsing is None:
+ continue
+ raw_nodes = str(parsing.group(1))
+ raw_relationships = parsing.group(2)
+ raw_nodes_schemas = parsing.group(3)
+ raw_relationships_schemas = parsing.group(4)
+ nodes.extend(re.findall(internal_regex, raw_nodes))
+ relationships.extend(re.findall(internal_regex, raw_relationships))
+ nodes_schemas.extend(re.findall(internal_regex, raw_nodes_schemas))
+ relationships_schemas.extend(re.findall(internal_regex,
raw_relationships_schemas))
+ result = {
+ "nodes": [],
+ "relationships": [],
+ "nodes_schemas": [],
+ "relationships_schemas": [],
+ }
+ result["nodes"].extend(nodes_text_to_list_of_dict(nodes))
+
result["relationships"].extend(relationships_text_to_list_of_dict(relationships))
+
result["nodes_schemas"].extend(nodes_schemas_text_to_list_of_dict(nodes_schemas))
+ result["relationships_schemas"].extend(
+ relationships_schemas_text_to_list_of_dict(relationships_schemas)
+ )
+ return result
+
+
+class InfoExtract:
+ def __init__(
+ self,
+ llm: BaseLLM,
+ text: str,
+ nodes_schemas=None,
+ relationships_schemas=None,
+ spo=False,
+ ) -> None:
+ self.llm = llm
+ self.text = text
+ self.nodes_schemas = nodes_schemas
+ self.relationships_schemas = relationships_schemas
+ self.spo = spo
+
+ def process(self, chunk):
+ if self.llm.get_llm_type() == "openai":
+ messages = [
+ {"role": "system", "content": self.generate_system_message()},
+ {"role": "user", "content": self.generate_prompt(chunk)},
+ ]
+ elif self.llm.get_llm_type() == "ernie":
+ if self.spo:
+ messages = [{"role": "user", "content":
generate_ernie_prompt_spo(chunk)}]
+ else:
+ messages = [{"role": "user", "content":
generate_ernie_message(chunk)}]
+ else:
+ raise Exception("llm type is not supported !")
+ output = self.llm.generate(messages)
+ return output
+
+ def generate_system_message(self) -> str:
+ if self.nodes_schemas and self.relationships_schemas:
+ return generate_system_message_with_schemas()
+ return generate_system_message()
+
+ def generate_prompt(self, data) -> str:
+ if self.nodes_schemas and self.relationships_schemas:
+ return generate_prompt_with_schemas(
+ data, self.nodes_schemas, self.relationships_schemas
+ )
+ return generate_prompt(data)
+
+ def run(self, data: Dict) -> Dict[str, List[Any]]:
+ token_usage_per_prompt = self.llm.num_tokens_from_string(
+ self.generate_system_message() + self.generate_prompt("")
+ )
+ chunked_data = split_string_to_fit_token_space(
+ llm=self.llm, string=self.text,
token_use_per_string=token_usage_per_prompt
+ )
+
+ results = []
+ for chunk in chunked_data:
+ proceeded_chunk = self.process(chunk)
+ results.append(proceeded_chunk)
+ if self.spo:
+ results = get_spo_from_result(results)
+ else:
+ results = get_nodes_and_relationships_from_result(results)
+ return results
diff --git
a/hugegraph-llm/src/hugegraph_llm/operators/llm_op/keyword_extract.py
b/hugegraph-llm/src/hugegraph_llm/operators/llm_op/keyword_extract.py
index 60d816a..9a94a11 100644
--- a/hugegraph-llm/src/hugegraph_llm/operators/llm_op/keyword_extract.py
+++ b/hugegraph-llm/src/hugegraph_llm/operators/llm_op/keyword_extract.py
@@ -20,9 +20,8 @@ import re
from typing import Set, Dict, Any, Optional
from hugegraph_llm.llms.base import BaseLLM
-from hugegraph_llm.llms.openai_llm import OpenAIChat
-from hugegraph_llm.operators.utils_op import nltk_helper
-
+from hugegraph_llm.llms.init_llm import LLMs
+from hugegraph_llm.operators.common_op.nltk_helper import NLTKHelper
DEFAULT_KEYWORDS_EXTRACT_TEMPLATE_TMPL = (
"A question is provided below. Given the question, "
@@ -55,15 +54,13 @@ class KeywordExtract:
max_keywords: int = 5,
extract_template: Optional[str] = None,
expand_template: Optional[str] = None,
- language: str = 'english',
+ language: str = "english",
):
self._llm = llm
self._query = text
self._language = language.lower()
self._max_keywords = max_keywords
- self._extract_template = (
- extract_template or DEFAULT_KEYWORDS_EXTRACT_TEMPLATE_TMPL
- )
+ self._extract_template = extract_template or
DEFAULT_KEYWORDS_EXTRACT_TEMPLATE_TMPL
self._expand_template = expand_template or
DEFAULT_KEYWORDS_EXPAND_TEMPLATE_TMPL
def run(self, context: Dict[str, Any]) -> Dict[str, Any]:
@@ -74,13 +71,13 @@ class KeywordExtract:
context["query"] = self._query
if self._llm is None:
- self._llm = context.get("llm") or OpenAIChat()
+ self._llm = context.get("llm") or LLMs().get_llm()
assert isinstance(self._llm, BaseLLM), "Invalid LLM Object."
if context.get("llm") is None:
context["llm"] = self._llm
- if isinstance(context.get('language'), str):
- self._language = context['language'].lower()
+ if isinstance(context.get("language"), str):
+ self._language = context["language"].lower()
else:
context["language"] = self._language
@@ -126,7 +123,7 @@ class KeywordExtract:
response = response.strip() # Strip newlines from responses.
if response.startswith(start_token):
- response = response[len(start_token):]
+ response = response[len(start_token) :]
for k in response.split(","):
rk = k
@@ -142,7 +139,7 @@ class KeywordExtract:
sub_tokens = re.findall(r"\w+", token)
if len(sub_tokens) > 1:
results.update(
- {w for w in sub_tokens if w not in
nltk_helper.stopwords(lang=self._language)}
+ {w for w in sub_tokens if w not in
NLTKHelper().stopwords(lang=self._language)}
)
return results
diff --git
a/hugegraph-llm/src/hugegraph_llm/operators/llm_op/parse_text_to_data.py
b/hugegraph-llm/src/hugegraph_llm/operators/llm_op/parse_text_to_data.py
deleted file mode 100644
index 05d41b2..0000000
--- a/hugegraph-llm/src/hugegraph_llm/operators/llm_op/parse_text_to_data.py
+++ /dev/null
@@ -1,208 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-
-import re
-from typing import List, Any, Dict
-
-from hugegraph_llm.llms.base import BaseLLM
-from hugegraph_llm.operators.llm_op.unstructured_data_utils import (
- nodes_text_to_list_of_dict,
- nodes_schemas_text_to_list_of_dict,
- relationships_schemas_text_to_list_of_dict,
- relationships_text_to_list_of_dict,
-)
-
-
-def generate_system_message() -> str:
- return """
-You are a data scientist working for a company that is building a graph
database. Your task is to extract information from data and convert it into a
graph database.
-Provide a set of Nodes in the form [ENTITY_ID, TYPE, PROPERTIES] and a set of
relationships in the form [ENTITY_ID_1, RELATIONSHIP, ENTITY_ID_2, PROPERTIES]
and a set of NodesSchemas in the form [ENTITY_TYPE, PRIMARY_KEY, PROPERTIES]
and a set of RelationshipsSchemas in the form [ENTITY_TYPE_1, RELATIONSHIP,
ENTITY_TYPE_2, PROPERTIES]
-It is important that the ENTITY_ID_1 and ENTITY_ID_2 exists as nodes with a
matching ENTITY_ID. If you can't pair a relationship with a pair of nodes don't
add it.
-When you find a node or relationship you want to add try to create a generic
TYPE for it that describes the entity you can also think of it as a label.
-
-Here is an example
-The input you will be given:
-Data: Alice lawyer and is 25 years old and Bob is her roommate since 2001. Bob
works as a journalist. Alice owns a the webpage www.alice.com and Bob owns the
webpage www.bob.com.
-The output you need to provide:
-Nodes: ["Alice", "Person", {"age": 25, "occupation": "lawyer", "name":
"Alice"}], ["Bob", "Person", {"occupation": "journalist", "name": "Bob"}],
["alice.com", "Webpage", {"name": "alice.com", "url": "www.alice.com"}],
["bob.com", "Webpage", {"name": "bob.com", "url": "www.bob.com"}]
-Relationships: [{"Person": "Alice"}, "roommate", {"Person": "Bob"}, {"start":
2021}], [{"Person": "Alice"}, "owns", {"Webpage": "alice.com"}, {}],
[{"Person": "Bob"}, "owns", {"Webpage": "bob.com"}, {}]
-NodesSchemas: ["Person", "name", {"age": "int", "name": "text", "occupation":
"text"}], ["Webpage", "name", {"name": "text", "url": "text"}]
-RelationshipsSchemas :["Person", "roommate", "Person", {"start": "int"}],
["Person", "owns", "Webpage", {}]
-"""
-
-
-def generate_system_message_with_schemas() -> str:
- return """
-You are a data scientist working for a company that is building a graph
database. Your task is to extract information from data and convert it into a
graph database.
-Provide a set of Nodes in the form [ENTITY_ID, TYPE, PROPERTIES] and a set of
relationships in the form [ENTITY_ID_1, RELATIONSHIP, ENTITY_ID_2, PROPERTIES]
and a set of NodesSchemas in the form [ENTITY_TYPE, PRIMARY_KEY, PROPERTIES]
and a set of RelationshipsSchemas in the form [ENTITY_TYPE_1, RELATIONSHIP,
ENTITY_TYPE_2, PROPERTIES]
-It is important that the ENTITY_ID_1 and ENTITY_ID_2 exists as nodes with a
matching ENTITY_ID. If you can't pair a relationship with a pair of nodes don't
add it.
-When you find a node or relationship you want to add try to create a generic
TYPE for it that describes the entity you can also think of it as a label.
-
-Here is an example
-The input you will be given:
-Data: Alice lawyer and is 25 years old and Bob is her roommate since 2001. Bob
works as a journalist. Alice owns a the webpage www.alice.com and Bob owns the
webpage www.bob.com.
-NodesSchemas: ["Person", "name", {"age": "int", "name": "text", "occupation":
"text"}], ["Webpage", "name", {"name": "text", "url": "text"}]
-RelationshipsSchemas :["Person", "roommate", "Person", {"start": "int"}],
["Person", "owns", "Webpage", {}]
-The output you need to provide:
-Nodes: ["Alice", "Person", {"age": 25, "occupation": "lawyer", "name":
"Alice"}], ["Bob", "Person", {"occupation": "journalist", "name": "Bob"}],
["alice.com", "Webpage", {"name": "alice.com", "url": "www.alice.com"}],
["bob.com", "Webpage", {"name": "bob.com", "url": "www.bob.com"}]
-Relationships: [{"Person": "Alice"}, "roommate", {"Person": "Bob"}, {"start":
2021}], [{"Person": "Alice"}, "owns", {"Webpage": "alice.com"}, {}],
[{"Person": "Bob"}, "owns", {"Webpage": "bob.com"}, {}]
-NodesSchemas: ["Person", "name", {"age": "int", "name": "text", "occupation":
"text"}], ["Webpage", "name", {"name": "text", "url": "text"}]
-RelationshipsSchemas :["Person", "roommate", "Person", {"start": "int"}],
["Person", "owns", "Webpage", {}]
-"""
-
-
-def generate_prompt(data) -> str:
- return f"""
-Data: {data}"""
-
-
-def generate_prompt_with_schemas(data, nodes_schemas, relationships_schemas)
-> str:
- return f"""
-Data: {data}
-NodesSchemas: {nodes_schemas}
-RelationshipsSchemas: {relationships_schemas}"""
-
-
-def split_string(string, max_length) -> List[str]:
- return [string[i: i + max_length] for i in range(0, len(string),
max_length)]
-
-
-def split_string_to_fit_token_space(
- llm: BaseLLM, string: str, token_use_per_string: int
-) -> List[str]:
- allowed_tokens = llm.max_allowed_token_length() - token_use_per_string
- chunked_data = split_string(string, 500)
- combined_chunks = []
- current_chunk = ""
- for chunk in chunked_data:
- if (
- llm.num_tokens_from_string(current_chunk) +
llm.num_tokens_from_string(chunk)
- < allowed_tokens
- ):
- current_chunk += chunk
- else:
- combined_chunks.append(current_chunk)
- current_chunk = chunk
- combined_chunks.append(current_chunk)
-
- return combined_chunks
-
-
-def get_nodes_and_relationships_from_result(result):
- regex = (
-
r"Nodes:\s+(.*?)\s?\s?Relationships:\s+(.*?)\s?\s?NodesSchemas:\s+(.*?)\s?\s?\s?"
- r"RelationshipsSchemas:\s?\s?(.*)"
- )
- internal_regex = r"\[(.*?)\]"
- nodes = []
- relationships = []
- nodes_schemas = []
- relationships_schemas = []
- for row in result:
- parsing = re.match(regex, row, flags=re.S)
- if parsing is None:
- continue
- raw_nodes = str(parsing.group(1))
- raw_relationships = parsing.group(2)
- raw_nodes_schemas = parsing.group(3)
- raw_relationships_schemas = parsing.group(4)
- nodes.extend(re.findall(internal_regex, raw_nodes))
- relationships.extend(re.findall(internal_regex, raw_relationships))
- nodes_schemas.extend(re.findall(internal_regex, raw_nodes_schemas))
- relationships_schemas.extend(re.findall(internal_regex,
raw_relationships_schemas))
- result = {"nodes": [], "relationships": [], "nodes_schemas": [],
"relationships_schemas": []}
- result["nodes"].extend(nodes_text_to_list_of_dict(nodes))
-
result["relationships"].extend(relationships_text_to_list_of_dict(relationships))
-
result["nodes_schemas"].extend(nodes_schemas_text_to_list_of_dict(nodes_schemas))
- result["relationships_schemas"].extend(
- relationships_schemas_text_to_list_of_dict(relationships_schemas)
- )
- return result
-
-
-class ParseTextToData:
- llm: BaseLLM
-
- def __init__(self, llm: BaseLLM, text: str) -> None:
- self.llm = llm
- self.text = text
-
- def process(self, chunk):
- messages = [
- {"role": "system", "content": generate_system_message()},
- {"role": "user", "content": generate_prompt(chunk)},
- ]
-
- output = self.llm.generate(messages)
- return output
-
- def run(self, data: Dict) -> Dict[str, List[Any]]:
- system_message = generate_system_message()
- prompt_string = generate_prompt("")
- token_usage_per_prompt =
self.llm.num_tokens_from_string(system_message + prompt_string)
- chunked_data = split_string_to_fit_token_space(
- llm=self.llm, string=self.text,
token_use_per_string=token_usage_per_prompt
- )
-
- results = []
- for chunk in chunked_data:
- proceeded_chunk = self.process(chunk)
- results.append(proceeded_chunk)
- results = get_nodes_and_relationships_from_result(results)
-
- return results
-
-
-class ParseTextToDataWithSchemas:
- llm: BaseLLM
-
- def __init__(self, llm: BaseLLM, text: str, nodes_schema,
relationships_schemas) -> None:
- self.llm = llm
- self.text = text
- self.data = {}
- self.nodes_schemas = nodes_schema
- self.relationships_schemas = relationships_schemas
-
- def process_with_schemas(self, chunk):
- messages = [
- {"role": "system", "content":
generate_system_message_with_schemas()},
- {
- "role": "user",
- "content": generate_prompt_with_schemas(
- chunk, self.nodes_schemas, self.relationships_schemas
- ),
- },
- ]
-
- output = self.llm.generate(messages)
- return output
-
- def run(self) -> Dict[str, List[Any]]:
- system_message = generate_system_message_with_schemas()
- prompt_string = generate_prompt_with_schemas("", "", "")
- token_usage_per_prompt =
self.llm.num_tokens_from_string(system_message + prompt_string)
- chunked_data = split_string_to_fit_token_space(
- llm=self.llm, string=self.text,
token_use_per_string=token_usage_per_prompt
- )
-
- results = []
- for chunk in chunked_data:
- proceeded_chunk = self.process_with_schemas(chunk)
- results.append(proceeded_chunk)
- results = get_nodes_and_relationships_from_result(results)
- return results
diff --git
a/hugegraph-llm/src/hugegraph_llm/operators/llm_op/unstructured_data_utils.py
b/hugegraph-llm/src/hugegraph_llm/operators/llm_op/unstructured_data_utils.py
index f02b9cf..38eabb1 100644
---
a/hugegraph-llm/src/hugegraph_llm/operators/llm_op/unstructured_data_utils.py
+++
b/hugegraph-llm/src/hugegraph_llm/operators/llm_op/unstructured_data_utils.py
@@ -45,6 +45,7 @@ def nodes_text_to_list_of_dict(nodes):
else:
properties = properties.group(0)
properties = properties.replace("True", "true")
+ properties = properties.replace("\\", "")
try:
properties = json.loads(properties)
except json.decoder.JSONDecodeError:
diff --git a/hugegraph-llm/src/hugegraph_llm/operators/utils_op/__init__.py
b/hugegraph-llm/src/hugegraph_llm/utils/__init__.py
similarity index 95%
copy from hugegraph-llm/src/hugegraph_llm/operators/utils_op/__init__.py
copy to hugegraph-llm/src/hugegraph_llm/utils/__init__.py
index 309b3ca..13a8339 100644
--- a/hugegraph-llm/src/hugegraph_llm/operators/utils_op/__init__.py
+++ b/hugegraph-llm/src/hugegraph_llm/utils/__init__.py
@@ -14,6 +14,3 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
-
-
-from .nltk_helper import nltk_helper
diff --git a/hugegraph-llm/src/hugegraph_llm/utils/config.py
b/hugegraph-llm/src/hugegraph_llm/utils/config.py
new file mode 100644
index 0000000..d7ec13f
--- /dev/null
+++ b/hugegraph-llm/src/hugegraph_llm/utils/config.py
@@ -0,0 +1,68 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import configparser
+import os
+
+
+class Config:
+ def __init__(self, config_file=None, section=None):
+ if config_file is None:
+ root_dir =
os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+ config_file = os.path.join(root_dir, "config", "config.ini")
+ if section is None:
+ raise Exception("config section cannot be none !")
+ self.config_file = config_file
+ self.config = configparser.ConfigParser()
+ self.config.read(self.config_file)
+ self.section = section
+
+ def get_config(self):
+ return self.config
+
+ def get_graph_ip(self):
+ return self.config.get(self.section, "ip")
+
+ def get_graph_port(self):
+ return self.config.get(self.section, "port")
+
+ def get_graph_user(self):
+ return self.config.get(self.section, "user")
+
+ def get_graph_pwd(self):
+ return self.config.get(self.section, "pwd")
+
+ def get_graph_name(self):
+ return self.config.get(self.section, "graph")
+
+ def get_llm_api_key(self):
+ return self.config.get(self.section, "api_key")
+
+ def get_llm_secret_key(self):
+ return self.config.get(self.section, "secret_key")
+
+ def get_llm_ernie_url(self):
+ return self.config.get(self.section, "ernie_url")
+
+ def get_llm_type(self):
+ return self.config.get(self.section, "type")
+
+ def get_llm_model_name(self):
+ return self.config.get(self.section, "model_name")
+
+ def get_llm_max_token(self):
+ return self.config.get(self.section, "max_token")
diff --git a/hugegraph-llm/src/hugegraph_llm/operators/utils_op/__init__.py
b/hugegraph-llm/src/hugegraph_llm/utils/constants.py
similarity index 90%
rename from hugegraph-llm/src/hugegraph_llm/operators/utils_op/__init__.py
rename to hugegraph-llm/src/hugegraph_llm/utils/constants.py
index 309b3ca..263bb9b 100644
--- a/hugegraph-llm/src/hugegraph_llm/operators/utils_op/__init__.py
+++ b/hugegraph-llm/src/hugegraph_llm/utils/constants.py
@@ -16,4 +16,6 @@
# under the License.
-from .nltk_helper import nltk_helper
+class Constants(str):
+ HUGEGRAPH_CONFIG = "hugegraph"
+ LLM_CONFIG = "llm"
diff --git a/hugegraph-python-client/src/pyhugegraph/api/graph.py
b/hugegraph-python-client/src/pyhugegraph/api/graph.py
index 393f0e4..1b3fefa 100644
--- a/hugegraph-python-client/src/pyhugegraph/api/graph.py
+++ b/hugegraph-python-client/src/pyhugegraph/api/graph.py
@@ -21,8 +21,17 @@ from pyhugegraph.utils.huge_requests import HugeSession
from pyhugegraph.api.common import HugeParamsBase
from pyhugegraph.structure.vertex_data import VertexData
from pyhugegraph.structure.edge_data import EdgeData
-from pyhugegraph.utils.exceptions import NotFoundError, CreateError,
RemoveError, UpdateError
-from pyhugegraph.utils.util import create_exception, check_if_authorized,
check_if_success
+from pyhugegraph.utils.exceptions import (
+ NotFoundError,
+ CreateError,
+ RemoveError,
+ UpdateError,
+)
+from pyhugegraph.utils.util import (
+ create_exception,
+ check_if_authorized,
+ check_if_success,
+)
class GraphManager(HugeParamsBase):
@@ -175,7 +184,12 @@ class GraphManager(HugeParamsBase):
def addEdge(self, edge_label, out_id, in_id, properties):
url = f"{self._host}/graphs/{self._graph_name}/graph/edges"
- data = {"label": edge_label, "outV": out_id, "inV": in_id,
"properties": properties}
+ data = {
+ "label": edge_label,
+ "outV": out_id,
+ "inV": in_id,
+ "properties": properties,
+ }
response = self.session.post(
url,
data=json.dumps(data),
@@ -256,7 +270,13 @@ class GraphManager(HugeParamsBase):
return res
def getEdgeByPage(
- self, label=None, vertex_id=None, direction=None, limit=0, page=None,
properties=None
+ self,
+ label=None,
+ vertex_id=None,
+ direction=None,
+ limit=0,
+ page=None,
+ properties=None,
):
url = f"{self._host}/graphs/{self._graph_name}/graph/edges?"
diff --git a/hugegraph-python-client/src/pyhugegraph/api/graphs.py
b/hugegraph-python-client/src/pyhugegraph/api/graphs.py
index 9db1b56..6d427ad 100644
--- a/hugegraph-python-client/src/pyhugegraph/api/graphs.py
+++ b/hugegraph-python-client/src/pyhugegraph/api/graphs.py
@@ -59,8 +59,10 @@ class GraphsManager(HugeParamsBase):
return ""
def clear_graph_all_data(self):
- url = f"{self._host}/graphs/{self._graph_name}/clear?confirm_message="
\
- f"{Constants.CONFORM_MESSAGE}"
+ url = (
+ f"{self._host}/graphs/{self._graph_name}/clear?confirm_message="
+ f"{Constants.CONFORM_MESSAGE}"
+ )
response = self.session.delete(url, auth=self._auth,
headers=self._headers)
if check_if_success(response, NotFoundError(response.content)):
return str(response.content)
diff --git a/hugegraph-python-client/src/pyhugegraph/api/gremlin.py
b/hugegraph-python-client/src/pyhugegraph/api/gremlin.py
index fcffd98..daa22f1 100644
--- a/hugegraph-python-client/src/pyhugegraph/api/gremlin.py
+++ b/hugegraph-python-client/src/pyhugegraph/api/gremlin.py
@@ -42,15 +42,15 @@ class GremlinManager(HugeParamsBase):
url = f"{self._host}/gremlin"
gremlin_data = GremlinData(gremlin)
gremlin_data.aliases = {
- 'graph': self._graph_name,
- 'g': '__g_' + self._graph_name
+ "graph": self._graph_name,
+ "g": "__g_" + self._graph_name,
}
response = self.session.post(
url,
data=gremlin_data.to_json(),
auth=self._auth,
headers=self._headers,
- timeout=self._timeout
+ timeout=self._timeout,
)
error = NotFoundError(f"Gremlin can't get results: {response.content}")
if check_if_success(response, error):
diff --git a/hugegraph-python-client/src/pyhugegraph/api/variable.py
b/hugegraph-python-client/src/pyhugegraph/api/variable.py
index 16abe61..cf1549e 100644
--- a/hugegraph-python-client/src/pyhugegraph/api/variable.py
+++ b/hugegraph-python-client/src/pyhugegraph/api/variable.py
@@ -24,7 +24,6 @@ from pyhugegraph.utils.util import check_if_success
class VariableManager(HugeParamsBase):
-
def __init__(self, graph_instance):
super().__init__(graph_instance)
self.session = self.set_session(HugeSession.new_session())
@@ -38,8 +37,8 @@ class VariableManager(HugeParamsBase):
self.session.close()
def set(self, key, value):
- url = f'{self._host}/graphs/{self._graph_name}/variables/{key}'
- data = {'data': value}
+ url = f"{self._host}/graphs/{self._graph_name}/variables/{key}"
+ data = {"data": value}
response = self.session.put(
url,
@@ -53,38 +52,29 @@ class VariableManager(HugeParamsBase):
return {}
def get(self, key):
- url = f'{self._host}/graphs/{self._graph_name}/variables/{key}'
+ url = f"{self._host}/graphs/{self._graph_name}/variables/{key}"
response = self.session.get(
- url,
- auth=self._auth,
- headers=self._headers,
- timeout=self._timeout
+ url, auth=self._auth, headers=self._headers, timeout=self._timeout
)
if check_if_success(response, NotFoundError(response.content)):
return response.json()
return {}
def all(self):
- url = f'{self._host}/graphs/{self._graph_name}/variables'
+ url = f"{self._host}/graphs/{self._graph_name}/variables"
response = self.session.get(
- url,
- auth=self._auth,
- headers=self._headers,
- timeout=self._timeout
+ url, auth=self._auth, headers=self._headers, timeout=self._timeout
)
if check_if_success(response, NotFoundError(response.content)):
return response.json()
return {}
def remove(self, key):
- url = f'{self._host}/graphs/{self._graph_name}/variables/{key}'
+ url = f"{self._host}/graphs/{self._graph_name}/variables/{key}"
response = self.session.delete(
- url,
- auth=self._auth,
- headers=self._headers,
- timeout=self._timeout
+ url, auth=self._auth, headers=self._headers, timeout=self._timeout
)
check_if_success(response, NotFoundError(response.content))
diff --git a/hugegraph-python-client/src/pyhugegraph/structure/gremlin_data.py
b/hugegraph-python-client/src/pyhugegraph/structure/gremlin_data.py
index 9440b84..81068bf 100644
--- a/hugegraph-python-client/src/pyhugegraph/structure/gremlin_data.py
+++ b/hugegraph-python-client/src/pyhugegraph/structure/gremlin_data.py
@@ -19,7 +19,6 @@ import json
class GremlinData:
-
def __init__(self, gremlin):
self.__gremlin = gremlin
self.__bindings = {}
@@ -59,8 +58,10 @@ class GremlinData:
self.__aliases = _aliases
def __repr__(self):
- res = f"gremlin: {self.__gremlin}, bindings: {self.__bindings}," \
- f"language: {self.__language}, aliases: {self.__aliases}"
+ res = (
+ f"gremlin: {self.__gremlin}, bindings: {self.__bindings},"
+ f"language: {self.__language}, aliases: {self.__aliases}"
+ )
return res
def to_json(self):
@@ -68,6 +69,5 @@ class GremlinData:
class GremlinDataEncoder(json.JSONEncoder):
-
def default(self, o):
- return {k.split('__')[1]: v for k, v in vars(o).items()}
+ return {k.split("__")[1]: v for k, v in vars(o).items()}
diff --git a/hugegraph-python-client/src/tests/api/test_gremlin.py
b/hugegraph-python-client/src/tests/api/test_gremlin.py
index 45d36ad..8bd6389 100644
--- a/hugegraph-python-client/src/tests/api/test_gremlin.py
+++ b/hugegraph-python-client/src/tests/api/test_gremlin.py
@@ -24,7 +24,6 @@ from tests.client_utils import ClientUtils
class TestGremlin(unittest.TestCase):
-
client = None
gremlin = None
@@ -49,29 +48,29 @@ class TestGremlin(unittest.TestCase):
pass
def test_query_all_vertices(self):
- vertices = self.gremlin.exec('g.V()')
- lst = vertices.get('data', [])
+ vertices = self.gremlin.exec("g.V()")
+ lst = vertices.get("data", [])
assert 6 == len(lst)
- self.gremlin.exec('g.V().drop()')
- vertices = self.gremlin.exec('g.V()')
- lst = vertices.get('data', [])
+ self.gremlin.exec("g.V().drop()")
+ vertices = self.gremlin.exec("g.V()")
+ lst = vertices.get("data", [])
assert 0 == len(lst)
def test_query_all_edges(self):
- edges = self.gremlin.exec('g.E()')
- lst = edges.get('data', [])
+ edges = self.gremlin.exec("g.E()")
+ lst = edges.get("data", [])
assert 6 == len(lst)
- self.gremlin.exec('g.E().drop()')
- edges = self.gremlin.exec('g.E()')
- lst = edges.get('data', [])
+ self.gremlin.exec("g.E().drop()")
+ edges = self.gremlin.exec("g.E()")
+ lst = edges.get("data", [])
assert 0 == len(lst)
def test_primitive_object(self):
- result = self.gremlin.exec('1 + 2')
+ result = self.gremlin.exec("1 + 2")
print(result)
- result_set = result.get('data', [])
+ result_set = result.get("data", [])
assert 1 == len(result_set)
data = result_set[0]
@@ -79,14 +78,14 @@ class TestGremlin(unittest.TestCase):
assert 3 == data
def test_empty_result_set(self):
- result = self.gremlin.exec('g.V().limit(0)')
- lst = result.get('data', [])
+ result = self.gremlin.exec("g.V().limit(0)")
+ lst = result.get("data", [])
assert 0 == len(lst)
def test_invalid_gremlin(self):
with pytest.raises(NotFoundError):
- assert self.gremlin.exec('g.V2()')
+ assert self.gremlin.exec("g.V2()")
def test_security_operation(self):
with pytest.raises(NotFoundError):
- assert self.gremlin.exec('System.exit(-1)')
+ assert self.gremlin.exec("System.exit(-1)")
diff --git a/hugegraph-python-client/src/tests/api/test_variable.py
b/hugegraph-python-client/src/tests/api/test_variable.py
index cf73b99..e50bb75 100644
--- a/hugegraph-python-client/src/tests/api/test_variable.py
+++ b/hugegraph-python-client/src/tests/api/test_variable.py
@@ -24,7 +24,6 @@ from tests.client_utils import ClientUtils
class TestVariable(unittest.TestCase):
-
client = None
variable = None
@@ -45,38 +44,38 @@ class TestVariable(unittest.TestCase):
def test_all(self):
assert 0 == len(self.variable.all())
- self.variable.set('student', 'mary')
- self.variable.set('price', 20.86)
+ self.variable.set("student", "mary")
+ self.variable.set("price", 20.86)
dic = self.variable.all()
assert 2 == len(dic)
- assert 'mary' == dic.get('student', None)
- assert 20.86 == dic.get('price', None)
+ assert "mary" == dic.get("student", None)
+ assert 20.86 == dic.get("price", None)
def test_remove(self):
- self.variable.set('lang', 'java')
+ self.variable.set("lang", "java")
dic = self.variable.all()
assert 1 == len(dic)
- assert 'java' == dic.get('lang', None)
+ assert "java" == dic.get("lang", None)
- self.variable.remove('lang')
+ self.variable.remove("lang")
dic = self.variable.all()
assert 0 == len(dic)
- assert dic.get('lang', None) is None
+ assert dic.get("lang", None) is None
def test_set_and_get(self):
- self.variable.set('name', 'tom')
- self.variable.set('age', 18)
+ self.variable.set("name", "tom")
+ self.variable.set("age", 18)
assert 2 == len(self.variable.all())
- name = self.variable.get('name').get('name', None)
- assert 'tom' == name
- age = self.variable.get('age').get('age', None)
+ name = self.variable.get("name").get("name", None)
+ assert "tom" == name
+ age = self.variable.get("age").get("age", None)
assert 18 == age
def test_get_key_not_exist(self):
with pytest.raises(NotFoundError):
- assert self.variable.get('id').get('id') is None
+ assert self.variable.get("id").get("id") is None
def test_remove_key_not_exist(self):
- self.variable.remove('id')
+ self.variable.remove("id")
diff --git a/hugegraph-python-client/src/tests/client_utils.py
b/hugegraph-python-client/src/tests/client_utils.py
index 166b526..9eacccb 100644
--- a/hugegraph-python-client/src/tests/client_utils.py
+++ b/hugegraph-python-client/src/tests/client_utils.py
@@ -22,13 +22,14 @@ class ClientUtils:
IP = "127.0.0.1"
PORT = 8080
GRAPH = "hugegraph"
- USERNAME = 'admin'
- PASSWORD = 'admin'
+ USERNAME = "admin"
+ PASSWORD = "admin"
TIMEOUT = 10
def __init__(self):
- self.client = PyHugeClient(self.IP, self.PORT, user=self.USERNAME,
- pwd=self.PASSWORD, graph=self.GRAPH)
+ self.client = PyHugeClient(
+ self.IP, self.PORT, user=self.USERNAME, pwd=self.PASSWORD,
graph=self.GRAPH
+ )
assert self.client is not None
self.schema = self.client.schema()
@@ -49,20 +50,26 @@ class ClientUtils:
def init_vertex_label(self):
schema = self.schema
- schema.vertexLabel("person").properties("name", "age",
"city").primaryKeys("name") \
- .nullableKeys("city").ifNotExist().create()
- schema.vertexLabel("software").properties("name", "lang",
"price").primaryKeys("name") \
- .nullableKeys("price").ifNotExist().create()
- schema.vertexLabel("book").useCustomizeStringId().properties("name",
"price") \
- .nullableKeys("price").ifNotExist().create()
+ schema.vertexLabel("person").properties("name", "age",
"city").primaryKeys(
+ "name"
+ ).nullableKeys("city").ifNotExist().create()
+ schema.vertexLabel("software").properties("name", "lang",
"price").primaryKeys(
+ "name"
+ ).nullableKeys("price").ifNotExist().create()
+ schema.vertexLabel("book").useCustomizeStringId().properties("name",
"price").nullableKeys(
+ "price"
+ ).ifNotExist().create()
def init_edge_label(self):
schema = self.schema
- schema.edgeLabel("knows").sourceLabel("person").targetLabel("person") \
- .multiTimes().properties("date", "city").sortKeys("date") \
- .nullableKeys("city").ifNotExist().create()
-
schema.edgeLabel("created").sourceLabel("person").targetLabel("software") \
- .properties("date",
"city").nullableKeys("city").ifNotExist().create()
+ schema.edgeLabel("knows").sourceLabel("person").targetLabel(
+ "person"
+ ).multiTimes().properties("date",
"city").sortKeys("date").nullableKeys(
+ "city"
+ ).ifNotExist().create()
+
schema.edgeLabel("created").sourceLabel("person").targetLabel("software").properties(
+ "date", "city"
+ ).nullableKeys("city").ifNotExist().create()
def init_vertices(self):
graph = self.graph
diff --git a/style/code_format_and_analysis.sh
b/style/code_format_and_analysis.sh
new file mode 100644
index 0000000..225e751
--- /dev/null
+++ b/style/code_format_and_analysis.sh
@@ -0,0 +1,42 @@
+#! /bin/bash
+
+BLACK=false
+PYLINT=false
+ROOT_DIR=$(pwd)
+echo ${ROOT_DIR}
+# Parse command line arguments
+while getopts ":bp" opt; do
+ case ${opt} in
+ b )
+ BLACK=true
+ ;;
+ p )
+ PYLINT=true
+ ;;
+ \? )
+ echo "Usage: cmd [-b] [-p]"
+ ;;
+ esac
+done
+
+# If no arguments were provided, run both BLACK and PYLINT
+if [ "$BLACK" = false ] && [ "$PYLINT" = false ]; then
+ BLACK=true
+ PYLINT=true
+fi
+
+# Run BLACK if -b is specified
+if [ "$BLACK" = true ] ; then
+ echo "[black] Start to check code style and auto format"
+ # https://github.com/psf/BLACK/issues/1802
+ black --line-length=100 ${ROOT_DIR}
+fi
+
+# Run PYLINT if -p is specified
+if [ "$PYLINT" = true ] ; then
+ echo "[pylint] Start code analysis and check,
+ we need to manually fix all the warnings mentioned below before commit! "
+ export
PYTHONPATH=${ROOT_DIR}/hugegraph-llm/src:${ROOT_DIR}/hugegraph-python-client/src
+ pylint --rcfile=${ROOT_DIR}/style/pylint.conf ${ROOT_DIR}/hugegraph-llm
+ #pylint --rcfile=${ROOT_DIR}/style/pylint.conf
${ROOT_DIR}/hugegraph-python-client
+fi
diff --git a/pylint.conf b/style/pylint.conf
similarity index 100%
rename from pylint.conf
rename to style/pylint.conf