Re: [PR] [Feature] knowledge graph construction by llm [incubator-hugegraph-ai]

via GitHub Sat, 14 Oct 2023 21:55:56 -0700


javeme commented on code in PR #7:
URL: 
https://github.com/apache/incubator-hugegraph-ai/pull/7#discussion_r1359759616



##########
hugegraph-llm/api/src/llm/basellm.py:
##########
@@ -0,0 +1,36 @@
+from abc import ABC, abstractmethod
+from typing import (
+    Any,
+    List,
+)
+
+
+def raise_(ex):

Review Comment:
   can we move to a file like exceptions.py



##########
hugegraph-llm/api/src/text2kg/data_to_data.py:
##########
@@ -0,0 +1,207 @@
+import json
+import re
+import time
+from itertools import groupby
+
+from text2kg.unstructured_data_utils import nodesTextToListOfDict, 
nodesschemasTextToListOfDict, \
+    relationshipTextToListOfDict, relationshipschemaTextToListOfDict
+
+
+def generate_system_message_for_nodes() -> str:
+    return """Your task is to identify if there are duplicated nodes and if so 
merge them into one nod. Only merge the nodes that refer to the same entity.
+You will be given different datasets of nodes and some of these nodes may be 
duplicated or refer to the same entity. 
+The datasets contains nodes in the form [ENTITY_ID, TYPE, PROPERTIES]. When 
you have completed your task please give me the 
+resulting nodes in the same format. Only return the nodes and relationships no 
other text. If there is no duplicated nodes return the original nodes.
+
+Here is an example of the input you will be given:
+["Alice", "Person", {"age" : 25, "occupation": "lawyer", "name":"Alice"}], 
["Bob", "Person", {"occupation": "journalist", "name": "Bob"}], ["alice.com", 
"Webpage", {"url": "www.alice.com"}], ["bob.com", "Webpage", {"url": 
"www.bob.com"}]
+"""
+
+
+def generate_system_message_for_relationships() -> str:
+    return """

Review Comment:
   make the same style with generate_system_message_for_nodes?



##########
hugegraph-llm/api/src/text2kg/data_to_kg.py:
##########
@@ -0,0 +1,169 @@
+import os
+from itertools import groupby
+
+from hugegraph.connection import PyHugeGraph
+
+
+
+def generate_new_relationships(nodes_schemas_data, relationships_data):
+    labelId = dict()
+    i = 1
+    old_label = []
+    for item in nodes_schemas_data:
+        label = item["label"]
+        if label in old_label:
+            continue
+        else:
+            labelId[label] = i
+            i += 1
+            old_label.append(label)
+    new_relationships_data = []
+
+    for relationship in relationships_data:
+        start = relationship['start']
+        end = relationship['end']
+        type = relationship['type']
+        properties = relationship['properties']
+        new_start = []
+        new_end = []
+        for key, value in labelId.items():
+            for key1, value1 in start.items():
+                if key1 == key:
+                    new_start = f'{value}' + ':' + f'{value1}'
+            for key1, value1 in end.items():
+                if key1 == key:
+                    new_end = f'{value}' + ':' + f'{value1}'
+        relationships_data = dict()
+        relationships_data["start"] = new_start
+        relationships_data["end"] = new_end
+        relationships_data["type"] = type
+        relationships_data["properties"] = properties
+        new_relationships_data.append(relationships_data)
+    return new_relationships_data
+
+
+def generate_schema_properties(data):
+    schema_properties_statements = []
+    if len(data) == 3:
+        for item in data:
+            properties = item['properties']
+            for key, value in properties.items():
+                if value == 'int':
+                    
schema_properties_statements.append(f"schema.propertyKey('{key}').asInt().ifNotExist().create()")
+                elif value == 'text':
+                    
schema_properties_statements.append(f"schema.propertyKey('{key}').asText().ifNotExist().create()")
+    else:
+        for item in data:
+            properties = item['properties']
+            for key, value in properties.items():
+                if value == 'int':
+                    
schema_properties_statements.append(f"schema.propertyKey('{key}').asInt().ifNotExist().create()")
+                elif value == 'text':
+                    
schema_properties_statements.append(f"schema.propertyKey('{key}').asText().ifNotExist().create()")
+    return schema_properties_statements
+
+
+def generate_schema_nodes(data):
+    schema_nodes_statements = []
+    for item in data:
+        label = item['label']
+        primaryKey = item['primaryKey']
+        properties = item['properties']
+
+        schema_statement = f"schema.vertexLabel('{label}').properties("
+        schema_statement += ', '.join(f"'{prop}'" for prop in 
properties.keys())
+        schema_statement += f").nullableKeys("
+        schema_statement += ', '.join(f"'{prop}'" for prop in 
properties.keys() if prop != primaryKey)
+        schema_statement += 
f").usePrimaryKeyId().primaryKeys('{primaryKey}').ifNotExist().create()"
+        schema_nodes_statements.append(schema_statement)
+    return schema_nodes_statements
+
+
+def generate_schema_relationships(data):
+    schema_relstionships_statements = []
+    for item in data:
+        start = item['start']
+        end = item['end']
+        type = item['type']
+        properties = item['properties']
+        schema_statement = 
f"schema.edgeLabel('{type}').sourceLabel('{start}').targetLabel('{end}').properties("
+        schema_statement += ', '.join(f"'{prop}'" for prop in 
properties.keys())
+        schema_statement += f").nullableKeys("
+        schema_statement += ', '.join(f"'{prop}'" for prop in 
properties.keys())
+        schema_statement += f").ifNotExist().create()"
+        schema_relstionships_statements.append(schema_statement)
+    return schema_relstionships_statements
+
+
+def generate_nodes(data):
+    nodes = []
+    for item in data:
+        label = item['label']
+        properties = item['properties']
+        nodes.append(f"g.addVertex('{label}', {properties})")
+    return nodes
+
+
+def generate_relationships(data):
+    relationships = []
+    for item in data:
+        start = item['start']
+        end = item['end']
+        type = item['type']
+        properties = item['properties']
+        relationships.append(f"g.addEdge('{type}', '{start}', '{end}', 
{properties})")
+    return relationships
+
+
+class DataToKg():
+    def __init__(self):
+        self.client = PyHugeGraph("127.0.0.1", "8080", user="admin", 
pwd="admin", graph="hugegraph")
+        self.schema = self.client.schema()
+
+    def run(self, data: dict):
+        os.environ.pop("http_proxy")
+        os.environ.pop("https_proxy")
+        nodes = data["nodes"]
+        relationships = data["relationships"]
+        nodes_schemas = data["nodesschemas"]
+        relationships_schemas = data["relationshipsschemas"]
+        schema = self.schema
+        # properties schema
+        schema_nodes_properties = generate_schema_properties(nodes_schemas)
+        schema_relationships_properties = 
generate_schema_properties(relationships_schemas)
+        for schema_nodes_property in schema_nodes_properties:
+            print(schema_nodes_property)
+            exec(schema_nodes_property)
+
+        for schema_relationships_property in schema_relationships_properties:
+            print(schema_relationships_property)

Review Comment:
   prefer move the print to exec()



##########
hugegraph-llm/api/src/llm/basellm.py:
##########
@@ -0,0 +1,36 @@
+from abc import ABC, abstractmethod
+from typing import (
+    Any,
+    List,
+)
+
+
+def raise_(ex):
+    raise ex
+
+
+class BaseLLM(ABC):
+    """LLM wrapper should take in a prompt and return a string."""
+
+    @abstractmethod
+    def generate(self, messages: List[str]) -> str:
+        """Comment"""
+
+    @abstractmethod
+    async def generateStreaming(
+        self, messages: List[str], onTokenCallback
+    ) -> List[Any]:
+        """Comment"""

Review Comment:
   ditto



##########
hugegraph-llm/api/src/llm/basellm.py:
##########
@@ -0,0 +1,36 @@
+from abc import ABC, abstractmethod

Review Comment:
   expect license header



##########
hugegraph-llm/api/src/llm/openai.py:
##########
@@ -0,0 +1,81 @@
+from typing import (
+    Callable,
+    List,
+)
+
+import openai
+import tiktoken
+from retry import retry
+
+from api.src.llm.basellm import BaseLLM
+
+
+class OpenAIChat(BaseLLM):
+    """Wrapper around OpenAI Chat large language models."""
+
+    def __init__(
+        self,
+        openai_api_key: str,
+        model_name: str = "gpt-3.5-turbo",
+        max_tokens: int = 1000,
+        temperature: float = 0.0,
+    ) -> None:
+        openai.api_key = openai_api_key
+        self.model = model_name
+        self.max_tokens = max_tokens
+        self.temperature = temperature
+
+    # 定义了一个generate方法，用于生成文本

Review Comment:
   to be translated



##########
hugegraph-llm/api/src/llm/basellm.py:
##########
@@ -0,0 +1,36 @@
+from abc import ABC, abstractmethod
+from typing import (
+    Any,
+    List,
+)
+
+
+def raise_(ex):
+    raise ex
+
+
+class BaseLLM(ABC):
+    """LLM wrapper should take in a prompt and return a string."""
+
+    @abstractmethod
+    def generate(self, messages: List[str]) -> str:
+        """Comment"""

Review Comment:
   to be improved?



##########
hugegraph-llm/api/src/llm/openai.py:
##########
@@ -0,0 +1,81 @@
+from typing import (
+    Callable,
+    List,
+)
+
+import openai
+import tiktoken
+from retry import retry
+
+from api.src.llm.basellm import BaseLLM
+
+
+class OpenAIChat(BaseLLM):
+    """Wrapper around OpenAI Chat large language models."""
+
+    def __init__(
+        self,
+        openai_api_key: str,
+        model_name: str = "gpt-3.5-turbo",
+        max_tokens: int = 1000,
+        temperature: float = 0.0,
+    ) -> None:
+        openai.api_key = openai_api_key
+        self.model = model_name
+        self.max_tokens = max_tokens
+        self.temperature = temperature
+
+    # 定义了一个generate方法，用于生成文本
+    @retry(tries=3, delay=1)
+    def generate(
+        self,
+        messages: List[str],
+    ) -> str:
+        try:
+            completions = openai.ChatCompletion.create(
+                model=self.model,
+                temperature=self.temperature,
+                max_tokens=self.max_tokens,
+                messages=messages,
+            )
+            return completions.choices[0].message.content
+        # catch context length / do not retry
+        except openai.error.InvalidRequestError as e:
+            return str(f"Error: {e}")
+        # catch authorization errors / do not retry
+        except openai.error.AuthenticationError as e:
+            return "Error: The provided OpenAI API key is invalid"
+        except Exception as e:
+            print(f"Retrying LLM call {e}")
+            raise Exception()
+
+    async def generateStreaming(
+        self,
+        messages: List[str],
+        onTokenCallback=Callable[[str], None],
+    ) -> str:
+        result = []
+        completions = openai.ChatCompletion.create(
+            model=self.model,
+            temperature=self.temperature,
+            max_tokens=self.max_tokens,
+            messages=messages,
+            stream=True,
+        )
+        result = []
+        for message in completions:
+            # Process the streamed messages or perform any other desired action
+            delta = message["choices"][0]["delta"]
+            if "content" in delta:
+                result.append(delta["content"])
+            await onTokenCallback(message)
+        return result
+
+    def num_tokens_from_string(self, string: str) -> int:
+        encoding = tiktoken.encoding_for_model(self.model)
+        num_tokens = len(encoding.encode(string))
+        return num_tokens
+
+    def max_allowed_token_length(self) -> int:
+        # TODO: list all models and their max tokens from api
+        return 2049

Review Comment:
   expect a blank line



##########
hugegraph-llm/api/src/text2kg/text_to_kg.py:
##########
@@ -0,0 +1,52 @@
+import os
+
+
+from api.src.llm.openai import OpenAIChat
+from text2kg.data_to_data import DataDisambiguation
+from text2kg.data_to_kg import DataToKg
+from text2kg.text_to_data import TextToData
+
+
+class TextToKg:
+    def __init__(self, name):
+        self.name = name
+        self.text2kg = []
+
+    def text2data(self, llm):
+        self.text2kg.append(TextToData(llm=llm))
+        return self
+
+    def data2data(self, llm):

Review Comment:
   wan we provide another name to distinguish the second data?
   like method names:
   
   1. parse_text_to_data
   2. disambiguate_data
   3. commit_data_to_kg



##########
hugegraph-llm/api/src/text2kg/text_to_kg.py:
##########
@@ -0,0 +1,52 @@
+import os
+
+
+from api.src.llm.openai import OpenAIChat
+from text2kg.data_to_data import DataDisambiguation
+from text2kg.data_to_kg import DataToKg
+from text2kg.text_to_data import TextToData
+
+
+class TextToKg:
+    def __init__(self, name):
+        self.name = name
+        self.text2kg = []
+
+    def text2data(self, llm):
+        self.text2kg.append(TextToData(llm=llm))
+        return self
+
+    def data2data(self, llm):
+        self.text2kg.append(DataDisambiguation(llm=llm))
+        return self
+
+    def data2kg(self):
+        self.text2kg.append(DataToKg())
+        return self
+
+    def run(self, result):
+        for i in self.text2kg:
+            result = i.run(result)
+
+
+

Review Comment:
   keep the 2 blank lines?



##########
hugegraph-llm/api/src/text2kg/data_to_data.py:
##########
@@ -0,0 +1,207 @@
+import json
+import re
+import time
+from itertools import groupby
+
+from text2kg.unstructured_data_utils import nodesTextToListOfDict, 
nodesschemasTextToListOfDict, \
+    relationshipTextToListOfDict, relationshipschemaTextToListOfDict
+
+
+def generate_system_message_for_nodes() -> str:
+    return """Your task is to identify if there are duplicated nodes and if so 
merge them into one nod. Only merge the nodes that refer to the same entity.
+You will be given different datasets of nodes and some of these nodes may be 
duplicated or refer to the same entity. 
+The datasets contains nodes in the form [ENTITY_ID, TYPE, PROPERTIES]. When 
you have completed your task please give me the 
+resulting nodes in the same format. Only return the nodes and relationships no 
other text. If there is no duplicated nodes return the original nodes.
+
+Here is an example of the input you will be given:
+["Alice", "Person", {"age" : 25, "occupation": "lawyer", "name":"Alice"}], 
["Bob", "Person", {"occupation": "journalist", "name": "Bob"}], ["alice.com", 
"Webpage", {"url": "www.alice.com"}], ["bob.com", "Webpage", {"url": 
"www.bob.com"}]
+"""
+
+
+def generate_system_message_for_relationships() -> str:
+    return """
+Your task is to identify if a set of relationships make sense.
+If they do not make sense please remove them from the dataset.
+Some relationships may be duplicated or refer to the same entity. 
+Please merge relationships that refer to the same entity.
+The datasets contains relationships in the form [{"ENTITY_TYPE_1": 
"ENTITY_ID_1"}, RELATIONSHIP, {"ENTITY_TYPE_2": "ENTITY_ID_2"}, PROPERTIES].
+You will also be given a set of ENTITY_IDs that are valid.
+Some relationships may use ENTITY_IDs that are not in the valid set but refer 
to a entity in the valid set.
+If a relationships refer to a ENTITY_ID in the valid set please change the ID 
so it matches the valid ID.
+When you have completed your task please give me the valid relationships in 
the same format. Only return the relationships no other text.
+
+Here is an example of the input you will be given:
+[{"Person": "Alice"}, "roommate", {"Person": "bob"}, {"start": 2021}], 
[{"Person": "Alice"}, "owns", {"Webpage": "alice.com"}, {}], [{"Person": 
"Bob"}, "owns", {"Webpage": "bob.com"}, {}]
+"""
+
+
+def generate_system_message_for_nodesSchemas() -> str:
+    return """Your task is to identify if there are duplicated nodes schemas 
and if so merge them into one nod. Only merge the nodes schemas that refer to 
the same entty_types.
+You will be given different node schemas, some of which may duplicate or 
reference the same entty_types. Note: For node schemas with the same 
entty_types, you need to merge them while merging all properties of the 
entty_types. 
+The datasets contains nodes schemas in the form [ENTITY_TYPE, PRIMARYKEY, 
PROPERTIES]. When you have completed your task please give me the 
+resulting nodes schemas in the same format. Only return the nodes schemas no 
other text. If there is no duplicated nodes return the original nodes schemas.
+
+Here is an example of the input you will be given:
+["Person", "name",  {"age": "int", "name": "text", "occupation": "text"}],  
["Webpage", "url", {url: "text"}]
+The output:
+["Person", "name",  {"age": "int", "name": "text", "occupation": "text"}],  
["Webpage", "url", {url: "text"}]
+"""
+
+
+def generate_system_message_for_relationshipsSchemas() -> str:
+    return """
+Your task is to identify if a set of relationships schemas make sense.
+If they do not make sense please remove them from the dataset.
+Some relationships may be duplicated or refer to the same label. 
+Please merge relationships that refer to the same label.
+The datasets contains relationships in the form [LABEL_ID_1, RELATIONSHIP, 
LABEL_ID_2, PROPERTIES].
+You will also be given a set of LABELS_IDs that are valid.
+Some relationships may use LABELS_IDs that are not in the valid set but refer 
to a LABEL in the valid set.
+If a relationships refer to a LABELS_IDs in the valid set please change the ID 
so it matches the valid ID.
+When you have completed your task please give me the valid relationships in 
the same format. Only return the relationships no other text.
+
+Here is an example of the input you will be given:
+["Person", "roommate", "Person", {"start": 2021}], ["Person", "owns", 
"Webpage", {}]
+"""
+
+
+def generate_prompt(data) -> str:
+    return f""" Here is the data:
+{data}
+"""
+
+
+internalRegex = "\[(.*?)\]"
+
+
+class DataDisambiguation():
+    def __init__(self, llm) -> None:
+        self.llm = llm
+
+    def run(self, data: dict) -> dict[str, list[any]]:
+        nodes = sorted(data["nodes"], key=lambda x: x.get("label", ""))
+        relationships = data["relationships"]
+        nodes_schemas = data["nodesschemas"]
+        relationships_schemas = data["relationshipsschemas"]
+        new_nodes = []
+        new_relationships = []
+        new_nodes_schemas = []
+        new_relationships_schemas = []
+
+        node_groups = groupby(nodes, lambda x: x["label"])
+        for group in node_groups:
+            disString = ""
+            nodes_in_group = list(group[1])
+            if len(nodes_in_group) == 1:
+                new_nodes.extend(nodes_in_group)
+                continue
+
+            for node in nodes_in_group:
+                disString += (
+                    '["'
+                    + node["name"]
+                    + '", "'
+                    + node["label"]
+                    + '", '
+                    + json.dumps(node["properties"])
+                    + "]\n"
+                )
+
+            messages = [
+                {"role": "system", "content": 
generate_system_message_for_nodes()},
+                {"role": "user", "content": generate_prompt(disString)},
+            ]
+            rawNodes = self.llm.generate(messages)
+
+            n = re.findall(internalRegex, rawNodes)
+
+            new_nodes.extend(nodesTextToListOfDict(n))
+
+        time.sleep(20)
+
+        nodes_schemas_data = ""
+        for node_schema in nodes_schemas:
+            nodes_schemas_data += (
+                    '["'
+                    + node_schema["label"]
+                    + '", '
+                    + node_schema["primaryKey"]
+                    + '", '
+                    + json.dumps(node_schema["properties"])
+                    + "]\n"
+            )
+
+        messages = [
+            {"role": "system", "content": 
generate_system_message_for_nodesSchemas()},
+            {"role": "user", "content": generate_prompt(nodes_schemas_data)},
+        ]
+        rawNodesSchemas = self.llm.generate(messages)
+
+        n = re.findall(internalRegex, rawNodesSchemas)
+
+        new_nodes_schemas.extend(nodesschemasTextToListOfDict(n))
+
+        relationship_data = ""
+        for relation in relationships:
+            relationship_data += (
+                '["'
+                + json.dumps(relation["start"])
+                + '", "'
+                + relation["type"]
+                + '", "'
+                + json.dumps(relation["end"])
+                + '", '
+                + json.dumps(relation["properties"])
+                + "]\n"
+            )
+
+        node_labels = [node["name"] for node in new_nodes]
+        relationship_data += "Valid Nodes:\n" + "\n".join(node_labels)
+
+        messages = [
+            {
+                "role": "system",
+                "content": generate_system_message_for_relationships(),
+            },
+            {"role": "user", "content": generate_prompt(relationship_data)},
+        ]
+        rawRelationships = self.llm.generate(messages)
+        rels = re.findall(internalRegex, rawRelationships)
+        new_relationships.extend(relationshipTextToListOfDict(rels))
+
+
+        relationships_schemas_data = ""
+        for relationships_schema in relationships_schemas:
+            relationships_schemas_data += (
+                '["'
+                + relationships_schema["start"]
+                + '", "'
+                + relationships_schema["type"]
+                + '", "'
+                + relationships_schema["end"]
+                + '", '
+                + json.dumps(relationships_schema["properties"])
+                + "]\n"
+            )
+
+        node_schemas_labels = [nodes_schemas["label"] for nodes_schemas in 
new_nodes_schemas]
+        relationships_schemas_data += "Valid Labels:\n" + 
"\n".join(node_schemas_labels)
+
+        messages = [
+            {
+                "role": "system",
+                "content": generate_system_message_for_relationshipsSchemas(),
+            },
+            {"role": "user", "content": 
generate_prompt(relationships_schemas_data)},
+        ]
+        rawRelationshipsSchema = self.llm.generate(messages)
+        schemaRels = re.findall(internalRegex, rawRelationshipsSchema)
+        
new_relationships_schemas.extend(relationshipschemaTextToListOfDict(schemaRels))
+
+        print(2)
+        print("data2data-result: ")
+        print(new_nodes)
+        print(new_relationships)
+        print(new_nodes_schemas)
+        print(new_relationships_schemas)
+        return {"nodes": new_nodes, "relationships": new_relationships, 
"nodesschemas": new_nodes_schemas, "relationshipsschemas": 
new_relationships_schemas}

Review Comment:
   nodesschemas => nodes_schemas



##########
hugegraph-llm/api/src/llm/openai.py:
##########
@@ -0,0 +1,81 @@
+from typing import (
+    Callable,
+    List,
+)
+
+import openai
+import tiktoken
+from retry import retry
+
+from api.src.llm.basellm import BaseLLM
+
+
+class OpenAIChat(BaseLLM):
+    """Wrapper around OpenAI Chat large language models."""
+
+    def __init__(
+        self,
+        openai_api_key: str,
+        model_name: str = "gpt-3.5-turbo",
+        max_tokens: int = 1000,
+        temperature: float = 0.0,
+    ) -> None:
+        openai.api_key = openai_api_key
+        self.model = model_name
+        self.max_tokens = max_tokens
+        self.temperature = temperature
+
+    # 定义了一个generate方法，用于生成文本
+    @retry(tries=3, delay=1)
+    def generate(
+        self,
+        messages: List[str],
+    ) -> str:
+        try:
+            completions = openai.ChatCompletion.create(
+                model=self.model,
+                temperature=self.temperature,
+                max_tokens=self.max_tokens,
+                messages=messages,
+            )
+            return completions.choices[0].message.content
+        # catch context length / do not retry
+        except openai.error.InvalidRequestError as e:
+            return str(f"Error: {e}")
+        # catch authorization errors / do not retry
+        except openai.error.AuthenticationError as e:
+            return "Error: The provided OpenAI API key is invalid"
+        except Exception as e:
+            print(f"Retrying LLM call {e}")
+            raise Exception()
+
+    async def generateStreaming(

Review Comment:
   can we keep "generate_streaming" style?



##########
hugegraph-llm/api/src/text2kg/data_to_kg.py:
##########
@@ -0,0 +1,169 @@
+import os
+from itertools import groupby
+
+from hugegraph.connection import PyHugeGraph
+
+
+
+def generate_new_relationships(nodes_schemas_data, relationships_data):
+    labelId = dict()
+    i = 1
+    old_label = []
+    for item in nodes_schemas_data:
+        label = item["label"]
+        if label in old_label:
+            continue
+        else:
+            labelId[label] = i
+            i += 1
+            old_label.append(label)
+    new_relationships_data = []
+
+    for relationship in relationships_data:
+        start = relationship['start']
+        end = relationship['end']
+        type = relationship['type']
+        properties = relationship['properties']
+        new_start = []
+        new_end = []
+        for key, value in labelId.items():
+            for key1, value1 in start.items():
+                if key1 == key:
+                    new_start = f'{value}' + ':' + f'{value1}'
+            for key1, value1 in end.items():
+                if key1 == key:
+                    new_end = f'{value}' + ':' + f'{value1}'
+        relationships_data = dict()
+        relationships_data["start"] = new_start
+        relationships_data["end"] = new_end
+        relationships_data["type"] = type
+        relationships_data["properties"] = properties
+        new_relationships_data.append(relationships_data)
+    return new_relationships_data
+
+
+def generate_schema_properties(data):
+    schema_properties_statements = []
+    if len(data) == 3:
+        for item in data:
+            properties = item['properties']
+            for key, value in properties.items():
+                if value == 'int':
+                    
schema_properties_statements.append(f"schema.propertyKey('{key}').asInt().ifNotExist().create()")
+                elif value == 'text':
+                    
schema_properties_statements.append(f"schema.propertyKey('{key}').asText().ifNotExist().create()")
+    else:
+        for item in data:
+            properties = item['properties']
+            for key, value in properties.items():
+                if value == 'int':
+                    
schema_properties_statements.append(f"schema.propertyKey('{key}').asInt().ifNotExist().create()")
+                elif value == 'text':
+                    
schema_properties_statements.append(f"schema.propertyKey('{key}').asText().ifNotExist().create()")
+    return schema_properties_statements
+
+
+def generate_schema_nodes(data):
+    schema_nodes_statements = []
+    for item in data:
+        label = item['label']
+        primaryKey = item['primaryKey']
+        properties = item['properties']
+
+        schema_statement = f"schema.vertexLabel('{label}').properties("
+        schema_statement += ', '.join(f"'{prop}'" for prop in 
properties.keys())
+        schema_statement += f").nullableKeys("
+        schema_statement += ', '.join(f"'{prop}'" for prop in 
properties.keys() if prop != primaryKey)
+        schema_statement += 
f").usePrimaryKeyId().primaryKeys('{primaryKey}').ifNotExist().create()"
+        schema_nodes_statements.append(schema_statement)
+    return schema_nodes_statements
+
+
+def generate_schema_relationships(data):
+    schema_relstionships_statements = []
+    for item in data:
+        start = item['start']
+        end = item['end']
+        type = item['type']
+        properties = item['properties']
+        schema_statement = 
f"schema.edgeLabel('{type}').sourceLabel('{start}').targetLabel('{end}').properties("
+        schema_statement += ', '.join(f"'{prop}'" for prop in 
properties.keys())
+        schema_statement += f").nullableKeys("
+        schema_statement += ', '.join(f"'{prop}'" for prop in 
properties.keys())
+        schema_statement += f").ifNotExist().create()"
+        schema_relstionships_statements.append(schema_statement)
+    return schema_relstionships_statements
+
+
+def generate_nodes(data):
+    nodes = []
+    for item in data:
+        label = item['label']
+        properties = item['properties']
+        nodes.append(f"g.addVertex('{label}', {properties})")
+    return nodes
+
+
+def generate_relationships(data):
+    relationships = []
+    for item in data:
+        start = item['start']
+        end = item['end']
+        type = item['type']
+        properties = item['properties']
+        relationships.append(f"g.addEdge('{type}', '{start}', '{end}', 
{properties})")
+    return relationships
+
+
+class DataToKg():
+    def __init__(self):
+        self.client = PyHugeGraph("127.0.0.1", "8080", user="admin", 
pwd="admin", graph="hugegraph")
+        self.schema = self.client.schema()
+
+    def run(self, data: dict):
+        os.environ.pop("http_proxy")
+        os.environ.pop("https_proxy")
+        nodes = data["nodes"]
+        relationships = data["relationships"]
+        nodes_schemas = data["nodesschemas"]
+        relationships_schemas = data["relationshipsschemas"]
+        schema = self.schema
+        # properties schema
+        schema_nodes_properties = generate_schema_properties(nodes_schemas)
+        schema_relationships_properties = 
generate_schema_properties(relationships_schemas)
+        for schema_nodes_property in schema_nodes_properties:
+            print(schema_nodes_property)
+            exec(schema_nodes_property)
+
+        for schema_relationships_property in schema_relationships_properties:
+            print(schema_relationships_property)
+            exec(schema_relationships_property)
+
+        # nodes schema
+        schema_nodes = generate_schema_nodes(nodes_schemas)
+        for schema_node in schema_nodes:
+            print(schema)
+            exec(schema_node)
+
+        # relationships schema
+        schema_relationships = 
generate_schema_relationships(relationships_schemas)
+        for schema_relationship in schema_relationships:
+            print(schema_relationship)
+            exec(schema_relationship)
+
+        g = self.client.graph()
+        # nodes
+        nodes = generate_nodes(nodes)
+        for node in nodes:
+            print(node)
+            exec(node)
+
+        # relationships
+        new_relationships = generate_new_relationships(nodes_schemas, 
relationships)
+        relationships_schemas = generate_relationships(new_relationships)
+        for relationship in relationships_schemas:
+            print(relationship)
+            exec(relationship)
+
+
+#

Review Comment:
   unused?



##########
hugegraph-llm/api/src/text2kg/text_to_data.py:
##########
@@ -0,0 +1,132 @@
+import re
+import os
+from typing import List, Dict, Any
+
+
+from api.src.llm.basellm import BaseLLM
+from text2kg.unstructured_data_utils import nodesTextToListOfDict, 
relationshipTextToListOfDict, \
+    nodesschemasTextToListOfDict, relationshipschemaTextToListOfDict
+
+
+def generate_system_message() -> str:
+    return """
+You are a data scientist working for a company that is building a graph 
database. Your task is to extract information from data and convert it into a 
graph database.
+Provide a set of Nodes in the form [ENTITY_ID, TYPE, PROPERTIES] and a set of 
relationships in the form [ENTITY_ID_1, RELATIONSHIP, ENTITY_ID_2, PROPERTIES] 
and a set of NodesSchemas in the form [ENTITY_TYPE, PRIMARYKEY, PROPERTIES] and 
a set of RelationshipsSchemas in the form [ENTITY_TYPE_1, RELATIONSHIP, 
ENTITY_TYPE_2, PROPERTIES]
+It is important that the ENTITY_ID_1 and ENTITY_ID_2 exists as nodes with a 
matching ENTITY_ID. If you can't pair a relationship with a pair of nodes don't 
add it.
+When you find a node or relationship you want to add try to create a generic 
TYPE for it that  describes the entity you can also think of it as a label.
+
+Example:
+Data: Alice lawyer and is 25 years old and Bob is her roommate since 2001. Bob 
works as a journalist. Alice owns a the webpage www.alice.com and Bob owns the 
webpage www.bob.com.
+Nodes: ["Alice", "Person", {"age": 25, "occupation": "lawyer", "name": 
"Alice"}], ["Bob", "Person", {"occupation": "journalist", "name": "Bob"}], 
["alice.com", "Webpage", {"name": "alice.com", "url": "www.alice.com"}], 
["bob.com", "Webpage", {"name": "bob.com", "url": "www.bob.com"}]
+Relationships: [{"Person": "Alice"}, "roommate", {"Person": "Bob"}, {"start": 
2021}], [{"Person": "Alice"}, "owns", {"Webpage": "alice.com"}, {}], 
[{"Person": "Bob"}, "owns", {"Webpage": "bob.com"}, {}]
+NodesSchemas: ["Person", "name",  {"age": "int", "name": "text", "occupation": 
"text"}],  ["Webpage", "name", {"name": "text", "url": "text"}]
+RelationshipsSchemas :["Person", "roommate", "Person", {"start": "int"}], 
["Person", "owns", "Webpage", {}]
+"""
+
+
+
+def generate_prompt(data) -> str:
+    return f"""
+Data: {data}"""
+
+
+
+def splitString(string, max_length) -> List[str]:
+    return [string[i : i + max_length] for i in range(0, len(string), 
max_length)]
+
+
+def splitStringToFitTokenSpace(
+    llm: BaseLLM, string: str, token_use_per_string: int
+) -> List[str]:
+    allowed_tokens = llm.max_allowed_token_length() - token_use_per_string
+    chunked_data = splitString(string, 500)
+    combined_chunks = []
+    current_chunk = ""
+    for chunk in chunked_data:
+        if (
+            llm.num_tokens_from_string(current_chunk)
+            + llm.num_tokens_from_string(chunk)
+            < allowed_tokens
+        ):
+            current_chunk += chunk
+        else:
+            combined_chunks.append(current_chunk)
+            current_chunk = chunk
+    combined_chunks.append(current_chunk)
+
+    return combined_chunks
+
+
+def getNodesAndRelationshipsFromResult(result):
+    regex = 
"Nodes:\s+(.*?)\s?\s?Relationships:\s+(.*?)\s?\s?NodesSchemas:\s+(.*?)\s?\s?\s?RelationshipsSchemas:\s?\s?(.*)"
+    internalRegex = "\[(.*?)\]"
+    nodes = []
+    relationships = []
+    nodesSchema = []
+    relationshipsSchemas = []
+    for row in result:
+        parsing = re.match(regex, row, flags=re.S)
+        if parsing == None:
+            continue
+        rawNodes = str(parsing.group(1))
+        rawRelationships = parsing.group(2)
+        rawNodesSchemas = parsing.group(3)
+        rawRelationshipsSchemas = parsing.group(4)
+        nodes.extend(re.findall(internalRegex, rawNodes))
+        relationships.extend(re.findall(internalRegex, rawRelationships))
+        nodesSchema.extend(re.findall(internalRegex, rawNodesSchemas))
+        relationshipsSchemas.extend(re.findall(internalRegex, 
rawRelationshipsSchemas))
+
+
+    result = dict()
+    result["nodes"] = []
+    result["relationships"] = []
+    result["nodesschemas"] = []
+    result["relationshipsschemas"] = []
+    result["nodes"].extend(nodesTextToListOfDict(nodes))
+    result["relationships"].extend(relationshipTextToListOfDict(relationships))
+    result["nodesschemas"].extend(nodesschemasTextToListOfDict(nodesSchema))
+    
result["relationshipsschemas"].extend(relationshipschemaTextToListOfDict(relationshipsSchemas))
+    print(result["nodes"])
+    print(result["relationships"])
+    print(result["nodesschemas"])
+    print(result["relationshipsschemas"])
+    return result
+
+
+class TextToData():
+    llm: BaseLLM
+
+    def __init__(self, llm: BaseLLM) -> None:
+        self.llm = llm
+
+
+    def process(self, chunk):
+        messages = [
+            {"role": "system", "content": generate_system_message()},
+            {"role": "user", "content": generate_prompt(chunk)},
+        ]
+        output = self.llm.generate(messages)
+        return output
+
+    def run(self, data:str) -> dict():
+        system_message = generate_system_message()
+        prompt_string = generate_prompt("")
+        token_usage_per_prompt = self.llm.num_tokens_from_string(
+            system_message + prompt_string
+        )
+        chunked_data = splitStringToFitTokenSpace(
+            llm=self.llm, string=data, 
token_use_per_string=token_usage_per_prompt
+        )
+
+        results = []
+        for chunk in chunked_data:
+            proceededChunk = self.process(chunk)
+            results.append(proceededChunk)
+            print("111111111")
+            print("text2data-result: ")
+
+        return getNodesAndRelationshipsFromResult(results)
+
+
+

Review Comment:
   ditto



##########
hugegraph-llm/api/src/text2kg/unstructured_data_utils.py:
##########
@@ -0,0 +1,108 @@
+import json
+import re
+
+regex = 
"Nodes:\s+(.*?)\s?\s?Relationships:\s?\s?NodesSchemas:\s+(.*?)\s?\s?RelationshipsSchemas:\s?\s?(.*)"
+internalRegex = "\[(.*?)\]"
+jsonRegex = "\{.*\}"
+jsonRegex_relationships = "\{.*?\}"
+
+
+def nodesTextToListOfDict(nodes):

Review Comment:
   can we keep `nodes_text...` style?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] [Feature] knowledge graph construction by llm [incubator-hugegraph-ai]

Reply via email to