This is an automated email from the ASF dual-hosted git repository. jin pushed a commit to branch fix-template in repository https://gitbox.apache.org/repos/asf/incubator-hugegraph-ai.git
commit d21a7d56b44c89ceef1dccfdf279287d3c9d6f90 Author: imbajin <[email protected]> AuthorDate: Mon Jul 29 17:28:50 2024 +0800 fix: use a proper template for default case TODO: we need refactor it to keep consistent with Schema-API --- .../src/hugegraph_llm/demo/rag_web_demo.py | 20 ++--- .../operators/llm_op/disambiguate_data.py | 2 +- .../hugegraph_llm/operators/llm_op/info_extract.py | 2 +- .../operators/llm_op/property_graph_extract.py | 88 ++++++++-------------- 4 files changed, 43 insertions(+), 69 deletions(-) diff --git a/hugegraph-llm/src/hugegraph_llm/demo/rag_web_demo.py b/hugegraph-llm/src/hugegraph_llm/demo/rag_web_demo.py index 02bb2af..d6107d7 100644 --- a/hugegraph-llm/src/hugegraph_llm/demo/rag_web_demo.py +++ b/hugegraph-llm/src/hugegraph_llm/demo/rag_web_demo.py @@ -338,20 +338,22 @@ if __name__ == "__main__": """ ) + # TODO: we need refactor the schema to a common way (keep same format with Graph REST-API) SCHEMA = """{ "vertices": [ { + "id":1, "vertex_label": "person", - "properties": [ - "name", - "age", - "occupation"] + "id_strategy":"PRIMARY_KEY", + "primary_keys":["name"], + "properties": ["name","age","occupation"] }, { + "id":2, "vertex_label": "webpage", - "properties": [ - "name", - "url"] + "id_strategy":"PRIMARY_KEY", + "primary_keys":["name"], + "properties": ["name","url"] } ], "edges": [ @@ -359,13 +361,13 @@ if __name__ == "__main__": "edge_label": "roommate", "source_vertex_label": "person", "target_vertex_label": "person", - "properties": {} + "properties": ["date"] }, { "edge_label": "link", "source_vertex_label": "webpage", "target_vertex_label": "person", - "properties": {} + "properties": [] } ] }""" diff --git a/hugegraph-llm/src/hugegraph_llm/operators/llm_op/disambiguate_data.py b/hugegraph-llm/src/hugegraph_llm/operators/llm_op/disambiguate_data.py index a0511d4..e34b637 100644 --- a/hugegraph-llm/src/hugegraph_llm/operators/llm_op/disambiguate_data.py +++ b/hugegraph-llm/src/hugegraph_llm/operators/llm_op/disambiguate_data.py @@ -52,5 +52,5 @@ class DisambiguateData: llm_output = self.llm.generate(prompt=prompt) data["triples"] = [] extract_triples_by_regex(llm_output, data) - print(f"LLM input:{prompt} \n output: {llm_output} \n data: {data}") + print(f"LLM {self.__class__.__name__} input:{prompt} \n output: {llm_output} \n data: {data}") return data diff --git a/hugegraph-llm/src/hugegraph_llm/operators/llm_op/info_extract.py b/hugegraph-llm/src/hugegraph_llm/operators/llm_op/info_extract.py index d001f5e..6f0e3af 100644 --- a/hugegraph-llm/src/hugegraph_llm/operators/llm_op/info_extract.py +++ b/hugegraph-llm/src/hugegraph_llm/operators/llm_op/info_extract.py @@ -152,7 +152,7 @@ class InfoExtract: for sentence in chunks: proceeded_chunk = self.extract_triples_by_llm(schema, sentence) - log.debug("[LLM] input: %s \n output:%s", sentence, proceeded_chunk) + log.debug("[LLM] %s input: %s \n output:%s", self.__class__.__name__, sentence, proceeded_chunk) if schema: extract_triples_by_regex_with_schema(schema, proceeded_chunk, context) else: diff --git a/hugegraph-llm/src/hugegraph_llm/operators/llm_op/property_graph_extract.py b/hugegraph-llm/src/hugegraph_llm/operators/llm_op/property_graph_extract.py index be44c7a..293f3be 100644 --- a/hugegraph-llm/src/hugegraph_llm/operators/llm_op/property_graph_extract.py +++ b/hugegraph-llm/src/hugegraph_llm/operators/llm_op/property_graph_extract.py @@ -24,73 +24,45 @@ from hugegraph_llm.models.llms.base import BaseLLM from hugegraph_llm.document.chunk_split import ChunkSplitter from hugegraph_llm.utils.log import log -SCHEMA_EXAMPLE_PROMPT = """Main Task -Given the following graph schema and a piece of text, your task is to analyze the text and extract information that fits into the schema’s structure, formatting the information into vertices and edges as specified. +SCHEMA_EXAMPLE_PROMPT = """## Main Task +Given the following graph schema and a piece of text, your task is to analyze the text and extract information that fits into the schema's structure, formatting the information into vertices and edges as specified. -Basic Rules -Schema Format +## Basic Rules +### Schema Format Graph Schema: +- Vertices: [List of vertex labels and their properties] +- Edges: [List of edge labels, their source and target vertex labels, and properties] -Vertices: [List of vertex labels and their properties] -Edges: [List of edge labels, their source and target vertex labels, and properties] -Content Rule +### Content Rule Please read the provided text carefully and identify any information that corresponds to the vertices and edges defined in the schema. For each piece of information that matches a vertex or edge, format it according to the following JSON structures: - -Vertex Format: -{“label”:“vertexLabel”,“type”:“vertex”,“properties”:{“propertyName”:“propertyValue”,…}} - -Edge Format: -{“label”:“edgeLabel”,“type”:“edge”,“outV”:“sourceVertexId”,“outVLabel”:“sourceVertexLabel”,“inV”:“targetVertexId”,“inVLabel”:“targetVertexLabel”,“properties”:{“propertyName”:“propertyValue”,…}} - -Also follow the rules: - -Don’t extract attribute/property fields that do not exist in the given schema -Ensure the extract property is in the same type as the schema (like ‘age’ should be a number) -Translate the given schema filed into Chinese if the given text is Chinese but the schema is in English (Optional) -Your output should be a list of such JSON objects, each representing either a vertex or an edge, extracted and formatted based on the text and the provided schema. -PrimaryKey ID Generate Rule - -vertexLabel的id生成策略为:id:primaryKey1!primaryKey2 - -Example -Input example: -text -道路交通事故认定书 -鱼公交认字[2013]第00478号 -天气:小雨 -交通事故时间:2013车11月24日18时09分 -交通事故地点:251省避清河菜市场路口 -当事人、车辆、道路和交通环境等基本情况: -1、当事人基本情况: -张小虎,男,1972年1月3日出生,山东省鱼台县清河镇清河村62号,系驾驶鲁H72886号小型轿车,驾驶证号:370827197201032316,档案号:370800767691,准驾车型:C1E,电话:15606376419. -于海洋,男,1952年3月12日出生,山东省鱼台县号清河镇于屯村77号、身份证:370827195203122316,步行,电话:15092699426。 -2、车辆情况: -鲁H7Z886小型轿车,入户车主:谢彪。有交通事故责任强制保险。保险单号:PDZA20133708T000075766,保险公司:中国人民产保险股份有限公司济宁市分公司。 -3、道路和咬通环境等基本情况: -事故现场位于251省道鱼台县清河镇菜市场路口,251省道呈南北走向,道路平坦,沥青路面,视线一般,有交通标志、标线,有中心隔离带,两侧为商业店铺 -道路交通事故发生经过: -2013年日月24日18时09分,张小虎驾驶鲁H72886号小型斩车,沿251省道自北向南行业至鱼台县清河镇菜市场路口处时与自西向东步行过公路的于海洋相撞,致于海洋受伤入院,经鱼台县人民医院抢教无效,于洋于2013车11月27日死亡,车辆损坏造成道路交通事故。张小虎肇事后驾车逃逸。 -道略交通事故证据及事故形成原因分折: -根据现场勘查、当事人陈述证实:张小虎因观察不够,措施不当违反《中华人民共和国道路交通安全法》第三十八条“车辆、行人应当按照交通信号通行:遇有交通警察现场指挥时,应当按照交道警察的指挥通行:在没有交通信号的道路上,应当在确保安全、畅通的原则下通行。”之规定,因酒后驾车,违反《中华人民共和国道路交通安全法》第二十二条第二款“饮酒,服用国家管制的精神药品或者醉药品,或者患有妨碍安全驾驶杭动车的疾病,或者过度劳影响安全驾驶的,不得买驶机动车,”是事故发生的原因,且肇事后驾车逸逸。 -当事人导致交通事故的过错及责任或者意外原因: -根据《中华人民共和国道路交通全法实施条例》第九十二条第一款和《道路交通事故处理程序规定》第四十六条的规定,认定当事人张小虎担地次事教的全部贡任。当事人于海洋无责任。 -交通警察: -刘爱军HZ402 -二0一四年一月二日 - - -graph schema -{"vertexLabels":[{"id":3,"name":"法条","id_strategy":"PRIMARY_KEY","primary_keys":["法典名","法条索引"],"nullable_keys":["法章名","法条内容"],"properties":["法典名","法章名","法条索引","法条内容"]},{"id":7,"name":"事故","id_strategy":"PRIMARY_KEY","primary_keys":["事故认定书编号","事故认定书单位"],"nullable_keys":[],"properties":["事故发生时间","事故认定书编号","事故认定书单位"]},{"id":11,"name":"发生地点","id_strategy":"PRIMARY_KEY","primary_keys":["城市","所属路段"],"nullable_keys":["走向","材质","路面情况","道路状况"],"properties":["城市","走向","材质","路面情况","道路状况","所属路段"]},{ [...] - -Output example: -[{"label":"事故","type":"vertex","properties":{"事故发生时间":"2013-11-24 18:09:00.000","事故认定书编号":"鱼公交认字[2013]第00478号","事故认定书单位":"道路交通事故认定书"}},{"label":"发生地点","type":"vertex","properties":{"城市":"山东省鱼台县","所属路段":"251省道清河菜市场路口","走向":"南北","材质":"沥青","路面情况":"平坦","道路状况":"视线一般"}},{"label":"当事人","type":"vertex","properties":{"身份证号":"370827197201032316","姓名":"张小虎","性别":"男","年龄":"1972-01-03","驾照":"C1E"}},{"label":"当事人","type":"vertex","properties":{"身份证号":"370827195203122316","姓名":"于海洋","性别":"男","年龄":"1952 [...] +#### Vertex Format: +{"id":"vertexLabel-entityName","label":"vertexLabel","type":"vertex","properties":{"propertyName":"propertyValue",...}} + +#### Edge Format: +{"label":"edgeLabel","type":"edge","outV":"sourceVertexId","outVLabel":"sourceVertexLabel","inV":"targetVertexId","inVLabel":"targetVertexLabel","properties":{"propertyName":"propertyValue",...}} + +Also follow the rules: +1. Don't extract attribute/property fields that do not exist in the given schema +2. Ensure the extract property is in the same type as the schema (like 'age' should be a number) +3. Translate the given schema filed into Chinese if the given text is Chinese but the schema is in English (Optional) +4. Your output should be a list of such JSON objects, each representing either a vertex or an edge, extracted and formatted based on the text and the provided schema. + +## Example +### Input example: +#### text +Meet Sarah, a 30-year-old attorney, and her roommate, James, whom she's shared a home with since 2010. James, in his professional life, works as a journalist. +#### graph schema +{"vertices":[{"vertex_label":"person","properties":["name","age","occupation"]}], "edges":[{"edge_label":"roommate", "source_vertex_label":"person","target_vertex_label":"person","properties":["date"]]} + +### Output example: +[{"id":"1:Sarah","label":"person","type":"vertex","properties":{"name":"Sarah","age":30,"occupation":"attorney"}},{"id":"1:James","label":"person","type":"vertex","properties":{"name":"James","occupation":"journalist"}},{"label":"roommate","type":"edge","outV":"1:Sarah","outVLabel":"person","inV":"1:James","inVLabel":"person","properties":{"date":"2010"}}] """ def generate_extract_property_graph_prompt(text, schema=None) -> str: return f"""--- -请根据上面的完整指令, 尝试根据下面给定的 schema, 提取下面的文本, 只需要输出 json 结果: +请根据上面的完整指令,尝试根据下面给定的 schema, 提取下面的文本,只需要输出 json 结果: ## Text: {text} ## Graph schema: @@ -123,7 +95,7 @@ class PropertyGraphExtract: items = [] for chunk in chunks: proceeded_chunk = self.extract_property_graph_by_llm(schema, chunk) - log.debug("[LLM] input: %s \n output:%s", chunk, proceeded_chunk) + log.debug("[LLM] %s input: %s \n output:%s", self.__class__.__name__, chunk, proceeded_chunk) items.extend(self._extract_and_filter_label(schema, proceeded_chunk)) items = self.filter_item(schema, items) for item in items:
