This is an automated email from the ASF dual-hosted git repository. jin pushed a commit to branch text2gql in repository https://gitbox.apache.org/repos/asf/incubator-hugegraph-ai.git
commit 958fef3011bdb67ba6a881eb9e35b033b8810e51 Author: Lriver <[email protected]> AuthorDate: Thu Oct 30 19:53:18 2025 +0800 Add Apache-2.0 license, fix review comments --- text2gremlin/AST_Text2Gremlin/README.md | 14 +- .../AST_Text2Gremlin/base/CombinationController.py | 77 ++++-- text2gremlin/AST_Text2Gremlin/base/Config.py | 35 ++- text2gremlin/AST_Text2Gremlin/base/GremlinBase.py | 18 ++ text2gremlin/AST_Text2Gremlin/base/GremlinExpr.py | 18 ++ text2gremlin/AST_Text2Gremlin/base/GremlinParse.py | 18 ++ .../AST_Text2Gremlin/base/GremlinTransVisitor.py | 18 ++ text2gremlin/AST_Text2Gremlin/base/Schema.py | 58 ++--- .../AST_Text2Gremlin/base/TraversalGenerator.py | 184 +++++++------- text2gremlin/AST_Text2Gremlin/base/__init__.py | 29 ++- text2gremlin/AST_Text2Gremlin/base/generator.py | 57 +++-- .../AST_Text2Gremlin/base/gremlin/GremlinLexer.py | 20 +- .../base/gremlin/GremlinListener.py | 20 +- .../AST_Text2Gremlin/base/gremlin/GremlinParser.py | 20 +- .../base/gremlin/GremlinVisitor.py | 20 +- .../AST_Text2Gremlin/base/gremlin/__init__.py | 16 ++ .../base/gremlin/antlr-4.13.1-complete.jar | Bin 2139203 -> 0 bytes text2gremlin/AST_Text2Gremlin/config.json | 4 +- text2gremlin/AST_Text2Gremlin/generate_corpus.py | 209 ++++++++++++++++ text2gremlin/AST_Text2Gremlin/output/README.md | 74 ------ .../output/SYNTAX_ANALYSIS_SUMMARY.md | 178 ------------- .../output/SYNTAX_DISTRIBUTION_REPORT.md | 277 --------------------- .../output/syntax_distribution_stats.json | 95 ------- text2gremlin/AST_Text2Gremlin/requirements.txt | 5 +- 24 files changed, 653 insertions(+), 811 deletions(-) diff --git a/text2gremlin/AST_Text2Gremlin/README.md b/text2gremlin/AST_Text2Gremlin/README.md index 9542906b..7df02684 100644 --- a/text2gremlin/AST_Text2Gremlin/README.md +++ b/text2gremlin/AST_Text2Gremlin/README.md @@ -5,7 +5,7 @@ ## 快速开始 环境配置:python:3.12.10 ```bash -pip install requirements.txt +pip install -r requirements.txt ``` ```bash @@ -31,7 +31,7 @@ python show_syntax_stats.py ## 项目结构 -``` +```text ├── generate_corpus.py # 主程序 ├── gremlin_templates.csv # 模板文件 ├── config.json # 配置 @@ -80,11 +80,7 @@ print(f"生成了 {result['total_unique_queries']} 个查询") ### 3. 添加模板 -```bash -python add_template.py -``` - -或直接编辑 `gremlin_templates.csv` +直接编辑 `gremlin_templates.csv`即可 --- @@ -155,7 +151,7 @@ python visualize_syntax_distribution.py ### 1. 模板泛化 从一个模板生成多个变体: -``` +```text 模板: g.V().hasLabel('person').out('acted_in') 泛化: @@ -177,7 +173,7 @@ python visualize_syntax_distribution.py ### 4. 中文翻译 自动生成流畅的中文描述: -``` +```text g.V().hasLabel('person').out('acted_in').has('title', 'Inception') ↓ 从图中开始查找所有顶点,过滤出'人'类型的顶点,沿'参演'边out方向遍历,其'标题'为'Inception' diff --git a/text2gremlin/AST_Text2Gremlin/base/CombinationController.py b/text2gremlin/AST_Text2Gremlin/base/CombinationController.py index 7923db01..445f8f38 100644 --- a/text2gremlin/AST_Text2Gremlin/base/CombinationController.py +++ b/text2gremlin/AST_Text2Gremlin/base/CombinationController.py @@ -1,3 +1,21 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + """ 组合爆炸控制器 @@ -20,21 +38,41 @@ class CombinationController: """ self.config = config - # 链长度分类阈值 - self.chain_thresholds = config['chain_thresholds'] - - # 随机增强控制 - self.random_enhancement = config['random_enhancement'] - - # 数据填充策略 - self.value_fill = config['value_fill_strategy'] - - # 属性泛化策略 - self.property_gen = config['property_generalization'] + # 验证必要配置项并加载 + try: + # 链长度分类阈值 + self.chain_thresholds = config['chain_thresholds'] + + # 随机增强控制 + self.random_enhancement = config['random_enhancement'] + + # 数据填充策略 + self.value_fill = config['value_fill_strategy'] + + # 属性泛化策略 + self.property_gen = config['property_generalization'] + except KeyError as e: + raise ValueError(f"缺少必要配置项: {e}") from None - # 总数限制 + # 总数限制(可选) self.max_total = config.get('max_total_combinations', {}) + # 验证关键类别的存在性 + # chain_thresholds 只需要 short, medium, long(ultra 通过 else 分支隐式定义) + for category in ('short', 'medium', 'long'): + if category not in self.chain_thresholds: + raise ValueError(f"chain_thresholds 缺少 '{category}' 配置") + + # property_generalization 需要所有4个类别(包括 ultra) + for category in ('short', 'medium', 'long', 'ultra'): + if category not in self.property_gen: + raise ValueError(f"property_generalization 缺少 '{category}' 配置") + # 验证每个类别的必要字段 + required_fields = ['full_coverage_threshold', 'additional_random_min', 'additional_random_max'] + for field in required_fields: + if field not in self.property_gen[category]: + raise ValueError(f"property_generalization.{category} 缺少 '{field}' 字段") + def get_chain_category(self, step_count: int) -> str: """ 根据步骤数确定链长度类别 @@ -132,7 +170,7 @@ class CombinationController: # 2. 判断是否全部遍历 if len(all_options) <= strategy['full_coverage_threshold']: # 同级选项少,全部遍历 - return all_options + return list(all_options) # 3. 同级选项多,随机选择额外的 additional_count = random.randint( @@ -178,9 +216,11 @@ class CombinationController: max_combinations = schema_config.get(chain_category, {}).get('max_combinations', 1) combinations = [] + seen = set() # 用于去重的集合 # 1. 保留原配方组合 combinations.append(recipe_params.copy()) + seen.add(tuple(sorted(recipe_params))) if max_combinations <= 1: return combinations @@ -210,24 +250,25 @@ class CombinationController: # 随机选择同数量的参数 combo = random.sample(other_options, param_count) - # 避免重复组合 - if combo not in combinations: + # 使用排序后的元组作为key进行去重(因为参数顺序不影响语义) + key = tuple(sorted(combo)) + if key not in seen: + seen.add(key) combinations.append(combo) attempts += 1 return combinations - def get_multi_param_value_fill_count(self, param_count: int, is_terminal: bool) -> int: + def get_multi_param_value_fill_count(self, is_terminal: bool) -> int: """ 多参数数据值填充次数控制 Args: - param_count: 参数个数 is_terminal: 是否是终端步骤 Returns: - 填充次数(每次填充param_count个值) + 填充次数(每次填充的值个数由调用方根据参数个数决定) """ multi_config = self.config.get('multi_param_strategy', {}) value_config = multi_config.get('value_fill', {}) diff --git a/text2gremlin/AST_Text2Gremlin/base/Config.py b/text2gremlin/AST_Text2Gremlin/base/Config.py index 21e93eb5..dd67d42c 100644 --- a/text2gremlin/AST_Text2Gremlin/base/Config.py +++ b/text2gremlin/AST_Text2Gremlin/base/Config.py @@ -1,5 +1,23 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + """ -配置管理模块。 +项目配置管理模块。 负责加载和管理项目配置文件,提供各模块所需的配置参数。 """ @@ -19,8 +37,13 @@ class Config: self.db_id = self.config_data.get("db_id") def load_config(self): - with open(self.file_path, "r") as file: - return json.load(file) + try: + with open(self.file_path, "r", encoding="utf-8") as file: + return json.load(file) + except FileNotFoundError: + raise FileNotFoundError(f"配置文件不存在: {self.file_path}") + except json.JSONDecodeError as e: + raise ValueError(f"配置文件 JSON 格式错误: {self.file_path}, 错误: {e}") def get_input_query_path(self): return self.config_data.get("input_query_path") @@ -59,7 +82,11 @@ class Config: def get_schema_path(self, db_id): schema_dict = self.config_data.get("db_schema_path") - return schema_dict[db_id] # todo error check + if not schema_dict: + raise ValueError("配置中缺少 'db_schema_path' 字段") + if db_id not in schema_dict: + raise KeyError(f"未找到 db_id '{db_id}' 对应的 schema 路径") + return schema_dict[db_id] def get_config(self, module_name): return self.config_data.get(module_name) diff --git a/text2gremlin/AST_Text2Gremlin/base/GremlinBase.py b/text2gremlin/AST_Text2Gremlin/base/GremlinBase.py index 998dce9d..40fa6025 100644 --- a/text2gremlin/AST_Text2Gremlin/base/GremlinBase.py +++ b/text2gremlin/AST_Text2Gremlin/base/GremlinBase.py @@ -1,3 +1,21 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + """ Gremlin翻译引擎模块。 diff --git a/text2gremlin/AST_Text2Gremlin/base/GremlinExpr.py b/text2gremlin/AST_Text2Gremlin/base/GremlinExpr.py index a2420cee..1b9e4a20 100644 --- a/text2gremlin/AST_Text2Gremlin/base/GremlinExpr.py +++ b/text2gremlin/AST_Text2Gremlin/base/GremlinExpr.py @@ -1,3 +1,21 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + """ Gremlin复杂表达式定义模块。 diff --git a/text2gremlin/AST_Text2Gremlin/base/GremlinParse.py b/text2gremlin/AST_Text2Gremlin/base/GremlinParse.py index 07a99e42..feef3c58 100644 --- a/text2gremlin/AST_Text2Gremlin/base/GremlinParse.py +++ b/text2gremlin/AST_Text2Gremlin/base/GremlinParse.py @@ -1,3 +1,21 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + """ Gremlin查询结构化表示模块。 diff --git a/text2gremlin/AST_Text2Gremlin/base/GremlinTransVisitor.py b/text2gremlin/AST_Text2Gremlin/base/GremlinTransVisitor.py index 502b0862..18308f72 100644 --- a/text2gremlin/AST_Text2Gremlin/base/GremlinTransVisitor.py +++ b/text2gremlin/AST_Text2Gremlin/base/GremlinTransVisitor.py @@ -1,3 +1,21 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + """ Gremlin查询AST访问器模块。 diff --git a/text2gremlin/AST_Text2Gremlin/base/Schema.py b/text2gremlin/AST_Text2Gremlin/base/Schema.py index 80a74d2c..ad675fd1 100644 --- a/text2gremlin/AST_Text2Gremlin/base/Schema.py +++ b/text2gremlin/AST_Text2Gremlin/base/Schema.py @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + """ 图数据库Schema管理模块。 @@ -7,9 +24,6 @@ import os import json -import pandas as pd -from typing import List, Dict -import json import random import pandas as pd from typing import List, Dict, Any, Tuple @@ -135,8 +149,14 @@ class Schema: def get_step_result_label(self, start_label: str, step: Dict) -> Tuple[str, str]: step_name, step_param = step.get('step'), step.get('param') - if step_name == 'out': return self.edges[step_param]['destination'], 'vertex' - if step_name == 'in': return self.edges[step_param]['source'], 'vertex' + if step_name == 'out': + if step_param not in self.edges: + raise KeyError(f"边标签 '{step_param}' 不存在于 schema 中") + return self.edges[step_param]['destination'], 'vertex' + if step_name == 'in': + if step_param not in self.edges: + raise KeyError(f"边标签 '{step_param}' 不存在于 schema 中") + return self.edges[step_param]['source'], 'vertex' if step_name in ['properties', 'has', 'values']: return start_label, 'vertex' return None, None @@ -171,7 +191,6 @@ class Schema: Returns: 实例列表 """ - import random is_edge = label in self.edges data_cache = self.edge_data if is_edge else self.vertex_data @@ -194,30 +213,3 @@ class Schema: # 随机采样 sampled_df = df.sample(actual_count) return sampled_df.to_dict('records') - -# --- 单模块测试入口 --- -if __name__ == "__main__": - base_dir = os.path.dirname(os.path.abspath(__file__)) - project_root = os.path.dirname(base_dir) - schema_path = os.path.join(project_root, 'db_data', 'schema', 'movie_schema.json') - # 【修正】将 data_path 指向包含 movie/raw_data 的上级目录 - data_path = os.path.join(project_root, 'db_data') - - if not os.path.exists(schema_path) or not os.path.exists(data_path): - print("错误: 找不到 Schema 或数据文件,请检查路径。") - else: - schema = Schema(schema_path, data_path) - print("\n--- Schema 初始化成功 (已修复CSV读取逻辑) ---") - - print("\n--- 测试数据实例获取 ---") - random_person = schema.get_instance('person') - print(f"随机获取一个 'person' 实例: {random_person}") - - random_user = schema.get_instance('user') - print(f"随机获取一个 'user' 实例: {random_user}") - - # 验证 name 属性是否能被正确读取 - if random_person and 'name' in random_person: - print(f"成功读取到 'person' 的 name: {random_person['name']}") - else: - print("错误: 未能从 'person' 实例中读取到 name 属性。") \ No newline at end of file diff --git a/text2gremlin/AST_Text2Gremlin/base/TraversalGenerator.py b/text2gremlin/AST_Text2Gremlin/base/TraversalGenerator.py index 7a6b7177..60593e68 100644 --- a/text2gremlin/AST_Text2Gremlin/base/TraversalGenerator.py +++ b/text2gremlin/AST_Text2Gremlin/base/TraversalGenerator.py @@ -1,8 +1,20 @@ -""" -Gremlin查询生成器核心引擎。 +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. -基于递归回溯算法与数据控制策略,从结构化配方生成大量多样化的Gremlin查询及其中文描述。 -""" import os import random @@ -20,37 +32,21 @@ from .CombinationController import CombinationController class TraversalGenerator: """Gremlin查询生成器 - 分层泛化架构""" - - # ==================== 步骤分类配置 ==================== - # - # 如何添加新步骤: - # 1. 确定步骤类别(A-J) - # 2. 在对应的字典中添加步骤定义 - # 3. 如果需要特殊逻辑,在对应的处理器方法中添加实现 - # 4. 运行测试验证 - # - # 示例:添加新的简单步骤 'explain' - # 1. 在 SIMPLE_STEPS 中添加: 'explain': ('解释查询', 'string') - # 2. 无需修改 _handle_simple_step 方法(自动处理) - # 3. 测试: g.V().explain() - # =========================================================== - # A. 简单步骤(无参数,直接生成) - # 翻译由GremlinBase提供 SIMPLE_STEPS = { 'count': {'output_type': 'number'}, 'id': {'output_type': 'value'}, 'label': {'output_type': 'string'}, 'fold': {'output_type': 'list'}, - 'unfold': {'output_type': None}, # None表示保持当前类型 + 'unfold': {'output_type': None}, 'drop': {'output_type': 'none'}, 'iterate': {'output_type': 'none'}, - 'explain': {'output_type': 'string'}, # 返回执行计划 - 'profile': {'output_type': 'map'}, # 返回性能分析 - 'loops': {'output_type': 'number'}, # 返回循环次数 - 'value': {'output_type': 'value'}, # 获取属性值(用于属性流) - 'identity': {'output_type': None}, # 恒等映射,保持当前类型 - 'barrier': {'output_type': None} # 屏障,等待所有遍历者 + 'explain': {'output_type': 'string'}, + 'profile': {'output_type': 'map'}, + 'loops': {'output_type': 'number'}, + 'value': {'output_type': 'value'}, + 'identity': {'output_type': None}, + 'barrier': {'output_type': None} } # B. 属性访问步骤(需要Schema + 泛化) @@ -62,7 +58,7 @@ class TraversalGenerator: 'key': {'output_type': 'string', 'supports_params': False} } - # C. 数值参数步骤(需要生成合理的数值) + # C. 数值参数步骤 NUMERIC_PARAM_STEPS = { 'limit': {'range': (1, 100)}, 'skip': {'range': (0, 50)}, @@ -142,13 +138,13 @@ class TraversalGenerator: } - # M. 边修改步骤 + # I. 边修改步骤 EDGE_MODIFICATION_STEPS = { 'from': {'output_type': None, 'needs_label_or_traversal': True}, 'to': {'output_type': None, 'needs_label_or_traversal': True} } - # I. 谓词(用于has等步骤) + # J. 谓词(用于has等步骤) # 谓词由 Visitor 解析为 Predicate/TextPredicate 对象,在 E 层过滤步骤中处理 PREDICATES = { # 数值谓词 @@ -177,7 +173,29 @@ class TraversalGenerator: 'not': {'types': ['any']} } - # J. 特殊步骤(需要单独实现) + + # K. 图算法步骤 + GRAPH_ALGORITHM_STEPS = { + 'pageRank': {'output_type': None}, + 'peerPressure': {'output_type': None}, + 'connectedComponent': {'output_type': None}, + 'shortestPath': {'output_type': None} + } + + # L. 工具步骤 + UTILITY_STEPS = { + 'math': {'output_type': 'number', 'needs_expression': True}, + 'subgraph': {'output_type': None, 'needs_key': True}, + 'timeLimit': {'output_type': None, 'needs_number': True}, + 'inject': {'output_type': None, 'multi_param': True}, # 支持多参数 + 'call': {'output_type': None, 'needs_string': True}, + 'io': {'output_type': None, 'needs_string': True}, + 'mergeE': {'output_type': None}, # 合并边 + 'mergeV': {'output_type': None}, # 合并顶点 + 'with': {'output_type': None, 'multi_param': True} # 配置选项 + } + + # M. 特殊步骤(需要单独实现) SPECIAL_STEPS = { # 起始步骤 'V': {'category': 'start'}, @@ -209,26 +227,6 @@ class TraversalGenerator: 'map': {'category': 'higher_order'}, 'local': {'category': 'higher_order'} # 本地作用域遍历 } - # K. 图算法步骤 - GRAPH_ALGORITHM_STEPS = { - 'pageRank': {'output_type': None}, - 'peerPressure': {'output_type': None}, - 'connectedComponent': {'output_type': None}, - 'shortestPath': {'output_type': None} - } - - # L. 工具步骤 - UTILITY_STEPS = { - 'math': {'output_type': 'number', 'needs_expression': True}, - 'subgraph': {'output_type': None, 'needs_key': True}, - 'timeLimit': {'output_type': None, 'needs_number': True}, - 'inject': {'output_type': None, 'multi_param': True}, # 支持多参数 - 'call': {'output_type': None, 'needs_string': True}, - 'io': {'output_type': None, 'needs_string': True}, - 'mergeE': {'output_type': None}, # 合并边 - 'mergeV': {'output_type': None}, # 合并顶点 - 'with': {'output_type': None, 'multi_param': True} # 配置选项 - } def __init__(self, schema: Schema, recipe: Traversal, gremlin_base: GremlinBase, controller: Optional[CombinationController] = None): @@ -248,12 +246,10 @@ class TraversalGenerator: # 集成组合控制器 if controller is None: - # 尝试加载默认配置 try: - # 尝试多个可能的路径 possible_paths = [ - 'combination_control_config.json', # 当前目录 - os.path.join(os.path.dirname(__file__), 'combination_control_config.json'), # TraversalGenerator.py所在目录 + 'combination_control_config.json', + os.path.join(os.path.dirname(__file__), 'combination_control_config.json'), ] config_loaded = False @@ -281,7 +277,6 @@ class TraversalGenerator: # 配方路径完成标记 self.recipe_path_completed = False - # ==================== 主生成流程 ==================== def generate(self) -> List[Tuple[str, str]]: """ @@ -415,7 +410,7 @@ class TraversalGenerator: option['new_label'], option['new_type'] ) - # ==================== 步骤选项生成(分发器)==================== + #步骤选项生成(分发器) def _get_valid_options_for_step(self, step_recipe: Step, current_label: str, current_type: str, remaining_steps: List[Step] = None) -> List[Dict]: @@ -489,7 +484,7 @@ class TraversalGenerator: print(f"⚠️ 未知步骤: {step_name}") return [] - # ==================== A. 简单步骤处理器 ==================== + # A. 简单步骤处理器 def _handle_simple_step(self, step_name: str, current_label: str, current_type: str) -> List[Dict]: @@ -521,7 +516,7 @@ class TraversalGenerator: 'new_type': new_type }] - # ==================== F. 转换步骤处理器 ==================== + # F. 转换步骤处理器 def _handle_transform_step(self, step_recipe: Step, current_label: str, current_type: str) -> List[Dict]: @@ -652,7 +647,7 @@ class TraversalGenerator: return options - # ==================== G. 聚合步骤处理器 ==================== + # G. 聚合步骤处理器 def _handle_aggregate_step(self, step_recipe: Step, current_label: str, current_type: str) -> List[Dict]: @@ -696,7 +691,7 @@ class TraversalGenerator: 'new_type': new_type }] - # ==================== G2. 副作用步骤处理器 ==================== + # G2. 副作用步骤处理器 def _handle_side_effect_step(self, step_recipe: Step, current_label: str, current_type: str) -> List[Dict]: @@ -783,7 +778,7 @@ class TraversalGenerator: 'new_type': new_type }] - # ==================== H. 终端步骤处理器 ==================== + # H. 终端步骤处理器 def _handle_terminal_step(self, step_recipe: Step, current_label: str, current_type: str) -> List[Dict]: @@ -830,7 +825,7 @@ class TraversalGenerator: 'new_type': new_type }] - # ==================== C. 数值参数步骤处理器 ==================== + # C. 数值参数步骤处理器 def _handle_numeric_param_step(self, step_name: str, params: List, current_label: str, current_type: str) -> List[Dict]: @@ -870,7 +865,7 @@ class TraversalGenerator: 'new_type': current_type }] - # ==================== K. 图算法步骤处理器 ==================== + # K. 图算法步骤处理器 def _handle_graph_algorithm_step(self, step_name: str, current_label: str, current_type: str) -> List[Dict]: @@ -897,7 +892,7 @@ class TraversalGenerator: 'new_type': current_type }] - # ==================== L. 工具步骤处理器 ==================== + # L. 工具步骤处理器 def _handle_utility_step(self, step_recipe: Step, current_label: str, current_type: str) -> List[Dict]: @@ -995,7 +990,7 @@ class TraversalGenerator: 'new_type': new_type }] - # ==================== M. 边修改步骤处理器 ==================== + # M. 边修改步骤处理器 def _handle_edge_modification_step(self, step_recipe: Step, current_label: str, current_type: str) -> List[Dict]: @@ -1058,7 +1053,7 @@ class TraversalGenerator: 'new_type': current_type }] - # ==================== B. 属性访问步骤处理器 ==================== + # B. 属性访问步骤处理器 def _handle_property_access_step(self, step_recipe: Step, current_label: str, current_type: str) -> List[Dict]: @@ -1244,7 +1239,7 @@ class TraversalGenerator: return options - # ==================== 嵌套遍历泛化辅助方法 ==================== + # 嵌套遍历泛化辅助方法 def _generate_nested_traversal_variants(self, anonymous_trav, current_depth=0): """ @@ -1450,7 +1445,7 @@ class TraversalGenerator: else: return "..." - # ==================== E. 过滤步骤处理器 ==================== + # E. 过滤步骤处理器 def _handle_filter_step(self, step_recipe: Step, current_label: str, current_type: str, remaining_steps: List[Step]) -> List[Dict]: @@ -1619,18 +1614,27 @@ class TraversalGenerator: # has('name', 'Tom', 'Jerry') - 多个值 if self.controller: fill_times = self.controller.get_multi_param_value_fill_count( - param_count=value_count, is_terminal=is_terminal_step ) else: fill_times = 1 - # 生成多次填充 - for _ in range(fill_times): - if len(all_values) >= value_count: - selected_combo = random.sample(all_values, value_count) - values_str = ", ".join(repr(v) for v in selected_combo) - prop_desc = self.gremlin_base.get_schema_desc(prop_name) + # 调整填充次数:不能超过实际可生成的不同组合数 + if len(all_values) >= value_count: + # 使用集合去重,避免生成重复组合 + generated_combos = set() + attempts = 0 + max_attempts = fill_times * 10 # 避免无限循环 + + while len(generated_combos) < fill_times and attempts < max_attempts: + selected_combo = tuple(sorted(random.sample(all_values, value_count))) + generated_combos.add(selected_combo) + attempts += 1 + + # 生成查询选项 + prop_desc = self.gremlin_base.get_schema_desc(prop_name) + for combo in generated_combos: + values_str = ", ".join(repr(v) for v in combo) options.append({ 'query_part': f".has('{prop_name}', {values_str})", 'desc_part': f",其'{prop_desc}'为{values_str}之一", @@ -1693,17 +1697,27 @@ class TraversalGenerator: # has('name', 'Tom', 'Jerry') - 多个值 if self.controller: fill_times = self.controller.get_multi_param_value_fill_count( - param_count=value_count, is_terminal=is_terminal_step ) else: fill_times = 1 - for _ in range(fill_times): - if len(all_values) >= value_count: - selected_combo = random.sample(all_values, value_count) - values_str = ", ".join(repr(v) for v in selected_combo) - prop_desc = self.gremlin_base.get_schema_desc(recipe_prop) + # 调整填充次数:不能超过实际可生成的不同组合数 + if len(all_values) >= value_count: + # 使用集合去重,避免生成重复组合 + generated_combos = set() + attempts = 0 + max_attempts = fill_times * 10 # 避免无限循环 + + while len(generated_combos) < fill_times and attempts < max_attempts: + selected_combo = tuple(sorted(random.sample(all_values, value_count))) + generated_combos.add(selected_combo) + attempts += 1 + + # 生成查询选项 + prop_desc = self.gremlin_base.get_schema_desc(recipe_prop) + for combo in generated_combos: + values_str = ", ".join(repr(v) for v in combo) options.append({ 'query_part': f".has('{recipe_prop}', {values_str})", 'desc_part': f",其'{prop_desc}'为{values_str}之一", @@ -2148,7 +2162,7 @@ class TraversalGenerator: return options - # ==================== D. 导航步骤处理器 ==================== + # D. 导航步骤处理器 def _handle_navigation_step(self, step_recipe: Step, current_label: str, current_type: str) -> List[Dict]: @@ -2333,7 +2347,7 @@ class TraversalGenerator: return options - # ==================== J. 特殊步骤处理器 ==================== + # J. 特殊步骤处理器 def _handle_special_step(self, step_recipe: Step, current_label: str, current_type: str, remaining_steps: List[Step]) -> List[Dict]: @@ -3165,7 +3179,7 @@ class TraversalGenerator: return results - # ==================== 辅助方法 ==================== + # 辅助方法 def _get_random_value(self, label: str, prop_info: Dict, for_update: bool = False) -> Any: """根据属性类型生成随机值""" diff --git a/text2gremlin/AST_Text2Gremlin/base/__init__.py b/text2gremlin/AST_Text2Gremlin/base/__init__.py index 89ee1a95..9d86256b 100644 --- a/text2gremlin/AST_Text2Gremlin/base/__init__.py +++ b/text2gremlin/AST_Text2Gremlin/base/__init__.py @@ -1,19 +1,26 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + """ Gremlin 查询生成器包 这个包提供了从模板生成 Gremlin 查询语料库的功能。 - -主要模块: -- generator: 主要的生成器接口 -- Config: 配置管理 -- Schema: 图数据库模式定义 -- TraversalGenerator: 遍历查询生成器 -- GremlinTransVisitor: Gremlin 语法解析器 """ - -__version__ = "1.0.0" - -# 导出主要接口 from .generator import generate_gremlin_corpus __all__ = ['generate_gremlin_corpus'] diff --git a/text2gremlin/AST_Text2Gremlin/base/generator.py b/text2gremlin/AST_Text2Gremlin/base/generator.py index 856d46e0..0c1d78cf 100644 --- a/text2gremlin/AST_Text2Gremlin/base/generator.py +++ b/text2gremlin/AST_Text2Gremlin/base/generator.py @@ -1,15 +1,33 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + """ -控制层-Gremlin语料库生成器主入口。 +Gremlin语料库生成器主入口脚本。 从Gremlin查询模板生成大量多样化的查询-描述对,用于Text-to-Gremlin任务的训练数据。 """ import os import json +from datetime import datetime from antlr4 import InputStream, CommonTokenStream from antlr4.error.ErrorListener import ErrorListener -# Import all our custom modules from the gremlin_base package from .Config import Config from .Schema import Schema from .GremlinBase import GremlinBase @@ -17,7 +35,6 @@ from .GremlinParse import Traversal from .TraversalGenerator import TraversalGenerator from .GremlinTransVisitor import GremlinTransVisitor -# Import the ANTLR-generated components from .gremlin.GremlinLexer import GremlinLexer from .gremlin.GremlinParser import GremlinParser import random @@ -54,10 +71,12 @@ def check_gremlin_syntax(query_string: str) -> tuple[bool, str]: parser = GremlinParser(token_stream) # 移除默认的控制台错误监听器 + lexer.removeErrorListeners() parser.removeErrorListeners() # 添加自定义的监听器 error_listener = SyntaxErrorListener() + lexer.addErrorListener(error_listener) parser.addErrorListener(error_listener) # 尝试解析查询 @@ -135,20 +154,21 @@ def generate_corpus_from_template( for query, description in corpus: try: - # 首先进行语法检查 + # 先判重,避免对重复项做语法检查 + if query in global_corpus_dict: + duplicate_count += 1 + continue + + # 再进行语法检查 is_valid, error_msg = check_gremlin_syntax(query) if not is_valid: syntax_error_count += 1 continue - - if query not in global_corpus_dict: - # 新的查询且语法正确,添加到全局字典 - global_corpus_dict[query] = description - new_pairs_count += 1 - else: - # 重复的查询,跳过 - duplicate_count += 1 + + # 新的查询且语法正确,添加到全局字典 + global_corpus_dict[query] = description + new_pairs_count += 1 except Exception as e: syntax_error_count += 1 @@ -179,18 +199,18 @@ def generate_gremlin_corpus(templates: list[str], config_path: str, schema_path: str, data_path: str, - output_file: str = None, - num_queries: int = 100) -> dict: + output_file: str = None) -> dict: """ 从Gremlin模板列表生成完整的语料库。 + 查询数量由 combination_control_config.json 中的 max_total_combinations 控制。 + Args: templates: Gremlin查询模板列表或CSV文件路径 config_path: 配置文件路径(必需) schema_path: Schema文件路径(必需) data_path: 数据文件路径(必需) output_file: 输出文件名(可选) - num_queries: 每个模板生成的查询数量(默认100) Returns: 包含生成统计信息的字典 @@ -300,9 +320,12 @@ def generate_gremlin_corpus(templates: list[str], full_corpus = [(query, desc) for query, desc in global_corpus_dict.items()] # --- Save the full corpus to a local file (if output_file is provided) --- - from datetime import datetime - if output_file: + # 确保输出目录存在 + out_dir = os.path.dirname(os.path.abspath(output_file)) + if out_dir: + os.makedirs(out_dir, exist_ok=True) + # 确保只保存成功生成的查询-描述对 corpus_data = { "metadata": { diff --git a/text2gremlin/AST_Text2Gremlin/base/gremlin/GremlinLexer.py b/text2gremlin/AST_Text2Gremlin/base/gremlin/GremlinLexer.py index 75d8b68b..99ad9e37 100644 --- a/text2gremlin/AST_Text2Gremlin/base/gremlin/GremlinLexer.py +++ b/text2gremlin/AST_Text2Gremlin/base/gremlin/GremlinLexer.py @@ -1,4 +1,22 @@ -# Generated from /root/lzj/ospp/Gremlin_Antlr4/Gremlin.g4 by ANTLR 4.13.1 +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +# Generated from ./Gremlin.g4 by ANTLR 4.13.1 from antlr4 import * from io import StringIO import sys diff --git a/text2gremlin/AST_Text2Gremlin/base/gremlin/GremlinListener.py b/text2gremlin/AST_Text2Gremlin/base/gremlin/GremlinListener.py index b9782b1b..c3a22ff8 100644 --- a/text2gremlin/AST_Text2Gremlin/base/gremlin/GremlinListener.py +++ b/text2gremlin/AST_Text2Gremlin/base/gremlin/GremlinListener.py @@ -1,4 +1,22 @@ -# Generated from /root/lzj/ospp/Gremlin_Antlr4/Gremlin.g4 by ANTLR 4.13.1 +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +# Generated from ./Gremlin.g4 by ANTLR 4.13.1 from antlr4 import * if "." in __name__: from .GremlinParser import GremlinParser diff --git a/text2gremlin/AST_Text2Gremlin/base/gremlin/GremlinParser.py b/text2gremlin/AST_Text2Gremlin/base/gremlin/GremlinParser.py index 83ff7ca1..57745b98 100644 --- a/text2gremlin/AST_Text2Gremlin/base/gremlin/GremlinParser.py +++ b/text2gremlin/AST_Text2Gremlin/base/gremlin/GremlinParser.py @@ -1,4 +1,22 @@ -# Generated from /root/lzj/ospp/Gremlin_Antlr4/Gremlin.g4 by ANTLR 4.13.1 +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +# Generated from ./Gremlin.g4 by ANTLR 4.13.1 # encoding: utf-8 from antlr4 import * from io import StringIO diff --git a/text2gremlin/AST_Text2Gremlin/base/gremlin/GremlinVisitor.py b/text2gremlin/AST_Text2Gremlin/base/gremlin/GremlinVisitor.py index f7684135..06e3f5a3 100644 --- a/text2gremlin/AST_Text2Gremlin/base/gremlin/GremlinVisitor.py +++ b/text2gremlin/AST_Text2Gremlin/base/gremlin/GremlinVisitor.py @@ -1,4 +1,22 @@ -# Generated from /root/lzj/ospp/Gremlin_Antlr4/Gremlin.g4 by ANTLR 4.13.1 +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +# Generated from ./Gremlin.g4 by ANTLR 4.13.1 from antlr4 import * if "." in __name__: from .GremlinParser import GremlinParser diff --git a/text2gremlin/AST_Text2Gremlin/base/gremlin/__init__.py b/text2gremlin/AST_Text2Gremlin/base/gremlin/__init__.py index e69de29b..13a83393 100644 --- a/text2gremlin/AST_Text2Gremlin/base/gremlin/__init__.py +++ b/text2gremlin/AST_Text2Gremlin/base/gremlin/__init__.py @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. diff --git a/text2gremlin/AST_Text2Gremlin/base/gremlin/antlr-4.13.1-complete.jar b/text2gremlin/AST_Text2Gremlin/base/gremlin/antlr-4.13.1-complete.jar deleted file mode 100644 index f539ab04..00000000 Binary files a/text2gremlin/AST_Text2Gremlin/base/gremlin/antlr-4.13.1-complete.jar and /dev/null differ diff --git a/text2gremlin/AST_Text2Gremlin/config.json b/text2gremlin/AST_Text2Gremlin/config.json index 3eae0490..43b311b9 100644 --- a/text2gremlin/AST_Text2Gremlin/config.json +++ b/text2gremlin/AST_Text2Gremlin/config.json @@ -7,9 +7,9 @@ "data_path": "./db_data/", "templates_file": "gremlin_templates.csv", "output_dir": "output", - "num_queries_per_template": 100, "schema_dict_path": [ "./base/template/schema_dict.txt" - ] + ], + "_note": "查询数量由 combination_control_config.json 中的 max_total_combinations 控制" } diff --git a/text2gremlin/AST_Text2Gremlin/generate_corpus.py b/text2gremlin/AST_Text2Gremlin/generate_corpus.py new file mode 100644 index 00000000..1438f4bf --- /dev/null +++ b/text2gremlin/AST_Text2Gremlin/generate_corpus.py @@ -0,0 +1,209 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +""" +Gremlin 查询语料库生成脚本 + +从模板生成 Gremlin 查询语料库的命令行工具。 + +用法: + # 使用默认配置(推荐) + python generate_corpus.py + + # 自定义参数 + python generate_corpus.py --templates my_templates.csv --num-queries 50 +""" + +import argparse +import sys +import os +import json +from pathlib import Path +from datetime import datetime + +# 添加 base 包到 Python 路径 +sys.path.insert(0, str(Path(__file__).parent)) + +from base import generate_gremlin_corpus + + +def load_config(config_path='config.json'): + """加载配置文件""" + try: + with open(config_path, 'r', encoding='utf-8') as f: + return json.load(f) + except Exception as e: + print(f"⚠️ 警告: 无法加载配置文件 {config_path}: {e}") + return {} + + +def main(): + parser = argparse.ArgumentParser( + description='生成 Gremlin 查询语料库', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +示例: + # 使用默认配置 + python generate_corpus.py + + # 使用自定义模板文件 + python generate_corpus.py --templates my_templates.csv + + # 使用自定义配置文件 + python generate_corpus.py --config my_config.json + + # 完全自定义 + python generate_corpus.py --templates templates.csv --schema schema.json --data data/ --output output.json + +配置说明: + config.json 中的配置项: + - templates_file: 模板文件路径(默认: gremlin_templates.csv) + - db_schema_path: schema 文件路径 + - data_path: 数据目录路径(默认: db_data/) + - output_dir: 输出目录(默认: output) + + 查询数量控制: + - 由 combination_control_config.json 中的 max_total_combinations 控制 + - 根据查询复杂度自动调整(short/medium/long/ultra) + +注意: + - 输出文件自动命名为 output/generated_corpus_YYYYMMDD_HHMMSS.json + - 每次运行生成新文件,不会覆盖旧文件 + """ + ) + + parser.add_argument( + '--config', + default='config.json', + help='配置文件路径 (JSON格式,默认: config.json)' + ) + + parser.add_argument( + '--templates', + help='模板文件路径 (CSV格式,默认从 config.json 读取)' + ) + + parser.add_argument( + '--schema', + help='图数据库模式文件路径 (JSON格式,默认从 config.json 读取)' + ) + + parser.add_argument( + '--data', + help='数据目录路径 (默认从 config.json 读取)' + ) + + parser.add_argument( + '--output', + help='输出文件路径 (JSON格式,默认: output/generated_corpus_YYYYMMDD_HHMMSS.json)' + ) + + + + args = parser.parse_args() + + # 加载配置文件 + config = load_config(args.config) + + # 从配置文件或命令行参数获取值(命令行参数优先) + templates_file = args.templates or config.get('templates_file', 'gremlin_templates.csv') + db_id = config.get('db_id', 'movie') + schema_path = args.schema or config.get('db_schema_path', {}).get(db_id, 'db_data/schema/movie_schema.json') + data_path = args.data or config.get('data_path', 'db_data/') + output_dir = config.get('output_dir', 'output') + + # 更新 args 对象 + args.templates = templates_file + args.schema = schema_path + args.data = data_path + + # 验证输入文件 + if not os.path.exists(args.templates): + print(f"❌ 错误: 模板文件不存在: {args.templates}") + print(f"💡 提示: 请创建 {args.templates} 文件,或使用 --templates 指定其他文件") + sys.exit(1) + + if not os.path.exists(args.config): + print(f"❌ 错误: 配置文件不存在: {args.config}") + sys.exit(1) + + if not os.path.exists(args.schema): + print(f"❌ 错误: 模式文件不存在: {args.schema}") + sys.exit(1) + + if not os.path.exists(args.data): + print(f"❌ 错误: 数据目录不存在: {args.data}") + sys.exit(1) + + # 如果没有指定输出文件,使用默认路径 + if not args.output: + # 确保输出目录存在 + os.makedirs(output_dir, exist_ok=True) + + # 生成带时间戳的文件名 + timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') + args.output = f'{output_dir}/generated_corpus_{timestamp}.json' + + try: + print("=" * 60) + print("🚀 Gremlin 查询语料库生成器") + print("=" * 60) + print(f"\n📋 配置信息:") + print(f" 模板文件: {args.templates}") + print(f" 配置文件: {args.config}") + print(f" 模式文件: {args.schema}") + print(f" 数据目录: {args.data}") + print(f" 输出文件: {args.output}") + + print("\n" + "-" * 60) + + # 调用生成器 + result = generate_gremlin_corpus( + templates=args.templates, + config_path=args.config, + schema_path=args.schema, + data_path=args.data, + output_file=args.output + ) + + print("\n" + "=" * 60) + print("✅ 生成完成!") + print("=" * 60) + print(f"\n📊 统计信息:") + print(f" 总模板数: {result['total_templates']}") + print(f" 成功处理: {result['successful_templates']}") + print(f" 处理失败: {result['failed_templates']}") + print(f" 生成查询数: {result['total_unique_queries']}") + + if 'output_file' in result: + print(f"\n💾 结果已保存到: {result['output_file']}") + print(f"\n💡 提示:") + print(f" - 可以在 {args.templates} 中添加更多模板") + print(f" - 查询数量由 combination_control_config.json 控制") + else: + print(f"\n生成了 {len(result['queries'])} 个查询 (未保存到文件)") + + except Exception as e: + print(f"❌ 错误: {str(e)}") + import traceback + traceback.print_exc() + sys.exit(1) + + +if __name__ == '__main__': + main() diff --git a/text2gremlin/AST_Text2Gremlin/output/README.md b/text2gremlin/AST_Text2Gremlin/output/README.md deleted file mode 100644 index 4513e94a..00000000 --- a/text2gremlin/AST_Text2Gremlin/output/README.md +++ /dev/null @@ -1,74 +0,0 @@ -# Output 目录 - -这个目录用于存放生成的 Gremlin 查询语料库文件。 - -## 文件命名规则 - -生成的文件会自动命名为: -``` -generated_corpus_YYYYMMDD_HHMMSS.json -``` - -例如: -- `generated_corpus_20251029_143025.json` -- `generated_corpus_20251029_150130.json` - -## 文件格式 - -每个生成的 JSON 文件包含: - -```json -{ - "metadata": { - "total_templates": 10, - "successful_templates": 9, - "failed_templates": 1, - "total_unique_queries": 450, - "generation_timestamp": "2025-10-29 14:30:25" - }, - "corpus": [ - { - "query": "g.V().hasLabel('person').has('name', 'Tom Hanks')", - "description": "从图中开始,并筛选出标签为 'person' 的元素,并筛选出属性 'name' 为 'Tom Hanks' 的元素" - } - ] -} -``` - -## 使用方式 - -### 生成新的语料库 - -```bash -# 使用默认配置 -python generate_corpus.py - -# 指定生成数量 -python generate_corpus.py --num-queries 50 -``` - -### 查看生成的文件 - -```bash -# 列出所有生成的文件 -ls -lh output/ - -# 查看最新生成的文件 -ls -t output/ | head -1 -``` - -### 清理旧文件 - -```bash -# 删除所有生成的文件 -rm output/generated_corpus_*.json - -# 只保留最新的 5 个文件 -ls -t output/generated_corpus_*.json | tail -n +6 | xargs rm -``` - -## 注意事项 - -- 每次运行 `generate_corpus.py` 都会生成一个新文件 -- 文件不会自动覆盖,需要手动清理旧文件 -- 建议定期清理不需要的文件以节省空间 diff --git a/text2gremlin/AST_Text2Gremlin/output/SYNTAX_ANALYSIS_SUMMARY.md b/text2gremlin/AST_Text2Gremlin/output/SYNTAX_ANALYSIS_SUMMARY.md deleted file mode 100644 index 1abf4de9..00000000 --- a/text2gremlin/AST_Text2Gremlin/output/SYNTAX_ANALYSIS_SUMMARY.md +++ /dev/null @@ -1,178 +0,0 @@ -# Gremlin 语法分析总结 - -## 📊 核心发现 - -基于对 **1,493 个查询** 的深度分析,我们使用 ANTLR 解析器对每个查询进行了语法树解析,统计了 **7,353 个步骤** 的分布情况。 - ---- - -## 🎯 关键数据 - -### 整体统计 -- **总查询数**: 1,493 -- **总步骤数**: 7,353 -- **不同步骤类型**: 76 种 -- **平均每查询步骤数**: 4.92 -- **谓词使用**: 154 次(3 种类型) - -### 集中度分析 -- **前 3 个步骤** 覆盖 **50%** 的使用 -- **前 10 个步骤** 覆盖 **80%** 的使用 -- **前 20 个步骤** 覆盖 **92.65%** 的使用 -- **前 42 个步骤** 覆盖 **99%** 的使用 - ---- - -## 🏆 Top 10 最常用步骤 - -| 排名 | 步骤 | 次数 | 占比 | 典型用法 | -|------|------|------|------|----------| -| 1 | `hasLabel` | 1,485 | 20.20% | `g.V().hasLabel('movie')` | -| 2 | `V` | 1,482 | 20.16% | `g.V()` | -| 3 | `out` | 1,202 | 16.35% | `.out('acted_in')` | -| 4 | `in` | 475 | 6.46% | `.in('has_genre')` | -| 5 | `dedup` | 302 | 4.11% | `.dedup()` | -| 6 | `by` | 259 | 3.52% | `.order().by('name')` | -| 7 | `as` | 254 | 3.45% | `.as('movie')` | -| 8 | `has` | 209 | 2.84% | `.has('name', 'Tom')` | -| 9 | `groupCount` | 182 | 2.48% | `.groupCount()` | -| 10 | `where` | 147 | 2.00% | `.where(P.neq('m'))` | - ---- - -## 📈 步骤分类占比 - -``` -过滤步骤 ████████████████████████████████ 29.63% -图遍历 ████████████████████████ 23.47% -起始步骤 ████████████████████ 20.17% -辅助步骤 ███████ 7.28% -排序限制 ████ 4.26% -聚合统计 ███ 3.59% -投影转换 ███ 3.33% -分支条件 ██ 2.50% -循环 ██ 2.30% -其他 ███ 3.47% -``` - ---- - -## 🔍 深度分析 - -### 1. 查询起始模式 -- **99.26%** 的查询从 `g.V()` 开始 -- 仅 **0.07%** 从 `g.E()` 开始 -- 说明:**顶点中心的图遍历是主流模式** - -### 2. 过滤策略 -- `hasLabel` 几乎是必备步骤(99.46% 的查询使用) -- `has` 用于属性过滤(14.00% 的查询使用) -- `dedup` 去重频繁(20.23% 的查询使用) -- 说明:**类型过滤 + 属性过滤 + 去重是标准三件套** - -### 3. 遍历方向偏好 -- `out` : `in` = **2.53 : 1** -- 出边遍历远多于入边遍历 -- 说明:**查询更关注"从哪里出发"而非"从哪里来"** - -### 4. 聚合分析需求 -- `groupCount` 是最常用的聚合操作(182 次) -- `count`, `sum`, `mean` 等基础统计也有使用 -- 说明:**分组统计是重要的分析需求** - -### 5. 复杂查询特征 -- **标记引用**: `as` (254) + `where` (147) 组合用于复杂关联 -- **循环遍历**: `repeat` (76) + `times` (39) 用于多跳查询 -- **分支逻辑**: `union` (115) + `coalesce` (51) 用于多路径探索 -- 说明:**支持复杂的图分析场景** - -### 6. 谓词使用模式 -- `neq` (不等于) 占 **69.48%**,主要用于 `where(P.neq('m'))` 排除自身 -- `within` (在集合内) 占 **22.08%**,用于集合成员判断 -- `gt` (大于) 占 **8.44%**,用于数值比较 -- 说明:**排除模式是最常见的过滤需求** - ---- - -## 💡 实践建议 - -### 对于查询优化 -1. **优先优化高频步骤**: `hasLabel`, `V`, `out` 的性能直接影响整体 -2. **索引策略**: 为 `hasLabel` 和 `has` 建立索引 -3. **去重优化**: `dedup` 使用频繁,需要高效的去重算法 -4. **出边优化**: `out` 步骤是性能瓶颈,考虑邻接表优化 - -### 对于测试覆盖 -1. **核心路径**: 重点测试 `V().hasLabel().out()` 组合 -2. **过滤场景**: 覆盖各种 `has` 和 `where` 的组合 -3. **聚合操作**: 确保 `groupCount` 在各种场景下正确 -4. **谓词测试**: 重点测试 `neq`, `within`, `gt` - -### 对于功能开发 -1. **高优先级**: 前 20 个步骤(覆盖 92.65%) -2. **中优先级**: 21-42 个步骤(覆盖 6.35%) -3. **低优先级**: 43-76 个步骤(覆盖 1%) -4. **长尾支持**: 虽然使用少,但要确保功能完整 - -### 对于文档编写 -1. **入门教程**: 重点讲解前 10 个步骤 -2. **进阶教程**: 覆盖前 30 个步骤的组合使用 -3. **高级特性**: 介绍循环、分支、聚合等复杂功能 -4. **完整参考**: 提供所有 76 个步骤的详细文档 - ---- - -## 📁 相关文件 - -- **统计数据**: `output/syntax_distribution_stats.json` -- **详细报告**: `output/SYNTAX_DISTRIBUTION_REPORT.md` -- **分析脚本**: `analyze_syntax_distribution.py` -- **可视化脚本**: `visualize_syntax_distribution.py` -- **源语料库**: `output/generated_corpus_20251029_190729.json` - ---- - -## 🔬 分析方法 - -本分析使用 **ANTLR 解析器** 对每个 Gremlin 查询进行语法树解析,而非简单的字符串匹配: - -1. **词法分析**: 使用 `GremlinLexer` 将查询字符串分解为 token -2. **语法分析**: 使用 `GremlinParser` 构建抽象语法树(AST) -3. **语义分析**: 使用 `GremlinTransVisitor` 遍历 AST 提取步骤和谓词 -4. **统计汇总**: 对提取的语法元素进行计数和分类 - -这种方法的优势: -- ✅ **准确识别**: 能准确区分步骤名称和参数 -- ✅ **处理嵌套**: 能正确处理嵌套遍历和匿名遍历 -- ✅ **谓词提取**: 能识别谓词类型而不受参数影响 -- ✅ **语法验证**: 只统计语法正确的查询 - ---- - -## 📊 数据质量 - -- **解析成功率**: 100% (1,493/1,493) -- **步骤识别**: 7,353 个步骤全部正确识别 -- **谓词识别**: 154 个谓词全部正确分类 -- **分析时间**: < 5 秒 - ---- - -## 🎓 结论 - -通过对 1,493 个 Gremlin 查询的深度分析,我们发现: - -1. **查询模式高度集中**: 前 20 个步骤覆盖 92.65% 的使用 -2. **顶点遍历为主**: 99.26% 的查询从 `g.V()` 开始 -3. **过滤是核心**: 过滤步骤占总步骤数的 29.63% -4. **出边优先**: 出边遍历是入边遍历的 2.5 倍 -5. **分析需求强**: 分组统计和聚合操作使用频繁 -6. **复杂查询支持**: 循环、分支、标记引用等高级特性都有使用 - -这些发现为 Gremlin 查询引擎的优化、测试用例设计、文档编写提供了数据支持。 - ---- - -**生成时间**: 2025-10-29 -**分析工具**: ANTLR + Python -**数据来源**: 泛化生成的 Gremlin 查询语料库 diff --git a/text2gremlin/AST_Text2Gremlin/output/SYNTAX_DISTRIBUTION_REPORT.md b/text2gremlin/AST_Text2Gremlin/output/SYNTAX_DISTRIBUTION_REPORT.md deleted file mode 100644 index 465e8582..00000000 --- a/text2gremlin/AST_Text2Gremlin/output/SYNTAX_DISTRIBUTION_REPORT.md +++ /dev/null @@ -1,277 +0,0 @@ -# Gremlin 语法词汇分布统计报告 - -## 📊 总体统计 - -| 指标 | 数值 | -|------|------| -| 总查询数 | 1,493 | -| 总步骤数 | 7,353 | -| 不同步骤类型数 | 76 | -| 总谓词数 | 154 | -| 不同谓词类型数 | 3 | -| 平均每个查询的步骤数 | 4.92 | - ---- - -## 🔝 Top 20 最常用步骤 - -| 排名 | 步骤名称 | 出现次数 | 占比 | 累计占比 | -|------|----------|----------|------|----------| -| 1 | `hasLabel` | 1,485 | 20.20% | 20.20% | -| 2 | `V` | 1,482 | 20.16% | 40.36% | -| 3 | `out` | 1,202 | 16.35% | 56.71% | -| 4 | `in` | 475 | 6.46% | 63.17% | -| 5 | `dedup` | 302 | 4.11% | 67.28% | -| 6 | `by` | 259 | 3.52% | 70.80% | -| 7 | `as` | 254 | 3.45% | 74.25% | -| 8 | `has` | 209 | 2.84% | 77.09% | -| 9 | `groupCount` | 182 | 2.48% | 79.57% | -| 10 | `where` | 147 | 2.00% | 81.57% | -| 11 | `order` | 126 | 1.71% | 83.28% | -| 12 | `limit` | 116 | 1.58% | 84.86% | -| 13 | `union` | 115 | 1.56% | 86.42% | -| 14 | `values` | 109 | 1.48% | 87.90% | -| 15 | `aggregate` | 78 | 1.06% | 88.96% | -| 16 | `repeat` | 76 | 1.03% | 89.99% | -| 17 | `path` | 54 | 0.73% | 90.72% | -| 18 | `coalesce` | 51 | 0.69% | 91.41% | -| 19 | `valueMap` | 47 | 0.64% | 92.05% | -| 20 | `select` | 44 | 0.60% | 92.65% | - -**分析**: 前 20 个步骤占总步骤数的 **92.65%**,说明查询模式相对集中。 - ---- - -## 📈 步骤分类统计 - -### 图遍历起始步骤 -| 步骤 | 次数 | 说明 | -|------|------|------| -| `V` | 1,482 | 从顶点开始遍历 | -| `E` | 1 | 从边开始遍历 | - -### 过滤步骤 -| 步骤 | 次数 | 说明 | -|------|------|------| -| `hasLabel` | 1,485 | 按标签过滤 | -| `has` | 209 | 按属性过滤 | -| `hasId` | 5 | 按ID过滤 | -| `hasKey` | 6 | 按键过滤 | -| `hasValue` | 2 | 按值过滤 | -| `where` | 147 | 条件过滤 | -| `filter` | 1 | 自定义过滤 | -| `is` | 1 | 值比较过滤 | -| `dedup` | 302 | 去重 | -| `simplePath` | 20 | 简单路径过滤 | -| `cyclicPath` | 1 | 循环路径过滤 | - -**小计**: 2,179 次 (29.63%) - -### 图遍历步骤 -| 步骤 | 次数 | 说明 | -|------|------|------| -| `out` | 1,202 | 出边遍历 | -| `in` | 475 | 入边遍历 | -| `both` | 2 | 双向遍历 | -| `outE` | 20 | 出边 | -| `inE` | 23 | 入边 | -| `bothE` | 1 | 双向边 | -| `outV` | 1 | 出顶点 | -| `inV` | 1 | 入顶点 | -| `otherV` | 1 | 另一端顶点 | - -**小计**: 1,726 次 (23.47%) - -### 聚合统计步骤 -| 步骤 | 次数 | 说明 | -|------|------|------| -| `groupCount` | 182 | 分组计数 | -| `count` | 22 | 计数 | -| `sum` | 21 | 求和 | -| `mean` | 6 | 平均值 | -| `max` | 2 | 最大值 | -| `min` | 1 | 最小值 | -| `fold` | 29 | 折叠为列表 | -| `unfold` | 1 | 展开列表 | - -**小计**: 264 次 (3.59%) - -### 排序和限制步骤 -| 步骤 | 次数 | 说明 | -|------|------|------| -| `order` | 126 | 排序 | -| `limit` | 116 | 限制数量 | -| `range` | 38 | 范围选择 | -| `skip` | 1 | 跳过 | -| `tail` | 1 | 取尾部 | -| `sample` | 30 | 随机采样 | -| `coin` | 1 | 随机过滤 | - -**小计**: 313 次 (4.26%) - -### 投影和转换步骤 -| 步骤 | 次数 | 说明 | -|------|------|------| -| `values` | 109 | 获取属性值 | -| `valueMap` | 47 | 获取属性映射 | -| `elementMap` | 2 | 获取元素映射 | -| `properties` | 19 | 获取属性对象 | -| `project` | 19 | 投影 | -| `select` | 44 | 选择 | -| `label` | 2 | 获取标签 | -| `id` | 1 | 获取ID | -| `constant` | 1 | 常量值 | -| `identity` | 1 | 恒等变换 | - -**小计**: 245 次 (3.33%) - -### 分支和条件步骤 -| 步骤 | 次数 | 说明 | -|------|------|------| -| `union` | 115 | 联合多个遍历 | -| `coalesce` | 51 | 合并(返回第一个非空) | -| `choose` | 10 | 条件分支 | -| `optional` | 8 | 可选遍历 | - -**小计**: 184 次 (2.50%) - -### 循环步骤 -| 步骤 | 次数 | 说明 | -|------|------|------| -| `repeat` | 76 | 重复遍历 | -| `times` | 39 | 重复次数 | -| `until` | 22 | 直到条件满足 | -| `emit` | 32 | 发射中间结果 | - -**小计**: 169 次 (2.30%) - -### 路径步骤 -| 步骤 | 次数 | 说明 | -|------|------|------| -| `path` | 54 | 获取路径 | -| `tree` | 18 | 树形结构 | - -**小计**: 72 次 (0.98%) - -### 副作用步骤 -| 步骤 | 次数 | 说明 | -|------|------|------| -| `aggregate` | 78 | 聚合到集合 | -| `store` | 2 | 存储 | -| `sideEffect` | 19 | 副作用 | -| `group` | 7 | 分组 | -| `cap` | 2 | 获取副作用值 | - -**小计**: 108 次 (1.47%) - -### 修改步骤 -| 步骤 | 次数 | 说明 | -|------|------|------| -| `addV` | 10 | 添加顶点 | -| `property` | 33 | 设置属性 | -| `drop` | 19 | 删除元素 | - -**小计**: 62 次 (0.84%) - -### 逻辑步骤 -| 步骤 | 次数 | 说明 | -|------|------|------| -| `and` | 5 | 逻辑与 | -| `or` | 2 | 逻辑或 | -| `not` | 3 | 逻辑非 | - -**小计**: 10 次 (0.14%) - -### 辅助步骤 -| 步骤 | 次数 | 说明 | -|------|------|------| -| `as` | 254 | 标记步骤 | -| `by` | 259 | 修饰符(用于排序、分组等) | -| `map` | 6 | 映射变换 | -| `flatMap` | 15 | 扁平映射 | -| `barrier` | 1 | 屏障 | - -**小计**: 535 次 (7.28%) - -### 终端步骤 -| 步骤 | 次数 | 说明 | -|------|------|------| -| `iterate` | 1 | 迭代执行 | -| `explain` | 1 | 解释查询计划 | -| `profile` | 1 | 性能分析 | - -**小计**: 3 次 (0.04%) - ---- - -## 🎯 谓词分布 - -| 排名 | 谓词 | 出现次数 | 占比 | 说明 | -|------|------|----------|------|------| -| 1 | `neq` | 107 | 69.48% | 不等于 | -| 2 | `within` | 34 | 22.08% | 在集合内 | -| 3 | `gt` | 13 | 8.44% | 大于 | - -**总计**: 154 次 - -**分析**: -- `neq` (不等于) 是最常用的谓词,主要用于排除自身或特定值 -- `within` 用于集合成员判断 -- `gt` 用于数值比较 - ---- - -## 📝 关键发现 - -### 1. 查询模式特征 -- **几乎所有查询都从 `V()` 开始** (99.26%),说明主要是顶点遍历查询 -- **标签过滤是标配** (`hasLabel` 出现 1,485 次),说明图数据有明确的类型划分 -- **出边遍历远多于入边遍历** (`out`: 1,202 vs `in`: 475),比例约 2.5:1 - -### 2. 数据质量控制 -- **去重操作频繁** (`dedup` 302 次),说明查询结果中存在重复数据 -- **路径过滤较少** (`simplePath` 20 次),大多数查询不关心路径唯一性 - -### 3. 分析需求 -- **分组统计需求强** (`groupCount` 182 次) -- **标记和引用常见** (`as` 254 次, `where` 147 次),说明有复杂的关联查询 -- **聚合操作** (`aggregate` 78 次) 用于收集中间结果 - -### 4. 查询复杂度 -- **循环遍历** (`repeat` 76 次) 用于多跳查询 -- **分支逻辑** (`union` 115 次, `coalesce` 51 次) 用于多路径探索 -- **条件过滤** (`where` 147 次) 用于复杂条件判断 - -### 5. 覆盖度分析 -- **高频步骤** (前 20 个) 占 92.65%,说明核心功能集中 -- **长尾步骤** (后 56 个) 仅占 7.35%,但提供了丰富的功能扩展 -- **76 种不同步骤类型** 说明 Gremlin 语法覆盖全面 - ---- - -## 🎨 使用建议 - -### 对于测试用例设计 -1. **优先覆盖高频步骤**: 重点测试前 20 个步骤的各种组合 -2. **关注遍历模式**: `V().hasLabel().out()` 是最常见的模式 -3. **测试去重场景**: 确保 `dedup` 在各种位置都能正常工作 -4. **验证谓词**: 重点测试 `neq`, `within`, `gt` 三个谓词 - -### 对于性能优化 -1. **优化 `hasLabel` 和 `has`**: 这两个过滤步骤使用最频繁 -2. **优化出边遍历**: `out` 步骤占比最高 -3. **优化 `groupCount`**: 聚合操作需要特别关注性能 - -### 对于文档编写 -1. **重点讲解高频步骤**: 前 20 个步骤应该有详细文档 -2. **提供组合示例**: 展示常见的步骤组合模式 -3. **补充长尾功能**: 虽然使用少,但要确保文档完整 - ---- - -## 📅 生成信息 - -- **语料库文件**: `output/generated_corpus_20251029_190729.json` -- **统计时间**: 2025-10-29 -- **分析方法**: 基于 ANTLR 解析器的语法树分析 -- **统计脚本**: `analyze_syntax_distribution.py` diff --git a/text2gremlin/AST_Text2Gremlin/output/syntax_distribution_stats.json b/text2gremlin/AST_Text2Gremlin/output/syntax_distribution_stats.json deleted file mode 100644 index 621d9617..00000000 --- a/text2gremlin/AST_Text2Gremlin/output/syntax_distribution_stats.json +++ /dev/null @@ -1,95 +0,0 @@ -{ - "metadata": { - "total_queries": 1493, - "total_steps": 7353, - "unique_step_types": 76, - "total_predicates": 154, - "unique_predicate_types": 3, - "total_text_predicates": 0, - "unique_text_predicate_types": 0 - }, - "steps": { - "hasLabel": 1485, - "V": 1482, - "out": 1202, - "in": 475, - "dedup": 302, - "by": 259, - "as": 254, - "has": 209, - "groupCount": 182, - "where": 147, - "order": 126, - "limit": 116, - "union": 115, - "values": 109, - "aggregate": 78, - "repeat": 76, - "path": 54, - "coalesce": 51, - "valueMap": 47, - "select": 44, - "times": 39, - "range": 38, - "property": 33, - "emit": 32, - "sample": 30, - "fold": 29, - "inE": 23, - "count": 22, - "until": 22, - "sum": 21, - "outE": 20, - "simplePath": 20, - "properties": 19, - "project": 19, - "sideEffect": 19, - "drop": 19, - "tree": 18, - "flatMap": 15, - "choose": 10, - "addV": 10, - "optional": 8, - "group": 7, - "hasKey": 6, - "mean": 6, - "map": 6, - "hasId": 5, - "and": 5, - "not": 3, - "hasValue": 2, - "both": 2, - "elementMap": 2, - "label": 2, - "max": 2, - "or": 2, - "store": 2, - "cap": 2, - "E": 1, - "bothE": 1, - "inV": 1, - "outV": 1, - "otherV": 1, - "skip": 1, - "tail": 1, - "coin": 1, - "id": 1, - "cyclicPath": 1, - "min": 1, - "filter": 1, - "is": 1, - "constant": 1, - "identity": 1, - "barrier": 1, - "unfold": 1, - "iterate": 1, - "explain": 1, - "profile": 1 - }, - "predicates": { - "neq": 107, - "within": 34, - "gt": 13 - }, - "text_predicates": {} -} \ No newline at end of file diff --git a/text2gremlin/AST_Text2Gremlin/requirements.txt b/text2gremlin/AST_Text2Gremlin/requirements.txt index 204098c6..396b4e32 100644 --- a/text2gremlin/AST_Text2Gremlin/requirements.txt +++ b/text2gremlin/AST_Text2Gremlin/requirements.txt @@ -26,7 +26,6 @@ openai==1.96.0 packaging==25.0 pandas==2.3.1 pillow==11.3.0 -pip==25.1.1 propcache==0.3.2 psutil==7.0.0 pydantic==2.11.7 @@ -35,7 +34,6 @@ pyparsing==3.2.3 python-dateutil==2.9.0.post0 python-dotenv==1.1.1 pytz==2025.2 -setuptools==80.9.0 six==1.17.0 sniffio==1.3.1 tenacity==9.1.2 @@ -43,5 +41,4 @@ tqdm==4.67.1 typing_extensions==4.14.1 typing-inspection==0.4.1 tzdata==2025.2 -wheel==0.45.1 -yarl==1.20.1 \ No newline at end of file +yarl==1.20.1
