imbajin commented on code in PR #301:
URL:
https://github.com/apache/incubator-hugegraph-ai/pull/301#discussion_r2447070061
##########
hugegraph-llm/src/hugegraph_llm/models/embeddings/init_embedding.py:
##########
@@ -17,10 +17,40 @@
Review Comment:
⚠️ **Important: Duplicate function definition**
The `get_embedding` function is defined twice in this file - once as a
standalone function (lines 18-51) and once as a method in the `Embeddings`
class (lines 60-78). This creates maintenance issues and confusion.
**Recommendation:**
1. Remove the duplicate standalone function if it's not being used
2. Or deprecate the `Embeddings` class if migrating to the new pattern
3. Update all callers to use the consistent API
##########
hugegraph-llm/src/hugegraph_llm/operators/common_op/check_schema.py:
##########
@@ -59,64 +63,270 @@ def _validate_schema(self, schema: Dict[str, Any]) -> None:
check_type(schema, dict, "Input data is not a dictionary.")
if "vertexlabels" not in schema or "edgelabels" not in schema:
log_and_raise("Input data does not contain 'vertexlabels' or
'edgelabels'.")
- check_type(schema["vertexlabels"], list, "'vertexlabels' in input data
is not a list.")
- check_type(schema["edgelabels"], list, "'edgelabels' in input data is
not a list.")
+ check_type(
+ schema["vertexlabels"], list, "'vertexlabels' in input data is not
a list."
+ )
+ check_type(
+ schema["edgelabels"], list, "'edgelabels' in input data is not a
list."
+ )
+
+ def _process_property_labels(self, schema: Dict[str, Any]) -> (list, set):
+ property_labels = schema.get("propertykeys", [])
+ check_type(
+ property_labels,
+ list,
+ "'propertykeys' in input data is not of correct type.",
+ )
+ property_label_set = {label["name"] for label in property_labels}
+ return property_labels, property_label_set
+
+ def _process_vertex_labels(
+ self, schema: Dict[str, Any], property_labels: list,
property_label_set: set
+ ) -> None:
+ for vertex_label in schema["vertexlabels"]:
+ self._validate_vertex_label(vertex_label)
+ properties = vertex_label["properties"]
+ primary_keys = self._process_keys(
+ vertex_label, "primary_keys", properties[:1]
+ )
+ if len(primary_keys) == 0:
+ log_and_raise(f"'primary_keys' of {vertex_label['name']} is
empty.")
+ vertex_label["primary_keys"] = primary_keys
+ nullable_keys = self._process_keys(
+ vertex_label, "nullable_keys", properties[1:]
+ )
+ vertex_label["nullable_keys"] = nullable_keys
+ self._add_missing_properties(
+ properties, property_labels, property_label_set
+ )
+
+ def _process_edge_labels(
+ self, schema: Dict[str, Any], property_labels: list,
property_label_set: set
+ ) -> None:
+ for edge_label in schema["edgelabels"]:
+ self._validate_edge_label(edge_label)
+ properties = edge_label.get("properties", [])
+ self._add_missing_properties(
+ properties, property_labels, property_label_set
+ )
+
+ def _validate_vertex_label(self, vertex_label: Dict[str, Any]) -> None:
+ check_type(vertex_label, dict, "VertexLabel in input data is not a
dictionary.")
+ if "name" not in vertex_label:
+ log_and_raise("VertexLabel in input data does not contain 'name'.")
+ check_type(
+ vertex_label["name"], str, "'name' in vertex_label is not of
correct type."
+ )
+ if "properties" not in vertex_label:
+ log_and_raise("VertexLabel in input data does not contain
'properties'.")
+ check_type(
+ vertex_label["properties"],
+ list,
+ "'properties' in vertex_label is not of correct type.",
+ )
+ if len(vertex_label["properties"]) == 0:
+ log_and_raise("'properties' in vertex_label is empty.")
+
+ def _validate_edge_label(self, edge_label: Dict[str, Any]) -> None:
+ check_type(edge_label, dict, "EdgeLabel in input data is not a
dictionary.")
+ if (
+ "name" not in edge_label
+ or "source_label" not in edge_label
+ or "target_label" not in edge_label
+ ):
+ log_and_raise(
+ "EdgeLabel in input data does not contain 'name',
'source_label', 'target_label'."
+ )
+ check_type(
+ edge_label["name"], str, "'name' in edge_label is not of correct
type."
+ )
+ check_type(
+ edge_label["source_label"],
+ str,
+ "'source_label' in edge_label is not of correct type.",
+ )
+ check_type(
+ edge_label["target_label"],
+ str,
+ "'target_label' in edge_label is not of correct type.",
+ )
Review Comment:
⚠️ **Important: Silent failure in key processing**
The `_process_keys` method silently filters out keys that don't exist in the
properties list (`new_keys = [key for key in keys if key in
label['properties']]`). This could hide configuration errors where users
specify invalid keys.
**Issues:**
1. Typos in key names go undetected
2. No feedback when configured keys are ignored
3. Could lead to unexpected schema behavior
**Recommendation:**
Add validation to warn or error on invalid keys:
```python
def _process_keys(self, label: Dict[str, Any], key_type: str, default_keys:
list) -> list:
keys = label.get(key_type, default_keys)
check_type(keys, list, f"'{key_type}' in {label['name']} is not of
correct type.")
invalid_keys = [key for key in keys if key not in label['properties']]
if invalid_keys:
log.warning(f"Keys {invalid_keys} in {key_type} are not present in
properties for {label['name']}")
new_keys = [key for key in keys if key in label['properties']]
return new_keys
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]