Appointat commented on code in PR #737:
URL: https://github.com/apache/geaflow/pull/737#discussion_r2797189124


##########
geaflow-ai/src/operator/casts/casts/data/graph_generator.py:
##########
@@ -0,0 +1,370 @@
+"""Graph data utilities for CASTS simulations.
+
+This module supports two data sources:
+
+1. Synthetic graph data with Zipf-like distribution (default).
+2. Real transaction/relationship data loaded from CSV files under 
``real_graph_data/``.
+
+Use :class:`GraphGenerator` as the unified in-memory representation. The 
simulation
+engine and other components should treat it as read-only.
+"""
+
+import csv
+from dataclasses import dataclass
+from pathlib import Path
+import random
+from typing import Any
+
+import networkx as nx
+
+
+@dataclass
+class GraphGeneratorConfig:
+    """Configuration for building graph data.
+
+    Attributes:
+        use_real_data: Whether to build from real CSV files instead of 
synthetic data.
+        real_data_dir: Directory containing the ``*.csv`` relationship tables.
+        real_subgraph_size: Maximum number of nodes to keep when sampling a
+            connected subgraph from real data. If ``None``, use the full graph.
+    """
+
+    use_real_data: bool = False
+    real_data_dir: str | None = None
+    real_subgraph_size: int | None = None
+
+
+class GraphGenerator:
+    """Unified graph container used by the simulation.
+
+    - By default, it generates synthetic graph data with realistic business
+      entity relationships.
+    - When ``config.use_real_data`` is True, it instead loads nodes/edges from
+      ``real_graph_data`` CSV files and optionally samples a connected subgraph
+      to control size while preserving edge integrity.
+    """
+
+    def __init__(self, size: int = 30, config: GraphGeneratorConfig | None = 
None):
+        self.nodes: dict[str, dict[str, Any]] = {}
+        self.edges: dict[str, list[dict[str, str]]] = {}
+
+        self.config = config or GraphGeneratorConfig()
+        self.source_label = "synthetic"
+
+        if self.config.use_real_data:
+            self._load_real_graph()
+            self.source_label = "real"
+        else:
+            self._generate_zipf_data(size)
+
+    def to_networkx(self) -> nx.DiGraph:
+        """Convert to NetworkX graph for visualization and analysis."""
+        G: nx.DiGraph = nx.DiGraph()
+        for node_id, node in self.nodes.items():
+            G.add_node(node_id, **node)
+        for node_id, edge_list in self.edges.items():
+            for edge in edge_list:
+                G.add_edge(node_id, edge['target'], label=edge['label'])
+        return G
+
+    # ------------------------------------------------------------------
+    # Synthetic data (existing behavior)
+    # ------------------------------------------------------------------
+
+    def _generate_zipf_data(self, size: int) -> None:
+        """Generate graph data following Zipf distribution for realistic 
entity distributions."""
+        # Use concrete, realistic business roles instead of abstract types
+        # Approximate Zipf: "Retail SME" is most common, "FinTech Startup" is 
rarest
+        business_types = [
+            "Retail SME",  # Most common - small retail businesses
+            "Logistics Partner",  # Medium frequency - logistics providers
+            "Enterprise Vendor",  # Medium frequency - large vendors
+            "Regional Distributor",  # Less common - regional distributors
+            "FinTech Startup",  # Rarest - fintech companies
+        ]
+        # Weights approximating 1/k distribution
+        type_weights = [100, 50, 25, 12, 6]
+        
+        business_categories = ["retail", "wholesale", "finance", 
"manufacturing"]
+        regions = ["NA", "EU", "APAC", "LATAM"]
+        risk_levels = ["low", "medium", "high"]
+
+        # Generate nodes
+        for i in range(size):
+            node_type = random.choices(business_types, weights=type_weights, 
k=1)[0]
+            status = "active" if random.random() < 0.8 else "inactive"
+            age = random.randint(18, 60)
+            
+            node = {
+                "id": str(i),
+                "type": node_type,
+                "status": status,
+                "age": age,
+                "category": random.choice(business_categories),
+                "region": random.choice(regions),
+                "risk": random.choices(risk_levels, weights=[60, 30, 10])[0],
+            }
+            self.nodes[str(i)] = node
+            self.edges[str(i)] = []
+
+        # Generate edges with realistic relationship labels
+        edge_labels = ["related", "friend", "knows", "supplies", "manages"]
+        for i in range(size):
+            num_edges = random.randint(1, 4)
+            for _ in range(num_edges):
+                target = random.randint(0, size - 1)
+                if target != i:
+                    label = random.choice(edge_labels)
+                    # Ensure common "Retail SME" has more 'related' edges
+                    # and "Logistics Partner" has more 'friend' edges for 
interesting simulation
+                    if self.nodes[str(i)]["type"] == "Retail SME" and 
random.random() < 0.7:
+                        label = "related"
+                    elif (
+                        self.nodes[str(i)]["type"] == "Logistics Partner"
+                        and random.random() < 0.7
+                    ):
+                        label = "friend"
+
+                    self.edges[str(i)].append({"target": str(target), "label": 
label})
+
+    # ------------------------------------------------------------------
+    # Real data loading and subgraph sampling
+    # ------------------------------------------------------------------
+
+    def _load_real_graph(self) -> None:

Review Comment:
   Addressed. Moved the real-data loading implementation into data/ 
real_graph_loader.py and made it injectable via 
GraphGeneratorConfig.real_data_loader. GraphGenerator now delegates to the 
provided loader (or falls back to default_real_graph_loader) instead of 
hardcoding the logic.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to