This is an automated email from the ASF dual-hosted git repository.
kezhenxu94 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/skywalking-mcp.git
The following commit(s) were added to refs/heads/main by this push:
new 3baaa62 feat: add support for MQE (#9)
3baaa62 is described below
commit 3baaa623aef8696624c22fb88a032d26816301db
Author: Zixin Zhou <[email protected]>
AuthorDate: Sat Jul 12 19:16:53 2025 +0800
feat: add support for MQE (#9)
---
.licenserc.yaml | 1 +
README.md | 3 +
internal/prompts/prompts.go | 685 ++++++++++++++++++++++++++++++
internal/resources/mqe_ai_prompt.md | 249 +++++++++++
internal/resources/mqe_detailed_syntax.md | 308 ++++++++++++++
internal/resources/mqe_docs.go | 126 ++++++
internal/resources/mqe_examples.json | 136 ++++++
internal/swmcp/server.go | 24 +-
internal/tools/common.go | 93 +++-
internal/tools/log.go | 5 +-
internal/tools/metric.go | 38 +-
internal/tools/mqe.go | 613 ++++++++++++++++++++++++++
12 files changed, 2244 insertions(+), 37 deletions(-)
diff --git a/.licenserc.yaml b/.licenserc.yaml
index 945b96a..51bc7b2 100644
--- a/.licenserc.yaml
+++ b/.licenserc.yaml
@@ -31,6 +31,7 @@ header:
- "LICENSE"
- "NOTICE"
- ".gitignore"
+ - '**/*.json'
comment: on-failure
diff --git a/README.md b/README.md
index 8a120b8..a07b904 100644
--- a/README.md
+++ b/README.md
@@ -110,6 +110,9 @@ SkyWalking MCP provides the following tools to query and
analyze SkyWalking OAP
| **Metrics** | `query_single_metrics` | Query single metric values
| Get specific metric values (CPM, response time, SLA, Apdex); Multiple
entity scopes (Service, ServiceInstance, Endpoint, Process, Relations); Time
range and cold storage support
|
| **Metrics** | `query_top_n_metrics` | Query top N metric rankings
| Rank entities by metric values; Configurable top N count;
Ascending/descending order; Scope-based filtering; Performance analysis and
issue identification
|
| **Log** | `query_logs` | Query logs from SkyWalking OAP
| Filter by service, instance, endpoint, trace ID, tags; Time range queries;
Cold storage support; Pagination support
|
+| **MQE** | `execute_mqe_expression` | Execute MQE expressions for metrics
| Execute complex MQE (Metrics Query Expression) queries; Support
calculations, aggregations, comparisons, TopN, trend analysis; Multiple result
types (single value, time series, sorted list); Entity filtering and relation
metrics; Debug and tracing capabilities |
+| **MQE** | `list_mqe_metrics` | List available metrics for MQE
| Discover available metrics for MQE queries; Filter by regex patterns; Get
metric metadata (type, catalog); Support service, instance, endpoint, relation,
database, and infrastructure metrics
|
+| **MQE** | `get_mqe_metric_type` | Get metric type information
| Get detailed type information for specific metrics; Understand metric
structure (regular value, labeled value, sampled record); Help with correct MQE
expression syntax
|
## Contact Us
diff --git a/internal/prompts/prompts.go b/internal/prompts/prompts.go
new file mode 100644
index 0000000..11057b8
--- /dev/null
+++ b/internal/prompts/prompts.go
@@ -0,0 +1,685 @@
+// Licensed to Apache Software Foundation (ASF) under one or more contributor
+// license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright
+// ownership. Apache Software Foundation (ASF) licenses this file to you under
+// the Apache License, Version 2.0 (the "License"); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package prompts
+
+import (
+ "context"
+ "fmt"
+
+ "github.com/mark3labs/mcp-go/mcp"
+ "github.com/mark3labs/mcp-go/server"
+)
+
+// Constants for common values
+const (
+ defaultDuration = "-1h"
+ allMetrics = "all"
+)
+
+// Tool capability mapping for different analysis types
+var toolCapabilities = map[string][]string{
+ "performance_analysis": {
+ "query_single_metrics",
+ "query_top_n_metrics",
+ "execute_mqe_expression",
+ },
+ "trace_investigation": {
+ "query_traces",
+ "get_trace_details",
+ "get_cold_trace_details",
+ },
+ "log_analysis": {
+ "query_logs",
+ },
+ "mqe_query_building": {
+ "execute_mqe_expression",
+ "list_mqe_metrics",
+ "get_mqe_metric_type",
+ },
+ "service_comparison": {
+ "query_single_metrics",
+ "query_top_n_metrics",
+ "execute_mqe_expression",
+ },
+ "metrics_exploration": {
+ "list_mqe_metrics",
+ "get_mqe_metric_type",
+ },
+}
+
+// AddSkyWalkingPrompts registers all SkyWalking-related prompts
+func AddSkyWalkingPrompts(s *server.MCPServer) {
+ addCoreAnalysisPrompts(s)
+ addTraceAnalysisPrompts(s)
+ addUtilityPrompts(s)
+}
+
+func addCoreAnalysisPrompts(s *server.MCPServer) {
+ // Performance Analysis Prompt
+ s.AddPrompt(mcp.Prompt{
+ Name: "analyze-performance",
+ Description: "Analyze service performance using metrics tools",
+ Arguments: []mcp.PromptArgument{
+ {Name: "service_name", Description: "The name of the
service to analyze", Required: true},
+ {Name: "duration", Description: "Time duration for
analysis. Examples: -1h (past hour), -30m (past 30 minutes), " +
+ "-7d (past 7 days), 1h (next hour), 24h (next
24 hours)", Required: false},
+ },
+ }, performanceAnalysisHandler)
+
+ // Service Comparison Prompt
+ s.AddPrompt(mcp.Prompt{
+ Name: "compare-services",
+ Description: "Compare performance metrics between multiple
services",
+ Arguments: []mcp.PromptArgument{
+ {Name: "services", Description: "Comma-separated list
of service names to compare", Required: true},
+ {Name: "metrics", Description: "Metrics to compare
(response_time, sla, cpm, all)", Required: false},
+ {Name: "time_range", Description: "Time range for
comparison. Examples: -1h (last hour), -2h (last 2 hours), -1d (last day)",
Required: false},
+ },
+ }, compareServicesHandler)
+
+ // Top N Metrics Analysis
+ s.AddPrompt(mcp.Prompt{
+ Name: "top-services",
+ Description: "Find top N services by various metrics",
+ Arguments: []mcp.PromptArgument{
+ {Name: "metric_name", Description: "Metric to rank by
(service_cpm, service_resp_time, service_sla)", Required: true},
+ {Name: "top_n", Description: "Number of top services to
return (default: 10)", Required: false},
+ {Name: "order", Description: "Order direction (ASC,
DES)", Required: false},
+ },
+ }, topServicesHandler)
+}
+
+func addTraceAnalysisPrompts(s *server.MCPServer) {
+ // Trace Investigation Prompt
+ s.AddPrompt(mcp.Prompt{
+ Name: "investigate-traces",
+ Description: "Investigate traces for errors and performance
issues",
+ Arguments: []mcp.PromptArgument{
+ {Name: "service_id", Description: "The service to
investigate", Required: false},
+ {Name: "trace_state", Description: "Filter by trace
state (success, error, all)", Required: false},
+ {Name: "duration", Description: "Time range to search.
Examples: -1h (last hour), -30m (last 30 minutes). Default: -1h", Required:
false},
+ },
+ }, traceInvestigationHandler)
+
+ // Trace Deep Dive
+ s.AddPrompt(mcp.Prompt{
+ Name: "trace-deep-dive",
+ Description: "Deep dive analysis of a specific trace",
+ Arguments: []mcp.PromptArgument{
+ {Name: "trace_id", Description: "The trace ID to
analyze", Required: true},
+ {Name: "view", Description: "Analysis view (full,
summary, errors_only)", Required: false},
+ {Name: "check_cold_storage", Description: "Check cold
storage if not found (true/false)", Required: false},
+ },
+ }, traceDeepDiveHandler)
+
+ // Log Analysis Prompt
+ s.AddPrompt(mcp.Prompt{
+ Name: "analyze-logs",
+ Description: "Analyze service logs for errors and patterns",
+ Arguments: []mcp.PromptArgument{
+ {Name: "service_id", Description: "Service to analyze
logs", Required: false},
+ {Name: "log_level", Description: "Log level to filter
(ERROR, WARN, INFO)", Required: false},
+ {Name: "duration", Description: "Time range to analyze.
Examples: -1h (last hour), -6h (last 6 hours). Default: -1h", Required: false},
+ },
+ }, logAnalysisHandler)
+}
+
+func addUtilityPrompts(s *server.MCPServer) {
+ // MQE Query Builder Prompt
+ s.AddPrompt(mcp.Prompt{
+ Name: "build-mqe-query",
+ Description: "Help build MQE (Metrics Query Expression) for
complex queries",
+ Arguments: []mcp.PromptArgument{
+ {Name: "query_type", Description: "Type of query
(performance, comparison, trend, alert)", Required: true},
+ {Name: "metrics", Description: "Comma-separated list of
metrics to query", Required: true},
+ {Name: "conditions", Description: "Additional
conditions or filters", Required: false},
+ },
+ }, mqeQueryBuilderHandler)
+
+ // MQE Metrics Explorer
+ s.AddPrompt(mcp.Prompt{
+ Name: "explore-metrics",
+ Description: "Explore available metrics and their types",
+ Arguments: []mcp.PromptArgument{
+ {Name: "pattern", Description: "Regex pattern to filter
metrics", Required: false},
+ {Name: "show_examples", Description: "Show usage
examples for each metric (true/false)", Required: false},
+ },
+ }, exploreMetricsHandler)
+}
+
+// Analysis execution chains for different types of analysis
+var analysisChains = map[string][]struct {
+ Tool string
+ Purpose string
+}{
+ "performance_analysis": {
+ {Tool: "query_single_metrics", Purpose: "Get basic metrics like
CPM, SLA, response time"},
+ {Tool: "execute_mqe_expression", Purpose: "Calculate
derivatives like SLA percentage, percentiles"},
+ {Tool: "query_top_n_metrics", Purpose: "Identify top endpoints
by response time or traffic"},
+ {Tool: "query_traces", Purpose: "Find error traces for deeper
investigation"},
+ },
+ "trace_investigation": {
+ {Tool: "query_traces", Purpose: "Search for traces with
specific filters"},
+ {Tool: "get_trace_details", Purpose: "Analyze individual traces
in detail"},
+ {Tool: "get_cold_trace_details", Purpose: "Check historical
traces if not found in hot storage"},
+ },
+ "log_analysis": {
+ {Tool: "query_logs", Purpose: "Search and analyze log entries
with filters"},
+ },
+ "mqe_query_building": {
+ {Tool: "list_mqe_metrics", Purpose: "Discover available
metrics"},
+ {Tool: "get_mqe_metric_type", Purpose: "Understand metric types
and usage"},
+ {Tool: "execute_mqe_expression", Purpose: "Test and execute the
built expression"},
+ },
+}
+
+// Helper function to generate tool usage instructions
+func generateToolInstructions(analysisType string) string {
+ tools := toolCapabilities[analysisType]
+ chain := analysisChains[analysisType]
+
+ if len(tools) == 0 {
+ return "No specific tools defined for this analysis type."
+ }
+
+ instructions := "**Available Tools:**\n"
+ for _, tool := range tools {
+ instructions += fmt.Sprintf("- %s\n", tool)
+ }
+
+ if len(chain) > 0 {
+ instructions += "\n**Recommended Analysis Workflow:**\n"
+ for i, step := range chain {
+ instructions += fmt.Sprintf("%d. %s: %s\n", i+1,
step.Tool, step.Purpose)
+ }
+ }
+
+ return instructions
+}
+
+// Handler implementations
+
+func performanceAnalysisHandler(_ context.Context, request
mcp.GetPromptRequest) (*mcp.GetPromptResult, error) {
+ args := request.Params.Arguments
+ serviceName := args["service_name"]
+ duration := args["duration"]
+
+ if duration == "" {
+ duration = defaultDuration
+ }
+
+ // Use the dynamic tool instructions
+ toolInstructions := generateToolInstructions("performance_analysis")
+
+ prompt := fmt.Sprintf(`Please analyze the performance of service '%s'
over the last %s.
+
+%s
+
+**Analysis Required:**
+
+**Response Time Analysis**
+- Use query_single_metrics with metrics_name="service_resp_time" to get
average response time
+- Use execute_mqe_expression with
expression="service_percentile{p='50,75,90,95,99'}" to get percentiles
+- Identify trends and anomalies
+
+**Success Rate and SLA**
+- Use execute_mqe_expression with expression="service_sla * 100" to get
success rate percentage
+- Use query_single_metrics with metrics_name="service_apdex" for user
satisfaction score
+- Track SLA compliance over time
+
+**Traffic Analysis**
+- Use query_single_metrics with metrics_name="service_cpm" to get calls per
minute
+- Identify traffic patterns and peak periods
+
+**Error Analysis**
+- Use query_traces with trace_state="error" to find error traces
+- Identify most common error types and affected endpoints
+
+**Performance Bottlenecks**
+- Use query_top_n_metrics with metrics_name="endpoint_resp_time" and
order="DES" to find slowest endpoints
+- Use query_top_n_metrics with metrics_name="endpoint_cpm" to find
high-traffic endpoints
+
+Please provide actionable insights and specific recommendations based on the
data.`, serviceName, duration, toolInstructions)
+
+ return &mcp.GetPromptResult{
+ Description: "Performance analysis using SkyWalking tools",
+ Messages: []mcp.PromptMessage{
+ {
+ Role: mcp.RoleUser,
+ Content: mcp.TextContent{
+ Type: "text",
+ Text: prompt,
+ },
+ },
+ },
+ }, nil
+}
+
+func mqeQueryBuilderHandler(_ context.Context, request mcp.GetPromptRequest)
(*mcp.GetPromptResult, error) {
+ args := request.Params.Arguments
+ queryType := args["query_type"]
+ metrics := args["metrics"]
+ conditions := args["conditions"]
+
+ // Use the dynamic tool instructions
+ toolInstructions := generateToolInstructions("mqe_query_building")
+
+ prompt := fmt.Sprintf(`Help me build an MQE (Metrics Query Expression)
for the following requirement:
+
+Query Type: %s
+Metrics: %s
+Additional Conditions: %s
+
+%s
+
+**MQE Building Process:**
+
+**Step-by-step approach:**
+- Explain the MQE syntax for this use case
+- Provide the complete MQE expression
+- Show example usage with different parameters
+- Explain what each part of the expression does
+- Suggest variations for different scenarios
+
+If there are multiple ways to achieve this, please show alternatives with pros
and cons.`,
+ queryType, metrics, conditions, toolInstructions)
+
+ return &mcp.GetPromptResult{
+ Description: "MQE query building assistance",
+ Messages: []mcp.PromptMessage{
+ {
+ Role: mcp.RoleUser,
+ Content: mcp.TextContent{
+ Type: "text",
+ Text: prompt,
+ },
+ },
+ },
+ }, nil
+}
+
+func compareServicesHandler(_ context.Context, request mcp.GetPromptRequest)
(*mcp.GetPromptResult, error) {
+ args := request.Params.Arguments
+ services := args["services"]
+ metrics := args["metrics"]
+ timeRange := args["time_range"]
+
+ if metrics == "" {
+ metrics = allMetrics
+ }
+ if timeRange == "" {
+ timeRange = defaultDuration
+ }
+
+ prompt := fmt.Sprintf(`Please compare the following services: %s
+
+Time Range: %s
+Metrics to Compare: %s
+
+Comparison should include:
+
+1. **Performance Comparison**
+ - Response time comparison (average and percentiles)
+ - Throughput (CPM) comparison
+ - Success rate (SLA) comparison
+
+2. **Resource Utilization**
+ - CPU and memory usage if available
+ - Connection pool usage
+
+3. **Error Patterns**
+ - Error rate comparison
+ - Types of errors by service
+
+4. **Dependency Impact**
+ - How each service affects others
+ - Cascade failure risks
+
+5. **Relative Performance**
+ - Which service is the bottleneck
+ - Performance ratios
+ - Efficiency metrics
+
+Please present the comparison in a clear, tabular format where possible, and
highlight significant differences.`,
+ services, timeRange, metrics)
+
+ return &mcp.GetPromptResult{
+ Description: "Service comparison analysis",
+ Messages: []mcp.PromptMessage{
+ {
+ Role: mcp.RoleUser,
+ Content: mcp.TextContent{
+ Type: "text",
+ Text: prompt,
+ },
+ },
+ },
+ }, nil
+}
+
+func traceInvestigationHandler(_ context.Context, request
mcp.GetPromptRequest) (*mcp.GetPromptResult, error) {
+ args := request.Params.Arguments
+ serviceID := args["service_id"]
+ traceState := args["trace_state"]
+ duration := args["duration"]
+
+ if duration == "" {
+ duration = defaultDuration
+ }
+ if traceState == "" {
+ traceState = "all"
+ }
+
+ // Use the dynamic tool instructions
+ toolInstructions := generateToolInstructions("trace_investigation")
+
+ prompt := fmt.Sprintf(`Investigate traces with filters:
service_id="%s", trace_state="%s", duration="%s".
+
+%s
+
+**Analysis Steps:**
+
+**Find Problematic Traces**
+- First use query_traces with view="summary" to get overview
+- Look for patterns in error traces, slow traces, or anomalies
+- Note trace IDs that need deeper investigation
+
+**Deep Dive on Specific Traces**
+- Use get_trace_details with identified trace_id
+- Start with view="summary" for quick insights
+- Use view="full" for complete span analysis
+- Use view="errors_only" if focusing on errors
+
+**Performance Analysis**
+- Look for traces with high duration using min_trace_duration filter
+- Identify bottlenecks in span timings
+- Check for cascading delays
+
+**Error Pattern Analysis**
+- Use query_traces with trace_state="error"
+- Group errors by type and service
+- Identify error propagation paths
+
+**Historical Investigation**
+- If recent data shows no issues, use cold storage tools
+- Use get_cold_trace_details for older trace data
+
+Provide specific findings and actionable recommendations.`, serviceID,
traceState, duration, toolInstructions)
+
+ return &mcp.GetPromptResult{
+ Description: "Trace investigation using query tools",
+ Messages: []mcp.PromptMessage{
+ {
+ Role: mcp.RoleUser,
+ Content: mcp.TextContent{
+ Type: "text",
+ Text: prompt,
+ },
+ },
+ },
+ }, nil
+}
+
+func logAnalysisHandler(_ context.Context, request mcp.GetPromptRequest)
(*mcp.GetPromptResult, error) {
+ args := request.Params.Arguments
+ serviceID := args["service_id"]
+ logLevel := args["log_level"]
+ duration := args["duration"]
+
+ if duration == "" {
+ duration = defaultDuration
+ }
+ if logLevel == "" {
+ logLevel = "ERROR"
+ }
+
+ prompt := fmt.Sprintf(`Analyze service logs using the query_logs tool:
+
+**Tool Configuration:**
+- query_logs with following parameters:
+ - service_id: "%s" (if specified)
+ - tags: [{"key": "level", "value": "%s"}] for log level filtering
+ - duration: "%s" for time range
+ - cold: true if historical data needed
+
+**Analysis Steps:**
+
+**Log Pattern Analysis**
+- Use query_logs to get recent logs for the service
+- Filter by log level (ERROR, WARN, INFO)
+- Look for recurring error patterns
+- Identify frequency of different log types
+
+**Error Investigation**
+- Focus on ERROR level logs first
+- Group similar error messages
+- Check for correlation with trace IDs
+- Look for timestamp patterns
+
+**Performance Correlation**
+- Compare log timestamps with performance issues
+- Look for resource exhaustion indicators
+- Check for timeout or connection errors
+
+**Troubleshooting Workflow**
+- Start with ERROR logs in the specified time range
+- Use trace_id from logs to get detailed trace analysis
+- Cross-reference with metrics for full picture
+
+Provide specific log analysis findings and recommendations.`, serviceID,
logLevel, duration)
+
+ return &mcp.GetPromptResult{
+ Description: "Log analysis using query_logs tool",
+ Messages: []mcp.PromptMessage{
+ {
+ Role: mcp.RoleUser,
+ Content: mcp.TextContent{
+ Type: "text",
+ Text: prompt,
+ },
+ },
+ },
+ }, nil
+}
+
+func topServicesHandler(_ context.Context, request mcp.GetPromptRequest)
(*mcp.GetPromptResult, error) {
+ args := request.Params.Arguments
+ metricName := args["metric_name"]
+ topN := args["top_n"]
+ order := args["order"]
+
+ if topN == "" {
+ topN = "10"
+ }
+ if order == "" {
+ order = "DES"
+ }
+
+ prompt := fmt.Sprintf(`Find top services using query_top_n_metrics tool:
+
+**Tool Configuration:**
+- query_top_n_metrics with parameters:
+ - metrics_name: "%s"
+ - top_n: %s
+ - order: "%s" (DES for highest, ASC for lowest)
+ - duration: "-1h" (or specify custom range)
+
+**Analysis Focus:**
+
+**Service Ranking**
+- Get top %s services by %s
+- Compare values against baseline
+- Identify outliers or anomalies
+
+**Performance Insights**
+- For CPM metrics: Find busiest services
+- For response time: Find slowest services
+- For SLA: Find services with issues
+
+**Actionable Recommendations**
+- Services needing immediate attention
+- Capacity planning insights
+- Performance optimization targets
+
+**Follow-up Analysis**
+- Use query_single_metrics for detailed service analysis
+- Use query_traces for error investigation
+- Use execute_mqe_expression for complex calculations
+
+Provide ranked results with specific recommendations.`, metricName, topN,
order, topN, metricName)
+
+ return &mcp.GetPromptResult{
+ Description: "Top services analysis",
+ Messages: []mcp.PromptMessage{
+ {
+ Role: mcp.RoleUser,
+ Content: mcp.TextContent{
+ Type: "text",
+ Text: prompt,
+ },
+ },
+ },
+ }, nil
+}
+
+func traceDeepDiveHandler(_ context.Context, request mcp.GetPromptRequest)
(*mcp.GetPromptResult, error) {
+ args := request.Params.Arguments
+ traceID := args["trace_id"]
+ view := args["view"]
+ checkColdStorage := args["check_cold_storage"]
+
+ if view == "" {
+ view = "summary"
+ }
+
+ prompt := fmt.Sprintf(`Perform deep dive analysis of trace %s:
+
+**Primary Analysis:**
+- get_trace_details with trace_id: "%s" and view: "%s"
+- Start with summary view for quick insights
+- Use full view for complete span analysis
+- Use errors_only view if trace has errors
+
+**Cold Storage Check:**
+- If trace not found in hot storage and check_cold_storage is "%s"
+- Use get_cold_trace_details with same trace_id
+- Check historical data for older traces
+
+**Analysis Depth:**
+
+**Trace Structure Analysis**
+- Service call flow and dependencies
+- Span duration breakdown
+- Critical path identification
+- Parallel vs sequential operations
+
+**Performance Investigation**
+- Identify bottleneck spans
+- Database query performance
+- External API call latency
+- Resource wait times
+
+**Error Analysis** (if applicable)
+- Error location and propagation
+- Root cause identification
+- Impact assessment
+
+**Optimization Opportunities**
+- Redundant operations
+- Caching possibilities
+- Parallel processing potential
+- Database query optimization
+
+Provide detailed trace analysis with specific optimization recommendations.`,
traceID, traceID, view, checkColdStorage)
+
+ return &mcp.GetPromptResult{
+ Description: "Deep dive trace analysis",
+ Messages: []mcp.PromptMessage{
+ {
+ Role: mcp.RoleUser,
+ Content: mcp.TextContent{
+ Type: "text",
+ Text: prompt,
+ },
+ },
+ },
+ }, nil
+}
+
+func exploreMetricsHandler(_ context.Context, request mcp.GetPromptRequest)
(*mcp.GetPromptResult, error) {
+ args := request.Params.Arguments
+ pattern := args["pattern"]
+ showExamples := args["show_examples"]
+
+ if pattern == "" {
+ pattern = ".*" // match all metrics
+ }
+
+ // Use the dynamic tool instructions
+ toolInstructions := generateToolInstructions("metrics_exploration")
+
+ prompt := fmt.Sprintf(`Explore available metrics with pattern: "%s".
+
+%s
+
+**Exploration Workflow:**
+
+**Discover Metrics**
+- Use list_mqe_metrics to get all available metrics
+- Filter by pattern if specified
+- Review metric names and types
+
+**Understand Metric Types**
+- For each interesting metric, use get_mqe_metric_type
+- REGULAR_VALUE: Direct arithmetic operations
+- LABELED_VALUE: Requires label selectors
+- SAMPLED_RECORD: Complex record-based metrics
+
+**Usage Examples** (if show_examples is "%s"):
+- REGULAR_VALUE: service_cpm, service_sla * 100
+- LABELED_VALUE: service_percentile{p='50,75,90,95,99'}
+- Complex: avg(service_cpm), top_n(service_resp_time, 10, des)
+
+**Metric Categories:**
+- Service metrics: service_sla, service_cpm, service_resp_time
+- Instance metrics: service_instance_*
+- Endpoint metrics: endpoint_*
+- Relation metrics: service_relation_*
+- Infrastructure metrics: service_cpu, service_memory
+
+**Best Practices:**
+- Check metric type before using in expressions
+- Use appropriate label selectors for LABELED_VALUE
+- Combine metrics for comprehensive analysis
+- Use aggregation functions for trend analysis
+
+Provide a comprehensive guide to available metrics and their usage.`, pattern,
toolInstructions, showExamples)
+
+ return &mcp.GetPromptResult{
+ Description: "Metrics exploration guide",
+ Messages: []mcp.PromptMessage{
+ {
+ Role: mcp.RoleUser,
+ Content: mcp.TextContent{
+ Type: "text",
+ Text: prompt,
+ },
+ },
+ },
+ }, nil
+}
diff --git a/internal/resources/mqe_ai_prompt.md
b/internal/resources/mqe_ai_prompt.md
new file mode 100644
index 0000000..6c6bd1e
--- /dev/null
+++ b/internal/resources/mqe_ai_prompt.md
@@ -0,0 +1,249 @@
+# MQE AI Model Understanding Guide
+
+## Core Understanding Principles
+
+When users describe monitoring requirements in natural language, you need to:
+
+1. **Understand user intent**, not just mechanically match keywords
+2. **Infer implicit information**, such as time ranges, aggregation methods,
etc.
+3. **Handle errors gracefully**, understand spelling mistakes or non-standard
expressions
+4. **Provide explanations**, explain the meaning of generated MQE expressions
+
+## Natural Language Understanding Strategies
+
+### 1. Intent Recognition Patterns
+
+#### 1.1 Query Type Intents
+- "view/show/get/display..." → Simple query
+- "monitor/observe/track/watch..." → Continuous query
+- "analyze/compare/contrast..." → Complex query
+- "alert/alarm/notify..." → Conditional judgment
+
+#### 1.2 Metric Recognition
+Users may express the same metric in various ways:
+
+**Response Time**:
+- response time, latency, delay, time spent, RT, latency
+- → `service_resp_time` or `endpoint_resp_time`
+
+**Success Rate**:
+- success rate, availability, SLA, uptime, health
+- → `service_sla`
+
+**Call Volume**:
+- call volume, request count, traffic, QPS, TPS, throughput
+- → `service_cpm` (needs context-based conversion)
+
+**Error Rate**:
+- error rate, failure rate, exception rate
+- → `100 - (service_sla * 100)` or use status_code metrics
+
+### 2. Time Range Understanding
+
+#### 2.1 Relative Time
+**Past Time (negative duration)**:
+- "recently/past/just now" + number + unit
+ - "recent 5 minutes" → `duration: "-5m"`
+ - "past one hour" → `duration: "-1h"`
+ - "recent one day" → `duration: "-24h"` or `duration: "-1d"`
+
+**Future Time (positive duration)**:
+- "next/future/upcoming/coming" + number + unit
+ - "next 5 minutes" → `duration: "5m"`
+ - "upcoming one hour" → `duration: "1h"`
+ - "next day" → `duration: "24h"` or `duration: "1d"`
+
+#### 2.2 Fuzzy Time
+**Past-oriented (negative)**:
+- "just now" → `duration: "-5m"` (defaults to 5 minutes)
+- "recently" → `duration: "-1h"` (defaults to 1 hour)
+
+**Future-oriented (positive)**:
+- "soon" → `duration: "5m"` (defaults to next 5 minutes)
+- "later" → `duration: "1h"` (defaults to next 1 hour)
+
+#### 2.3 Absolute Time
+- "from...to..." → use start and end
+- "January 1, 2025, 10 am to 11 am" → convert to standard format
+
+### 3. Inference of Aggregation Methods
+
+#### 3.1 Default Aggregation
+- Query response time → usually use `avg()`
+- Query error count → usually use `sum()`
+- Query peak value → use `max()`
+- Query minimum value → use `min()`
+
+#### 3.2 Explicit Aggregation
+- "average response time" → `avg(service_resp_time)`
+- "maximum latency" → `max(service_resp_time)`
+- "total call volume" → `sum(service_cpm)`
+- "median response time" → `service_percentile{p='50'}`
+
+### 4. Condition Understanding
+
+#### 4.1 Threshold Judgment
+- "exceeds/greater than/higher than" → `>`
+- "less than/lower than" → `<`
+- "reaches/equal to" → `>=` or `==`
+- "between...and..." → combine `>` and `<`
+
+#### 4.2 Unit Conversion
+- "exceeds 1 second" → `> 1000` (convert to milliseconds)
+- "less than 90%" → `< 90` (after converting SLA: `service_sla * 100 < 90`)
+- "100 times per second" → convert to CPM: `> 6000`
+
+### 5. Complex Query Patterns
+
+#### 5.1 Multi-Condition Combination
+User: "response time exceeds 3 seconds and low call volume"
+- Identify two conditions
+- "low" requires inferred threshold, such as `< 100`
+- Generate: `service_resp_time > 3000 && service_cpm < 100`
+
+#### 5.2 Percentile Query
+User: "check P95 and P99 response time"
+- Identify percentile requirement
+- Generate: `service_percentile{p='95,99'}`
+
+#### 5.3 Sorting Requirement
+User: "slowest 10 services"
+- Identify sorting and quantity
+- Generate: `top_n(service_resp_time, 10, des)`
+
+### 6. Error Tolerance Handling
+
+#### 6.1 Spelling Errors
+- "sevice" → "service"
+- "responce" → "response"
+- "avrage" → "average"
+
+#### 6.2 Abbreviation Understanding
+- "RT" → Response Time
+- "QPS" → Queries Per Second (convert to CPM)
+- "P95" → 95th percentile
+
+#### 6.3 Colloquial Expressions
+- "this service is slow" → query response time
+- "traffic suddenly increased" → use increase() function
+- "higher than usual" → use baseline() function
+
+### 7. Context Understanding
+
+#### 7.1 Service Recognition
+- If the user mentions a specific service name, extract and use it
+- If not specified, may need to:
+ - Query aggregate data for all services
+ - Use top_n to find related services
+
+#### 7.2 Level Understanding
+- "database" → `layer: "DATABASE"`
+- "gateway" → possibly a specific service name
+- "microservice" → `layer: "GENERAL"`
+
+### 8. Response Generation Strategy
+
+#### 8.1 Explaining Generated Expressions
+```
+User: "check the recent average response time for service A"
+Response: "I will query the average response time for service A in the last
hour:"
+Expression: avg(service_resp_time)
+Parameters: service_name="A", duration="-1h"
+```
+
+#### 8.2 Providing Alternatives
+```
+User: "service is slow"
+Response: "I understand you want to check service performance issues. Here are
several query options:
+1. Check average response time: avg(service_resp_time)
+2. Check response time percentiles: service_percentile{p='50,75,90,95,99'}
+3. Check slowest services: top_n(service_resp_time, 10, des)"
+```
+
+#### 8.3 Suggesting Optimizations
+```
+User: "requests per second"
+Response: "service_cpm returns calls per minute, I'll help you convert to per
second:"
+Expression: service_cpm / 60
+```
+
+### 9. Common Scenario Templates
+
+#### 9.1 Performance Analysis
+- "service performance analysis" → combine multiple metrics
+ ```
+ Response time: avg(service_resp_time)
+ Call volume: service_cpm
+ Success rate: service_sla * 100
+ ```
+
+#### 9.2 Alert Rules
+- "response time alert" → generate comparison expression
+ ```
+ avg(service_resp_time) > 1000
+ ```
+
+#### 9.3 Trend Analysis
+- "traffic trend" → use trend functions
+ ```
+ rate(service_cpm, 5) # rate of change over 5 minutes
+ ```
+
+### 10. Error Handling Tips
+
+When unable to understand user intent:
+1. Ask for specifics: "Do you want to check response time or call volume?"
+2. Provide examples: "For example: check average response time for service A"
+3. List available options: "Available metrics include: response time, call
volume, success rate, etc."
+
+## Practical Examples
+
+### Example 1: Fuzzy Query
+User: "service seems to have issues"
+Analysis:
+- Intent: diagnose problems
+- Suggest querying multiple key metrics
+
+Response:
+```
+Let me check the key service metrics:
+1. Success rate: service_sla * 100
+2. Response time: avg(service_resp_time)
+3. Call volume: service_cpm
+These metrics can help identify the issue.
+```
+
+### Example 2: Complex Conditions
+User: "find services with slow response but high traffic"
+Analysis:
+- Needs to satisfy two conditions
+- "slow" and "high" need threshold definitions
+
+Response:
+```
+expression: "service_resp_time > 1000 && service_cpm > 1000"
+Explanation: Find services with response time over 1 second and over 1000
calls per minute
+You can also use top_n to find such services.
+```
+
+### Example 3: Time Comparison
+User: "compare today's traffic with yesterday"
+Analysis:
+- Needs two time period queries
+- May need to calculate difference or ratio
+
+Response:
+```
+Need to execute two queries:
+1. Today: start="today 00:00", end="now", expression="avg(service_cpm)"
+2. Yesterday: start="yesterday 00:00", end="yesterday 24:00",
expression="avg(service_cpm)"
+Then compare the results.
+```
+
+## Best Practices
+
+1. **Proactive Inference**: Users may not know specific metric names, infer
from descriptions
+2. **Provide Context**: Explain why choosing a certain expression
+3. **Flexible Handling**: Same requirement may have multiple implementations
+4. **Educate Users**: Include MQE knowledge in responses to help users learn
+5. **Safe Defaults**: When uncertain, choose more conservative query approaches
diff --git a/internal/resources/mqe_detailed_syntax.md
b/internal/resources/mqe_detailed_syntax.md
new file mode 100644
index 0000000..ccb38df
--- /dev/null
+++ b/internal/resources/mqe_detailed_syntax.md
@@ -0,0 +1,308 @@
+# MQE (Metrics Query Expression) Detailed Syntax Rules
+
+## 1. Syntax Structure Overview
+
+The basic structure of MQE expressions follows these BNF grammar rules:
+
+```bnf
+expression ::= metric_expression | calculation_expression |
comparison_expression | function_expression
+
+metric_expression ::= metric_name | metric_name '{' label_selector '}'
+
+label_selector ::= label '=' '"' value_list '"' (',' label '=' '"' value_list
'"')*
+
+value_list ::= value (',' value)*
+
+calculation_expression ::= expression operator expression | '(' expression ')'
+
+operator ::= '+' | '-' | '*' | '/' | '%'
+
+comparison_expression ::= expression comparison_operator expression
+
+comparison_operator ::= '>' | '>=' | '<' | '<=' | '==' | '!='
+
+logical_expression ::= expression logical_operator expression
+
+logical_operator ::= '&&' | '||'
+
+function_expression ::= function_name '(' argument_list ')'
+
+argument_list ::= expression (',' expression)*
+```
+
+## 2. Core Components Details
+
+### 2.1 Metric Names
+- **Rules**: Must be valid metric names, typically lowercase letters separated
by underscores
+- **Pattern**: `[a-z][a-z0-9_]*`
+- **Examples**: `service_sla`, `service_cpm`, `endpoint_resp_time`
+
+### 2.2 Label Selectors
+- **Syntax**: `{label='value1,value2,...'}`
+- **Multiple labels**: `{label1='value1', label2='value2'}`
+- **Special case**: Percentile labels `{p='50,75,90,95,99'}`
+
+### 2.3 Operator Precedence (highest to lowest)
+1. Parentheses `()`
+2. Function calls
+3. Multiplication, division, modulo `*`, `/`, `%`
+4. Addition, subtraction `+`, `-`
+5. Comparison operators `>`, `>=`, `<`, `<=`, `==`, `!=`
+6. Logical AND `&&`
+7. Logical OR `||`
+
+## 3. Function Categories and Syntax
+
+### 3.1 Aggregation Functions
+```
+avg(expression) -> single_value
+sum(expression) -> single_value
+max(expression) -> single_value
+min(expression) -> single_value
+count(expression) -> single_value
+latest(expression) -> single_value
+```
+
+### 3.2 Mathematical Functions
+```
+abs(expression) -> same_type_as_input
+ceil(expression) -> same_type_as_input
+floor(expression) -> same_type_as_input
+round(expression, decimal_places) -> same_type_as_input
+```
+
+### 3.3 Sorting and Selection Functions
+```
+top_n(metric_name, top_number, order, attrs) -> sorted_list/record_list
+ - top_number: positive integer
+ - order: 'asc' | 'des'
+ - attrs: optional, e.g., attr0='value', attr1='value'
+
+top_n_of(top_n_expr1, top_n_expr2, ..., top_number, order) -> merged_top_n
+ - top_n_expr: top_n expressions
+ - top_number: positive integer
+ - order: 'asc' | 'des'
+
+sort_values(expression, limit, order) -> follows_input_type
+ - limit: optional positive integer
+ - order: 'asc' | 'des'
+
+sort_label_values(expression, order, label_names...) -> follows_input_type
+ - order: 'asc' | 'des'
+ - label_names: at least one label name
+```
+
+### 3.4 Trend Analysis Functions
+```
+increase(expression, time_range) -> time_series_values
+rate(expression, time_range) -> time_series_values
+ - time_range: positive integer, unit aligns with query Step
+```
+
+### 3.5 Label Operations
+```
+relabel(expression, target_label='origin_values', new_label='new_values') ->
follows_input
+aggregate_labels(expression, aggregation_method(label_names...)) ->
time_series_values
+ - aggregation_method: sum | avg | max | min
+ - label_names: optional, if not specified, all labels will be aggregated
+```
+
+### 3.6 Logical Functions
+```
+view_as_seq([expression1, expression2, ...]) -> follows_selected_expression
+is_present([expression1, expression2, ...]) -> single_value (0 or 1)
+```
+
+### 3.7 Baseline Functions
+```
+baseline(expression, baseline_type) -> time_series_values
+ - baseline_type: 'value' | 'upper' | 'lower'
+```
+
+## 4. Expression Types and Return Values
+
+### 4.1 Return Value Types
+- **SINGLE_VALUE**: Single value, like `avg(service_cpm)`
+- **TIME_SERIES_VALUES**: Time series data with timestamps
+- **SORTED_LIST**: Sorted list of values, like `top_n()`
+- **RECORD_LIST**: List of records
+- **LABELED_VALUE**: Values with labels, such as percentiles
+
+### 4.2 Type Conversion Rules
+- Numeric values can be used directly in arithmetic operations
+- Boolean values (comparison results) convert to 0 or 1
+- Label values must be accessed through label selectors
+
+## 5. Entity Filtering Rules
+
+### 5.1 Service-Level Filtering
+```
+expression + entity{serviceName='name', layer='GENERAL', normal=true}
+```
+
+### 5.2 Instance-Level Filtering
+```
+expression + entity{serviceName='name', serviceInstanceName='instance'}
+```
+
+### 5.3 Endpoint-Level Filtering
+```
+expression + entity{serviceName='name', endpointName='endpoint'}
+```
+
+### 5.4 Relation Query Filtering
+```
+expression + entity{
+ serviceName='source',
+ destServiceName='destination',
+ layer='GENERAL',
+ destLayer='DATABASE'
+}
+```
+
+## 6. Common Patterns and Best Practices
+
+### 6.1 Percentage Conversion
+```
+# Convert SLA from decimal to percentage
+service_sla * 100
+
+# Convert CPM to RPS
+service_cpm / 60
+```
+
+### 6.2 Condition Combinations
+```
+# High latency with low traffic
+service_resp_time > 3000 && service_cpm < 100
+
+# Multiple percentiles exceed threshold
+sum(service_percentile{p='50,75,90,95,99'} > 1000) >= 3
+```
+
+### 6.3 Trend Monitoring
+```
+# Response time growth rate
+rate(service_resp_time, 5)
+
+# Increase over the past 2 minutes
+increase(service_cpm, 2)
+```
+
+### 6.4 Aggregation Calculations
+```
+# Average response time (convert milliseconds to seconds)
+avg(service_resp_time) / 1000
+
+# Error rate statistics
+sum(aggregate_labels(meter_status_code{status='4xx,5xx'}, sum))
+```
+
+## 7. Syntax Validation Rules
+
+### 7.1 Required Conditions
+1. Metric names must exist in the system
+2. Label names must match the metric type
+3. Function parameter count must be correct
+4. Types on both sides of operators must be compatible
+
+### 7.2 Common Error Patterns
+```
+# Error: Missing label selector
+service_percentile # Should be service_percentile{p='50'}
+
+# Error: Invalid aggregation
+avg(service_percentile{p='50,75,90'}) # Cannot average multiple values
directly
+
+# Error: Type mismatch
+"string" + 123 # Cannot add string and number
+```
+
+## 8. Parsing Precedence Example
+
+```
+# Expression: avg(service_cpm) * 60 > 1000 && service_sla < 0.95
+# Parsing order:
+1. avg(service_cpm) # Function call
+2. ... * 60 # Multiplication
+3. ... > 1000 # Comparison
+4. service_sla < 0.95 # Comparison
+5. ... && ... # Logical AND
+```
+
+## 9. Advanced Usage
+
+### 9.1 Nested Functions
+```
+round(avg(service_resp_time) / 1000, 2) # Average response time in seconds,
rounded to 2 decimal places
+```
+
+### 9.2 Conditional Aggregation
+```
+sum((service_sla * 100) < 95) # Count instances where SLA is below 95%
+```
+
+### 9.3 Dynamic Labels
+```
+relabel(
+ service_percentile{p='50,75,90,95,99'},
+ p='50,75,90,95,99',
+ percentile='P50,P75,P90,P95,P99'
+)
+```
+
+## 10. Natural Language to MQE Mapping Rules
+
+### 10.1 Keyword Mappings
+- "average" → `avg()`
+- "maximum" → `max()`
+- "minimum" → `min()`
+- "total", "sum" → `sum()`
+- "count" → `count()`
+- "latest" → `latest()`
+- "top N" → `top_n(..., N, des)`
+- "bottom N" → `top_n(..., N, asc)`
+- "percentage" → `* 100`
+- "per second" → `/ 60` (for CPM)
+- "increase" → `increase()`
+- "rate" → `rate()`
+
+### 10.2 Condition Mappings
+- "greater than", "more than" → `>`
+- "less than", "below" → `<`
+- "equals", "is" → `==`
+- "not equal", "is not" → `!=`
+- "and" → `&&`
+- "or" → `||`
+
+### 10.3 Time Range Mappings
+- "last hour", "past hour" (past) → `duration: "-1h"`
+- "last 30 minutes" (past) → `duration: "-30m"`
+- "next 30 minutes" (future) → `duration: "30m"`
+
+## 11. Model Understanding Guidelines
+
+When processing natural language queries from users, the model should:
+
+1. **Identify Intent**: Is it querying a single value, time series, sorted
list, or comparison?
+2. **Extract Entities**: Service names, instance names, endpoint names, etc.
+3. **Identify Metrics**: Response time, success rate, call volume, etc.
+4. **Determine Operations**: Aggregation, calculation, comparison, sorting,
etc.
+5. **Build Expression**: Combine components according to syntax rules
+
+### Example Conversion Process
+User input: "Show the average response time for service A in the last hour,
alert if it exceeds 1 second"
+
+Analysis steps:
+1. Entity: Service A → `service_name: "A"`
+2. Metric: Response time → `service_resp_time`
+3. Operation: Average → `avg()`
+4. Time: Last hour → `duration: "1h"`
+5. Condition: Exceeds 1 second → `> 1000` (convert to milliseconds)
+
+Final expression:
+```
+expression: "avg(service_resp_time) > 1000"
+service_name: "A"
+duration: "1h"
+```
diff --git a/internal/resources/mqe_docs.go b/internal/resources/mqe_docs.go
new file mode 100644
index 0000000..3d9d45b
--- /dev/null
+++ b/internal/resources/mqe_docs.go
@@ -0,0 +1,126 @@
+// Licensed to Apache Software Foundation (ASF) under one or more contributor
+// license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright
+// ownership. Apache Software Foundation (ASF) licenses this file to you under
+// the Apache License, Version 2.0 (the "License"); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package resources
+
+import (
+ "context"
+ _ "embed"
+ "encoding/json"
+ "fmt"
+
+ "github.com/mark3labs/mcp-go/mcp"
+ "github.com/mark3labs/mcp-go/server"
+
+ "github.com/apache/skywalking-mcp/internal/tools"
+)
+
+// Embed MQE documentation and examples
+
+//go:embed mqe_examples.json
+var mqeExamples string
+
+//go:embed mqe_detailed_syntax.md
+var mqeDetailedSyntaxDoc string
+
+//go:embed mqe_ai_prompt.md
+var mqeAIPromptDoc string
+
+// AddMQEResources registers MQE-related resources with the MCP server
+func AddMQEResources(s *server.MCPServer) {
+ // Add detailed MQE syntax documentation as a resource
+ s.AddResource(mcp.Resource{
+ URI: "mqe://docs/syntax",
+ Name: "MQE Detailed Syntax Rules",
+ Description: "Comprehensive syntax rules and grammar for MQE
expressions",
+ MIMEType: "text/markdown",
+ }, func(_ context.Context, _ mcp.ReadResourceRequest)
([]mcp.ResourceContents, error) {
+ return []mcp.ResourceContents{
+ mcp.TextResourceContents{
+ URI: "mqe://docs/syntax",
+ MIMEType: "text/markdown",
+ Text: mqeDetailedSyntaxDoc,
+ },
+ }, nil
+ })
+
+ // Add MQE examples as a resource
+ s.AddResource(mcp.Resource{
+ URI: "mqe://docs/examples",
+ Name: "MQE Examples",
+ Description: "Common MQE expression examples with natural
language descriptions",
+ MIMEType: "application/json",
+ }, func(_ context.Context, _ mcp.ReadResourceRequest)
([]mcp.ResourceContents, error) {
+ return []mcp.ResourceContents{
+ mcp.TextResourceContents{
+ URI: "mqe://docs/examples",
+ MIMEType: "application/json",
+ Text: mqeExamples,
+ },
+ }, nil
+ })
+
+ // Add a dynamic resource that lists available metrics
+ s.AddResource(mcp.Resource{
+ URI: "mqe://metrics/available",
+ Name: "Available Metrics",
+ Description: "List of all available metrics in the current
SkyWalking instance",
+ MIMEType: "application/json",
+ }, func(ctx context.Context, _ mcp.ReadResourceRequest)
([]mcp.ResourceContents, error) {
+ // Call list_mqe_metrics to get real-time data
+ resp, err := tools.ListMQEMetricsInternal(ctx, nil)
+ if err != nil {
+ return nil, fmt.Errorf("failed to list metrics: %w",
err)
+ }
+
+ // Format the response
+ var formattedData interface{}
+ if unmarshalErr := json.Unmarshal([]byte(resp),
&formattedData); unmarshalErr != nil {
+ return nil, fmt.Errorf("failed to parse metrics data:
%w", unmarshalErr)
+ }
+
+ // Convert back to JSON with proper formatting
+ formattedJSON, err := json.MarshalIndent(formattedData, "", "
")
+ if err != nil {
+ return nil, fmt.Errorf("failed to format metrics data:
%w", err)
+ }
+
+ return []mcp.ResourceContents{
+ mcp.TextResourceContents{
+ URI: "mqe://metrics/available",
+ MIMEType: "application/json",
+ Text: string(formattedJSON),
+ },
+ }, nil
+ })
+
+ // Add AI understanding guide as a resource
+ s.AddResource(mcp.Resource{
+ URI: "mqe://docs/ai_prompt",
+ Name: "MQE AI Understanding Guide",
+ Description: "Guide for AI models to understand natural
language queries and convert to MQE",
+ MIMEType: "text/markdown",
+ }, func(_ context.Context, _ mcp.ReadResourceRequest)
([]mcp.ResourceContents, error) {
+ return []mcp.ResourceContents{
+ mcp.TextResourceContents{
+ URI: "mqe://docs/ai_prompt",
+ MIMEType: "text/markdown",
+ Text: mqeAIPromptDoc,
+ },
+ }, nil
+ })
+}
diff --git a/internal/resources/mqe_examples.json
b/internal/resources/mqe_examples.json
new file mode 100644
index 0000000..80e4018
--- /dev/null
+++ b/internal/resources/mqe_examples.json
@@ -0,0 +1,136 @@
+{
+ "examples": [
+ {
+ "description": "Query service response time",
+ "natural_language": "What is the average response time for service
'frontend'?",
+ "mqe_expression": "avg(service_resp_time)",
+ "notes": "Returns average response time in milliseconds"
+ },
+ {
+ "description": "Service SLA percentage",
+ "natural_language": "Show me the success rate for service 'backend' as a
percentage",
+ "mqe_expression": "service_sla * 100",
+ "notes": "Converts SLA from decimal (0-1) to percentage (0-100)"
+ },
+ {
+ "description": "Response time percentiles",
+ "natural_language": "Get the P50, P75, P90, P95, and P99 response times",
+ "mqe_expression": "service_percentile{p='50,75,90,95,99'}",
+ "notes": "Returns multiple percentile values with labels"
+ },
+ {
+ "description": "Rename percentile labels",
+ "natural_language": "Get response time percentiles with custom labels",
+ "mqe_expression": "relabel(service_percentile{p='50,75,90,95,99'},
p='50,75,90,95,99', percentile='P50,P75,P90,P95,P99')",
+ "notes": "Relabels the percentile values for better readability"
+ },
+ {
+ "description": "Calls per minute above threshold",
+ "natural_language": "Is the service receiving more than 1000 calls per
minute?",
+ "mqe_expression": "service_cpm > 1000",
+ "notes": "Returns 1 if true, 0 if false"
+ },
+ {
+ "description": "Service response time alert condition",
+ "natural_language": "Alert when average response time is over 1 second",
+ "mqe_expression": "avg(service_resp_time) > 1000",
+ "notes": "Used in alarm rules, checks if average response time exceeds
1000ms"
+ },
+ {
+ "description": "Service SLA below threshold alert",
+ "natural_language": "Alert when service success rate drops below 85%",
+ "mqe_expression": "sum((service_sla * 100) < 85) >= 4",
+ "notes": "Triggers when SLA is below 85% for 4 or more time buckets"
+ },
+ {
+ "description": "Multiple percentiles above threshold",
+ "natural_language": "Alert when multiple percentiles exceed 1 second",
+ "mqe_expression": "sum(service_percentile{p='50,75,90,95,99'} > 1000) >=
3",
+ "notes": "Triggers when 3 or more percentiles exceed 1000ms"
+ },
+ {
+ "description": "Combined SLA and response time condition",
+ "natural_language": "Alert when both SLA is low AND response time is
high",
+ "mqe_expression": "(avg(service_sla * 100) < 80) &&
(avg(service_percentile{p='50'}) > 1000)",
+ "notes": "Complex condition using boolean AND operator"
+ },
+ {
+ "description": "Top 10 services by CPM",
+ "natural_language": "Which are the top 10 busiest services by calls per
minute?",
+ "mqe_expression": "top_n(service_cpm, 10, des)",
+ "notes": "Returns sorted list of top 10 services"
+ },
+ {
+ "description": "Top 10 instances of a service",
+ "natural_language": "Show the top 10 instances by CPM for a specific
service",
+ "mqe_expression": "top_n(service_instance_cpm, 10, des)",
+ "notes": "Use with entity.serviceName to get instances of that service"
+ },
+ {
+ "description": "Filter top services by layer",
+ "natural_language": "Get top 10 general services by CPM",
+ "mqe_expression": "top_n(service_cpm, 10, des, attr0='GENERAL')",
+ "notes": "Filters results by service layer attribute"
+ },
+ {
+ "description": "HTTP error rate monitoring",
+ "natural_language": "Monitor 4xx and 5xx error rates",
+ "mqe_expression":
"sum(aggregate_labels(meter_status_code{status='4xx,5xx'},sum) > 10) > 3",
+ "notes": "Aggregates error status codes and alerts on high error count"
+ },
+ {
+ "description": "Instance response time with regex filter",
+ "natural_language": "Monitor instances matching a pattern",
+ "mqe_expression": "avg(service_instance_resp_time)",
+ "notes": "Use with include-names-regex in alarm rules to filter
instances"
+ },
+ {
+ "description": "Endpoint success rate monitoring",
+ "natural_language": "Alert when endpoint success rate is below 75%",
+ "mqe_expression": "sum((endpoint_sla * 100) < 75) >= 3",
+ "notes": "Monitors specific endpoint performance"
+ },
+ {
+ "description": "Service CPM in requests per second",
+ "natural_language": "Convert calls per minute to requests per second",
+ "mqe_expression": "round(service_cpm / 60, 2)",
+ "notes": "Converts CPM to RPS with 2 decimal places"
+ },
+ {
+ "description": "Service response time trend",
+ "natural_language": "Show the rate of change in response time",
+ "mqe_expression": "rate(service_resp_time, 5)",
+ "notes": "Calculates per-second rate of change over 5-minute window"
+ },
+ {
+ "description": "Response time increase over period",
+ "natural_language": "How much has response time increased in the last 2
minutes?",
+ "mqe_expression": "increase(service_resp_time, 2)",
+ "notes": "Shows absolute increase over time range"
+ },
+ {
+ "description": "Sort service response times",
+ "natural_language": "Get top 10 slowest response times",
+ "mqe_expression": "sort_values(service_resp_time, 10, des)",
+ "notes": "Sorts values in descending order, returns top 10"
+ },
+ {
+ "description": "First available metric fallback",
+ "natural_language": "Use service CPM or fallback to instance CPM",
+ "mqe_expression": "view_as_seq([service_cpm, service_instance_cpm])",
+ "notes": "Returns first non-null metric in sequence"
+ },
+ {
+ "description": "Check if metrics exist",
+ "natural_language": "Check if any monitoring data is available",
+ "mqe_expression": "is_present([service_cpm, service_resp_time,
service_sla])",
+ "notes": "Returns 1 if any metric has values, 0 otherwise"
+ },
+ {
+ "description": "Baseline comparison",
+ "natural_language": "Compare response time to predicted baseline",
+ "mqe_expression": "sum(service_resp_time > baseline(service_resp_time,
upper)) > 3",
+ "notes": "Alert when response time exceeds upper baseline prediction"
+ }
+ ]
+}
diff --git a/internal/swmcp/server.go b/internal/swmcp/server.go
index 57e2694..da406fa 100644
--- a/internal/swmcp/server.go
+++ b/internal/swmcp/server.go
@@ -22,7 +22,6 @@ import (
"fmt"
"net/http"
"os"
- "strings"
"github.com/mark3labs/mcp-go/server"
"github.com/sirupsen/logrus"
@@ -30,6 +29,8 @@ import (
"github.com/apache/skywalking-cli/pkg/contextkey"
"github.com/apache/skywalking-mcp/internal/config"
+ "github.com/apache/skywalking-mcp/internal/prompts"
+ "github.com/apache/skywalking-mcp/internal/resources"
"github.com/apache/skywalking-mcp/internal/tools"
)
@@ -40,11 +41,20 @@ func newMcpServer() *server.MCPServer {
"skywalking-mcp",
"0.1.0",
server.WithResourceCapabilities(true, true),
+ server.WithPromptCapabilities(true),
server.WithLogging())
+ // add tools and capabilities to the MCP server
tools.AddTraceTools(mcpServer)
tools.AddMetricsTools(mcpServer)
tools.AddLogTools(mcpServer)
+ tools.AddMQETools(mcpServer)
+
+ // add MQE documentation resources
+ resources.AddMQEResources(mcpServer)
+
+ // add prompts for guided interactions
+ prompts.AddSkyWalkingPrompts(mcpServer)
return mcpServer
}
@@ -79,21 +89,13 @@ const (
skywalkingURLEnvVar = "SW_URL"
)
-// finalizeURL ensures the URL ends with "/graphql".
-func finalizeURL(urlStr string) string {
- if !strings.HasSuffix(urlStr, "/graphql") {
- urlStr = strings.TrimRight(urlStr, "/") + "/graphql"
- }
- return urlStr
-}
-
// urlAndInsecureFromEnv extracts URL and insecure flag purely from
environment variables.
func urlAndInsecureFromEnv() (string, bool) {
urlStr := os.Getenv(skywalkingURLEnvVar)
if urlStr == "" {
urlStr = config.DefaultSWURL
}
- return finalizeURL(urlStr), false
+ return tools.FinalizeURL(urlStr), false
}
// urlAndInsecureFromHeaders extracts URL and insecure flag for a request.
@@ -108,7 +110,7 @@ func urlAndInsecureFromHeaders(req *http.Request) (string,
bool) {
}
}
- return finalizeURL(urlStr), false
+ return tools.FinalizeURL(urlStr), false
}
// WithSkyWalkingContextFromEnv injects the SkyWalking URL and insecure
diff --git a/internal/tools/common.go b/internal/tools/common.go
index 08475f9..970a27d 100644
--- a/internal/tools/common.go
+++ b/internal/tools/common.go
@@ -19,6 +19,7 @@ package tools
import (
"fmt"
+ "strings"
"time"
api "skywalking.apache.org/repo/goapi/query"
@@ -28,8 +29,8 @@ import (
const (
DefaultPageSize = 15
DefaultPageNum = 1
- DefaultStep = "MINUTE"
DefaultDuration = 30 // minutes
+ nowKeyword = "now"
)
// Error messages
@@ -38,6 +39,14 @@ const (
ErrMarshalFailed = "failed to marshal result: %v"
)
+// FinalizeURL ensures the URL ends with "/graphql".
+func FinalizeURL(urlStr string) string {
+ if !strings.HasSuffix(urlStr, "/graphql") {
+ urlStr = strings.TrimRight(urlStr, "/") + "/graphql"
+ }
+ return urlStr
+}
+
// FormatTimeByStep formats time according to step granularity
func FormatTimeByStep(t time.Time, step api.Step) string {
switch step {
@@ -69,7 +78,8 @@ func ParseDuration(durationStr string, coldStage bool)
api.Duration {
startTime = now
endTime = now.Add(duration)
}
- step = determineStep(duration)
+ // Use adaptive step based on time range
+ step = determineAdaptiveStep(startTime, endTime)
} else {
startTime, endTime, step = parseLegacyDuration(durationStr)
}
@@ -104,12 +114,17 @@ func BuildPagination(pageNum, pageSize int)
*api.Pagination {
func BuildDuration(start, end, step string, cold bool, defaultDurationMinutes
int) api.Duration {
if start != "" || end != "" {
stepEnum := api.Step(step)
+ // Parse and format start and end times
+ startTime, endTime := parseStartEndTimes(start, end)
+
+ // If step is not provided or invalid, determine it adaptively
based on time range
if step == "" || !stepEnum.IsValid() {
- stepEnum = DefaultStep
+ stepEnum = determineAdaptiveStep(startTime, endTime)
}
+
return api.Duration{
- Start: start,
- End: end,
+ Start: FormatTimeByStep(startTime, stepEnum),
+ End: FormatTimeByStep(endTime, stepEnum),
Step: stepEnum,
ColdStage: &cold,
}
@@ -122,15 +137,17 @@ func BuildDuration(start, end, step string, cold bool,
defaultDurationMinutes in
return ParseDuration(defaultDurationStr, cold)
}
-// determineStep determines the step based on the duration
-func determineStep(duration time.Duration) api.Step {
- if duration >= 24*time.Hour {
+// determineAdaptiveStep determines the adaptive step based on the time range
+func determineAdaptiveStep(startTime, endTime time.Time) api.Step {
+ duration := endTime.Sub(startTime)
+ if duration >= 7*24*time.Hour {
return api.StepDay
- } else if duration >= time.Hour {
+ } else if duration >= 24*time.Hour {
return api.StepHour
- } else if duration >= time.Minute {
+ } else if duration >= time.Hour {
return api.StepMinute
}
+
return api.StepSecond
}
@@ -168,3 +185,59 @@ func parseLegacyDuration(durationStr string) (startTime,
endTime time.Time, step
step = api.StepDay
return startTime, endTime, step
}
+
+// parseAbsoluteTime tries to parse absolute time in various formats
+func parseAbsoluteTime(timeStr string) (time.Time, bool) {
+ timeFormats := []string{
+ "2006-01-02 15:04:05",
+ "2006-01-02 15:04",
+ "2006-01-02 1504",
+ "2006-01-02 15",
+ "2006-01-02 150405",
+ "2006-01-02",
+ }
+
+ for _, format := range timeFormats {
+ if parsed, err := time.Parse(format, timeStr); err == nil {
+ return parsed, true
+ }
+ }
+
+ return time.Time{}, false
+}
+
+// parseTimeString parses a time string (start or end)
+func parseTimeString(timeStr string, defaultTime time.Time) time.Time {
+ now := time.Now()
+
+ if timeStr == "" {
+ return defaultTime
+ }
+
+ if strings.EqualFold(timeStr, nowKeyword) {
+ return now
+ }
+
+ // Try relative time like "-30m", "-1h"
+ if duration, err := time.ParseDuration(timeStr); err == nil {
+ return now.Add(duration)
+ }
+
+ // Try absolute time
+ if parsed, ok := parseAbsoluteTime(timeStr); ok {
+ return parsed
+ }
+
+ return defaultTime
+}
+
+// parseStartEndTimes parses start and end time strings
+func parseStartEndTimes(start, end string) (startTime, endTime time.Time) {
+ now := time.Now()
+ defaultStart := now.Add(-30 * time.Minute) // Default to 30 minutes ago
+
+ startTime = parseTimeString(start, defaultStart)
+ endTime = parseTimeString(end, now)
+
+ return startTime, endTime
+}
diff --git a/internal/tools/log.go b/internal/tools/log.go
index 9a027db..1d5e2a6 100644
--- a/internal/tools/log.go
+++ b/internal/tools/log.go
@@ -119,7 +119,10 @@ Examples:
mcp.WithArray("tags", mcp.Description("Array of log tags, each with key
and value.")),
mcp.WithString("start", mcp.Description("Start time for the query.")),
mcp.WithString("end", mcp.Description("End time for the query.")),
- mcp.WithString("step", mcp.Enum("SECOND", "MINUTE", "HOUR", "DAY"),
mcp.Description("Time step granularity.")),
+ mcp.WithString("step", mcp.Enum("SECOND", "MINUTE", "HOUR", "DAY"),
+ mcp.Description("Time step granularity: SECOND, MINUTE, HOUR,
DAY. "+
+ "If not specified, uses adaptive step sizing: "+
+ "SECOND (<1h), MINUTE (1h-24h), HOUR (1d-7d), DAY
(>7d)")),
mcp.WithBoolean("cold", mcp.Description("Whether to query from
cold-stage storage.")),
mcp.WithNumber("page_num", mcp.Description("Page number, default 1.")),
mcp.WithNumber("page_size", mcp.Description("Page size, default 15.")),
diff --git a/internal/tools/metric.go b/internal/tools/metric.go
index 27017f2..e5e725f 100644
--- a/internal/tools/metric.go
+++ b/internal/tools/metric.go
@@ -309,9 +309,9 @@ Time Format:
- Step: "SECOND", "MINUTE", "HOUR", "DAY"
Examples:
-- {"metrics_name": "service_cpm", "service_name": "business-zone::projectC",
"duration": "1h"}: Get calls per minute for a service in the last hour
+- {"metrics_name": "service_cpm", "service_name": "business-zone::projectC",
"duration": "-1h"}: Get calls per minute for a service in the past hour
- {"metrics_name": "endpoint_cpm", "service_name": "business-zone::projectC",
- "endpoint_name": "/projectC/{value}", "duration": "30m"}: Get calls per
minute for a specific endpoint in the last 30 minutes
+ "endpoint_name": "/projectC/{value}", "duration": "-30m"}: Get calls per
minute for a specific endpoint in the past 30 minutes
- {"metrics_name": "service_resp_time", "service_name": "web-service",
"start": "-1h", "end": "now", "step": "MINUTE"}: Get service response time
with custom time range
- {"metrics_name": "service_apdex", "service_name": "api-gateway", "cold":
true}: Get Apdex score from cold storage`,
@@ -359,7 +359,9 @@ service_instance_sla, service_cpm, service_resp_time,
service_apdex`),
mcp.Description("Destination process name for relationship
metrics. Use this for process relation scopes."),
),
mcp.WithString("duration",
- mcp.Description("Time duration for the query. Examples: \"1h\"
(last 1 hour), \"30m\" (last 30 minutes), \"7d\" (last 7 days)"),
+ mcp.Description("Time duration for the query relative to
current time. "+
+ "Negative values query the past: \"-1h\" (past 1 hour),
\"-30m\" (past 30 minutes), \"-7d\" (past 7 days). "+
+ "Positive values query the future: \"1h\" (next 1
hour), \"24h\" (next 24 hours)"),
),
mcp.WithString("start",
mcp.Description("Start time for the query. Examples:
\"2023-01-01 12:00:00\", \"-1h\" (1 hour ago), \"-30m\" (30 minutes ago)"),
@@ -370,10 +372,12 @@ service_instance_sla, service_cpm, service_resp_time,
service_apdex`),
mcp.WithString("step",
mcp.Enum("SECOND", "MINUTE", "HOUR", "DAY"),
mcp.Description(`Time step between start time and end time:
-- 'SECOND': Second-level granularity
-- 'MINUTE': Minute-level granularity (default)
-- 'HOUR': Hour-level granularity
-- 'DAY': Day-level granularity`),
+|- 'SECOND': Second-level granularity
+|- 'MINUTE': Minute-level granularity
+|- 'HOUR': Hour-level granularity
+|- 'DAY': Day-level granularity
+If not specified, uses adaptive step sizing:
+SECOND (<1h), MINUTE (1h-24h), HOUR (1d-7d), DAY (>7d)`),
),
mcp.WithBoolean("cold",
mcp.Description("Whether to query from cold-stage storage. Set
to true for historical data queries."),
@@ -415,10 +419,10 @@ Time Format:
- Step: "SECOND", "MINUTE", "HOUR", "DAY"
Examples:
-- {"metrics_name": "service_sla", "top_n": 5, "duration": "1h"}: Get top 5
services with highest SLA in the last hour
-- {"metrics_name": "endpoint_sla", "top_n": 10, "order": "ASC", "duration":
"30m"}: Get top 10 endpoints with lowest SLA in the last 30 minutes
+- {"metrics_name": "service_sla", "top_n": 5, "duration": "-1h"}: Get top 5
services with highest SLA in the past hour
+- {"metrics_name": "endpoint_sla", "top_n": 10, "order": "ASC", "duration":
"-30m"}: Get top 10 endpoints with lowest SLA in the past 30 minutes
- {"metrics_name": "service_instance_sla", "top_n": 3, "service_name":
"boutique::adservice",
- "duration": "1h"}: Get top 3 instances of a specific service with highest SLA
+ "duration": "-1h"}: Get top 3 instances of a specific service with highest
SLA in the past hour
- {"metrics_name": "service_cpm", "top_n": 5, "start": "-1h", "end": "now",
"step": "MINUTE"}: Get top 5 services with highest calls per minute with
custom time range`,
queryTopNMetrics,
@@ -448,7 +452,9 @@ service_instance_sla, service_cpm, service_resp_time,
service_apdex`),
mcp.Description("Parent service name to filter metrics. Use
this to get top N entities within a specific service."),
),
mcp.WithString("duration",
- mcp.Description("Time duration for the query. Examples: \"1h\"
(last 1 hour), \"30m\" (last 30 minutes), \"7d\" (last 7 days)"),
+ mcp.Description("Time duration for the query relative to
current time. "+
+ "Negative values query the past: \"-1h\" (past 1 hour),
\"-30m\" (past 30 minutes), \"-7d\" (past 7 days). "+
+ "Positive values query the future: \"1h\" (next 1
hour), \"24h\" (next 24 hours)"),
),
mcp.WithString("start",
mcp.Description("Start time for the query. Examples:
\"2023-01-01 12:00:00\", \"-1h\" (1 hour ago), \"-30m\" (30 minutes ago)"),
@@ -459,10 +465,12 @@ service_instance_sla, service_cpm, service_resp_time,
service_apdex`),
mcp.WithString("step",
mcp.Enum("SECOND", "MINUTE", "HOUR", "DAY"),
mcp.Description(`Time step between start time and end time:
-- 'SECOND': Second-level granularity
-- 'MINUTE': Minute-level granularity (default)
-- 'HOUR': Hour-level granularity
-- 'DAY': Day-level granularity`),
+|- 'SECOND': Second-level granularity
+|- 'MINUTE': Minute-level granularity
+|- 'HOUR': Hour-level granularity
+|- 'DAY': Day-level granularity
+If not specified, uses adaptive step sizing:
+SECOND (<1h), MINUTE (1h-24h), HOUR (1d-7d), DAY (>7d)`),
),
mcp.WithBoolean("cold",
mcp.Description("Whether to query from cold-stage storage. Set
to true for historical data queries."),
diff --git a/internal/tools/mqe.go b/internal/tools/mqe.go
new file mode 100644
index 0000000..68afd31
--- /dev/null
+++ b/internal/tools/mqe.go
@@ -0,0 +1,613 @@
+// Licensed to Apache Software Foundation (ASF) under one or more contributor
+// license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright
+// ownership. Apache Software Foundation (ASF) licenses this file to you under
+// the Apache License, Version 2.0 (the "License"); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package tools
+
+import (
+ "bytes"
+ "context"
+ "encoding/json"
+ "fmt"
+ "io"
+ "net/http"
+ "strings"
+ "time"
+
+ "github.com/mark3labs/mcp-go/mcp"
+ "github.com/mark3labs/mcp-go/server"
+ "github.com/spf13/viper"
+ api "skywalking.apache.org/repo/goapi/query"
+)
+
+// AddMQETools registers MQE-related tools with the MCP server
+func AddMQETools(mcp *server.MCPServer) {
+ MQEExpressionTool.Register(mcp)
+ MQEMetricsListTool.Register(mcp)
+ MQEMetricsTypeTool.Register(mcp)
+}
+
+// GraphQLRequest represents a GraphQL request
+type GraphQLRequest struct {
+ Query string `json:"query"`
+ Variables map[string]interface{} `json:"variables,omitempty"`
+}
+
+// GraphQLResponse represents a GraphQL response
+type GraphQLResponse struct {
+ Data interface{} `json:"data"`
+ Errors []struct {
+ Message string `json:"message"`
+ } `json:"errors,omitempty"`
+}
+
+// executeGraphQL executes a GraphQL query against SkyWalking OAP
+func executeGraphQL(ctx context.Context, url, query string, variables
map[string]interface{}) (*GraphQLResponse, error) {
+ url = FinalizeURL(url)
+
+ reqBody := GraphQLRequest{
+ Query: query,
+ Variables: variables,
+ }
+
+ jsonData, err := json.Marshal(reqBody)
+ if err != nil {
+ return nil, fmt.Errorf("failed to marshal GraphQL request: %w",
err)
+ }
+
+ req, err := http.NewRequestWithContext(ctx, "POST", url,
bytes.NewBuffer(jsonData))
+ if err != nil {
+ return nil, fmt.Errorf("failed to create HTTP request: %w", err)
+ }
+
+ req.Header.Set("Content-Type", "application/json")
+
+ client := &http.Client{Timeout: 30 * time.Second}
+ resp, err := client.Do(req)
+ if err != nil {
+ return nil, fmt.Errorf("failed to execute HTTP request: %w",
err)
+ }
+ defer resp.Body.Close()
+
+ if resp.StatusCode != http.StatusOK {
+ bodyBytes, _ := io.ReadAll(resp.Body)
+ return nil, fmt.Errorf("HTTP request failed with status: %d,
body: %s", resp.StatusCode, string(bodyBytes))
+ }
+
+ var graphqlResp GraphQLResponse
+ if err := json.NewDecoder(resp.Body).Decode(&graphqlResp); err != nil {
+ return nil, fmt.Errorf("failed to decode GraphQL response: %w",
err)
+ }
+
+ if len(graphqlResp.Errors) > 0 {
+ var errorMsgs []string
+ for _, err := range graphqlResp.Errors {
+ errorMsgs = append(errorMsgs, err.Message)
+ }
+ return nil, fmt.Errorf("GraphQL errors: %s",
strings.Join(errorMsgs, ", "))
+ }
+
+ return &graphqlResp, nil
+}
+
+// MQEExpressionRequest represents a request to execute MQE expression
+type MQEExpressionRequest struct {
+ Expression string `json:"expression"`
+ ServiceName string `json:"service_name,omitempty"`
+ Layer string `json:"layer,omitempty"`
+ ServiceInstanceName string `json:"service_instance_name,omitempty"`
+ EndpointName string `json:"endpoint_name,omitempty"`
+ ProcessName string `json:"process_name,omitempty"`
+ Normal *bool `json:"normal,omitempty"`
+ DestServiceName string `json:"dest_service_name,omitempty"`
+ DestLayer string `json:"dest_layer,omitempty"`
+ DestServiceInstanceName string
`json:"dest_service_instance_name,omitempty"`
+ DestEndpointName string `json:"dest_endpoint_name,omitempty"`
+ DestProcessName string `json:"dest_process_name,omitempty"`
+ DestNormal *bool `json:"dest_normal,omitempty"`
+ Duration string `json:"duration,omitempty"`
+ Start string `json:"start,omitempty"`
+ End string `json:"end,omitempty"`
+ Step string `json:"step,omitempty"`
+ Cold bool `json:"cold,omitempty"`
+ Debug bool `json:"debug,omitempty"`
+ DumpDBRsp bool `json:"dump_db_rsp,omitempty"`
+}
+
+// MQEMetricsListRequest represents a request to list available metrics
+type MQEMetricsListRequest struct {
+ Regex string `json:"regex,omitempty"`
+}
+
+// MQEMetricsTypeRequest represents a request to get metric type
+type MQEMetricsTypeRequest struct {
+ MetricName string `json:"metric_name"`
+}
+
+// ListServicesRequest represents a request to list services
+type ListServicesRequest struct {
+ Layer string `json:"layer"`
+}
+
+// getServiceInfo queries service information using the specified layer
+func getServiceInfo(ctx context.Context, serviceName, layer string) bool {
+ if serviceName == "" {
+ return false
+ }
+
+ if layer == "" {
+ layer = "GENERAL"
+ }
+
+ normal, err := getServiceByName(ctx, serviceName, layer)
+ if err != nil {
+ return true
+ }
+ if normal != nil {
+ return *normal
+ }
+
+ return true
+}
+
+// getServiceByName tries to get service info directly by name in specified
layer
+func getServiceByName(ctx context.Context, serviceName, layer string) (*bool,
error) {
+ serviceID, err := findServiceID(ctx, serviceName, layer)
+ if err != nil {
+ return nil, fmt.Errorf("service not found in layer %s: %s",
layer, serviceName)
+ }
+ if serviceID == "" {
+ return nil, fmt.Errorf("service not found in layer %s: %s",
layer, serviceName)
+ }
+
+ query := `
+ query getService($serviceId: String!) {
+ service: getService(serviceId: $serviceId) {
+ id
+ name
+ normal
+ layers
+ }
+ }
+ `
+
+ variables := map[string]interface{}{
+ "serviceId": serviceID,
+ }
+
+ result, err := executeGraphQL(ctx, viper.GetString("url"), query,
variables)
+ if err != nil {
+ return nil, fmt.Errorf("failed to get service details: %w", err)
+ }
+
+ if data, ok := result.Data.(map[string]interface{}); ok {
+ if service, ok := data["service"].(map[string]interface{}); ok {
+ if normal, ok := service["normal"].(bool); ok {
+ return &normal, nil
+ }
+ }
+ }
+
+ return nil, fmt.Errorf("invalid service data returned for: %s",
serviceName)
+}
+
+// findServiceID finds service ID by name in a specific layer
+func findServiceID(ctx context.Context, serviceName, layer string) (string,
error) {
+ query := `
+ query getServices($layer: String!) {
+ services: listServices(layer: $layer) {
+ id
+ name
+ }
+ }
+ `
+
+ variables := map[string]interface{}{
+ "layer": layer,
+ }
+
+ result, err := executeGraphQL(ctx, viper.GetString("url"), query,
variables)
+ if err != nil {
+ return "", err
+ }
+
+ if data, ok := result.Data.(map[string]interface{}); ok {
+ if services, ok := data["services"].([]interface{}); ok {
+ for _, s := range services {
+ svc, ok := s.(map[string]interface{})
+ if !ok {
+ continue
+ }
+ if svc["name"] == serviceName {
+ if id, ok := svc["id"].(string); ok {
+ return id, nil
+ }
+ }
+ }
+ }
+ }
+
+ return "", nil
+}
+
+// buildMQEEntity builds the entity from request parameters
+func buildMQEEntity(ctx context.Context, req *MQEExpressionRequest)
map[string]interface{} {
+ entity := make(map[string]interface{})
+
+ // Define a mapping of field names to their corresponding values
+ fields := map[string]interface{}{
+ "serviceName": req.ServiceName,
+ "serviceInstanceName": req.ServiceInstanceName,
+ "endpointName": req.EndpointName,
+ "processName": req.ProcessName,
+ "destServiceName": req.DestServiceName,
+ "destServiceInstanceName": req.DestServiceInstanceName,
+ "destEndpointName": req.DestEndpointName,
+ "destProcessName": req.DestProcessName,
+ }
+
+ // Populate the entity map based on the mapping
+ for key, value := range fields {
+ if strValue, ok := value.(string); ok && strValue != "" {
+ entity[key] = strValue
+ }
+ }
+
+ // Handle special cases
+ if req.ServiceName != "" {
+ if req.Normal == nil {
+ normal := getServiceInfo(ctx, req.ServiceName,
req.Layer)
+ entity["normal"] = normal
+ } else {
+ entity["normal"] = *req.Normal
+ }
+ } else if req.Normal != nil {
+ entity["normal"] = *req.Normal
+ }
+
+ if req.DestNormal != nil {
+ entity["destNormal"] = *req.DestNormal
+ }
+
+ return entity
+}
+
+// executeMQEExpression executes MQE expression query
+func executeMQEExpression(ctx context.Context, req *MQEExpressionRequest)
(*mcp.CallToolResult, error) {
+ if req.Expression == "" {
+ return mcp.NewToolResultError("expression is required"), nil
+ }
+
+ entity := buildMQEEntity(ctx, req)
+
+ var duration api.Duration
+ if req.Duration != "" {
+ duration = ParseDuration(req.Duration, req.Cold)
+ } else {
+ duration = BuildDuration(req.Start, req.End, req.Step,
req.Cold, DefaultDuration)
+ }
+
+ // GraphQL query for MQE expression
+ query := `
+ query execExpression($expression: String!, $entity: Entity!,
$duration: Duration!, $debug: Boolean, $dumpDBRsp: Boolean) {
+ execExpression(expression: $expression, entity:
$entity, duration: $duration, debug: $debug, dumpDBRsp: $dumpDBRsp) {
+ type
+ error
+ results {
+ metric {
+ labels {
+ key
+ value
+ }
+ }
+ values {
+ id
+ value
+ traceID
+ owner {
+ scope
+ serviceID
+ serviceName
+ normal
+ serviceInstanceID
+ serviceInstanceName
+ endpointID
+ endpointName
+ }
+ }
+ }
+ debuggingTrace {
+ traceId
+ condition
+ duration
+ spans {
+ spanId
+ operation
+ msg
+ startTime
+ endTime
+ duration
+ }
+ }
+ }
+ }
+ `
+
+ variables := map[string]interface{}{
+ "expression": req.Expression,
+ "entity": entity, // Always include entity, even if empty
+ "duration": map[string]interface{}{
+ "start": duration.Start,
+ "end": duration.End,
+ "step": string(duration.Step),
+ },
+ // Always provide debug parameters with explicit values
+ "debug": req.Debug,
+ "dumpDBRsp": req.DumpDBRsp,
+ }
+
+ result, err := executeGraphQL(ctx, viper.GetString("url"), query,
variables)
+ if err != nil {
+ return mcp.NewToolResultError(fmt.Sprintf("failed to execute
MQE expression: %v", err)), nil
+ }
+
+ jsonBytes, err := json.Marshal(result.Data)
+ if err != nil {
+ return mcp.NewToolResultError(fmt.Sprintf("failed to marshal
result: %v", err)), nil
+ }
+ return mcp.NewToolResultText(string(jsonBytes)), nil
+}
+
+// listMQEMetrics lists available metrics
+func listMQEMetrics(ctx context.Context, req *MQEMetricsListRequest)
(*mcp.CallToolResult, error) {
+ // GraphQL query for listing metrics
+ query := `
+ query listMetrics($regex: String) {
+ listMetrics(regex: $regex) {
+ name
+ type
+ catalog
+ }
+ }
+ `
+
+ variables := map[string]interface{}{}
+ if req != nil && req.Regex != "" {
+ variables["regex"] = req.Regex
+ }
+
+ result, err := executeGraphQL(ctx, viper.GetString("url"), query,
variables)
+ if err != nil {
+ return mcp.NewToolResultError(fmt.Sprintf("failed to list
metrics: %v", err)), nil
+ }
+
+ jsonBytes, err := json.Marshal(result.Data)
+ if err != nil {
+ return mcp.NewToolResultError(fmt.Sprintf("failed to marshal
result: %v", err)), nil
+ }
+ return mcp.NewToolResultText(string(jsonBytes)), nil
+}
+
+// ListMQEMetricsInternal is an exported function for internal use by
resources package
+func ListMQEMetricsInternal(ctx context.Context, regex *string) (string,
error) {
+ var req *MQEMetricsListRequest
+ if regex != nil {
+ req = &MQEMetricsListRequest{Regex: *regex}
+ }
+ result, err := listMQEMetrics(ctx, req)
+ if err != nil {
+ return "", err
+ }
+
+ // Extract the text content from the tool result
+ if textResult, ok := result.Content[0].(mcp.TextContent); ok {
+ return textResult.Text, nil
+ }
+
+ return "", fmt.Errorf("unexpected result format")
+}
+
+// getMQEMetricsType gets metric type information
+func getMQEMetricsType(ctx context.Context, req *MQEMetricsTypeRequest)
(*mcp.CallToolResult, error) {
+ if req.MetricName == "" {
+ return mcp.NewToolResultError("metric_name must be provided"),
nil
+ }
+
+ // GraphQL query for getting metric type
+ query := `
+ query typeOfMetrics($name: String!) {
+ typeOfMetrics(name: $name)
+ }
+ `
+
+ variables := map[string]interface{}{
+ "name": req.MetricName,
+ }
+
+ result, err := executeGraphQL(ctx, viper.GetString("url"), query,
variables)
+ if err != nil {
+ return mcp.NewToolResultError(fmt.Sprintf("failed to get metric
type: %v", err)), nil
+ }
+
+ jsonBytes, err := json.Marshal(result.Data)
+ if err != nil {
+ return mcp.NewToolResultError(fmt.Sprintf("failed to marshal
result: %v", err)), nil
+ }
+ return mcp.NewToolResultText(string(jsonBytes)), nil
+}
+
+var MQEExpressionTool = NewTool[MQEExpressionRequest, *mcp.CallToolResult](
+ "execute_mqe_expression",
+ `Execute MQE (Metrics Query Expression) to query and calculate metrics
data.
+
+MQE is SkyWalking's powerful query language that allows you to:
+- Query metrics with labels: service_percentile{p='50,75,90,95,99'}
+- Perform calculations: service_sla * 100, service_cpm / 60
+- Compare values: service_resp_time > 3000
+- Use aggregations: avg(service_cpm), sum(service_cpm), max(service_resp_time)
+- Mathematical functions: round(service_cpm / 60, 2), abs(service_resp_time -
1000)
+- TopN queries: top_n(service_cpm, 10, des)
+- Trend analysis: increase(service_cpm, 2), rate(service_cpm, 5)
+- Sort operations: sort_values(service_resp_time, 10, des)
+- Baseline comparison: baseline(service_resp_time, upper)
+- Relabel operations: relabels(service_percentile{p='50,75,90,95,99'},
p='50,75,90,95,99', percentile='P50,P75,P90,P95,P99')
+- Logical operations: view_as_seq([metric1, metric2]), is_present([metric1,
metric2])
+- Label aggregation: aggregate_labels(total_commands_rate, sum)
+
+Result Types:
+- SINGLE_VALUE: Single metric value (e.g., avg(), sum())
+- TIME_SERIES_VALUES: Time series data with timestamps
+- SORTED_LIST: Sorted metric values (e.g., top_n())
+- RECORD_LIST: Record-based metrics
+- LABELED_VALUE: Metrics with multiple labels
+
+USAGE REQUIREMENTS:
+- The 'expression' parameter is mandatory for all queries
+- For service-specific queries, specify 'service_name' and optionally 'layer'
(defaults to GENERAL)
+- For relation metrics, provide both source and destination entity parameters
+- Either specify 'duration' OR both 'start' and 'end' for time range
+- Use 'debug: true' for query tracing and troubleshooting
+- Use 'cold: true' to query from cold storage (BanyanDB only)
+
+Entity Filtering (all optional):
+- Service level: service_name + layer + normal
+- Instance level: service_instance_name
+- Endpoint level: endpoint_name
+- Process level: process_name
+- Relation queries: dest_service_name + dest_layer,
dest_service_instance_name, etc.
+
+Examples:
+- {expression: "service_sla * 100", service_name: "Your_ApplicationName",
layer: "GENERAL", duration: "-1h"}: Convert SLA to percentage for last hour
+- {expression: "service_resp_time > 3000 && service_cpm < 1000", service_name:
"Your_ApplicationName",
+ duration: "-30m"}: Find high latency with low traffic in last 30 minutes
+- {expression: "avg(service_cpm)", duration: "-2h"}: Calculate average CPM for
last 2 hours
+- {expression: "service_cpm", duration: "24h"}: Query CPM for next 24 hours
(useful for capacity planning)
+- {expression: "top_n(service_cpm, 10, des)", start: "2025-07-06 16:00:00",
end: "2025-07-06 17:00:00",
+ step: "MINUTE"}: Top 10 services by CPM with minute granularity`,
+ executeMQEExpression,
+ mcp.WithString("expression", mcp.Required(),
+ mcp.Description("MQE expression to execute (required). "+
+ "Examples: `service_sla`, `avg(service_cpm)`,
`service_sla * 100`, `service_percentile{p='50,75,90,95,99'}`")),
+ mcp.WithString("service_name", mcp.Description("Service name for entity
filtering")),
+ mcp.WithString("layer",
+ mcp.Description("Service layer for entity filtering. "+
+ "Examples: `GENERAL` (default), `MESH`, `K8S_SERVICE`,
`DATABASE`, `VIRTUAL_DATABASE`. "+
+ "Defaults to GENERAL if not specified")),
+ mcp.WithString("service_instance_name", mcp.Description("Service
instance name for entity filtering")),
+ mcp.WithString("endpoint_name", mcp.Description("Endpoint name for
entity filtering")),
+ mcp.WithString("process_name", mcp.Description("Process name for entity
filtering")),
+ mcp.WithBoolean("normal",
+ mcp.Description("Whether the service is normal (has agent
installed). "+
+ "If not specified, will be auto-detected based on
service layer")),
+ mcp.WithString("dest_service_name", mcp.Description("Destination
service name for relation metrics")),
+ mcp.WithString("dest_layer",
+ mcp.Description("Destination service layer for relation
metrics. "+
+ "Examples: `GENERAL`, `MESH`, `K8S_SERVICE`,
`DATABASE`")),
+ mcp.WithString("dest_service_instance_name",
mcp.Description("Destination service instance name for relation metrics")),
+ mcp.WithString("dest_endpoint_name", mcp.Description("Destination
endpoint name for relation metrics")),
+ mcp.WithString("dest_process_name", mcp.Description("Destination
process name for relation metrics")),
+ mcp.WithBoolean("dest_normal", mcp.Description("Whether the destination
service is normal")),
+ mcp.WithString("duration",
+ mcp.Description("Time duration for the query relative to
current time. "+
+ "Negative values query the past: `-1h` (past 1 hour),
`-30m` (past 30 minutes), `-7d` (past 7 days). "+
+ "Positive values query the future: `1h` (next 1 hour),
`24h` (next 24 hours). "+
+ "Use this OR specify both start+end")),
+ mcp.WithString("start", mcp.Description("Start time for the query.
Examples: `2025-07-06 12:00:00`, `-1h` (1 hour ago), `-30m` (30 minutes ago)")),
+ mcp.WithString("end", mcp.Description("End time for the query.
Examples: `2025-07-06 13:00:00`, `now`, `-10m` (10 minutes ago)")),
+ mcp.WithString("step", mcp.Enum("SECOND", "MINUTE", "HOUR", "DAY",
"MONTH"),
+ mcp.Description("Time step between start time and end time: "+
+ "SECOND (second-level), MINUTE (minute-level), HOUR
(hour-level), "+
+ "DAY (day-level), MONTH (month-level). "+
+ "If not specified, uses adaptive step sizing: "+
+ "SECOND (<1h), MINUTE (1h-24h), HOUR (1d-7d), DAY
(>7d)")),
+ mcp.WithBoolean("cold", mcp.Description("Whether to query from
cold-stage storage")),
+ mcp.WithBoolean("debug", mcp.Description("Enable query tracing and
debugging")),
+ mcp.WithBoolean("dump_db_rsp", mcp.Description("Dump database response
for debugging")),
+)
+
+var MQEMetricsListTool = NewTool[MQEMetricsListRequest, *mcp.CallToolResult](
+ "list_mqe_metrics",
+ `List available metrics in SkyWalking that can be used in MQE
expressions.
+
+This tool helps you discover what metrics are available for querying and their
metadata information
+including metric type and catalog. You can optionally provide a regex pattern
to filter the metrics by name.
+
+Metric Categories:
+- Service metrics: service_sla, service_cpm, service_resp_time, service_apdex,
service_percentile
+- Instance metrics: service_instance_sla, service_instance_cpm,
service_instance_resp_time
+- Endpoint metrics: endpoint_sla, endpoint_cpm, endpoint_resp_time,
endpoint_percentile
+- Relation metrics: service_relation_client_cpm, service_relation_server_cpm
+- Database metrics: database_access_resp_time, database_access_cpm
+- Infrastructure metrics: service_cpu, service_memory, service_thread_count
+
+Metric Types:
+- REGULAR_VALUE: Single value metrics (e.g., service_sla, service_cpm)
+- LABELED_VALUE: Multi-label metrics (e.g., service_percentile,
k8s_cluster_deployment_status)
+- SAMPLED_RECORD: Record-based metrics
+
+Usage Tips:
+- Use regex patterns to filter specific metric categories
+- Check metric type to understand how to use them in MQE expressions
+- Regular value metrics can be used directly in calculations
+- Labeled value metrics require label selectors: metric_name{label='value'}
+
+Examples:
+- {regex: "service_.*"}: List all service-related metrics
+- {regex: ".*_cpm"}: List all CPM (calls per minute) metrics
+- {regex: ".*percentile.*"}: List all percentile metrics
+- {}: List all available metrics`,
+ listMQEMetrics,
+ mcp.WithString("regex", mcp.Description("Optional regex pattern to
filter metrics by name. Examples: `service_.*`, `.*_cpm`, `endpoint_.*`")),
+)
+
+var MQEMetricsTypeTool = NewTool[MQEMetricsTypeRequest, *mcp.CallToolResult](
+ "get_mqe_metric_type",
+ `Get type information for a specific metric.
+
+This tool returns the type and catalog information for a given metric name,
which helps understand
+what kind of data the metric contains and how it should be used in MQE
expressions.
+
+Metric Types:
+- REGULAR_VALUE: Single numeric value metrics
+ - Can be used directly in arithmetic operations
+ - Examples: service_sla, service_cpm, service_resp_time
+ - Usage: service_sla, service_sla * 100, avg(service_cpm)
+
+- LABELED_VALUE: Multi-dimensional metrics with labels
+ - Require label selectors to specify which values to query
+ - Examples: service_percentile, k8s_cluster_deployment_status
+ - Usage: service_percentile{p='50,75,90,95,99'}
+
+- SAMPLED_RECORD: Record-based metrics with sampling
+ - Used for detailed record analysis
+ - Examples: top_n_database_statement, traces
+ - Usage: Complex aggregations and filtering
+
+Understanding metric types is crucial for:
+- Writing correct MQE expressions
+- Knowing whether to use label selectors
+- Understanding result data structure
+- Choosing appropriate aggregation functions
+
+Examples:
+- {metric_name: "service_cpm"}: Get type info for service CPM metric
+- {metric_name: "service_percentile"}: Get type info for service percentile
metric
+- {metric_name: "endpoint_sla"}: Get type info for endpoint SLA metric`,
+ getMQEMetricsType,
+ mcp.WithString("metric_name", mcp.Required(),
+ mcp.Description("Name of the metric to get type information for
(required). "+
+ "Examples: `service_sla`, `service_percentile`,
`endpoint_cpm`")),
+)