This is an automated email from the ASF dual-hosted git repository. wu-sheng pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/skywalking-horizon-ui.git
commit 0caaa685677d00574794b7afc735cb89344965d5 Author: Wu Sheng <[email protected]> AuthorDate: Thu May 21 11:27:40 2026 +0800 traces: name native query by API, not version; banner the v1 segment-list path The native trace query is selected by storage backend, not OAP version: queryTraces (whole trace, spans inline) is BanyanDB-only, while every other backend (ES, …) uses queryBasicTraces (segment list + a second queryTrace on click). Detection already runs via hasQueryTracesV2Support. Drop the misleading 'v2'|'v3' labels from code (there is no trace v3 — that's metrics) and name by the OAP API instead: TraceQueryApi = 'queryTraces'|'queryBasicTraces', response field `protocol` -> `api`. Add a persistent banner on the native traces view stating the active Trace Query v1/v2 API; in the segment-list path it explains each row is a segment and the list/rail title reads "Segments". --- .claude/skills/migrate-layer/SKILL.md | 263 --------------------------- apps/bff/src/http/query/trace.ts | 35 ++-- apps/bff/src/logic/layers/loader.ts | 5 +- apps/bff/src/util/trace-protocol-cache.ts | 55 +++--- apps/ui/src/layer/traces/LayerTracesView.vue | 47 ++++- packages/api-client/src/index.ts | 1 + packages/api-client/src/trace.ts | 41 +++-- 7 files changed, 121 insertions(+), 326 deletions(-) diff --git a/.claude/skills/migrate-layer/SKILL.md b/.claude/skills/migrate-layer/SKILL.md deleted file mode 100644 index 3fdefdb..0000000 --- a/.claude/skills/migrate-layer/SKILL.md +++ /dev/null @@ -1,263 +0,0 @@ ---- -name: migrate-layer -description: Migrate one OAP layer's UI templates from booster-ui (apache/skywalking ui-initialized-templates) into the Horizon UI bundled config — preserves intent, adapts to Horizon's flatter widget grid + extracts metric semantics from upstream docs. -user-invocable: true ---- - -# Migrate one OAP layer into Horizon - -Source of truth for the upstream side: -- Templates: `/Users/wusheng/github/skywalking/oap-server/server-starter/src/main/resources/ui-initialized-templates/<layer>/` -- Docs index: `/Users/wusheng/github/skywalking/docs/menu.yml` (paths are relative to `/Users/wusheng/github/skywalking/docs/`) - -Output goes to: `apps/bff/src/bundled_templates/layers/<layer>.json` - -The Horizon JSON shape is defined by `apps/bff/src/layers/loader.ts` (`LayerTemplate` interface). Read it once before each new layer so any new fields are picked up. - ---- - -## When this skill applies - -You're staring at a booster-ui template folder (e.g. `k8s/`, `mesh/`, `virtual_database/`) with 1–7 nested `*.json` files and you need to produce **one** Horizon JSON that lights up the corresponding layer page (Services / Instances / Endpoints / Topology / Traces / Logs / Profiling). Some layers have only `*-root.json` + `*-service.json` (service-only); others add `*-instance.json`, `*-endpoint.json`, and `*-*-relation.json` (full mesh). - ---- - -## Booster → Horizon mapping reference - -### Top-level shape - -| Booster | Horizon | Notes | -|---|---|---| -| One file per scope, each wrapping a `Tab` tree | One JSON, scopes flattened into `dashboards.{service,instance,endpoint,dependency,topology}` arrays | The `Tab` containers in booster are dropped — Horizon's layer page already provides the tab strip via the route hierarchy. | -| `configuration.layer` | top-level `key` (also matches the file basename uppercased) | The loader asserts `file.json` ↔ `"key": "FILE"` (UPPER_SNAKE). | -| `configuration.entity` (`Service` / `ServiceInstance` / `Endpoint` / …) | implied by which `dashboards.*` bucket the widget lives in | Drop the entity field — placement decides. | -| `configuration.name` (e.g. `General-Service`) | not needed | Horizon derives titles from `alias` + route. | -| `Event` widget at the bottom | not yet supported | Skip; revisit when Horizon adds an events strip. | - -### Widget renderer mapping - -| Booster `graph.type` | Horizon `type` | Comment | -|---|---|---| -| `Card` | `card` | Single scalar, avg across window. Card MQE usually wraps with `avg(...)`. Horizon evaluates the average itself; **strip the `avg()` wrapper** unless it changes shape (e.g. `avg(x/100)`). | -| `Line` (single expression) | `line` | One labeled series. Carry `unit`, `expressions[0]`, `expressionLabels?`. | -| `Line` (multiple expressions) | `line` | Multi-series. Use `expressionLabels` (mandatory for legend). When the booster widget has a dual-axis trick (e.g. count + latency), set `expressionAxes: [0, 1]` and supply `expressionUnits`. | -| `TopList` | `top` | Sorted list — usually `top_n(...,N,des)`. When booster ships several `TopList`s on the same layer that are conceptually the same ranking with different sort dimensions (e.g. Top APIs by Traffic / by Slow / by SR), **fold them into a single `top` widget with multiple `expressions` + `expressionLabels` + `expressionUnits`** (the SPA renders these as in-widget tabs). | -| `TopList` with RECORD-typed MQE (`top_n_service_database_statement`, etc.) | `record` | Use `record`, not `top`. Pair with `relatedTrace.refIdType: traceId` upstream — Horizon's record widget surfaces trace links automatically. | -| `Topology` | **not a dashboard widget** — pull node/link MQE up to top-level `topology` block | The `linkServerExpressions`, `linkClientExpressions`, `nodeExpressions` become `topology.{nodeMetrics, linkServerMetrics, linkClientMetrics}` arrays. Add `role` (`center`/`ring`/`secondary` for nodes, `lineServer`/`lineClient` for edges). | -| `InstanceList`, `EndpointList` | **dropped** — Horizon ships built-in pickers | The picker columns come from `header.columns` (or `overview.*`) instead. Don't reproduce these as dashboard widgets. | -| `Trace`, `Log` | **flip the component flag** in `components.{traces,logs}` | No widget needed — Horizon owns the Traces / Logs views. | -| `Profile` (trace profiling) | `components.traceProfiling: true` | Horizon's profiling views are component-level, not widget-level. | -| `Ebpf` | `components.ebpfProfiling: true` | Same. | -| `AsyncProfiling` | `components.asyncProfiling: true` | Same. | -| `Text` | dropped | Layer header in Horizon already documents the page; if you need an external link, set `documentLink` at the layer root. | -| `Event` | dropped (no Horizon equivalent yet) | Note in the PR if a layer leans on this heavily. | - -### Field-level mapping - -| Booster | Horizon | Notes | -|---|---|---| -| `widget.title` | `title` | Drop unit suffixes from the title — Horizon shows the unit separately. e.g. `"Service Avg Response Time (ms)"` → `"Avg Response Time"` + `"unit": "ms"`. | -| `widget.tips` | `tip` | Keep verbatim where useful; otherwise tighten to one sentence. | -| `expressions[0]` | `expressions[0]` | MQE stays. For cards, unwrap a single outer `avg(...)`. | -| `graph.fontSize`, `textAlign`, `showSymbol`, `step`, `smooth`, `color`, `showXAxis`/`Y` | dropped | Horizon's chart wrappers own the rendering style. | -| `metricConfig[*].unit` | `unit` (or `expressionUnits`) | Use widget-level `unit` for single-series widgets, `expressionUnits` per index for multi-series. | -| `metricConfig[*].label` | `expressionLabels[i]` | When booster sets a label to give the legend a friendly name, copy it; otherwise rely on the MQE relabel (e.g. `relabels(...,percentile='99')` is already the legend label). | -| `associate` (cross-widget cursor sync) | dropped | Horizon syncs crosshairs via `useCrosshair()` automatically. | -| `relatedTrace.enableRelate`, `latency`, `status`, `queryOrder` | dropped (for widget) | The dashboard's trace-list links are auto-derived. For `record` widgets, keep `relatedTrace.refIdType` semantics by ensuring the underlying MQE is RECORD-typed. | -| `valueRelatedDashboard` | dropped | Horizon routing — clicking a top item drills into the instance/endpoint page automatically. | -| `x/y/w/h` (24-col grid) | `span` / `rowSpan` (12-col flow) | **Halve the booster width** to convert to Horizon's 12-col grid (`w: 8` → `span: 4`). Use `rowSpan` only when the widget is taller than the default 2 rows (default ~14px × 8 = chart height). Top-lists and percentile charts typically `rowSpan: 3` or `4`. | -| `subExpressions` (per-row sparkline MQE under `InstanceList`/`EndpointList`) | dropped (UI computes the row-sparklines itself) | These are the picker's per-row spark sources. Horizon's pickers fetch them through `landing` routes. | -| `legendMQE` + `description.healthy/unhealthy` on Topology | Horizon doesn't yet have a legend-rule UI — capture the rule in a comment for now | Pending UI work. | - -### Picker column derivation - -The layer page header (services list) needs `header.columns`. Source order of preference: -1. The booster `*-service.json` Overview tab's top three single-card metrics — these are conceptually the same as the picker columns. -2. The picker's `subExpressions` array — first 1–3 entries map well. -3. Default to `cpm` / `sla/100` / `resp_time` if nothing is obvious. - -Always set `orderBy` to the column you want as the default sort (usually `cpm`). - -### Overview tile derivation - -`overview.groups` drives the tall hero card above the picker. Pick 2–4 metrics that tell the layer's health story at a glance — almost always: -- **one** latency percentile (**P95** — prefer P95 over P99; do not include both, they're redundant in a hero strip) -- an error rate (`100 - service_sla/100` or layer equivalent) -- a health score (apdex) where available **or** a throughput indicator — pick the one that's most informative for that layer - -Group as `{ "title": "Latency & errors", "size": "auto", "metrics": [...] }` and (optionally) `{ "title": "Health", "size": "square", "metrics": [<single>] }` or `{ "title": "Throughput", "size": "square", "metrics": [<RPM>] }` when the layer has no apdex equivalent. - ---- - -## 5-step procedure for migrating one layer - -Run these in order **for each layer**. The order matters: 1–4 build the mental model; 5 produces the JSON. Don't skip ahead to writing JSON. - -### Step 1 — Learn the old layout - -Read every file under `/Users/wusheng/github/skywalking/oap-server/server-starter/src/main/resources/ui-initialized-templates/<layer>/`. List: -- Each `*-root.json` Tab name + child widgets — defines the layer landing strip -- Each `*-service.json` / `*-instance.json` / `*-endpoint.json` Overview tab widget set — these become the `dashboards.<scope>` arrays -- The `*-relation.json` files — the relation widget sets feed into Horizon's edge/dependency drill panels but **most metrics belong to topology.linkServerMetrics / linkClientMetrics**, not as flat widgets. If the relation file has 4+ widgets, mention them in the migration notes; if it's just RPM/respTime/SLA/percentile, fold into topology. -- Note which widgets the layer uses (`InstanceList`, `EndpointList`, `Topology`, `Trace`, `Log`, `Profile`, etc.) so you know which `components.*` flags to flip. - -Optional `Bash` helper to summarize: -```bash -for f in /Users/wusheng/github/skywalking/oap-server/server-starter/src/main/resources/ui-initialized-templates/<LAYER>/*.json; do - echo "=== $(basename $f) ===" - python3 -c " -import json -d = json.load(open('$f')) -def walk(n, depth=0): - if isinstance(n, list): - for i in n: walk(i, depth); return - if not isinstance(n, dict): return - t = n.get('type'); name = n.get('name') - title = (n.get('widget') or {}).get('title') if isinstance(n.get('widget'), dict) else None - if name or (t and t != 'Tab') or title: - print(' '*depth + (t or 'Tab') + ((': ' + (name or title)) if (name or title) else '')) - for c in n.get('children') or []: walk(c, depth+1) - if n.get('configuration'): walk(n['configuration'], depth) -walk(d)" -done -``` - -### Step 2 — Learn the Horizon capabilities - -Reread `apps/bff/src/layers/loader.ts` (the `LayerTemplate` interface + comments) and `packages/api-client/src/dashboard.ts` (the `DashboardWidget` interface). Decide which Horizon features the migration will use: -- `dashboards.service` (always) -- `dashboards.instance` (if booster has instance template **and** the layer has per-instance metrics worth showing) -- `dashboards.endpoint` (same logic for endpoints) -- `dashboards.dependency` (only if the relation file has metrics distinct from topology edges) -- `topology` block (any layer with a non-trivial service map — almost all do) -- `endpointDependency` block (only if the layer has per-endpoint relations, e.g. general, mesh, k8s_service) -- `components.{traces, logs, traceProfiling, ebpfProfiling, asyncProfiling}` flags -- `visibleWhen` for any widget whose MQE only fires conditionally (`instance_jvm_cpu has value` etc.) — common on instance-scope widgets that span multiple runtimes. - -Also reread the existing `general.json` (and `mesh.json` if migrating a service-mesh-adjacent layer) as the reference template. Copy its shape verbatim and replace metric names — don't reinvent the structure. - -### Step 3 — Read metrics meaning from booster's `widget.tips` + metric names - -For each widget you're going to migrate, write a one-line summary of what the metric represents. Sources: -- `widget.tips` in the booster JSON (often the most direct) -- The metric name itself (e.g. `instance_jvm_gc_pause_duration` → JVM GC pause time per minute) -- The OAP query-protocol schemas under `apache/skywalking/oap-server/.../query-protocol/*.graphqls` if the metric is exposed there - -If you can't write a one-liner from these alone, move to Step 4. - -### Step 4 — Web search the target component + popular key metrics - -For domain-specific layers (Kafka, Redis, MongoDB, Envoy, Cilium, etc.), web search a small set: -- `"<component> top metrics for monitoring"` — usually finds the canonical 5–10 SREs care about -- `"<component> golden signals"` — RED / USE method articles -- Vendor docs (Confluent for Kafka, Redis Labs for Redis, etc.) for the metric definitions - -The goal is to **prioritize** the booster widgets — if booster ships 30 metrics but only 8 of them are commonly-watched, surface those 8 in the layer landing and stash the rest in a "More" group (or just keep them in the JSON with smaller `span` so they live below the fold). - -### Step 5 — Read upstream docs from `docs/menu.yml` - -`/Users/wusheng/github/skywalking/docs/menu.yml` has a `catalog:` tree. Find the entry whose name matches the layer (e.g. "MySQL/MariaDB monitoring", "Kafka monitoring", "Redis monitoring") and follow its `path:` to `/Users/wusheng/github/skywalking/docs/<path-without-leading-slash>.md`. These docs: -- List every metric the OAP module emits, with definitions -- Often categorize metrics ("RED metrics", "Capacity metrics", "Reliability metrics") -- Sometimes include the suggested dashboard layout - -This is the most authoritative source for metric semantics. **If a booster widget tip is vague but the upstream doc explains the metric, use the doc's wording for the Horizon `tip`.** - -### Step 6 — Write the Horizon JSON - -Now produce `apps/bff/src/bundled_templates/layers/<layer>.json`. Skeleton: - -```jsonc -{ - "key": "<UPPER_SNAKE>", - "alias": "<Display Name>", - "color": "var(--sw-accent)", // most layers use the orange accent - "documentLink": "https://skywalking.apache.org/docs/main/next/en/<path-from-menu.yml>", - "aliases": { // optional — only override slot names when the layer uses non-standard entity terms - "services": "Brokers", - "instances": "Nodes", - "endpoints": "Topics" - }, - "components": { - "service": true, - "instances": <has *-instance.json>, - "endpoints": <has *-endpoint.json>, - "endpointDependency": <has endpoint-relation.json>, - "topology": <has Topology widget anywhere>, - "traces": <has Trace widget>, - "logs": <has Log widget>, - "traceProfiling": <has Profile widget>, - "ebpfProfiling": <has Ebpf widget>, - "asyncProfiling": <has AsyncProfiling widget> - }, - "layer-header": { - "orderBy": "<column metric key>", - "columns": [ /* 2-4 picker columns */ ] - }, - "overview": { - "groups": [ - { "title": "Latency & errors", "size": "auto", "metrics": [/* p95 + err (no p99 — redundant with p95 in a hero strip) */] }, - { "title": "Health", "size": "square", "metrics": [/* single apdex-like */] } - ] - }, - "dashboards": { - "service": [ /* the *-service.json Overview tab widgets, mapped 1:1 */ ], - "instance": [ /* *-instance.json Overview + JVM/CLR/etc tabs flattened */ ], - "endpoint": [ /* *-endpoint.json Overview tab */ ] - }, - "topology": { - "nodeMetrics": [ /* center, ring, secondary */ ], - "linkServerMetrics": [ /* RPM, respTime, p95, SLA — order matters for fallback */ ], - "linkClientMetrics": [ /* same shape; omit on layers w/o client-side relations */ ] - }, - "endpointDependency": { // only when components.endpointDependency - "nodeMetrics": [ /* center, ring, secondary */ ], - "linkMetrics": [ /* server-only — endpoint relations have no client side in OAP */ ] - } -} -``` - -### Step 7 — Validate - -From `apps/bff`: -```bash -pnpm exec tsc --noEmit # schema still typechecks -node --import tsx -e "import('./src/layers/loader.ts').then(m => { const t = m.getLayerTemplate('<KEY>'); console.log(JSON.stringify({key:t.key, components:t.components, headerCols:t.header.columns.length, dashScopes:Object.keys(t.dashboards||{}), topo:!!t.topology, epDep:!!t.endpointDependency}, null, 2)); })" -``` - -Sanity-check the JSON loads and surfaces all expected fields. If the BFF is running, hit `/api/menu` and `/api/layer/<key>/landing` against the demo OAP to confirm the picker columns resolve. - ---- - -## Common pitfalls - -1. **Scope mismatch — SERVICE_INSTANCE metrics in service dashboards.** A bare per-instance metric (e.g. `meter_oap_instance_cpu_percentage`, catalog `SERVICE_INSTANCE`) **does not work** in a service-scope picker / overview / line widget. You have three options: - - **Picker column / overview / card** — wrap with `avg(...)` or `sum(...)` to coerce to a single scalar (e.g. `avg(meter_oap_instance_cpu_percentage)`). - - **Service-dashboard line widget showing per-instance trends** — must be a **`top_n(...)` `top` widget**, never a bare `line`. The line widget at service scope can only render SERVICE-scope metrics. - - **Per-instance metric on the instance scope** — put it under `dashboards.instance` and reference the bare metric; it works fine at instance entity. - The same rule applies one level deeper for ENDPOINT-scope metrics in service dashboards — use `top_n` or aggregate. -2. **Labeled vs regular OAL/MAL output.** `aggregate_labels(metric, sum)` is only valid for LABELED_VALUE metrics. For REGULAR_VALUE metrics use plain `sum(metric)` / `avg(metric)`. Mixing them produces "result is not a labeled result" errors. Check `listMetrics(regex:^<name>$)` on the demo OAP to confirm catalog + type before composing. -3. **Card MQE includes `avg(...)`** — booster wraps card values in `avg(...)` because their card renderer expects a scalar. Horizon's card path *also* averages, so leaving the `avg(...)` works but is redundant. **For consistency, unwrap a single outer `avg()` from card expressions** unless removing it changes the shape (e.g. `avg(x)/100` becomes `x/100`, but `avg(x{p='99'})/100` stays as-is because the inner `{p='99'}` is the dimension we're averaging across). -2. **24-col → 12-col grid** — `w: 8` (booster) becomes `span: 4` (Horizon). Don't paste 24-col widths or widgets overflow the grid. -3. **InstanceList / EndpointList** — these are *not* widgets in Horizon. They're the layer page's built-in pickers. The booster widget's `subExpressions` array (per-row MQE) is the closest analogue; drop it (Horizon's landing route picks per-row metrics from `header.columns`). -4. **Relation files** — most of a `*-service-relation.json` collapses into the top-level `topology` block (server + client edge metrics). Don't replicate the relation widgets as flat `dashboards.dependency` entries unless they're metrics the topology renderer can't surface (e.g. relation-specific record widgets). -5. **`visibleWhen`** — booster instance dashboards bundle JVM / CLR / Golang / Python / Ruby / Spring widgets in the same scope, and the SPA hides ones with no data. Carry this over with `"visibleWhen": "<first_metric> has value"` so the Horizon Instance view stays clean for a single-runtime instance. -6. **No `Co-Authored-By` footers on commits** — per project `CLAUDE.md` and stored memory. - ---- - -## Reference: General layer migration - -The General layer (already migrated) is the canonical reference. Compare: - -- Upstream files: `general-root.json`, `general-service.json`, `general-instance.json`, `general-endpoint.json`, `general-service-relation.json`, `general-instance-relation.json`, `general-endpoint-relation.json` (4565 lines total across 7 files) -- Horizon output: `apps/bff/src/bundled_templates/layers/general.json` (989 lines, one file) - -Key transformations applied (use as a recipe): -- 7 nested `Tab` files → flat `dashboards.{service,instance,endpoint}` arrays (dependency + topology folded into top-level blocks) -- Card MQE unwrapped (`avg(service_apdex)/10000` → in card context Horizon does the avg, but kept as-is when the math is inside) -- Picker columns derived from the booster Overview tab's three card metrics: cpm / apdex / error rate -- Topology block merged `general-service-relation.json`'s `linkServerExpressions` / `linkClientExpressions` into a single block with four-metric server + client families (RPM / respTime / p95 / SLA) -- Slow-statements widget converted from booster's `TopList` with RECORD-typed MQE to Horizon's `record` type -- Multi-runtime instance widgets (JVM / CLR / Sleuth / Go / Python / Ruby) all carry `visibleWhen: "<first_metric> has value"` so instances only render the relevant rows -- `components.{traceProfiling, ebpfProfiling, asyncProfiling}` set from the three booster profiling tabs diff --git a/apps/bff/src/http/query/trace.ts b/apps/bff/src/http/query/trace.ts index ab42c75..21da88d 100644 --- a/apps/bff/src/http/query/trace.ts +++ b/apps/bff/src/http/query/trace.ts @@ -28,8 +28,9 @@ * slot. The UI renders two tables side-by-side; there's no field * mapping between the two — zipkin spans keep their zipkin shape. * - * Native v2 vs v3 is auto-detected via {@link detectTraceProtocol} - * — the caller doesn't need to know which OAP version is answering. + * The native query (`queryTraces` vs `queryBasicTraces`) is + * auto-detected via {@link detectTraceQueryApi} — the caller doesn't + * need to know which one the OAP backend answers with. */ import type { FastifyInstance, FastifyReply, FastifyRequest } from 'fastify'; @@ -52,7 +53,7 @@ import type { SessionStore } from '../../user/sessions.js'; import { requireAuth } from '../../user/middleware.js'; import { graphqlPost, buildOapOpts, type GraphqlOptions } from '../../client/graphql.js'; import { getLayerTemplate, tracesConfigFor } from '../../logic/layers/loader.js'; -import { detectTraceProtocol } from '../../util/trace-protocol-cache.js'; +import { detectTraceQueryApi } from '../../util/trace-protocol-cache.js'; import { zipkinFetchTraces, zipkinFetchTraceById, summariseZipkinTrace } from '../../client/zipkin.js'; export interface TraceRouteDeps { @@ -125,7 +126,7 @@ const LIST_SERVICES_FOR_RESOLVE = /* GraphQL */ ` } `; -const QUERY_BASIC_TRACES_V3 = /* GraphQL */ ` +const QUERY_BASIC_TRACES = /* GraphQL */ ` query QueryBasicTraces($condition: TraceQueryCondition) { data: queryBasicTraces(condition: $condition) { traces { @@ -140,8 +141,8 @@ const QUERY_BASIC_TRACES_V3 = /* GraphQL */ ` } `; -const QUERY_TRACES_V2 = /* GraphQL */ ` - query QueryTracesV2($condition: TraceQueryCondition) { +const QUERY_TRACES = /* GraphQL */ ` + query QueryTraces($condition: TraceQueryCondition) { data: queryTraces(condition: $condition) { traces { spans { @@ -256,7 +257,7 @@ async function fetchNativeList( body: TraceListBody, layerKey: string, ): Promise<NativeTraceListResponse> { - const protocol = await detectTraceProtocol(opts); + const api = await detectTraceQueryApi(opts); // Explicit start+end takes precedence over windowMinutes; falling // back to the rolling default when the explicit range is invalid. const explicit = body.start && body.end ? explicitWindow(body.start, body.end) : null; @@ -271,7 +272,7 @@ async function fetchNativeList( } catch (err) { return { source: 'native', - protocol, + api, traces: [], reachable: false, error: err instanceof Error ? err.message : String(err), @@ -279,10 +280,10 @@ async function fetchNativeList( } const condition = buildTraceCondition(body, serviceId, window); try { - if (protocol === 'v2') { + if (api === 'queryTraces') { const env = await graphqlPost<{ data: { traces: Array<{ spans: NativeSpan[] }> }; - }>(opts, QUERY_TRACES_V2, { condition }); + }>(opts, QUERY_TRACES, { condition }); const traces = (env.data?.traces ?? []).map((t) => { const root = t.spans.find((s) => s.parentSpanId === -1) ?? t.spans[0]; const ids = Array.from(new Set(t.spans.map((s) => s.traceId))); @@ -297,7 +298,7 @@ async function fetchNativeList( spans: t.spans, }; }); - return { source: 'native', protocol, traces, reachable: true }; + return { source: 'native', api, traces, reachable: true }; } const env = await graphqlPost<{ data: { @@ -310,7 +311,7 @@ async function fetchNativeList( traceIds: string[]; }>; }; - }>(opts, QUERY_BASIC_TRACES_V3, { condition }); + }>(opts, QUERY_BASIC_TRACES, { condition }); const traces = (env.data?.traces ?? []).map((t) => ({ key: t.key, segmentId: t.key, @@ -320,11 +321,11 @@ async function fetchNativeList( isError: t.isError, traceIds: t.traceIds, })); - return { source: 'native', protocol, traces, reachable: true }; + return { source: 'native', api, traces, reachable: true }; } catch (err) { return { source: 'native', - protocol, + api, traces: [], reachable: false, error: err instanceof Error ? err.message : String(err), @@ -403,7 +404,7 @@ export function registerTraceRoutes(app: FastifyInstance, deps: TraceRouteDeps): const opts = buildOapOpts(deps.config.current, deps.fetch); if (source === 'native') { - const protocol = await detectTraceProtocol(opts); + const api = await detectTraceQueryApi(opts); try { const env = await graphqlPost<{ trace: { spans: NativeSpan[] } }>( opts, @@ -412,7 +413,7 @@ export function registerTraceRoutes(app: FastifyInstance, deps: TraceRouteDeps): ); const detail: NativeTraceDetailResponse = { source: 'native', - protocol, + api, traceId: params.traceId, spans: env.trace?.spans ?? [], reachable: true, @@ -425,7 +426,7 @@ export function registerTraceRoutes(app: FastifyInstance, deps: TraceRouteDeps): } catch (err) { const detail: NativeTraceDetailResponse = { source: 'native', - protocol, + api, traceId: params.traceId, spans: [], reachable: false, diff --git a/apps/bff/src/logic/layers/loader.ts b/apps/bff/src/logic/layers/loader.ts index 153d823..c82d99f 100644 --- a/apps/bff/src/logic/layers/loader.ts +++ b/apps/bff/src/logic/layers/loader.ts @@ -213,8 +213,9 @@ export interface LayerTemplate { /** Traces tab config. The `source` field picks which trace backend * the UI's filter selector defaults to (`both` shows two parallel * tables; `native` / `zipkin` pin to one). Default `both` when - * absent. The v2-vs-v3 split for native traces is decided at - * runtime by probing `hasQueryTracesV2Support`, not in this config. */ + * absent. The native query choice (`queryTraces` vs + * `queryBasicTraces`) is decided at runtime by probing + * `hasQueryTracesV2Support`, not in this config. */ traces?: TracesConfig; /** Logs tab config. Some layers carry per-instance logs (Istio Data * Plane / sidecar access logs, eBPF profiling targets) — they need diff --git a/apps/bff/src/util/trace-protocol-cache.ts b/apps/bff/src/util/trace-protocol-cache.ts index 0887eb0..79eaeb9 100644 --- a/apps/bff/src/util/trace-protocol-cache.ts +++ b/apps/bff/src/util/trace-protocol-cache.ts @@ -16,16 +16,21 @@ */ /** - * Tiny per-process cache for the trace-protocol probe. + * Tiny per-process cache for the native trace-query-API probe. * - * OAP 9.6+ exposes `queryTraces` (v2) which returns the list AND - * the spans inline in a single roundtrip. Older OAP only exposes - * `queryBasicTraces` (v3) which returns trace summaries; the SPA - * fetches each detail via `queryTrace(traceId)`. + * OAP exposes two native trace queries. The Trace Query v2 API + * (`queryTraces`) returns the list AND the spans inline in a single + * roundtrip — but OAP only supports it on the BanyanDB storage + * backend. On every other backend (Elasticsearch, …) it is + * unavailable, so the BFF uses the Trace Query v1 API + * (`queryBasicTraces`), which returns trace/segment summaries; the SPA + * then fetches each detail via `queryTrace(traceId)`. * - * The v2 query type carries a sibling boolean `hasQueryTracesV2Support` - * we can probe once and cache. Cached for 5 minutes so an OAP rollover - * or a redeploy is picked up without an explicit reload. + * OAP advertises v2 support through the sibling boolean + * `hasQueryTracesV2Support` (true iff the storage backend is + * BanyanDB), which we probe once and cache for 5 minutes so an OAP + * rollover or a backend switch is picked up without an explicit + * reload. * * The decision is per OAP target (keyed by `queryUrl`) — when the * operator points horizon at a different cluster, the probe runs @@ -33,12 +38,10 @@ */ import { graphqlPost } from '../client/graphql.js'; -import type { FetchLike } from '@skywalking-horizon-ui/api-client'; - -export type TraceProtocol = 'v2' | 'v3'; +import type { FetchLike, TraceQueryApi } from '@skywalking-horizon-ui/api-client'; interface CacheEntry { - protocol: TraceProtocol; + api: TraceQueryApi; expiresAt: number; } @@ -55,35 +58,35 @@ interface ProbeOpts { } const PROBE_QUERY = /* GraphQL */ ` - query ProbeQueryTracesV2 { + query ProbeQueryTracesSupport { hasQueryTracesV2Support } `; /** - * Returns the protocol the BFF should use for trace queries against - * this OAP target. Probes once and caches; falls back to v3 on - * probe failure since v3's `queryBasicTraces` is supported by every - * shipped OAP version. + * Returns the native trace query the BFF should use against this OAP + * target. Probes once and caches; falls back to `queryBasicTraces` on + * probe failure since it is supported on every storage backend. */ -export async function detectTraceProtocol(opts: ProbeOpts): Promise<TraceProtocol> { +export async function detectTraceQueryApi(opts: ProbeOpts): Promise<TraceQueryApi> { const now = Date.now(); const cached = cache.get(opts.queryUrl); - if (cached && cached.expiresAt > now) return cached.protocol; - let protocol: TraceProtocol = 'v3'; + if (cached && cached.expiresAt > now) return cached.api; + let api: TraceQueryApi = 'queryBasicTraces'; try { const data = await graphqlPost<{ hasQueryTracesV2Support?: boolean }>(opts, PROBE_QUERY); - if (data.hasQueryTracesV2Support === true) protocol = 'v2'; + if (data.hasQueryTracesV2Support === true) api = 'queryTraces'; } catch { - // Probe failed — older OAP doesn't have the field, GraphQL errors - // out. Cache v3 so we don't re-probe on every list call. + // Probe failed — an OAP without the field errors the GraphQL out. + // Cache `queryBasicTraces` (works on every backend) so we don't + // re-probe on every list call. } - cache.set(opts.queryUrl, { protocol, expiresAt: now + CACHE_TTL_MS }); - return protocol; + cache.set(opts.queryUrl, { api, expiresAt: now + CACHE_TTL_MS }); + return api; } /** Force-invalidate the cache. Wired into the future "Refresh" admin * affordance; not used by the runtime today. */ -export function invalidateTraceProtocolCache(): void { +export function invalidateTraceQueryApiCache(): void { cache.clear(); } diff --git a/apps/ui/src/layer/traces/LayerTracesView.vue b/apps/ui/src/layer/traces/LayerTracesView.vue index d3ee54a..d0f52d0 100644 --- a/apps/ui/src/layer/traces/LayerTracesView.vue +++ b/apps/ui/src/layer/traces/LayerTracesView.vue @@ -203,6 +203,16 @@ const { native, isFetching, refetch } = useLayerTraces(layerKey, { enabled: queryEnabled, }); +// Which OAP query answered. `queryBasicTraces` (Trace Query v1 API) +// returns trace SEGMENTS — each row is one segment and the full trace +// is fetched on click via queryTrace. `queryTraces` (v2, BanyanDB +// only) returns whole traces with spans inline, rendered immediately +// on selection. The banner states the API and persists across the +// browse + detail views so operators always know what a row is. +const isSegmentList = computed(() => native.value?.api === 'queryBasicTraces'); +const traceApiLabel = computed(() => (native.value?.api === 'queryTraces' ? 'v2' : 'v1')); +const showApiBanner = computed(() => hasQueried.value && !!native.value?.reachable); + /** * Commit live filter values to the committed refs, then fire the * query. This is the only path that fetches — filter inputs don't @@ -1153,13 +1163,26 @@ onBeforeUnmount(() => window.removeEventListener('keydown', onPageKeyDown, true) </section> </div> + <!-- Persists across browse + detail so the active trace-query API + (and what a row represents) stays visible after a click. --> + <div v-if="showApiBanner" class="tr-api-banner"> + This OAP serves traces via <b>Trace Query {{ traceApiLabel }} API</b> + (<code>{{ native?.api }}</code>). + <template v-if="isSegmentList"> + Each row is a trace <b>segment</b> — click one to fetch its full trace. + </template> + <template v-else> + Full traces are returned inline. + </template> + </div> + <!-- Browsing mode: full-width list when no trace is selected --> <template v-if="!selectedTraceId"> <article class="tr-list-card sw-card"> <header class="tr-list-head"> - <h4>Traces</h4> + <h4>{{ isSegmentList ? 'Segments' : 'Traces' }}</h4> <span v-if="native?.error" class="err-chip" :title="native.error">unreachable</span> - <span v-if="native" class="hint">{{ native.traces.length }} traces</span> + <span v-if="native" class="hint">{{ native.traces.length }} {{ isSegmentList ? 'segments' : 'traces' }}</span> </header> <div v-if="!hasQueried" class="tr-empty"> Pick your conditions, then click <b>Run query</b>. @@ -1215,7 +1238,7 @@ onBeforeUnmount(() => window.removeEventListener('keydown', onPageKeyDown, true) <button class="rail-handle" type="button" :title="railOpen ? 'Collapse list' : 'Expand list'" @click="railOpen = !railOpen"> <span v-if="railOpen">«</span><span v-else>»</span> </button> - <h4 v-if="railOpen">Traces</h4> + <h4 v-if="railOpen">{{ isSegmentList ? 'Segments' : 'Traces' }}</h4> <span v-if="railOpen && native" class="hint">{{ native.traces.length }}</span> </header> <ul v-if="railOpen && visibleTraces.length" class="tr-rowlist rail-list"> @@ -1917,6 +1940,24 @@ onBeforeUnmount(() => window.removeEventListener('keydown', onPageKeyDown, true) background: rgba(239, 68, 68, 0.18); color: var(--sw-err); } +.tr-api-banner { + padding: 7px 12px; + border: 1px solid var(--sw-line); + border-radius: 6px; + background: var(--sw-bg-2); + color: var(--sw-fg-2); + font-size: 11px; + line-height: 1.5; +} +.tr-api-banner code { + font-family: var(--sw-mono); + font-size: 10.5px; + padding: 0 3px; + border-radius: 3px; + background: var(--sw-bg-3); + color: var(--sw-accent); +} +.tr-api-banner b { color: var(--sw-fg-0); } .tr-empty { padding: 24px; text-align: center; color: var(--sw-fg-3); font-size: 11.5px; } .tr-rowlist { list-style: none; diff --git a/packages/api-client/src/index.ts b/packages/api-client/src/index.ts index 9f372e1..2167da4 100644 --- a/packages/api-client/src/index.ts +++ b/packages/api-client/src/index.ts @@ -68,6 +68,7 @@ export type { export type { TraceSource, TracesConfig, + TraceQueryApi, TraceKeyValue, TraceLogEntry, TraceAttachedEvent, diff --git a/packages/api-client/src/trace.ts b/packages/api-client/src/trace.ts index 33947ed..bdba07e 100644 --- a/packages/api-client/src/trace.ts +++ b/packages/api-client/src/trace.ts @@ -25,10 +25,11 @@ * we do NOT normalise into a "common" trace; each backend's fields * are surfaced verbatim, with its own waterfall renderer. * - * Native traces themselves split into v2 (one call, spans inline) - * and v3 (two calls, segment list + span fetch by traceId). The BFF - * picks the right one based on `hasQueryTracesV2Support` and the - * caller doesn't need to know which. + * Native traces are served by one of two OAP queries: `queryTraces` + * (one call, spans inline) or `queryBasicTraces` (segment list, then + * a per-trace `queryTrace` fetch by id). The BFF picks the right one + * based on `hasQueryTracesV2Support` and the caller doesn't need to + * know which. */ // ── Trace source selector ────────────────────────────────────────── @@ -41,7 +42,16 @@ export interface TracesConfig { source: TraceSource; } -// ── Native trace types (SkyWalking v2 + v3 share the span shape) ─── +// ── Native trace types (both OAP queries share the span shape) ───── + +/** Which OAP query served the native trace list/detail. Driven by the + * OAP storage backend, not its version: + * - `queryTraces` — Trace Query v2 API; returns the whole trace + * (spans inline). Only available on the BanyanDB backend. + * - `queryBasicTraces` — Trace Query v1 API; returns segment + * summaries, the full trace is fetched on demand via + * `queryTrace(traceId)`. Available on every backend (ES, …). */ +export type TraceQueryApi = 'queryTraces' | 'queryBasicTraces'; export interface TraceKeyValue { key: string; @@ -89,10 +99,10 @@ export interface NativeSpan { attachedEvents: TraceAttachedEvent[]; } -/** One row in the trace list. v2 + v3 share this shape. The +/** One row in the trace list. Both queries share this shape. The * segmentId / traceIds pair lets the operator open the full trace - * through the v3 fetch path; v2 already embeds spans so the list - * endpoint may surface them directly. */ + * through the `queryTrace`-by-id fetch path; `queryTraces` already + * embeds spans so the list endpoint may surface them directly. */ export interface NativeTraceListRow { key: string; segmentId: string; @@ -101,9 +111,9 @@ export interface NativeTraceListRow { start: string; isError: boolean; traceIds: string[]; - /** Only populated when the BFF served the list via v2 (spans - * inline). v3 returns these undefined; the SPA fetches detail on - * demand via the trace-by-id endpoint. */ + /** Only populated when the BFF served the list via `queryTraces` + * (spans inline). `queryBasicTraces` returns these undefined; the + * SPA fetches detail on demand via the trace-by-id endpoint. */ spans?: NativeSpan[]; } @@ -112,9 +122,10 @@ export type TraceQueryState = 'ALL' | 'SUCCESS' | 'ERROR'; export interface NativeTraceListResponse { source: 'native'; - /** Which OAP family answered — informational, surfaced as a small - * chip in the UI so the operator knows what they're looking at. */ - protocol: 'v2' | 'v3'; + /** Which OAP query answered — informational, lets the SPA decide + * whether list rows already carry spans (`queryTraces`) or need a + * follow-up `queryTrace` fetch (`queryBasicTraces`). */ + api: TraceQueryApi; traces: NativeTraceListRow[]; reachable: boolean; error?: string; @@ -122,7 +133,7 @@ export interface NativeTraceListResponse { export interface NativeTraceDetailResponse { source: 'native'; - protocol: 'v2' | 'v3'; + api: TraceQueryApi; traceId: string; spans: NativeSpan[]; reachable: boolean;
