laskoviymishka commented on code in PR #1213: URL: https://github.com/apache/iceberg-go/pull/1213#discussion_r3417977775
########## catalog/rest/scan_planning.go: ########## @@ -0,0 +1,162 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This file is a PROPOSED public API surface for REST server-side scan +// planning (apache/iceberg-go#1178). The bodies are intentionally +// unimplemented; the file exists so the REST surface can be reviewed as Go. +// Endpoint capability discovery (Endpoint, SupportsEndpoint) lands separately +// in the Phase 0 PR and is intentionally not redeclared here. + +package rest + +import ( + "context" + "encoding/json" + "fmt" + "time" + + "github.com/apache/iceberg-go/table" +) + +// Compile-time proof that the REST catalog satisfies the table planner seam. +var _ table.ScanPlanner = (*Catalog)(nil) + +// ErrPlanExpired is returned when polling a plan-id the server no longer knows +// about (HTTP 404 while polling), distinct from a table-not-found 404. +var ErrPlanExpired = fmt.Errorf("%w: scan plan expired", ErrRESTError) + +// --- Capability gating (Open Question 2) ------------------------------------ +// +// A single capability check is too coarse: requiring all four endpoints falls +// back to local against sync-only servers, while requiring only the plan +// endpoint false-positives, because planTableScan can return `submitted` or +// `plan-tasks` that need the poll/fetch endpoints. The split below lets `auto` +// use a sync-only server while reserving the async/fanout path for servers +// that advertise everything. + +// SupportsPlanTableScan reports whether the server advertised the synchronous +// plan endpoint. +func (c *Catalog) SupportsPlanTableScan() bool { + panic("unimplemented: proposed API for #1178") +} + +// SupportsFullRemoteScanPlanning reports whether the server advertised all four +// scan-planning endpoints (plan, fetch-result, cancel, fetch-tasks). +func (c *Catalog) SupportsFullRemoteScanPlanning() bool { + panic("unimplemented: proposed API for #1178") +} + +// --- table.ScanPlanner implementation --------------------------------------- + +// SupportsRemoteScanPlanning reports whether this catalog can complete a remote +// plan end-to-end; backed by the split capability checks above. +func (c *Catalog) SupportsRemoteScanPlanning() bool { + panic("unimplemented: proposed API for #1178") +} + +// PlanFiles plans a scan server-side and returns tasks (and, optionally, a +// plan-scoped FileIO) for the table to read. +func (c *Catalog) PlanFiles(ctx context.Context, req table.ScanPlanningRequest) (table.ScanPlanningResult, error) { + panic("unimplemented: proposed API for #1178") +} + +// --- Low-level client methods ----------------------------------------------- + +// PlanTableScan submits a scan plan. The result is either completed inline, +// submitted (returns a plan-id to poll), or failed. +func (c *Catalog) PlanTableScan(ctx context.Context, ident table.Identifier, req PlanTableScanRequest) (PlanTableScanResponse, error) { + panic("unimplemented: proposed API for #1178") +} + +// FetchPlanningResult polls a previously submitted plan. +func (c *Catalog) FetchPlanningResult(ctx context.Context, ident table.Identifier, planID string) (FetchPlanningResultResponse, error) { + panic("unimplemented: proposed API for #1178") +} + +// CancelPlanning cancels a server-side plan. Callers should cancel on context +// cancellation using a detached context with a short timeout. +func (c *Catalog) CancelPlanning(ctx context.Context, ident table.Identifier, planID string) error { + panic("unimplemented: proposed API for #1178") +} + +// FetchScanTasks fetches the scan tasks for a plan-task handle returned by a +// completed plan. +func (c *Catalog) FetchScanTasks(ctx context.Context, ident table.Identifier, req FetchScanTasksRequest) (FetchScanTasksResponse, error) { + panic("unimplemented: proposed API for #1178") +} + +// WaitForPlan submits and polls a plan to completion using jittered backoff, +// cancelling the server-side plan if the context is cancelled. +func (c *Catalog) WaitForPlan(ctx context.Context, ident table.Identifier, planID string, opts WaitForPlanOptions) (FetchPlanningResultResponse, error) { + panic("unimplemented: proposed API for #1178") +} + +// --- Wire types (sketch) ---------------------------------------------------- +// +// Field-complete request/response decoding (content-file JSON, residuals, +// storage credentials) lands with the scan-task decoder PR; these sketch the +// request/response envelopes so the client surface compiles and reads. + +// PlanStatus is the status of a server-side plan. +type PlanStatus string + +const ( + PlanStatusCompleted PlanStatus = "completed" + PlanStatusSubmitted PlanStatus = "submitted" + PlanStatusCancelled PlanStatus = "cancelled" + PlanStatusFailed PlanStatus = "failed" +) + +// PlanTableScanRequest is the POST .../plan request body. Filter is the +// ExpressionParser-format JSON produced by iceberg.MarshalExpressionJSON. +type PlanTableScanRequest struct { + SnapshotID *int64 `json:"snapshot-id,omitempty"` + StartSnapshotID *int64 `json:"start-snapshot-id,omitempty"` + EndSnapshotID *int64 `json:"end-snapshot-id,omitempty"` + Select []string `json:"select,omitempty"` + Filter json.RawMessage `json:"filter,omitempty"` + CaseSensitive *bool `json:"case-sensitive,omitempty"` + UseSnapshotSchema *bool `json:"use-snapshot-schema,omitempty"` +} + +// PlanTableScanResponse is the POST .../plan response envelope. +type PlanTableScanResponse struct { + PlanStatus PlanStatus `json:"plan-status"` + PlanID *string `json:"plan-id,omitempty"` + // file-scan-tasks, delete-files, plan-tasks, storage-credentials decoded + // by the scan-task decoder PR. +} + +// FetchPlanningResultResponse is the GET .../plan/{plan-id} response envelope. +type FetchPlanningResultResponse struct { + PlanStatus PlanStatus `json:"plan-status"` Review Comment: Same union collapse as `PlanTableScanResponse`, but this one is worse because it's `WaitForPlan`'s return type. `CompletedPlanningResult` carries `file-scan-tasks`, `delete-files`, `plan-tasks`, and `storage-credentials` — and `storage-credentials` is the only way a client gets vended creds, so the `PlanFiles -> ReadTasks` path can't read data even after a successful remote plan if this stays status-only. I'd not reuse `FetchPlanningResultResponse` as `WaitForPlan`'s return; return a typed completed-plan result and at least sketch `StorageCredentials` and the task fields so the shape is stable for the decoder PR. wdyt? ########## catalog/rest/scan_planning.go: ########## @@ -0,0 +1,162 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This file is a PROPOSED public API surface for REST server-side scan +// planning (apache/iceberg-go#1178). The bodies are intentionally +// unimplemented; the file exists so the REST surface can be reviewed as Go. +// Endpoint capability discovery (Endpoint, SupportsEndpoint) lands separately +// in the Phase 0 PR and is intentionally not redeclared here. + +package rest + +import ( + "context" + "encoding/json" + "fmt" + "time" + + "github.com/apache/iceberg-go/table" +) + +// Compile-time proof that the REST catalog satisfies the table planner seam. +var _ table.ScanPlanner = (*Catalog)(nil) + +// ErrPlanExpired is returned when polling a plan-id the server no longer knows +// about (HTTP 404 while polling), distinct from a table-not-found 404. +var ErrPlanExpired = fmt.Errorf("%w: scan plan expired", ErrRESTError) + +// --- Capability gating (Open Question 2) ------------------------------------ +// +// A single capability check is too coarse: requiring all four endpoints falls +// back to local against sync-only servers, while requiring only the plan +// endpoint false-positives, because planTableScan can return `submitted` or +// `plan-tasks` that need the poll/fetch endpoints. The split below lets `auto` +// use a sync-only server while reserving the async/fanout path for servers +// that advertise everything. + +// SupportsPlanTableScan reports whether the server advertised the synchronous +// plan endpoint. +func (c *Catalog) SupportsPlanTableScan() bool { + panic("unimplemented: proposed API for #1178") +} + +// SupportsFullRemoteScanPlanning reports whether the server advertised all four +// scan-planning endpoints (plan, fetch-result, cancel, fetch-tasks). +func (c *Catalog) SupportsFullRemoteScanPlanning() bool { + panic("unimplemented: proposed API for #1178") +} + +// --- table.ScanPlanner implementation --------------------------------------- + +// SupportsRemoteScanPlanning reports whether this catalog can complete a remote +// plan end-to-end; backed by the split capability checks above. +func (c *Catalog) SupportsRemoteScanPlanning() bool { + panic("unimplemented: proposed API for #1178") +} + +// PlanFiles plans a scan server-side and returns tasks (and, optionally, a +// plan-scoped FileIO) for the table to read. +func (c *Catalog) PlanFiles(ctx context.Context, req table.ScanPlanningRequest) (table.ScanPlanningResult, error) { + panic("unimplemented: proposed API for #1178") +} + +// --- Low-level client methods ----------------------------------------------- + +// PlanTableScan submits a scan plan. The result is either completed inline, +// submitted (returns a plan-id to poll), or failed. +func (c *Catalog) PlanTableScan(ctx context.Context, ident table.Identifier, req PlanTableScanRequest) (PlanTableScanResponse, error) { + panic("unimplemented: proposed API for #1178") +} + +// FetchPlanningResult polls a previously submitted plan. +func (c *Catalog) FetchPlanningResult(ctx context.Context, ident table.Identifier, planID string) (FetchPlanningResultResponse, error) { + panic("unimplemented: proposed API for #1178") +} + +// CancelPlanning cancels a server-side plan. Callers should cancel on context +// cancellation using a detached context with a short timeout. +func (c *Catalog) CancelPlanning(ctx context.Context, ident table.Identifier, planID string) error { + panic("unimplemented: proposed API for #1178") +} + +// FetchScanTasks fetches the scan tasks for a plan-task handle returned by a +// completed plan. +func (c *Catalog) FetchScanTasks(ctx context.Context, ident table.Identifier, req FetchScanTasksRequest) (FetchScanTasksResponse, error) { + panic("unimplemented: proposed API for #1178") +} + +// WaitForPlan submits and polls a plan to completion using jittered backoff, +// cancelling the server-side plan if the context is cancelled. +func (c *Catalog) WaitForPlan(ctx context.Context, ident table.Identifier, planID string, opts WaitForPlanOptions) (FetchPlanningResultResponse, error) { + panic("unimplemented: proposed API for #1178") +} + +// --- Wire types (sketch) ---------------------------------------------------- +// +// Field-complete request/response decoding (content-file JSON, residuals, +// storage credentials) lands with the scan-task decoder PR; these sketch the +// request/response envelopes so the client surface compiles and reads. + +// PlanStatus is the status of a server-side plan. +type PlanStatus string + +const ( + PlanStatusCompleted PlanStatus = "completed" + PlanStatusSubmitted PlanStatus = "submitted" + PlanStatusCancelled PlanStatus = "cancelled" + PlanStatusFailed PlanStatus = "failed" +) + +// PlanTableScanRequest is the POST .../plan request body. Filter is the +// ExpressionParser-format JSON produced by iceberg.MarshalExpressionJSON. +type PlanTableScanRequest struct { + SnapshotID *int64 `json:"snapshot-id,omitempty"` + StartSnapshotID *int64 `json:"start-snapshot-id,omitempty"` + EndSnapshotID *int64 `json:"end-snapshot-id,omitempty"` + Select []string `json:"select,omitempty"` + Filter json.RawMessage `json:"filter,omitempty"` + CaseSensitive *bool `json:"case-sensitive,omitempty"` + UseSnapshotSchema *bool `json:"use-snapshot-schema,omitempty"` +} + +// PlanTableScanResponse is the POST .../plan response envelope. +type PlanTableScanResponse struct { + PlanStatus PlanStatus `json:"plan-status"` Review Comment: The json tag here is `plan-status`, but the spec field is `status` — the `PlanTableScanResult` oneOf discriminates on `propertyName: status`, and PyIceberg's `Plan*` models all use `status` with no alias. A conforming server returns `{"status": "completed"}`, so this client unmarshals to the zero-value `PlanStatus` ("") on every real response. I'd change the tag to `json:"status"` here and on `FetchPlanningResultResponse` below, and rename the field to `Status PlanStatus` while we're at it to drop the field/type-name collision. ########## catalog/rest/scan_planning.go: ########## @@ -0,0 +1,162 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This file is a PROPOSED public API surface for REST server-side scan +// planning (apache/iceberg-go#1178). The bodies are intentionally +// unimplemented; the file exists so the REST surface can be reviewed as Go. +// Endpoint capability discovery (Endpoint, SupportsEndpoint) lands separately +// in the Phase 0 PR and is intentionally not redeclared here. + +package rest + +import ( + "context" + "encoding/json" + "fmt" + "time" + + "github.com/apache/iceberg-go/table" +) + +// Compile-time proof that the REST catalog satisfies the table planner seam. +var _ table.ScanPlanner = (*Catalog)(nil) + +// ErrPlanExpired is returned when polling a plan-id the server no longer knows +// about (HTTP 404 while polling), distinct from a table-not-found 404. +var ErrPlanExpired = fmt.Errorf("%w: scan plan expired", ErrRESTError) + +// --- Capability gating (Open Question 2) ------------------------------------ +// +// A single capability check is too coarse: requiring all four endpoints falls +// back to local against sync-only servers, while requiring only the plan +// endpoint false-positives, because planTableScan can return `submitted` or +// `plan-tasks` that need the poll/fetch endpoints. The split below lets `auto` +// use a sync-only server while reserving the async/fanout path for servers +// that advertise everything. + +// SupportsPlanTableScan reports whether the server advertised the synchronous +// plan endpoint. +func (c *Catalog) SupportsPlanTableScan() bool { + panic("unimplemented: proposed API for #1178") +} + +// SupportsFullRemoteScanPlanning reports whether the server advertised all four +// scan-planning endpoints (plan, fetch-result, cancel, fetch-tasks). +func (c *Catalog) SupportsFullRemoteScanPlanning() bool { + panic("unimplemented: proposed API for #1178") +} + +// --- table.ScanPlanner implementation --------------------------------------- + +// SupportsRemoteScanPlanning reports whether this catalog can complete a remote +// plan end-to-end; backed by the split capability checks above. +func (c *Catalog) SupportsRemoteScanPlanning() bool { + panic("unimplemented: proposed API for #1178") +} + +// PlanFiles plans a scan server-side and returns tasks (and, optionally, a +// plan-scoped FileIO) for the table to read. +func (c *Catalog) PlanFiles(ctx context.Context, req table.ScanPlanningRequest) (table.ScanPlanningResult, error) { + panic("unimplemented: proposed API for #1178") +} + +// --- Low-level client methods ----------------------------------------------- + +// PlanTableScan submits a scan plan. The result is either completed inline, +// submitted (returns a plan-id to poll), or failed. +func (c *Catalog) PlanTableScan(ctx context.Context, ident table.Identifier, req PlanTableScanRequest) (PlanTableScanResponse, error) { + panic("unimplemented: proposed API for #1178") +} + +// FetchPlanningResult polls a previously submitted plan. +func (c *Catalog) FetchPlanningResult(ctx context.Context, ident table.Identifier, planID string) (FetchPlanningResultResponse, error) { + panic("unimplemented: proposed API for #1178") +} + +// CancelPlanning cancels a server-side plan. Callers should cancel on context +// cancellation using a detached context with a short timeout. +func (c *Catalog) CancelPlanning(ctx context.Context, ident table.Identifier, planID string) error { + panic("unimplemented: proposed API for #1178") +} + +// FetchScanTasks fetches the scan tasks for a plan-task handle returned by a +// completed plan. +func (c *Catalog) FetchScanTasks(ctx context.Context, ident table.Identifier, req FetchScanTasksRequest) (FetchScanTasksResponse, error) { + panic("unimplemented: proposed API for #1178") +} + +// WaitForPlan submits and polls a plan to completion using jittered backoff, +// cancelling the server-side plan if the context is cancelled. +func (c *Catalog) WaitForPlan(ctx context.Context, ident table.Identifier, planID string, opts WaitForPlanOptions) (FetchPlanningResultResponse, error) { + panic("unimplemented: proposed API for #1178") +} + +// --- Wire types (sketch) ---------------------------------------------------- +// +// Field-complete request/response decoding (content-file JSON, residuals, +// storage credentials) lands with the scan-task decoder PR; these sketch the +// request/response envelopes so the client surface compiles and reads. + +// PlanStatus is the status of a server-side plan. +type PlanStatus string + +const ( + PlanStatusCompleted PlanStatus = "completed" + PlanStatusSubmitted PlanStatus = "submitted" + PlanStatusCancelled PlanStatus = "cancelled" + PlanStatusFailed PlanStatus = "failed" +) + +// PlanTableScanRequest is the POST .../plan request body. Filter is the +// ExpressionParser-format JSON produced by iceberg.MarshalExpressionJSON. +type PlanTableScanRequest struct { + SnapshotID *int64 `json:"snapshot-id,omitempty"` + StartSnapshotID *int64 `json:"start-snapshot-id,omitempty"` + EndSnapshotID *int64 `json:"end-snapshot-id,omitempty"` + Select []string `json:"select,omitempty"` + Filter json.RawMessage `json:"filter,omitempty"` + CaseSensitive *bool `json:"case-sensitive,omitempty"` + UseSnapshotSchema *bool `json:"use-snapshot-schema,omitempty"` +} + +// PlanTableScanResponse is the POST .../plan response envelope. +type PlanTableScanResponse struct { Review Comment: `PlanTableScanResult` is a oneOf discriminated on status, and `failed` maps to `FailedPlanningResult` (= `allOf[IcebergErrorResponse, {status: failed}]`), so a failed plan carries message/type/code. Collapsing to `{Status, PlanID}` discards all of that — a caller getting `failed` has nothing to propagate. I think this is the core decision the PR needs to make rather than defer: either a discriminated-union interface like PyIceberg's `PlanningResponse`, or a flat struct with an embedded `*IcebergError` populated only on `failed`. The "decoded by the scan-task decoder PR" note is fine for the task payload, but the error arm and the union shape are the contract, and the follow-up PR will be stuck with whatever we pick here. Which direction do you want to commit to? ########## table/scan_planning.go: ########## @@ -0,0 +1,97 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This file is a PROPOSED public API surface for REST server-side scan +// planning (apache/iceberg-go#1178). The bodies are intentionally +// unimplemented; the file exists so the seam can be reviewed as Go rather +// than prose. Nothing here changes existing behavior. + +package table + +import ( + "context" + + "github.com/apache/iceberg-go" +) + +// ScanPlanningMode selects how (*Scan).PlanFiles plans a scan. Local planning +// remains the default; remote planning is opt-in via WithScanPlanningMode. +type ScanPlanningMode string + +const ( + // ScanPlanningLocal always plans locally by reading manifests through the + // table's FileIO. This is the default and current behavior. + ScanPlanningLocal ScanPlanningMode = "local" + // ScanPlanningRemote requires a planner that advertises remote capability + // and fails loudly if remote planning is unavailable. + ScanPlanningRemote ScanPlanningMode = "remote" + // ScanPlanningAuto uses remote planning when available and allowed by the + // table config, otherwise falls back to local. + ScanPlanningAuto ScanPlanningMode = "auto" +) + +// WithScanPlanningMode sets the scan-planning mode for a scan. The default is +// ScanPlanningLocal unless the REST table config requires server planning. +func WithScanPlanningMode(mode ScanPlanningMode) ScanOption { + panic("unimplemented: proposed API for #1178") +} + +// ScanPlanningRequest is the input a Scan hands to a ScanPlanner. It carries +// the resolved scan state a planner needs without depending on catalog/rest. +// +// Open question (epic OQ4): when the table has evolved, UseSnapshotSchema must +// pin which schema binds a returned residual and the partition decode — the +// snapshot's schema (via schema-id), kept separate from each file's partition +// spec-id. Incremental scans (start/end snapshot) are deferred to a later +// phase; point-in-time SnapshotID lands first. +type ScanPlanningRequest struct { + Identifier Identifier + Metadata Metadata + MetadataLocation string + SnapshotID *int64 + SelectedFields []string + RowFilter iceberg.BooleanExpression + CaseSensitive bool Review Comment: `CaseSensitive` as a non-pointer bool can't distinguish "explicitly false" from "unset", and the spec default for case-sensitivity is `true` while Go's bool zero is `false`. The wire type `PlanTableScanRequest.CaseSensitive` is correctly `*bool`, so a converter reading this `false` and writing `*bool(false)` sends case-insensitive when the caller meant the default. If this is always populated from a `Scan` where `caseSensitive` is guaranteed initialized to `true`, a doc line saying so is enough; otherwise I'd make it `*bool` (nil = server default). ########## table/scan_planning.go: ########## @@ -0,0 +1,97 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This file is a PROPOSED public API surface for REST server-side scan +// planning (apache/iceberg-go#1178). The bodies are intentionally +// unimplemented; the file exists so the seam can be reviewed as Go rather +// than prose. Nothing here changes existing behavior. + +package table + +import ( + "context" + + "github.com/apache/iceberg-go" +) + +// ScanPlanningMode selects how (*Scan).PlanFiles plans a scan. Local planning +// remains the default; remote planning is opt-in via WithScanPlanningMode. +type ScanPlanningMode string + +const ( + // ScanPlanningLocal always plans locally by reading manifests through the + // table's FileIO. This is the default and current behavior. + ScanPlanningLocal ScanPlanningMode = "local" + // ScanPlanningRemote requires a planner that advertises remote capability + // and fails loudly if remote planning is unavailable. + ScanPlanningRemote ScanPlanningMode = "remote" + // ScanPlanningAuto uses remote planning when available and allowed by the + // table config, otherwise falls back to local. + ScanPlanningAuto ScanPlanningMode = "auto" Review Comment: The PR body's OQ4 describes a `server` mode — config-set by the catalog/admin, distinct from a client choosing `ScanPlanningRemote`, that makes a client's local attempt fail fast — but the enum only has local/remote/auto, so that's observable behavior with no code anchor and a 3-value enum carrying 4-value semantics. I'd add `ScanPlanningRequired ScanPlanningMode = "server"` (matching the table-property string) with a doc that it's config-set and makes `ScanPlanningLocal` fail, or at minimum a comment mapping the property value onto the three constants so Phase 5/6 doesn't have to reconcile it later. ########## catalog/rest/scan_planning.go: ########## @@ -0,0 +1,162 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This file is a PROPOSED public API surface for REST server-side scan +// planning (apache/iceberg-go#1178). The bodies are intentionally +// unimplemented; the file exists so the REST surface can be reviewed as Go. +// Endpoint capability discovery (Endpoint, SupportsEndpoint) lands separately +// in the Phase 0 PR and is intentionally not redeclared here. + +package rest + +import ( + "context" + "encoding/json" + "fmt" + "time" + + "github.com/apache/iceberg-go/table" +) + +// Compile-time proof that the REST catalog satisfies the table planner seam. +var _ table.ScanPlanner = (*Catalog)(nil) Review Comment: This assertion always passes — the methods exist, they just panic — so it proves interface satisfaction but not that the seam connects to anything. Neither `Scan` nor `Table` grows a planner or mode field, so the actual integration point (how `Scan.PlanFiles` decides to delegate vs. run local) isn't visible in the proposal. For a PR whose whole value is making the seam reviewable, I'd sketch the connection: an unexported `planner ScanPlanner` / `planningMode ScanPlanningMode` field on `Scan` (even commented out), so reviewers can see where the mode and the planner land. It'd also give the parity test a named anchor. Worth it here? ########## table/scan_planning.go: ########## @@ -0,0 +1,97 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This file is a PROPOSED public API surface for REST server-side scan +// planning (apache/iceberg-go#1178). The bodies are intentionally +// unimplemented; the file exists so the seam can be reviewed as Go rather +// than prose. Nothing here changes existing behavior. + +package table + +import ( + "context" + + "github.com/apache/iceberg-go" +) + +// ScanPlanningMode selects how (*Scan).PlanFiles plans a scan. Local planning +// remains the default; remote planning is opt-in via WithScanPlanningMode. +type ScanPlanningMode string + +const ( + // ScanPlanningLocal always plans locally by reading manifests through the + // table's FileIO. This is the default and current behavior. + ScanPlanningLocal ScanPlanningMode = "local" + // ScanPlanningRemote requires a planner that advertises remote capability + // and fails loudly if remote planning is unavailable. + ScanPlanningRemote ScanPlanningMode = "remote" + // ScanPlanningAuto uses remote planning when available and allowed by the + // table config, otherwise falls back to local. + ScanPlanningAuto ScanPlanningMode = "auto" +) + +// WithScanPlanningMode sets the scan-planning mode for a scan. The default is +// ScanPlanningLocal unless the REST table config requires server planning. +func WithScanPlanningMode(mode ScanPlanningMode) ScanOption { + panic("unimplemented: proposed API for #1178") +} + +// ScanPlanningRequest is the input a Scan hands to a ScanPlanner. It carries +// the resolved scan state a planner needs without depending on catalog/rest. +// +// Open question (epic OQ4): when the table has evolved, UseSnapshotSchema must +// pin which schema binds a returned residual and the partition decode — the +// snapshot's schema (via schema-id), kept separate from each file's partition +// spec-id. Incremental scans (start/end snapshot) are deferred to a later +// phase; point-in-time SnapshotID lands first. +type ScanPlanningRequest struct { + Identifier Identifier + Metadata Metadata + MetadataLocation string + SnapshotID *int64 + SelectedFields []string + RowFilter iceberg.BooleanExpression + CaseSensitive bool + UseSnapshotSchema bool +} + +// ScanPlanningResult is what a ScanPlanner returns. +// +// Open question (OQ1): how plan-scoped FileIO reaches ReadTasks across the +// PlanFiles -> ReadTasks boundary is unsettled. IO here is one provisional +// carrier; a live FileIO should not live on FileScanTask (it has a transport +// codec). Alternatives: a richer planned-result object, an internal plan +// context on Scan, or a serializable credential handle on FileScanTask. +type ScanPlanningResult struct { + Tasks []FileScanTask + IO FSysF // PROVISIONAL carrier — see OQ1 Review Comment: This is the OQ1 question, and I don't think the PROVISIONAL comment buys us much — once the impl ships, `result.IO` becomes de-facto stable and downstream consumers (Arrow readers off `ReadTasks`, external CDC) will bind to it, so removing or renaming it later is a breaking change to a public type. `FSysF` is also a live closure, which is exactly the thing the comment argues shouldn't live on `FileScanTask`. The other half of this is that `(*Scan).ReadTasks` calls `scan.ioF(ctx)` directly today and never consults a `PlanFiles` result, so even if `IO` is populated here, nothing routes it into the read path without poking it back into `Scan` — the ugliness OQ1 is trying to avoid. The local-vs-remote parity test you're using as the acceptance anchor can't be written against this seam until that's resolved. Two shapes I'd consider: narrow to an opaque `PlanIO interface { Load(ctx) (icebergio.IO, error) }` (stable name, unstable body), or keep `IO` off the public result entirely for now and carry an opaque credential handle the `Scan` rebuilds a vending FileIO from. Either works, but I'd want OQ1 settled before this field is exported. How are you leaning? ########## table/scan_planning.go: ########## @@ -0,0 +1,97 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This file is a PROPOSED public API surface for REST server-side scan +// planning (apache/iceberg-go#1178). The bodies are intentionally +// unimplemented; the file exists so the seam can be reviewed as Go rather +// than prose. Nothing here changes existing behavior. + +package table + +import ( + "context" + + "github.com/apache/iceberg-go" +) + +// ScanPlanningMode selects how (*Scan).PlanFiles plans a scan. Local planning +// remains the default; remote planning is opt-in via WithScanPlanningMode. +type ScanPlanningMode string + +const ( + // ScanPlanningLocal always plans locally by reading manifests through the + // table's FileIO. This is the default and current behavior. + ScanPlanningLocal ScanPlanningMode = "local" + // ScanPlanningRemote requires a planner that advertises remote capability + // and fails loudly if remote planning is unavailable. + ScanPlanningRemote ScanPlanningMode = "remote" + // ScanPlanningAuto uses remote planning when available and allowed by the + // table config, otherwise falls back to local. + ScanPlanningAuto ScanPlanningMode = "auto" +) + +// WithScanPlanningMode sets the scan-planning mode for a scan. The default is +// ScanPlanningLocal unless the REST table config requires server planning. +func WithScanPlanningMode(mode ScanPlanningMode) ScanOption { + panic("unimplemented: proposed API for #1178") +} + +// ScanPlanningRequest is the input a Scan hands to a ScanPlanner. It carries +// the resolved scan state a planner needs without depending on catalog/rest. +// +// Open question (epic OQ4): when the table has evolved, UseSnapshotSchema must +// pin which schema binds a returned residual and the partition decode — the +// snapshot's schema (via schema-id), kept separate from each file's partition +// spec-id. Incremental scans (start/end snapshot) are deferred to a later +// phase; point-in-time SnapshotID lands first. +type ScanPlanningRequest struct { + Identifier Identifier + Metadata Metadata Review Comment: `Metadata` is a large interface (schema, snapshots, specs, sort orders, all files), and the planner really only needs snapshot id, schema, partition spec, and the metadata location. Since this struct is public, passing full `Metadata` over-specifies the contract, risks version drift between `PlanFiles` and `ReadTasks`, and forces tests to build a complete metadata stub. I'd narrow this to the fields `PlanFiles` actually consumes and drop `Metadata`. Non-blocking, but easier to widen a contract later than to shrink one. ########## table/scan_planning.go: ########## @@ -0,0 +1,97 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This file is a PROPOSED public API surface for REST server-side scan +// planning (apache/iceberg-go#1178). The bodies are intentionally +// unimplemented; the file exists so the seam can be reviewed as Go rather +// than prose. Nothing here changes existing behavior. + +package table + +import ( + "context" + + "github.com/apache/iceberg-go" +) + +// ScanPlanningMode selects how (*Scan).PlanFiles plans a scan. Local planning +// remains the default; remote planning is opt-in via WithScanPlanningMode. +type ScanPlanningMode string + +const ( + // ScanPlanningLocal always plans locally by reading manifests through the + // table's FileIO. This is the default and current behavior. + ScanPlanningLocal ScanPlanningMode = "local" + // ScanPlanningRemote requires a planner that advertises remote capability + // and fails loudly if remote planning is unavailable. + ScanPlanningRemote ScanPlanningMode = "remote" + // ScanPlanningAuto uses remote planning when available and allowed by the + // table config, otherwise falls back to local. + ScanPlanningAuto ScanPlanningMode = "auto" +) + +// WithScanPlanningMode sets the scan-planning mode for a scan. The default is +// ScanPlanningLocal unless the REST table config requires server planning. +func WithScanPlanningMode(mode ScanPlanningMode) ScanOption { Review Comment: This panics at construction, not when applied — every other `ScanOption` constructor in `table.go` is safe to call eagerly, and options usually get composed in a slice literal (`[]ScanOption{WithSnapshotID(n), WithScanPlanningMode(mode)}`), which would panic the moment it's evaluated, before `Scan()` ever runs. That breaks the "no behavior change" guarantee the rest of the PR holds. I'd return a closure that panics when applied instead: ```go func WithScanPlanningMode(mode ScanPlanningMode) ScanOption { return func(scan *Scan) { panic("unimplemented: proposed API for #1178") } } ``` ########## catalog/rest/scan_planning.go: ########## @@ -0,0 +1,162 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This file is a PROPOSED public API surface for REST server-side scan +// planning (apache/iceberg-go#1178). The bodies are intentionally +// unimplemented; the file exists so the REST surface can be reviewed as Go. +// Endpoint capability discovery (Endpoint, SupportsEndpoint) lands separately +// in the Phase 0 PR and is intentionally not redeclared here. + +package rest + +import ( + "context" + "encoding/json" + "fmt" + "time" + + "github.com/apache/iceberg-go/table" +) + +// Compile-time proof that the REST catalog satisfies the table planner seam. +var _ table.ScanPlanner = (*Catalog)(nil) + +// ErrPlanExpired is returned when polling a plan-id the server no longer knows +// about (HTTP 404 while polling), distinct from a table-not-found 404. +var ErrPlanExpired = fmt.Errorf("%w: scan plan expired", ErrRESTError) + +// --- Capability gating (Open Question 2) ------------------------------------ +// +// A single capability check is too coarse: requiring all four endpoints falls +// back to local against sync-only servers, while requiring only the plan +// endpoint false-positives, because planTableScan can return `submitted` or +// `plan-tasks` that need the poll/fetch endpoints. The split below lets `auto` +// use a sync-only server while reserving the async/fanout path for servers +// that advertise everything. + +// SupportsPlanTableScan reports whether the server advertised the synchronous +// plan endpoint. +func (c *Catalog) SupportsPlanTableScan() bool { + panic("unimplemented: proposed API for #1178") +} + +// SupportsFullRemoteScanPlanning reports whether the server advertised all four +// scan-planning endpoints (plan, fetch-result, cancel, fetch-tasks). +func (c *Catalog) SupportsFullRemoteScanPlanning() bool { + panic("unimplemented: proposed API for #1178") +} + +// --- table.ScanPlanner implementation --------------------------------------- + +// SupportsRemoteScanPlanning reports whether this catalog can complete a remote +// plan end-to-end; backed by the split capability checks above. +func (c *Catalog) SupportsRemoteScanPlanning() bool { + panic("unimplemented: proposed API for #1178") +} + +// PlanFiles plans a scan server-side and returns tasks (and, optionally, a +// plan-scoped FileIO) for the table to read. +func (c *Catalog) PlanFiles(ctx context.Context, req table.ScanPlanningRequest) (table.ScanPlanningResult, error) { + panic("unimplemented: proposed API for #1178") +} + +// --- Low-level client methods ----------------------------------------------- + +// PlanTableScan submits a scan plan. The result is either completed inline, +// submitted (returns a plan-id to poll), or failed. +func (c *Catalog) PlanTableScan(ctx context.Context, ident table.Identifier, req PlanTableScanRequest) (PlanTableScanResponse, error) { + panic("unimplemented: proposed API for #1178") +} + +// FetchPlanningResult polls a previously submitted plan. +func (c *Catalog) FetchPlanningResult(ctx context.Context, ident table.Identifier, planID string) (FetchPlanningResultResponse, error) { + panic("unimplemented: proposed API for #1178") +} + +// CancelPlanning cancels a server-side plan. Callers should cancel on context +// cancellation using a detached context with a short timeout. +func (c *Catalog) CancelPlanning(ctx context.Context, ident table.Identifier, planID string) error { + panic("unimplemented: proposed API for #1178") +} + +// FetchScanTasks fetches the scan tasks for a plan-task handle returned by a +// completed plan. +func (c *Catalog) FetchScanTasks(ctx context.Context, ident table.Identifier, req FetchScanTasksRequest) (FetchScanTasksResponse, error) { + panic("unimplemented: proposed API for #1178") +} + +// WaitForPlan submits and polls a plan to completion using jittered backoff, +// cancelling the server-side plan if the context is cancelled. +func (c *Catalog) WaitForPlan(ctx context.Context, ident table.Identifier, planID string, opts WaitForPlanOptions) (FetchPlanningResultResponse, error) { + panic("unimplemented: proposed API for #1178") +} + +// --- Wire types (sketch) ---------------------------------------------------- +// +// Field-complete request/response decoding (content-file JSON, residuals, +// storage credentials) lands with the scan-task decoder PR; these sketch the +// request/response envelopes so the client surface compiles and reads. + +// PlanStatus is the status of a server-side plan. +type PlanStatus string + +const ( + PlanStatusCompleted PlanStatus = "completed" + PlanStatusSubmitted PlanStatus = "submitted" + PlanStatusCancelled PlanStatus = "cancelled" + PlanStatusFailed PlanStatus = "failed" +) + +// PlanTableScanRequest is the POST .../plan request body. Filter is the +// ExpressionParser-format JSON produced by iceberg.MarshalExpressionJSON. +type PlanTableScanRequest struct { + SnapshotID *int64 `json:"snapshot-id,omitempty"` + StartSnapshotID *int64 `json:"start-snapshot-id,omitempty"` + EndSnapshotID *int64 `json:"end-snapshot-id,omitempty"` + Select []string `json:"select,omitempty"` + Filter json.RawMessage `json:"filter,omitempty"` + CaseSensitive *bool `json:"case-sensitive,omitempty"` + UseSnapshotSchema *bool `json:"use-snapshot-schema,omitempty"` +} + +// PlanTableScanResponse is the POST .../plan response envelope. +type PlanTableScanResponse struct { + PlanStatus PlanStatus `json:"plan-status"` + PlanID *string `json:"plan-id,omitempty"` + // file-scan-tasks, delete-files, plan-tasks, storage-credentials decoded + // by the scan-task decoder PR. +} + +// FetchPlanningResultResponse is the GET .../plan/{plan-id} response envelope. +type FetchPlanningResultResponse struct { + PlanStatus PlanStatus `json:"plan-status"` +} + +// FetchScanTasksRequest is the POST .../tasks request body. +type FetchScanTasksRequest struct { + PlanTask string `json:"plan-task"` +} + +// FetchScanTasksResponse is the POST .../tasks response envelope. +type FetchScanTasksResponse struct{} Review Comment: `FetchScanTasksResult` is `allOf[ScanTasks]`, so it must carry `file-scan-tasks`, `delete-files`, and `plan-tasks` — and `FetchScanTasks` is the only retrieval path for the async fanout, so an empty struct makes that path non-functional by construction. It also means `FetchScanTasksRequest.PlanTask` can't be populated from anywhere, since `plan-tasks` is missing from the responses above too. Leaving the decode to the scan-task decoder PR is fine, but the fields need to exist as part of the envelope — even as `json.RawMessage` placeholders — so the shape is reviewable now. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
