This is an automated email from the ASF dual-hosted git repository.

etudenhoefner pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-docs.git


The following commit(s) were added to refs/heads/main by this push:
     new 02496a65 Common docs for 1.4.0 (#276)
02496a65 is described below

commit 02496a65c9a23ec862563d4f34e6b5b7390c5bbd
Author: Anton Okolnychyi <[email protected]>
AuthorDate: Thu Oct 5 23:19:08 2023 -0700

    Common docs for 1.4.0 (#276)
---
 landing-page/content/common/spec.md      | 35 ++++++++++++++++
 landing-page/content/common/view-spec.md | 72 +++++++++++++++-----------------
 2 files changed, 69 insertions(+), 38 deletions(-)

diff --git a/landing-page/content/common/spec.md 
b/landing-page/content/common/spec.md
index 58cfc229..60c0f99c 100644
--- a/landing-page/content/common/spec.md
+++ b/landing-page/content/common/spec.md
@@ -1128,6 +1128,41 @@ Example
      ] } ]
 ```
 
+### Content File (Data and Delete) Serialization
+
+Content file (data or delete) is serialized as a JSON object according to the 
following table.
+
+| Metadata field           |JSON representation|Example|
+|--------------------------|--- |--- |
+| **`spec-id`**            |`JSON int`|`1`|
+| **`content`**            |`JSON string`|`DATA`, `POSITION_DELETES`, 
`EQUALITY_DELETES`|
+| **`file-path`**          |`JSON string`|`"s3://b/wh/data.db/table"`|
+| **`file-format`**        |`JSON string`|`AVRO`, `ORC`, `PARQUET`|
+| **`partition`**          |`JSON object: Partition data tuple using partition 
field ids for the struct field ids`|`{"1000":1}`|
+| **`record-count`**       |`JSON long`|`1`|
+| **`file-size-in-bytes`** |`JSON long`|`1024`|
+| **`column-sizes`**       |`JSON object: Map from column id to the total size 
on disk of all regions that store the 
column.`|`{"keys":[3,4],"values":[100,200]}`|
+| **`value-counts`**       |`JSON object: Map from column id to number of 
values in the column (including null and NaN 
values)`|`{"keys":[3,4],"values":[90,180]}`|
+| **`null-value-counts`**  |`JSON object: Map from column id to number of null 
values in the column`|`{"keys":[3,4],"values":[10,20]}`|
+| **`nan-value-counts`**   |`JSON object: Map from column id to number of NaN 
values in the column`|`{"keys":[3,4],"values":[0,0]}`|
+| **`lower-bounds`**       |`JSON object: Map from column id to lower bound 
binary in the column serialized as hexadecimal 
string`|`{"keys":[3,4],"values":["01000000","02000000"]}`|
+| **`upper-bounds`**       |`JSON object: Map from column id to upper bound 
binary in the column serialized as hexadecimal 
string`|`{"keys":[3,4],"values":["05000000","0A000000"]}`|
+| **`key-metadata`**       |`JSON string: Encryption key metadata binary 
serialized as hexadecimal string`|`00000000000000000000000000000000`|
+| **`split-offsets`**      |`JSON list of long: Split offsets for the data 
file`|`[128,256]`|
+| **`equality-ids`**       |`JSON list of int: Field ids used to determine row 
equality in equality delete files`|`[1]`|
+| **`sort-order-id`**      |`JSON int`|`1`|
+
+### File Scan Task Serialization
+
+File scan task is serialized as a JSON object according to the following table.
+
+| Metadata field       |JSON representation|Example|
+|--------------------------|--- |--- |
+| **`schema`**          |`JSON object`|`See above, read schemas instead`|
+| **`spec`**            |`JSON object`|`See above, read partition specs 
instead`|
+| **`data-file`**       |`JSON object`|`See above, read content file instead`|
+| **`delete-files`**    |`JSON list of objects`|`See above, read content file 
instead`|
+| **`residual-filter`** |`JSON object: residual filter 
expression`|`{"type":"eq","term":"id","value":1}`|
 
 ## Appendix D: Single-value serialization
 
diff --git a/landing-page/content/common/view-spec.md 
b/landing-page/content/common/view-spec.md
index a9826a32..26313193 100644
--- a/landing-page/content/common/view-spec.md
+++ b/landing-page/content/common/view-spec.md
@@ -58,9 +58,9 @@ The view version metadata file has the following fields:
 
 | Requirement | Field name           | Description |
 |-------------|----------------------|-------------|
+| _required_  | `view-uuid`          | A UUID that identifies the view, 
generated when the view is created. Implementations must throw an exception if 
a view's UUID does not match the expected UUID after refreshing metadata |
 | _required_  | `format-version`     | An integer version number for the view 
format; must be 1 |
 | _required_  | `location`           | The view's base location; used to 
create metadata file locations |
-| _required_  | `current-schema-id`  | ID of the current schema of the view, 
if known |
 | _required_  | `schemas`            | A list of known schemas |
 | _required_  | `current-version-id` | ID of the current version of the view 
(`version-id`) |
 | _required_  | `versions`           | A list of known [versions](#versions) 
of the view [1] |
@@ -75,13 +75,17 @@ Notes:
 
 Each version in `versions` is a struct with the following fields:
 
-| Requirement | Field name        | Description                                
                              |
-|-------------|-------------------|--------------------------------------------------------------------------|
-| _required_  | `version-id`      | ID for the version                         
                              |
-| _required_  | `schema-id`       | ID of the schema for the view version      
                              |
-| _required_  | `timestamp-ms`    | Timestamp when the version was created (ms 
from epoch)                   |
-| _required_  | `summary`         | A string to string map of [summary 
metadata](#summary) about the version |
-| _required_  | `representations` | A list of 
[representations](#representations) for the view definition    |
+| Requirement | Field name          | Description                              
                                     |
+|-------------|---------------------|-------------------------------------------------------------------------------|
+| _required_  | `version-id`        | ID for the version                       
                                     |
+| _required_  | `schema-id`         | ID of the schema for the view version    
                                     |
+| _required_  | `timestamp-ms`      | Timestamp when the version was created 
(ms from epoch)                        |
+| _required_  | `summary`           | A string to string map of [summary 
metadata](#summary) about the version      |
+| _required_  | `representations`   | A list of 
[representations](#representations) for the view definition         |
+| _optional_  | `default-catalog`   | Catalog name to use when a reference in 
the SELECT does not contain a catalog |
+| _required_  | `default-namespace` | Namespace to use when a reference in the 
SELECT is a single identifier        |
+
+When `default-catalog` is `null` or not set, the catalog in which the view is 
stored must be used as the default catalog.
 
 #### Summary
 
@@ -117,10 +121,6 @@ A view version can have multiple SQL representations of 
different dialects, but
 | _required_  | `type`              | `string`       | Must be `sql` |
 | _required_  | `sql`               | `string`       | A SQL SELECT statement |
 | _required_  | `dialect`           | `string`       | The dialect of the 
`sql` SELECT statement (e.g., "trino" or "spark") |
-| _optional_  | `default-catalog`   | `string`       | Catalog name to use 
when a reference in the SELECT does not contain a catalog |
-| _optional_  | `default-namespace` | `list<string>` | Namespace to use when a 
reference in the SELECT is a single identifier |
-| _optional_  | `field-aliases`     | `list<string>` | Column names optionally 
specified in the create statement |
-| _optional_  | `field-comments`    | `list<string>` | Column descriptions 
(COMMENT) optionally specified in the create statement |
 
 For example:
 
@@ -144,13 +144,11 @@ This create statement would produce the following `sql` 
representation metadata:
 | `type`              | `"sql"` |
 | `sql`               | `"SELECT\n    COUNT(1), CAST(event_ts AS DATE)\nFROM 
events\nGROUP BY 2"` |
 | `dialect`           | `"spark"` |
-| `default-catalog`   | `"prod"` |
-| `default-namespace` | `["default"]` |
-| `field-aliases`     | `["event_count", "event_date"]` |
-| `field-comments`    | `["Count of events", null]` |
 
 If a create statement does not include column names or comments before `AS`, 
the fields should be omitted.
 
+The `event_count` (with the `Count of events` comment) and `event_date` field 
aliases must be part of the view version's `schema`.
+
 #### Version log
 
 The version log tracks changes to the view's current version. This is the 
view's history and allows reconstructing what version of the view would have 
been used at some point in time.
@@ -195,6 +193,7 @@ 
s3://bucket/warehouse/default.db/event_agg/metadata/00001-(uuid).metadata.json
 ```
 ```
 {
+  "view-uuid": "fa6506c3-7681-40c8-86dc-e36561f83385",
   "format-version" : 1,
   "location" : "s3://bucket/warehouse/default.db/event_agg",
   "current-version-id" : 1,
@@ -205,6 +204,8 @@ 
s3://bucket/warehouse/default.db/event_agg/metadata/00001-(uuid).metadata.json
     "version-id" : 1,
     "timestamp-ms" : 1573518431292,
     "schema-id" : 1,
+    "default-catalog" : "prod",
+    "default-namespace" : [ "default" ],
     "summary" : {
       "operation" : "create",
       "engine-name" : "Spark",
@@ -213,25 +214,21 @@ 
s3://bucket/warehouse/default.db/event_agg/metadata/00001-(uuid).metadata.json
     "representations" : [ {
       "type" : "sql",
       "sql" : "SELECT\n    COUNT(1), CAST(event_ts AS DATE)\nFROM 
events\nGROUP BY 2",
-      "dialect" : "spark",
-      "default-catalog" : "prod",
-      "default-namespace" : [ "default" ],
-      "field-aliases" : ["event_count", "event_date"],
-      "field-comments" : ["Count of events", null]
+      "dialect" : "spark"
     } ]
   } ],
-  "current-schema-id": 1,
   "schemas": [ {
     "schema-id": 1,
     "type" : "struct",
     "fields" : [ {
       "id" : 1,
-      "name" : "col1",
+      "name" : "event_count",
       "required" : false,
-      "type" : "int"
+      "type" : "int",
+      "doc" : "Count of events"
     }, {
       "id" : 2,
-      "name" : "col2",
+      "name" : "event_date",
       "required" : false,
       "type" : "date"
     } ]
@@ -264,6 +261,7 @@ 
s3://bucket/warehouse/default.db/event_agg/metadata/00002-(uuid).metadata.json
 ```
 ```
 {
+  "view-uuid": "fa6506c3-7681-40c8-86dc-e36561f83385",
   "format-version" : 1,
   "location" : "s3://bucket/warehouse/default.db/event_agg",
   "current-version-id" : 1,
@@ -274,6 +272,8 @@ 
s3://bucket/warehouse/default.db/event_agg/metadata/00002-(uuid).metadata.json
     "version-id" : 1,
     "timestamp-ms" : 1573518431292,
     "schema-id" : 1,
+    "default-catalog" : "prod",
+    "default-namespace" : [ "default" ],
     "summary" : {
       "operation" : "create",
       "engine-name" : "Spark",
@@ -282,15 +282,14 @@ 
s3://bucket/warehouse/default.db/event_agg/metadata/00002-(uuid).metadata.json
     "representations" : [ {
       "type" : "sql",
       "sql" : "SELECT\n    COUNT(1), CAST(event_ts AS DATE)\nFROM 
events\nGROUP BY 2",
-      "dialect" : "spark",
-      "default-catalog" : "prod",
-      "default-namespace" : [ "default" ],
-      "field-aliases" : ["event_count", "event_date"],
-      "field-comments" : ["Count of events", null]
+      "dialect" : "spark"
     } ]
   }, {
     "version-id" : 2,
     "timestamp-ms" : 1573518981593,
+    "schema-id" : 1,
+    "default-catalog" : "prod",
+    "default-namespace" : [ "default" ],
     "summary" : {
       "operation" : "create",
       "engine-name" : "Spark",
@@ -299,24 +298,21 @@ 
s3://bucket/warehouse/default.db/event_agg/metadata/00002-(uuid).metadata.json
     "representations" : [ {
       "type" : "sql",
       "sql" : "SELECT\n    COUNT(1), CAST(event_ts AS DATE)\nFROM 
prod.default.events\nGROUP BY 2",
-      "dialect" : "spark",
-      "default-catalog" : "prod",
-      "default-namespace" : [ "default" ],
-      "field-aliases" : ["event_count", "event_date"]
+      "dialect" : "spark"
     } ]
   } ],
-  "current-schema-id": 1,
   "schemas": [ {
     "schema-id": 1,
     "type" : "struct",
     "fields" : [ {
       "id" : 1,
-      "name" : "col1",
+      "name" : "event_count",
       "required" : false,
-      "type" : "int"
+      "type" : "int",
+      "doc" : "Count of events"
     }, {
       "id" : 2,
-      "name" : "col2",
+      "name" : "event_date",
       "required" : false,
       "type" : "date"
     } ]

Reply via email to