cpcloud commented on a change in pull request #10934: URL: https://github.com/apache/arrow/pull/10934#discussion_r689754664
########## File path: format/ComputeIR.fbs ########## @@ -0,0 +1,241 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +include "Schema.fbs"; +include "Message.fbs"; + +namespace org.apache.arrow.flatbuf.computeir; + +/// An expression is one of +/// - a Literal datum +/// - a reference to a Field from a Relation +/// - a call to a named function +/// On evaluation, an Expression will have either array or scalar shape. +union ExpressionImpl { + Literal, FieldRef, Call +} + +table Expression { + impl: ExpressionImpl (required); +} + +union Shape { + ArrayShape, ScalarShape Review comment: Tiny nit: do we need to repeat the `Shape` suffix? ########## File path: format/ComputeIR.fbs ########## @@ -0,0 +1,241 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +include "Schema.fbs"; +include "Message.fbs"; + +namespace org.apache.arrow.flatbuf.computeir; + +/// An expression is one of +/// - a Literal datum +/// - a reference to a Field from a Relation +/// - a call to a named function +/// On evaluation, an Expression will have either array or scalar shape. +union ExpressionImpl { + Literal, FieldRef, Call +} + +table Expression { + impl: ExpressionImpl (required); +} + +union Shape { + ArrayShape, ScalarShape +} + +table ScalarShape {} + +table ArrayShape { + /// Number of slots. + length: long; +} + +table Literal { + /// Shape of this literal. Review comment: Perhaps a short comment about the potentially non-obvious case of `struct`s here might be useful to indicate the full scope of what `Scalar` entails. I can also push up a blurb. ########## File path: format/ComputeIR.fbs ########## @@ -0,0 +1,241 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +include "Schema.fbs"; +include "Message.fbs"; + +namespace org.apache.arrow.flatbuf.computeir; + +/// An expression is one of +/// - a Literal datum +/// - a reference to a Field from a Relation +/// - a call to a named function +/// On evaluation, an Expression will have either array or scalar shape. +union ExpressionImpl { + Literal, FieldRef, Call +} + +table Expression { + impl: ExpressionImpl (required); +} + +union Shape { + ArrayShape, ScalarShape +} + +table ScalarShape {} + +table ArrayShape { + /// Number of slots. + length: long; +} + +table Literal { + /// Shape of this literal. + shape: Shape (required); + + /// The type of this literal. + type: Type (required); + + /// Buffers containing `length` elements of arrow-formatted data. + /// If `length` is absent (this Literal is scalar), these buffers + /// are sized to accommodate a single element of arrow-formatted data. + /// XXX this can be optimized for trivial scalars later + buffers: [Buffer]; +} + +table FieldRef { + /// A sequence of field names to allow referencing potentially nested fields + path: [string]; + + /// For Expressions which might reference fields in multiple Relations, + /// this index may be provided to indicate which Relation's fields + /// `path` points into. For example in the case of a join, + /// 0 refers to the left relation and 1 to the right relation. + relation_index: int; + + /// The type of data in the referenced Field. + type: Type; +} + +table Call { + /// The name of the function whose invocation this Call represents. + function_name: string (required); + + /// Parameters for `function_name`; content/format may be unique to each + /// value of `function_name`. + options: Buffer; + + /// The arguments passed to `function_name`. + arguments: [Expression] (required); + + /// The type of data which invoking `function_name` will return. + type: Type; +} + +/// A relation is a set of rows with consistent schema. +table Relation { + /// The namespaced name of this Relation. + /// + /// Names with no namespace are reserved for pure relational + /// algebraic operations, which currently include: + /// "filter" + /// "project" + /// "aggregate" + /// "join" + /// "order_by" + /// "limit" + /// "common" + /// "union" + /// "literal" + /// "interactive_output" + relation_name: string (required); + + /// Parameters for `relation_name`; content/format may be unique to each + /// value of `relation_name`. + options: Buffer; + + /// The arguments passed to `relation_name`. + arguments: [Relation] (required); + + /// The schema of rows in this Relation + schema: Schema; +} + +/// The contents of Relation.options will be FilterOptions +/// if Relation.relation_name = "filter" +table FilterOptions { + /// The expression which will be evaluated against input rows + /// to determine whether they should be excluded from the + /// "filter" relation's output. + filter_expression: Expression (required); +} + +/// The contents of Relation.options will be ProjectOptions +/// if Relation.relation_name = "project" +table ProjectOptions { + /// Expressions which will be evaluated to produce to + /// the rows of the "project" relation's output. + expressions: [Expression] (required); +} + +/// The contents of Relation.options will be AggregateOptions +/// if Relation.relation_name = "aggregate" +table AggregateOptions { + /// Expressions which will be evaluated to produce to + /// the rows of the "aggregate" relation's output. + aggregations: [Expression] (required); + /// Keys by which `aggregations` will be grouped. + keys: [Expression]; +} + +/// The contents of Relation.options will be JoinOptions +/// if Relation.relation_name = "join" +table JoinOptions { + /// The expression which will be evaluated against rows from each + /// input to determine whether they should be included in the + /// "join" relation's output. + on_expression: Expression (required); + join_kind: string; +} + +/// Whether lesser values should precede greater or vice versa, +/// also whether nulls should preced or follow values. +enum Ordering : uint8 { + ASCENDING_THEN_NULLS, + DESCENDING_THEN_NULLS, + NULLS_THEN_ASCENDING, + NULLS_THEN_DESCENDING +} + +table SortKey { + value: Expression (required); + ordering: Ordering = ASCENDING_THEN_NULLS; +} + +/// The contents of Relation.options will be OrderByOptions +/// if Relation.relation_name = "order_by" +table OrderByOptions { + /// Define sort order for rows of output. + /// Keys with higher precedence are ordered ahead of other keys. + keys: [SortKey] (required); +} + +/// The contents of Relation.options will be LimitOptions +/// if Relation.relation_name = "limit" +table LimitOptions { + /// Set the maximum number of rows of output. + count: long; +} + +/// The contents of Relation.options will be CommonOptions +/// if Relation.relation_name = "common" +table CommonOptions { + /// Commons (CTEs in SQL) allow assigning a name to a stream + /// of data and reusing it, potentially multiple times and + /// potentially recursively. + name: string; +} + +/// The contents of Relation.options will be UnionOptions +/// if Relation.relation_name = "union" +table UnionOptions { + /// For simplicity, all rows from any input to a "union" relation + /// will always be concatenated into a single output- establishing + /// uniqueness of output rows is deferred to other relations. Review comment: I'm not entirely sure we can leave out something like a `unique: boolean;` parameter, since chaining unions with different value of such a parameter is (syntactically) legal: ```sql SELECT 1 UNION ALL SELECT 1 UNION SELECT 1 ``` Maybe there's an algebraic equivalence here that obviates the need for such a parameter, but I don't see it. ########## File path: format/ComputeIR.fbs ########## @@ -0,0 +1,241 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +include "Schema.fbs"; +include "Message.fbs"; + +namespace org.apache.arrow.flatbuf.computeir; + +/// An expression is one of +/// - a Literal datum +/// - a reference to a Field from a Relation +/// - a call to a named function +/// On evaluation, an Expression will have either array or scalar shape. +union ExpressionImpl { + Literal, FieldRef, Call +} + +table Expression { + impl: ExpressionImpl (required); +} + +union Shape { + ArrayShape, ScalarShape +} + +table ScalarShape {} + +table ArrayShape { + /// Number of slots. + length: long; +} + +table Literal { + /// Shape of this literal. + shape: Shape (required); + + /// The type of this literal. + type: Type (required); + + /// Buffers containing `length` elements of arrow-formatted data. + /// If `length` is absent (this Literal is scalar), these buffers + /// are sized to accommodate a single element of arrow-formatted data. + /// XXX this can be optimized for trivial scalars later + buffers: [Buffer]; +} + +table FieldRef { + /// A sequence of field names to allow referencing potentially nested fields + path: [string]; + + /// For Expressions which might reference fields in multiple Relations, + /// this index may be provided to indicate which Relation's fields + /// `path` points into. For example in the case of a join, + /// 0 refers to the left relation and 1 to the right relation. + relation_index: int; + + /// The type of data in the referenced Field. + type: Type; +} + +table Call { + /// The name of the function whose invocation this Call represents. + function_name: string (required); + + /// Parameters for `function_name`; content/format may be unique to each + /// value of `function_name`. + options: Buffer; + + /// The arguments passed to `function_name`. + arguments: [Expression] (required); + + /// The type of data which invoking `function_name` will return. + type: Type; +} + +/// A relation is a set of rows with consistent schema. +table Relation { + /// The namespaced name of this Relation. + /// + /// Names with no namespace are reserved for pure relational + /// algebraic operations, which currently include: + /// "filter" + /// "project" + /// "aggregate" + /// "join" + /// "order_by" + /// "limit" + /// "common" + /// "union" + /// "literal" + /// "interactive_output" + relation_name: string (required); + + /// Parameters for `relation_name`; content/format may be unique to each + /// value of `relation_name`. + options: Buffer; + + /// The arguments passed to `relation_name`. + arguments: [Relation] (required); + + /// The schema of rows in this Relation + schema: Schema; +} + +/// The contents of Relation.options will be FilterOptions +/// if Relation.relation_name = "filter" +table FilterOptions { + /// The expression which will be evaluated against input rows + /// to determine whether they should be excluded from the + /// "filter" relation's output. + filter_expression: Expression (required); +} + +/// The contents of Relation.options will be ProjectOptions +/// if Relation.relation_name = "project" +table ProjectOptions { + /// Expressions which will be evaluated to produce to + /// the rows of the "project" relation's output. + expressions: [Expression] (required); +} + +/// The contents of Relation.options will be AggregateOptions +/// if Relation.relation_name = "aggregate" +table AggregateOptions { + /// Expressions which will be evaluated to produce to + /// the rows of the "aggregate" relation's output. + aggregations: [Expression] (required); + /// Keys by which `aggregations` will be grouped. + keys: [Expression]; +} + +/// The contents of Relation.options will be JoinOptions +/// if Relation.relation_name = "join" +table JoinOptions { + /// The expression which will be evaluated against rows from each + /// input to determine whether they should be included in the + /// "join" relation's output. + on_expression: Expression (required); + join_kind: string; +} + +/// Whether lesser values should precede greater or vice versa, +/// also whether nulls should preced or follow values. +enum Ordering : uint8 { + ASCENDING_THEN_NULLS, + DESCENDING_THEN_NULLS, + NULLS_THEN_ASCENDING, + NULLS_THEN_DESCENDING +} + +table SortKey { + value: Expression (required); + ordering: Ordering = ASCENDING_THEN_NULLS; +} + +/// The contents of Relation.options will be OrderByOptions +/// if Relation.relation_name = "order_by" +table OrderByOptions { + /// Define sort order for rows of output. + /// Keys with higher precedence are ordered ahead of other keys. + keys: [SortKey] (required); +} + +/// The contents of Relation.options will be LimitOptions +/// if Relation.relation_name = "limit" +table LimitOptions { + /// Set the maximum number of rows of output. + count: long; +} + +/// The contents of Relation.options will be CommonOptions +/// if Relation.relation_name = "common" +table CommonOptions { + /// Commons (CTEs in SQL) allow assigning a name to a stream + /// of data and reusing it, potentially multiple times and + /// potentially recursively. + name: string; +} + +/// The contents of Relation.options will be UnionOptions +/// if Relation.relation_name = "union" +table UnionOptions { + /// For simplicity, all rows from any input to a "union" relation + /// will always be concatenated into a single output- establishing + /// uniqueness of output rows is deferred to other relations. +} + +/// The contents of Relation.options will be LiteralOptions +/// if Relation.relation_name = "literal" +table LiteralOptions { + /// Batches of rows in this literal. + batches: [RecordBatch] (required); +} + +/// A specification of a query. +table Plan { + /// One or more output relations. + sinks: [Relation] (required); + + /// Dictionary batches which may be referenced by Literals. + dictionary_batch: [DictionaryBatch]; Review comment: Can you elaborate a tiny bit on why dictionary batches are in `Plan` and not in `LiteralOptions`? ########## File path: docs/source/format/ComputeIR.rst ########## @@ -0,0 +1,59 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +********************************************** +Arrow Compute IR (Intermediate Representation) +********************************************** + +In the same way that the Arrow format provides a powerful tool +for communicating data, Compute IR is intended to provide a +consistent format for representing analytical operations against +that data. As an arrow-native expression of computation it includes +information such as explicit types and schemas and arrow formatted +literal data. It is also optimized for low runtime overhead in both +serialization and deserialization. + +Built-in definitions are included to enable representation of +relational algebraic operations- the contents of a "logical query plan". +Compute IR also has first class support for representing operations +which are not members of a minimal relational algebra, including +implementation and optimization details- the contents of a "physical +query plan". This approach is taken in emulation of `MLIR`_ (Multi-Level +Intermediate Representation), a system which has had strong successes in +spaces of comparable complexity to representation of analytic operations. +To borrow terms from that project, there are two mutations of interest: + +* Replacement of representations with semantically equivalent representations + which will yield better performance for consumers- an optimization pass. +* Replacement of abstract or generic representations with more specific + and potentially consumer-specific representations- a lowering pass. + This modification corresponds to the translation of a logical plan + to a physical plan. + +Allowing representation of physical plans (and plans which are between +logical and physical) in Compute IR enables systems to define incremental +optimization and lowering passes which operate on and produce valid +Compute IR. This in turn enables communication, manipulation, and inspection +at every stage of lowering/optimization by the same tools +used for logical-plan-equivalent-IR. This is especially useful for systems +where such passes may depend on information only available on every node +of a distributed consumer (for example statistics unique to that node's +local data) or may not be universal to all backends in a heterogeneous +consumer (for example which optimizations nodes are capable of for +non equi joins). Review comment: Great summary. ########## File path: format/ComputeIR.fbs ########## @@ -0,0 +1,241 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +include "Schema.fbs"; +include "Message.fbs"; + +namespace org.apache.arrow.flatbuf.computeir; + +/// An expression is one of +/// - a Literal datum +/// - a reference to a Field from a Relation +/// - a call to a named function +/// On evaluation, an Expression will have either array or scalar shape. +union ExpressionImpl { + Literal, FieldRef, Call +} + +table Expression { + impl: ExpressionImpl (required); +} + +union Shape { + ArrayShape, ScalarShape +} + +table ScalarShape {} + +table ArrayShape { + /// Number of slots. + length: long; +} + +table Literal { + /// Shape of this literal. + shape: Shape (required); + + /// The type of this literal. + type: Type (required); + + /// Buffers containing `length` elements of arrow-formatted data. + /// If `length` is absent (this Literal is scalar), these buffers + /// are sized to accommodate a single element of arrow-formatted data. + /// XXX this can be optimized for trivial scalars later + buffers: [Buffer]; +} + +table FieldRef { + /// A sequence of field names to allow referencing potentially nested fields + path: [string]; + + /// For Expressions which might reference fields in multiple Relations, + /// this index may be provided to indicate which Relation's fields + /// `path` points into. For example in the case of a join, + /// 0 refers to the left relation and 1 to the right relation. + relation_index: int; + + /// The type of data in the referenced Field. + type: Type; +} + +table Call { + /// The name of the function whose invocation this Call represents. + function_name: string (required); + + /// Parameters for `function_name`; content/format may be unique to each + /// value of `function_name`. + options: Buffer; Review comment: Are `Buffer`s meaningfully different from `[ubyte]`? ########## File path: format/ComputeIR.fbs ########## @@ -0,0 +1,241 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +include "Schema.fbs"; +include "Message.fbs"; + +namespace org.apache.arrow.flatbuf.computeir; + +/// An expression is one of +/// - a Literal datum +/// - a reference to a Field from a Relation +/// - a call to a named function +/// On evaluation, an Expression will have either array or scalar shape. +union ExpressionImpl { + Literal, FieldRef, Call +} + +table Expression { + impl: ExpressionImpl (required); +} + +union Shape { + ArrayShape, ScalarShape +} + +table ScalarShape {} + +table ArrayShape { + /// Number of slots. + length: long; +} + +table Literal { + /// Shape of this literal. + shape: Shape (required); + + /// The type of this literal. + type: Type (required); + + /// Buffers containing `length` elements of arrow-formatted data. + /// If `length` is absent (this Literal is scalar), these buffers + /// are sized to accommodate a single element of arrow-formatted data. + /// XXX this can be optimized for trivial scalars later + buffers: [Buffer]; +} + +table FieldRef { + /// A sequence of field names to allow referencing potentially nested fields + path: [string]; + + /// For Expressions which might reference fields in multiple Relations, + /// this index may be provided to indicate which Relation's fields + /// `path` points into. For example in the case of a join, + /// 0 refers to the left relation and 1 to the right relation. + relation_index: int; + + /// The type of data in the referenced Field. + type: Type; +} + +table Call { + /// The name of the function whose invocation this Call represents. + function_name: string (required); + + /// Parameters for `function_name`; content/format may be unique to each + /// value of `function_name`. + options: Buffer; + + /// The arguments passed to `function_name`. + arguments: [Expression] (required); + + /// The type of data which invoking `function_name` will return. + type: Type; +} + +/// A relation is a set of rows with consistent schema. +table Relation { + /// The namespaced name of this Relation. + /// + /// Names with no namespace are reserved for pure relational + /// algebraic operations, which currently include: + /// "filter" + /// "project" + /// "aggregate" + /// "join" + /// "order_by" + /// "limit" + /// "common" + /// "union" + /// "literal" + /// "interactive_output" + relation_name: string (required); + + /// Parameters for `relation_name`; content/format may be unique to each + /// value of `relation_name`. + options: Buffer; + + /// The arguments passed to `relation_name`. + arguments: [Relation] (required); + + /// The schema of rows in this Relation + schema: Schema; +} + +/// The contents of Relation.options will be FilterOptions +/// if Relation.relation_name = "filter" +table FilterOptions { + /// The expression which will be evaluated against input rows + /// to determine whether they should be excluded from the + /// "filter" relation's output. + filter_expression: Expression (required); +} + +/// The contents of Relation.options will be ProjectOptions +/// if Relation.relation_name = "project" +table ProjectOptions { + /// Expressions which will be evaluated to produce to + /// the rows of the "project" relation's output. + expressions: [Expression] (required); +} + +/// The contents of Relation.options will be AggregateOptions +/// if Relation.relation_name = "aggregate" +table AggregateOptions { + /// Expressions which will be evaluated to produce to + /// the rows of the "aggregate" relation's output. + aggregations: [Expression] (required); + /// Keys by which `aggregations` will be grouped. + keys: [Expression]; +} + +/// The contents of Relation.options will be JoinOptions +/// if Relation.relation_name = "join" +table JoinOptions { + /// The expression which will be evaluated against rows from each + /// input to determine whether they should be included in the + /// "join" relation's output. + on_expression: Expression (required); + join_kind: string; +} + +/// Whether lesser values should precede greater or vice versa, +/// also whether nulls should preced or follow values. +enum Ordering : uint8 { + ASCENDING_THEN_NULLS, + DESCENDING_THEN_NULLS, + NULLS_THEN_ASCENDING, + NULLS_THEN_DESCENDING +} + +table SortKey { + value: Expression (required); + ordering: Ordering = ASCENDING_THEN_NULLS; +} + +/// The contents of Relation.options will be OrderByOptions +/// if Relation.relation_name = "order_by" +table OrderByOptions { + /// Define sort order for rows of output. + /// Keys with higher precedence are ordered ahead of other keys. + keys: [SortKey] (required); +} + +/// The contents of Relation.options will be LimitOptions +/// if Relation.relation_name = "limit" +table LimitOptions { + /// Set the maximum number of rows of output. + count: long; +} + +/// The contents of Relation.options will be CommonOptions +/// if Relation.relation_name = "common" +table CommonOptions { + /// Commons (CTEs in SQL) allow assigning a name to a stream + /// of data and reusing it, potentially multiple times and + /// potentially recursively. + name: string; Review comment: Should this be `(required)`? What does it mean for a CTE or alias to lack a name? ########## File path: format/ComputeIR.fbs ########## @@ -0,0 +1,241 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +include "Schema.fbs"; +include "Message.fbs"; + +namespace org.apache.arrow.flatbuf.computeir; + +/// An expression is one of +/// - a Literal datum +/// - a reference to a Field from a Relation +/// - a call to a named function +/// On evaluation, an Expression will have either array or scalar shape. +union ExpressionImpl { + Literal, FieldRef, Call +} + +table Expression { + impl: ExpressionImpl (required); +} + +union Shape { + ArrayShape, ScalarShape +} + +table ScalarShape {} + +table ArrayShape { + /// Number of slots. + length: long; +} + +table Literal { + /// Shape of this literal. + shape: Shape (required); + + /// The type of this literal. + type: Type (required); + + /// Buffers containing `length` elements of arrow-formatted data. + /// If `length` is absent (this Literal is scalar), these buffers + /// are sized to accommodate a single element of arrow-formatted data. + /// XXX this can be optimized for trivial scalars later + buffers: [Buffer]; +} + +table FieldRef { + /// A sequence of field names to allow referencing potentially nested fields + path: [string]; + + /// For Expressions which might reference fields in multiple Relations, + /// this index may be provided to indicate which Relation's fields + /// `path` points into. For example in the case of a join, + /// 0 refers to the left relation and 1 to the right relation. + relation_index: int; + + /// The type of data in the referenced Field. + type: Type; +} + +table Call { + /// The name of the function whose invocation this Call represents. + function_name: string (required); + + /// Parameters for `function_name`; content/format may be unique to each + /// value of `function_name`. + options: Buffer; + + /// The arguments passed to `function_name`. + arguments: [Expression] (required); + + /// The type of data which invoking `function_name` will return. + type: Type; +} + +/// A relation is a set of rows with consistent schema. +table Relation { + /// The namespaced name of this Relation. + /// + /// Names with no namespace are reserved for pure relational + /// algebraic operations, which currently include: + /// "filter" + /// "project" + /// "aggregate" + /// "join" + /// "order_by" + /// "limit" + /// "common" + /// "union" + /// "literal" + /// "interactive_output" + relation_name: string (required); + + /// Parameters for `relation_name`; content/format may be unique to each + /// value of `relation_name`. + options: Buffer; + + /// The arguments passed to `relation_name`. + arguments: [Relation] (required); + + /// The schema of rows in this Relation + schema: Schema; +} + +/// The contents of Relation.options will be FilterOptions +/// if Relation.relation_name = "filter" +table FilterOptions { + /// The expression which will be evaluated against input rows + /// to determine whether they should be excluded from the + /// "filter" relation's output. + filter_expression: Expression (required); +} + +/// The contents of Relation.options will be ProjectOptions +/// if Relation.relation_name = "project" +table ProjectOptions { + /// Expressions which will be evaluated to produce to + /// the rows of the "project" relation's output. + expressions: [Expression] (required); +} + +/// The contents of Relation.options will be AggregateOptions +/// if Relation.relation_name = "aggregate" +table AggregateOptions { + /// Expressions which will be evaluated to produce to + /// the rows of the "aggregate" relation's output. + aggregations: [Expression] (required); + /// Keys by which `aggregations` will be grouped. + keys: [Expression]; +} + +/// The contents of Relation.options will be JoinOptions +/// if Relation.relation_name = "join" +table JoinOptions { + /// The expression which will be evaluated against rows from each + /// input to determine whether they should be included in the + /// "join" relation's output. + on_expression: Expression (required); + join_kind: string; +} + +/// Whether lesser values should precede greater or vice versa, +/// also whether nulls should preced or follow values. +enum Ordering : uint8 { + ASCENDING_THEN_NULLS, + DESCENDING_THEN_NULLS, + NULLS_THEN_ASCENDING, + NULLS_THEN_DESCENDING Review comment: What was the rationale for combining these into one enum? ########## File path: format/ComputeIR.fbs ########## @@ -0,0 +1,241 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +include "Schema.fbs"; +include "Message.fbs"; + +namespace org.apache.arrow.flatbuf.computeir; + +/// An expression is one of +/// - a Literal datum +/// - a reference to a Field from a Relation +/// - a call to a named function +/// On evaluation, an Expression will have either array or scalar shape. +union ExpressionImpl { + Literal, FieldRef, Call +} + +table Expression { + impl: ExpressionImpl (required); +} + +union Shape { + ArrayShape, ScalarShape +} + +table ScalarShape {} + +table ArrayShape { + /// Number of slots. + length: long; +} + +table Literal { + /// Shape of this literal. + shape: Shape (required); + + /// The type of this literal. + type: Type (required); + + /// Buffers containing `length` elements of arrow-formatted data. + /// If `length` is absent (this Literal is scalar), these buffers + /// are sized to accommodate a single element of arrow-formatted data. + /// XXX this can be optimized for trivial scalars later + buffers: [Buffer]; +} + +table FieldRef { + /// A sequence of field names to allow referencing potentially nested fields + path: [string]; + + /// For Expressions which might reference fields in multiple Relations, + /// this index may be provided to indicate which Relation's fields + /// `path` points into. For example in the case of a join, + /// 0 refers to the left relation and 1 to the right relation. + relation_index: int; + + /// The type of data in the referenced Field. + type: Type; +} + +table Call { + /// The name of the function whose invocation this Call represents. + function_name: string (required); + + /// Parameters for `function_name`; content/format may be unique to each + /// value of `function_name`. + options: Buffer; + + /// The arguments passed to `function_name`. + arguments: [Expression] (required); + + /// The type of data which invoking `function_name` will return. + type: Type; +} + +/// A relation is a set of rows with consistent schema. +table Relation { + /// The namespaced name of this Relation. + /// + /// Names with no namespace are reserved for pure relational + /// algebraic operations, which currently include: + /// "filter" + /// "project" + /// "aggregate" + /// "join" + /// "order_by" + /// "limit" + /// "common" + /// "union" + /// "literal" + /// "interactive_output" + relation_name: string (required); + + /// Parameters for `relation_name`; content/format may be unique to each + /// value of `relation_name`. + options: Buffer; + + /// The arguments passed to `relation_name`. + arguments: [Relation] (required); + + /// The schema of rows in this Relation + schema: Schema; +} + +/// The contents of Relation.options will be FilterOptions +/// if Relation.relation_name = "filter" +table FilterOptions { + /// The expression which will be evaluated against input rows + /// to determine whether they should be excluded from the + /// "filter" relation's output. + filter_expression: Expression (required); +} + +/// The contents of Relation.options will be ProjectOptions +/// if Relation.relation_name = "project" +table ProjectOptions { + /// Expressions which will be evaluated to produce to + /// the rows of the "project" relation's output. + expressions: [Expression] (required); +} + +/// The contents of Relation.options will be AggregateOptions +/// if Relation.relation_name = "aggregate" +table AggregateOptions { + /// Expressions which will be evaluated to produce to + /// the rows of the "aggregate" relation's output. + aggregations: [Expression] (required); + /// Keys by which `aggregations` will be grouped. + keys: [Expression]; +} + +/// The contents of Relation.options will be JoinOptions +/// if Relation.relation_name = "join" +table JoinOptions { + /// The expression which will be evaluated against rows from each + /// input to determine whether they should be included in the + /// "join" relation's output. + on_expression: Expression (required); + join_kind: string; Review comment: :( People are gonna find ways to invent new names for joins that we won't think of. ########## File path: format/ComputeIR.fbs ########## @@ -0,0 +1,241 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +include "Schema.fbs"; +include "Message.fbs"; + +namespace org.apache.arrow.flatbuf.computeir; + +/// An expression is one of +/// - a Literal datum +/// - a reference to a Field from a Relation +/// - a call to a named function +/// On evaluation, an Expression will have either array or scalar shape. +union ExpressionImpl { + Literal, FieldRef, Call +} + +table Expression { + impl: ExpressionImpl (required); +} + +union Shape { + ArrayShape, ScalarShape +} + +table ScalarShape {} + +table ArrayShape { + /// Number of slots. + length: long; +} + +table Literal { + /// Shape of this literal. + shape: Shape (required); + + /// The type of this literal. + type: Type (required); + + /// Buffers containing `length` elements of arrow-formatted data. + /// If `length` is absent (this Literal is scalar), these buffers + /// are sized to accommodate a single element of arrow-formatted data. + /// XXX this can be optimized for trivial scalars later + buffers: [Buffer]; +} + +table FieldRef { + /// A sequence of field names to allow referencing potentially nested fields + path: [string]; + + /// For Expressions which might reference fields in multiple Relations, + /// this index may be provided to indicate which Relation's fields + /// `path` points into. For example in the case of a join, + /// 0 refers to the left relation and 1 to the right relation. + relation_index: int; + + /// The type of data in the referenced Field. + type: Type; +} + +table Call { + /// The name of the function whose invocation this Call represents. + function_name: string (required); + + /// Parameters for `function_name`; content/format may be unique to each + /// value of `function_name`. + options: Buffer; + + /// The arguments passed to `function_name`. + arguments: [Expression] (required); + + /// The type of data which invoking `function_name` will return. + type: Type; +} + +/// A relation is a set of rows with consistent schema. +table Relation { + /// The namespaced name of this Relation. + /// + /// Names with no namespace are reserved for pure relational + /// algebraic operations, which currently include: + /// "filter" + /// "project" + /// "aggregate" + /// "join" + /// "order_by" + /// "limit" + /// "common" + /// "union" + /// "literal" + /// "interactive_output" + relation_name: string (required); + + /// Parameters for `relation_name`; content/format may be unique to each + /// value of `relation_name`. + options: Buffer; + + /// The arguments passed to `relation_name`. + arguments: [Relation] (required); + + /// The schema of rows in this Relation + schema: Schema; +} + +/// The contents of Relation.options will be FilterOptions +/// if Relation.relation_name = "filter" +table FilterOptions { + /// The expression which will be evaluated against input rows + /// to determine whether they should be excluded from the + /// "filter" relation's output. + filter_expression: Expression (required); +} + +/// The contents of Relation.options will be ProjectOptions +/// if Relation.relation_name = "project" +table ProjectOptions { + /// Expressions which will be evaluated to produce to + /// the rows of the "project" relation's output. + expressions: [Expression] (required); +} + +/// The contents of Relation.options will be AggregateOptions +/// if Relation.relation_name = "aggregate" +table AggregateOptions { + /// Expressions which will be evaluated to produce to + /// the rows of the "aggregate" relation's output. + aggregations: [Expression] (required); + /// Keys by which `aggregations` will be grouped. + keys: [Expression]; Review comment: Should this be required? There's no meaningful semantic difference between a missing `keys` and an empty `keys` as far as I can tell. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
