http://git-wip-us.apache.org/repos/asf/calcite/blob/5cee486f/avatica/site/_docs/reference.md ---------------------------------------------------------------------- diff --git a/avatica/site/_docs/reference.md b/avatica/site/_docs/reference.md new file mode 100644 index 0000000..7bc9bc3 --- /dev/null +++ b/avatica/site/_docs/reference.md @@ -0,0 +1,1248 @@ +--- +layout: docs +title: SQL language +permalink: /docs/reference.html +--- +<!-- +{% comment %} +Licensed to the Apache Software Foundation (ASF) under one or more +contributor license agreements. See the NOTICE file distributed with +this work for additional information regarding copyright ownership. +The ASF licenses this file to you under the Apache License, Version 2.0 +(the "License"); you may not use this file except in compliance with +the License. You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +{% endcomment %} +--> + +The page describes the SQL dialect recognized by Calcite's default SQL parser. + +## Grammar + +SQL grammar in [BNF](http://en.wikipedia.org/wiki/Backus%E2%80%93Naur_Form)-like +form. + +{% highlight sql %} +statement: + setStatement + | resetStatement + | explain + | insert + | update + | merge + | delete + | query + +setStatement: + [ ALTER ( SYSTEM | SESSION ) ] SET identifier '=' expression + +resetStatement: + [ ALTER ( SYSTEM | SESSION ) ] RESET identifier + | [ ALTER ( SYSTEM | SESSION ) ] RESET ALL + +explain: + EXPLAIN PLAN + [ WITH TYPE | WITH IMPLEMENTATION | WITHOUT IMPLEMENTATION ] + [ EXCLUDING ATTRIBUTES | INCLUDING [ ALL ] ATTRIBUTES ] + FOR ( insert | update | merge | delete | query ) + +insert: + ( INSERT | UPSERT ) INTO tablePrimary + [ '(' column [, column ]* ')' ] + query + +update: + UPDATE tablePrimary + SET assign [, assign ]* + [ WHERE booleanExpression ] + +assign: + identifier '=' expression + +merge: + MERGE INTO tablePrimary [ [ AS ] alias ] + USING tablePrimary + ON booleanExpression + [ WHEN MATCHED THEN UPDATE SET assign [, assign ]* ] + [ WHEN NOT MATCHED THEN INSERT VALUES '(' value [ , value ]* ')' ] + +delete: + DELETE FROM tablePrimary [ [ AS ] alias ] + [ WHERE booleanExpression ] + +query: + [ WITH withItem [ , withItem ]* query ] + | { + select + | query UNION [ ALL ] query + | query EXCEPT query + | query INTERSECT query + } + [ ORDER BY orderItem [, orderItem ]* ] + [ LIMIT { count | ALL } ] + [ OFFSET start { ROW | ROWS } ] + [ FETCH { FIRST | NEXT } [ count ] { ROW | ROWS } ] + +withItem: + name + [ '(' column [, column ]* ')' ] + AS '(' query ')' + +orderItem: + expression [ ASC | DESC ] [ NULLS FIRST | NULLS LAST ] + +select: + SELECT [ STREAM ] [ ALL | DISTINCT ] + { * | projectItem [, projectItem ]* } + FROM tableExpression + [ WHERE booleanExpression ] + [ GROUP BY { groupItem [, groupItem ]* } ] + [ HAVING booleanExpression ] + [ WINDOW windowName AS windowSpec [, windowName AS windowSpec ]* ] + +projectItem: + expression [ [ AS ] columnAlias ] + | tableAlias . * + +tableExpression: + tableReference [, tableReference ]* + | tableExpression [ NATURAL ] [ LEFT | RIGHT | FULL ] JOIN tableExpression [ joinCondition ] + +joinCondition: + ON booleanExpression + | USING '(' column [, column ]* ')' + +tableReference: + [ LATERAL ] + tablePrimary + [ [ AS ] alias [ '(' columnAlias [, columnAlias ]* ')' ] ] + +tablePrimary: + [ TABLE ] [ [ catalogName . ] schemaName . ] tableName + | '(' query ')' + | values + | UNNEST '(' expression ')' [ WITH ORDINALITY ] + | TABLE '(' [ SPECIFIC ] functionName '(' expression [, expression ]* ')' ')' + +values: + VALUES expression [, expression ]* + +groupItem: + expression + | '(' ')' + | '(' expression [, expression ]* ')' + | CUBE '(' expression [, expression ]* ')' + | ROLLUP '(' expression [, expression ]* ')' + | GROUPING SETS '(' groupItem [, groupItem ]* ')' + +windowRef: + windowName + | windowSpec + +windowSpec: + [ windowName ] + '(' + [ ORDER BY orderItem [, orderItem ]* ] + [ PARTITION BY expression [, expression ]* ] + [ + RANGE numericOrIntervalExpression { PRECEDING | FOLLOWING } + | ROWS numericExpression { PRECEDING | FOLLOWING } + ] + ')' +{% endhighlight %} + +In *merge*, at least one of the WHEN MATCHED and WHEN NOT MATCHED clauses must +be present. + +In *orderItem*, if *expression* is a positive integer *n*, it denotes +the <em>n</em>th item in the SELECT clause. + +An aggregate query is a query that contains a GROUP BY or a HAVING +clause, or aggregate functions in the SELECT clause. In the SELECT, +HAVING and ORDER BY clauses of an aggregate query, all expressions +must be constant within the current group (that is, grouping constants +as defined by the GROUP BY clause, or constants), or aggregate +functions, or a combination of constants and aggregate +functions. Aggregate and grouping functions may only appear in an +aggregate query, and only in a SELECT, HAVING or ORDER BY clause. + +A scalar sub-query is a sub-query used as an expression. +If the sub-query returns no rows, the value is NULL; if it +returns more than one row, it is an error. + +IN, EXISTS and scalar sub-queries can occur +in any place where an expression can occur (such as the SELECT clause, +WHERE clause, ON clause of a JOIN, or as an argument to an aggregate +function). + +An IN, EXISTS or scalar sub-query may be correlated; that is, it +may refer to tables in the FROM clause of an enclosing query. + +## Keywords + +The following is a list of SQL keywords. +Reserved keywords are **bold**. + +{% comment %} start {% endcomment %} +A, +**ABS**, +ABSOLUTE, +ACTION, +ADA, +ADD, +ADMIN, +AFTER, +**ALL**, +**ALLOCATE**, +**ALLOW**, +**ALTER**, +ALWAYS, +**AND**, +**ANY**, +**ARE**, +**ARRAY**, +**AS**, +ASC, +**ASENSITIVE**, +ASSERTION, +ASSIGNMENT, +**ASYMMETRIC**, +**AT**, +**ATOMIC**, +ATTRIBUTE, +ATTRIBUTES, +**AUTHORIZATION**, +**AVG**, +BEFORE, +**BEGIN**, +BERNOULLI, +**BETWEEN**, +**BIGINT**, +**BINARY**, +**BIT**, +**BLOB**, +**BOOLEAN**, +**BOTH**, +BREADTH, +**BY**, +C, +**CALL**, +**CALLED**, +**CARDINALITY**, +CASCADE, +**CASCADED**, +**CASE**, +**CAST**, +CATALOG, +CATALOG_NAME, +**CEIL**, +**CEILING**, +CHAIN, +**CHAR**, +**CHARACTER**, +CHARACTERISTICTS, +CHARACTERS, +**CHARACTER_LENGTH**, +CHARACTER_SET_CATALOG, +CHARACTER_SET_NAME, +CHARACTER_SET_SCHEMA, +**CHAR_LENGTH**, +**CHECK**, +CLASS_ORIGIN, +**CLOB**, +**CLOSE**, +**COALESCE**, +COBOL, +**COLLATE**, +COLLATION, +COLLATION_CATALOG, +COLLATION_NAME, +COLLATION_SCHEMA, +**COLLECT**, +**COLUMN**, +COLUMN_NAME, +COMMAND_FUNCTION, +COMMAND_FUNCTION_CODE, +**COMMIT**, +COMMITTED, +**CONDITION**, +CONDITION_NUMBER, +**CONNECT**, +CONNECTION, +CONNECTION_NAME, +**CONSTRAINT**, +CONSTRAINTS, +CONSTRAINT_CATALOG, +CONSTRAINT_NAME, +CONSTRAINT_SCHEMA, +CONSTRUCTOR, +CONTAINS, +CONTINUE, +**CONVERT**, +**CORR**, +**CORRESPONDING**, +**COUNT**, +**COVAR_POP**, +**COVAR_SAMP**, +**CREATE**, +**CROSS**, +**CUBE**, +**CUME_DIST**, +**CURRENT**, +**CURRENT_CATALOG**, +**CURRENT_DATE**, +**CURRENT_DEFAULT_TRANSFORM_GROUP**, +**CURRENT_PATH**, +**CURRENT_ROLE**, +**CURRENT_SCHEMA**, +**CURRENT_TIME**, +**CURRENT_TIMESTAMP**, +**CURRENT_TRANSFORM_GROUP_FOR_TYPE**, +**CURRENT_USER**, +**CURSOR**, +CURSOR_NAME, +**CYCLE**, +DATA, +**DATE**, +DATETIME_INTERVAL_CODE, +DATETIME_INTERVAL_PRECISION, +**DAY**, +**DEALLOCATE**, +**DEC**, +**DECIMAL**, +**DECLARE**, +**DEFAULT**, +DEFAULTS, +DEFERRABLE, +DEFERRED, +DEFINED, +DEFINER, +DEGREE, +**DELETE**, +**DENSE_RANK**, +DEPTH, +**DEREF**, +DERIVED, +DESC, +**DESCRIBE**, +DESCRIPTION, +DESCRIPTOR, +**DETERMINISTIC**, +DIAGNOSTICS, +**DISALLOW**, +**DISCONNECT**, +DISPATCH, +**DISTINCT**, +DOMAIN, +**DOUBLE**, +**DROP**, +**DYNAMIC**, +DYNAMIC_FUNCTION, +DYNAMIC_FUNCTION_CODE, +**EACH**, +**ELEMENT**, +**ELSE**, +**END**, +**END-EXEC**, +EQUALS, +**ESCAPE**, +**EVERY**, +**EXCEPT**, +EXCEPTION, +EXCLUDE, +EXCLUDING, +**EXEC**, +**EXECUTE**, +**EXISTS**, +**EXP**, +**EXPLAIN**, +**EXTEND**, +**EXTERNAL**, +**EXTRACT**, +**FALSE**, +**FETCH**, +**FILTER**, +FINAL, +FIRST, +**FIRST_VALUE**, +**FLOAT**, +**FLOOR**, +FOLLOWING, +**FOR**, +**FOREIGN**, +FORTRAN, +FOUND, +**FREE**, +**FROM**, +**FULL**, +**FUNCTION**, +**FUSION**, +G, +GENERAL, +GENERATED, +**GET**, +**GLOBAL**, +GO, +GOTO, +**GRANT**, +GRANTED, +**GROUP**, +**GROUPING**, +**HAVING**, +HIERARCHY, +**HOLD**, +**HOUR**, +**IDENTITY**, +IMMEDIATE, +IMPLEMENTATION, +**IMPORT**, +**IN**, +INCLUDING, +INCREMENT, +**INDICATOR**, +INITIALLY, +**INNER**, +**INOUT**, +INPUT, +**INSENSITIVE**, +**INSERT**, +INSTANCE, +INSTANTIABLE, +**INT**, +**INTEGER**, +**INTERSECT**, +**INTERSECTION**, +**INTERVAL**, +**INTO**, +INVOKER, +**IS**, +ISOLATION, +JAVA, +**JOIN**, +K, +KEY, +KEY_MEMBER, +KEY_TYPE, +LABEL, +**LANGUAGE**, +**LARGE**, +LAST, +**LAST_VALUE**, +**LATERAL**, +**LEADING**, +**LEFT**, +LENGTH, +LEVEL, +LIBRARY, +**LIKE**, +**LIMIT**, +**LN**, +**LOCAL**, +**LOCALTIME**, +**LOCALTIMESTAMP**, +LOCATOR, +**LOWER**, +M, +MAP, +**MATCH**, +MATCHED, +**MAX**, +MAXVALUE, +**MEMBER**, +**MERGE**, +MESSAGE_LENGTH, +MESSAGE_OCTET_LENGTH, +MESSAGE_TEXT, +**METHOD**, +**MIN**, +**MINUTE**, +MINVALUE, +**MOD**, +**MODIFIES**, +**MODULE**, +**MONTH**, +MORE, +**MULTISET**, +MUMPS, +NAME, +NAMES, +**NATIONAL**, +**NATURAL**, +**NCHAR**, +**NCLOB**, +NESTING, +**NEW**, +**NEXT**, +**NO**, +**NONE**, +**NORMALIZE**, +NORMALIZED, +**NOT**, +**NULL**, +NULLABLE, +**NULLIF**, +NULLS, +NUMBER, +**NUMERIC**, +OBJECT, +OCTETS, +**OCTET_LENGTH**, +**OF**, +**OFFSET**, +**OLD**, +**ON**, +**ONLY**, +**OPEN**, +OPTION, +OPTIONS, +**OR**, +**ORDER**, +ORDERING, +ORDINALITY, +OTHERS, +**OUT**, +**OUTER**, +OUTPUT, +**OVER**, +**OVERLAPS**, +**OVERLAY**, +OVERRIDING, +PAD, +**PARAMETER**, +PARAMETER_MODE, +PARAMETER_NAME, +PARAMETER_ORDINAL_POSITION, +PARAMETER_SPECIFIC_CATALOG, +PARAMETER_SPECIFIC_NAME, +PARAMETER_SPECIFIC_SCHEMA, +PARTIAL, +**PARTITION**, +PASCAL, +PASSTHROUGH, +PATH, +**PERCENTILE_CONT**, +**PERCENTILE_DISC**, +**PERCENT_RANK**, +PLACING, +PLAN, +PLI, +**POSITION**, +**POWER**, +PRECEDING, +**PRECISION**, +**PREPARE**, +PRESERVE, +**PRIMARY**, +PRIOR, +PRIVILEGES, +**PROCEDURE**, +PUBLIC, +**RANGE**, +**RANK**, +READ, +**READS**, +**REAL**, +**RECURSIVE**, +**REF**, +**REFERENCES**, +**REFERENCING**, +**REGR_AVGX**, +**REGR_AVGY**, +**REGR_COUNT**, +**REGR_INTERCEPT**, +**REGR_R2**, +**REGR_SLOPE**, +**REGR_SXX**, +**REGR_SXY**, +**REGR_SYY**, +RELATIVE, +**RELEASE**, +REPEATABLE, +**RESET**, +RESTART, +RESTRICT, +**RESULT**, +**RETURN**, +RETURNED_CARDINALITY, +RETURNED_LENGTH, +RETURNED_OCTET_LENGTH, +RETURNED_SQLSTATE, +**RETURNS**, +**REVOKE**, +**RIGHT**, +ROLE, +**ROLLBACK**, +**ROLLUP**, +ROUTINE, +ROUTINE_CATALOG, +ROUTINE_NAME, +ROUTINE_SCHEMA, +**ROW**, +**ROWS**, +ROW_COUNT, +**ROW_NUMBER**, +**SAVEPOINT**, +SCALE, +SCHEMA, +SCHEMA_NAME, +**SCOPE**, +SCOPE_CATALOGS, +SCOPE_NAME, +SCOPE_SCHEMA, +**SCROLL**, +**SEARCH**, +**SECOND**, +SECTION, +SECURITY, +**SELECT**, +SELF, +**SENSITIVE**, +SEQUENCE, +SERIALIZABLE, +SERVER, +SERVER_NAME, +SESSION, +**SESSION_USER**, +**SET**, +SETS, +**SIMILAR**, +SIMPLE, +SIZE, +**SMALLINT**, +**SOME**, +SOURCE, +SPACE, +**SPECIFIC**, +**SPECIFICTYPE**, +SPECIFIC_NAME, +**SQL**, +**SQLEXCEPTION**, +**SQLSTATE**, +**SQLWARNING**, +**SQRT**, +**START**, +STATE, +STATEMENT, +**STATIC**, +**STDDEV_POP**, +**STDDEV_SAMP**, +**STREAM**, +STRUCTURE, +STYLE, +SUBCLASS_ORIGIN, +**SUBMULTISET**, +SUBSTITUTE, +**SUBSTRING**, +**SUM**, +**SYMMETRIC**, +**SYSTEM**, +**SYSTEM_USER**, +**TABLE**, +**TABLESAMPLE**, +TABLE_NAME, +TEMPORARY, +**THEN**, +TIES, +**TIME**, +**TIMESTAMP**, +**TIMEZONE_HOUR**, +**TIMEZONE_MINUTE**, +**TINYINT**, +**TO**, +TOP_LEVEL_COUNT, +**TRAILING**, +TRANSACTION, +TRANSACTIONS_ACTIVE, +TRANSACTIONS_COMMITTED, +TRANSACTIONS_ROLLED_BACK, +TRANSFORM, +TRANSFORMS, +**TRANSLATE**, +**TRANSLATION**, +**TREAT**, +**TRIGGER**, +TRIGGER_CATALOG, +TRIGGER_NAME, +TRIGGER_SCHEMA, +**TRIM**, +**TRUE**, +TYPE, +**UESCAPE**, +UNBOUNDED, +UNCOMMITTED, +UNDER, +**UNION**, +**UNIQUE**, +**UNKNOWN**, +UNNAMED, +**UNNEST**, +**UPDATE**, +**UPPER**, +**UPSERT**, +USAGE, +**USER**, +USER_DEFINED_TYPE_CATALOG, +USER_DEFINED_TYPE_CODE, +USER_DEFINED_TYPE_NAME, +USER_DEFINED_TYPE_SCHEMA, +**USING**, +**VALUE**, +**VALUES**, +**VARBINARY**, +**VARCHAR**, +**VARYING**, +**VAR_POP**, +**VAR_SAMP**, +VERSION, +VIEW, +**WHEN**, +**WHENEVER**, +**WHERE**, +**WIDTH_BUCKET**, +**WINDOW**, +**WITH**, +**WITHIN**, +**WITHOUT**, +WORK, +WRAPPER, +WRITE, +XML, +**YEAR**, +ZONE. +{% comment %} end {% endcomment %} + +## Identifiers + +Identifiers are the names of tables, columns and other metadata +elements used in a SQL query. + +Unquoted identifiers, such as emp, must start with a letter and can +only contain letters, digits, and underscores. They are implicitly +converted to upper case. + +Quoted identifiers, such as `"Employee Name"`, start and end with +double quotes. They may contain virtually any character, including +spaces and other punctuation. If you wish to include a double quote +in an identifier, use another double quote to escape it, like this: +`"An employee called ""Fred""."`. + +In Calcite, matching identifiers to the name of the referenced object is +case-sensitive. But remember that unquoted identifiers are implicitly +converted to upper case before matching, and if the object it refers +to was created using an unquoted identifier for its name, then its +name will have been converted to upper case also. + +## Data types + +### Scalar types + +| Data type | Description | Range and examples | +|:----------- |:------------------------- |:---------------------| +| BOOLEAN | Logical values | Values: TRUE, FALSE, UNKNOWN +| TINYINT | 1 byte signed integer | Range is -255 to 256 +| SMALLINT | 2 byte signed integer | Range is -32768 to 32767 +| INTEGER, INT | 4 byte signed integer | Range is -2147483648 to 2147483647 +| BIGINT | 8 byte signed integer | Range is -9223372036854775808 to 9223372036854775807 +| DECIMAL(p, s) | Fixed point | Example: 123.45 is a DECIMAL(5, 2) value. +| NUMERIC | Fixed point | +| REAL, FLOAT | 4 byte floating point | 6 decimal digits precision +| DOUBLE | 8 byte floating point | 15 decimal digits precision +| CHAR(n), CHARACTER(n) | Fixed-width character string | 'Hello', '' (empty string), _latin1'Hello', n'Hello', _UTF16'Hello', 'Hello' 'there' (literal split into multiple parts) +| VARCHAR(n), CHARACTER VARYING(n) | Variable-length character string | As CHAR(n) +| BINARY(n) | Fixed-width binary string | x'45F0AB', x'' (empty binary string), x'AB' 'CD' (multi-part binary string literal) +| VARBINARY(n), BINARY VARYING(n) | Variable-length binary string | As BINARY(n) +| DATE | Date | Example: DATE '1969-07-20' +| TIME | Time of day | Example: TIME '20:17:40' +| TIMESTAMP [ WITHOUT TIME ZONE ] | Date and time | Example: TIMESTAMP '1969-07-20 20:17:40' +| TIMESTAMP WITH TIME ZONE | Date and time with time zone | Example: TIMESTAMP '1969-07-20 20:17:40 America/Los Angeles' +| INTERVAL timeUnit [ TO timeUnit ] | Date time interval | Examples: INTERVAL '1:5' YEAR TO MONTH, INTERVAL '45' DAY +| Anchored interval | Date time interval | Example: (DATE '1969-07-20', DATE '1972-08-29') + +Where: + +{% highlight sql %} +timeUnit: + YEAR | MONTH | DAY | HOUR | MINUTE | SECOND +{% endhighlight %} + +Note: + +* DATE, TIME and TIMESTAMP have no time zone. There is not even an implicit + time zone, such as UTC (as in Java) or the local time zone. It is left to + the user or application to supply a time zone. + +### Non-scalar types + +| Type | Description +|:-------- |:----------------------------------------------------------- +| ANY | A value of an unknown type +| ROW | Row with 1 or more columns +| MAP | Collection of keys mapped to values +| MULTISET | Unordered collection that may contain duplicates +| ARRAY | Ordered, contiguous collection that may contain duplicates +| CURSOR | Cursor over the result of executing a query + +## Operators and functions + +### Comparison operators + +| Operator syntax | Description +|:------------------------------------------------- |:----------- +| value1 = value2 | Equals +| value1 <> value2 | Not equal +| value1 > value2 | Greater than +| value1 >= value2 | Greater than or equal +| value1 < value2 | Less than +| value1 <= value2 | Less than or equal +| value IS NULL | Whether *value* is null +| value IS NOT NULL | Whether *value* is not null +| value1 IS DISTINCT FROM value2 | Whether two values are not equal, treating null values as the same +| value1 IS NOT DISTINCT FROM value2 | Whether two values are equal, treating null values as the same +| value1 BETWEEN value2 AND value3 | Whether *value1* is greater than or equal to *value2* and less than or equal to *value3* +| value1 NOT BETWEEN value2 AND value3 | Whether *value1* is less than *value2* or greater than *value3* +| string1 LIKE string2 [ ESCAPE string3 ] | Whether *string1* matches pattern *string2* +| string1 NOT LIKE string2 [ ESCAPE string3 ] | Whether *string1* does not match pattern *string2* +| string1 SIMILAR TO string2 [ ESCAPE string3 ] | Whether *string1* matches regular expression *string2* +| string1 NOT SIMILAR TO string2 [ ESCAPE string3 ] | Whether *string1* does not match regular expression *string2* +| value IN (value [, value]* ) | Whether *value* is equal to a value in a list +| value NOT IN (value [, value]* ) | Whether *value* is not equal to every value in a list +| value IN (sub-query) | Whether *value* is equal to a row returned by *sub-query* +| value NOT IN (sub-query) | Whether *value* is not equal to every row returned by *sub-query* +| EXISTS (sub-query) | Whether *sub-query* returns at least one row + +### Logical operators + +| Operator syntax | Description +|:---------------------- |:----------- +| boolean1 OR boolean2 | Whether *boolean1* is TRUE or *boolean2* is TRUE +| boolean1 AND boolean2 | Whether *boolean1* and *boolean2* are both TRUE +| NOT boolean | Whether *boolean* is not TRUE; returns UNKNOWN if *boolean* is UNKNOWN +| boolean IS FALSE | Whether *boolean* is FALSE; returns FALSE if *boolean* is UNKNOWN +| boolean IS NOT FALSE | Whether *boolean* is not FALSE; returns TRUE if *boolean* is UNKNOWN +| boolean IS TRUE | Whether *boolean* is TRUE; returns FALSE if *boolean* is UNKNOWN +| boolean IS NOT TRUE | Whether *boolean* is not TRUE; returns TRUE if *boolean* is UNKNOWN +| boolean IS UNKNOWN | Whether *boolean* is UNKNOWN +| boolean IS NOT UNKNOWN | Whether *boolean* is not UNKNOWN + +### Arithmetic operators and functions + +| Operator syntax | Description +|:------------------------- |:----------- +| + numeric | Returns *numeric* +|:- numeric | Returns negative *numeric* +| numeric1 + numeric2 | Returns *numeric1* plus *numeric2* +| numeric1 - numeric2 | Returns *numeric1* minus *numeric2* +| numeric1 * numeric2 | Returns *numeric1* multiplied by *numeric2* +| numeric1 / numeric2 | Returns *numeric1* divided by *numeric2* +| POWER(numeric1, numeric2) | Returns *numeric1* raised to the power of *numeric2* +| ABS(numeric) | Returns the absolute value of *numeric* +| MOD(numeric, numeric) | Returns the remainder (modulus) of *numeric1* divided by *numeric2*. The result is negative only if *numeric1* is negative +| SQRT(numeric) | Returns the square root of *numeric* +| LN(numeric) | Returns the natural logarithm (base *e*) of *numeric* +| LOG10(numeric) | Returns the base 10 logarithm of *numeric* +| EXP(numeric) | Returns *e* raised to the power of *numeric* +| CEIL(numeric) | Rounds *numeric* up, and returns the smallest number that is greater than or equal to *numeric* +| FLOOR(numeric) | Rounds *numeric* down, and returns the largest number that is less than or equal to *numeric* + +### Character string operators and functions + +| Operator syntax | Description +|:-------------------------- |:----------- +| string || string | Concatenates two character strings. +| CHAR_LENGTH(string) | Returns the number of characters in a character string +| CHARACTER_LENGTH(string) | As CHAR_LENGTH(*string*) +| UPPER(string) | Returns a character string converted to upper case +| LOWER(string) | Returns a character string converted to lower case +| POSITION(string1 IN string2) | Returns the position of the first occurrence of *string1* in *string2* +| TRIM( { BOTH | LEADING | TRAILING } string1 FROM string2) | Removes the longest string containing only the characters in *string1* from the start/end/both ends of *string1* +| OVERLAY(string1 PLACING string2 FROM integer [ FOR integer2 ]) | Replaces a substring of *string1* with *string2* +| SUBSTRING(string FROM integer) | Returns a substring of a character string starting at a given point. +| SUBSTRING(string FROM integer FOR integer) | Returns a substring of a character string starting at a given point with a given length. +| INITCAP(string) | Returns *string* with the first letter of each word converter to upper case and the rest to lower case. Words are sequences of alphanumeric characters separated by non-alphanumeric characters. + +Not implemented: + +* SUBSTRING(string FROM regexp FOR regexp) + +### Binary string operators and functions + +| Operator syntax | Description +|:--------------- |:----------- +| binary || binary | Concatenates two binary strings. +| POSITION(binary1 IN binary2) | Returns the position of the first occurrence of *binary1* in *binary2* +| OVERLAY(binary1 PLACING binary2 FROM integer [ FOR integer2 ]) | Replaces a substring of *binary1* with *binary2* +| SUBSTRING(binary FROM integer) | Returns a substring of *binary* starting at a given point +| SUBSTRING(binary FROM integer FOR integer) | Returns a substring of *binary* starting at a given point with a given length + +### Date/time functions + +| Operator syntax | Description +|:------------------------- |:----------- +| LOCALTIME | Returns the current date and time in the session time zone in a value of datatype TIME +| LOCALTIME(precision) | Returns the current date and time in the session time zone in a value of datatype TIME, with *precision* digits of precision +| LOCALTIMESTAMP | Returns the current date and time in the session time zone in a value of datatype TIMESTAMP +| LOCALTIMESTAMP(precision) | Returns the current date and time in the session time zone in a value of datatype TIMESTAMP, with *precision* digits of precision +| CURRENT_TIME | Returns the current time in the session time zone, in a value of datatype TIMESTAMP WITH TIME ZONE +| CURRENT_DATE | Returns the current date in the session time zone, in a value of datatype DATE +| CURRENT_TIMESTAMP | Returns the current date and time in the session time zone, in a value of datatype TIMESTAMP WITH TIME ZONE +| EXTRACT(timeUnit FROM datetime) | Extracts and returns the value of a specified datetime field from a datetime value expression +| FLOOR(datetime TO timeUnit) | Rounds *datetime* down to *timeUnit* +| CEIL(datetime TO timeUnit) | Rounds *datetime* up to *timeUnit* + +Not implemented: + +* EXTRACT(timeUnit FROM interval) +* CEIL(interval) +* FLOOR(interval) +* datetime - datetime timeUnit [ TO timeUnit ] +* interval OVERLAPS interval +* \+ interval +* \- interval +* interval + interval +* interval - interval +* interval / interval +* datetime + interval +* datetime - interval + +### System functions + +| Operator syntax | Description +|:--------------- |:----------- +| USER | Equivalent to CURRENT_USER +| CURRENT_USER | User name of current execution context +| SESSION_USER | Session user name +| SYSTEM_USER | Returns the name of the current data store user as identified by the operating system +| CURRENT_PATH | Returns a character string representing the current lookup scope for references to user-defined routines and types +| CURRENT_ROLE | Returns the current active role + +### Conditional functions and operators + +| Operator syntax | Description +|:--------------- |:----------- +| CASE value<br/>WHEN value1 [, value11 ]* THEN result1<br/>[ WHEN valueN [, valueN1 ]* THEN resultN ]*<br/>[ ELSE resultZ ]<br/> END | Simple case +| CASE<br/>WHEN condition1 THEN result1<br/>[ WHEN conditionN THEN resultN ]*<br/>[ ELSE resultZ ]<br/>END | Searched case +| NULLIF(value, value) | Returns NULL if the values are the same.<br/><br/>For example, <code>NULLIF(5, 5)</code> returns NULL; <code>NULLIF(5, 0)</code> returns 5. +| COALESCE(value, value [, value ]* ) | Provides a value if the first value is null.<br/><br/>For example, <code>COALESCE(NULL, 5)</code> returns 5. + +### Type conversion + +| Operator syntax | Description +|:--------------- | :---------- +| CAST(value AS type) | Converts a value to a given type. + +### Value constructors + +| Operator syntax | Description +|:--------------- |:----------- +| ROW (value [, value]* ) | Creates a row from a list of values. +| (value [, value]* ) | Creates a row from a list of values. +| map '[' key ']' | Returns the element of a map with a particular key. +| array '[' index ']' | Returns the element at a particular location in an array. +| ARRAY '[' value [, value ]* ']' | Creates an array from a list of values. +| MAP '[' key, value [, key, value ]* ']' | Creates a map from a list of key-value pairs. + +### Collection functions + +| Operator syntax | Description +|:--------------- |:----------- +| ELEMENT(value) | Returns the sole element of a array or multiset; null if the collection is empty; throws if it has more than one element. +| CARDINALITY(value) | Returns the number of elements in an array or multiset. + +See also: UNNEST relational operator converts a collection to a relation. + +### JDBC function escape + +#### Numeric + +| Operator syntax | Description +|:------------------------------ |:----------- +| {fn LOG10(numeric)} | Returns the base-10 logarithm of *numeric* +| {fn POWER(numeric1, numeric2)} | Returns *numeric1* raised to the power of *numeric2* + +Not implemented: + +* {fn ABS(numeric)} - Returns the absolute value of *numeric* +* {fn ACOS(numeric)} - Returns the arc cosine of *numeric* +* {fn ASIN(numeric)} - Returns the arc sine of *numeric* +* {fn ATAN(numeric)} - Returns the arc tangent of *numeric* +* {fn ATAN2(numeric, numeric)} +* {fn CEILING(numeric)} - Rounds *numeric* up, and returns the smallest number that is greater than or equal to *numeric* +* {fn COS(numeric)} - Returns the cosine of *numeric* +* {fn COT(numeric)} +* {fn DEGREES(numeric)} - Converts *numeric* from radians to degrees +* {fn EXP(numeric)} - Returns *e* raised to the power of *numeric* +* {fn FLOOR(numeric)} - Rounds *numeric* down, and returns the largest number that is less than or equal to *numeric* +* {fn LOG(numeric)} - Returns the natural logarithm (base *e*) of *numeric* +* {fn MOD(numeric1, numeric2)} - Returns the remainder (modulus) of *numeric1* divided by *numeric2*. The result is negative only if *numeric1* is negative +* {fn PI()} - Returns a value that is closer than any other value to *pi* +* {fn RADIANS(numeric)} - Converts *numeric* from degrees to radians +* {fn RAND(numeric)} +* {fn ROUND(numeric, numeric)} +* {fn SIGN(numeric)} +* {fn SIN(numeric)} - Returns the sine of *numeric* +* {fn SQRT(numeric)} - Returns the square root of *numeric* +* {fn TAN(numeric)} - Returns the tangent of *numeric* +* {fn TRUNCATE(numeric, numeric)} + +#### String + +| Operator syntax | Description +|:--------------- |:----------- +| {fn LOCATE(string1, string2)} | Returns the position in *string2* of the first occurrence of *string1*. Searches from the beginning of the second CharacterExpression, unless the startIndex parameter is specified. +| {fn INSERT(string1, start, length, string2)} | Inserts *string2* into a slot in *string1* +| {fn LCASE(string)} | Returns a string in which all alphabetic characters in *string* have been converted to lower case + +Not implemented: + +* {fn ASCII(string)} - Convert a single-character string to the corresponding ASCII code, an integer between 0 and 255 +* {fn CHAR(string)} +* {fn CONCAT(character, character)} - Returns the concatenation of character strings +* {fn DIFFERENCE(string, string)} +* {fn LEFT(string, integer)} +* {fn LENGTH(string)} +* {fn LOCATE(string1, string2 [, integer])} - Returns the position in *string2* of the first occurrence of *string1*. Searches from the beginning of *string2*, unless *integer* is specified. +* {fn LTRIM(string)} +* {fn REPEAT(string, integer)} +* {fn REPLACE(string, string, string)} +* {fn RIGHT(string, integer)} +* {fn RTRIM(string)} +* {fn SOUNDEX(string)} +* {fn SPACE(integer)} +* {fn SUBSTRING(string, integer, integer)} +* {fn UCASE(string)} - Returns a string in which all alphabetic characters in *string* have been converted to upper case + +#### Date/time + +Not implemented: + +* {fn CURDATE()} +* {fn CURTIME()} +* {fn DAYNAME(date)} +* {fn DAYOFMONTH(date)} +* {fn DAYOFWEEK(date)} +* {fn DAYOFYEAR(date)} +* {fn HOUR(time)} +* {fn MINUTE(time)} +* {fn MONTH(date)} +* {fn MONTHNAME(date)} +* {fn NOW()} +* {fn QUARTER(date)} +* {fn SECOND(time)} +* {fn TIMESTAMPADD(interval, count, timestamp)} +* {fn TIMESTAMPDIFF(interval, timestamp, timestamp)} +* {fn WEEK(date)} +* {fn YEAR(date)} + +#### System + +Not implemented: + +* {fn DATABASE()} +* {fn IFNULL(value, value)} +* {fn USER(value, value)} +* {fn CONVERT(value, type)} + +### Aggregate functions + +Syntax: + +{% highlight sql %} +aggregateCall: + agg( [ DISTINCT ] value [, value]* ) [ FILTER ( WHERE condition ) ] + | agg(*) [ FILTER ( WHERE condition ) ] +{% endhighlight %} + +If `FILTER` is present, the aggregate function only considers rows for which +*condition* evaluates to TRUE. + +If `DISTINCT` is present, duplicate argument values are eliminated before being +passed to the aggregate function. + +| Operator syntax | Description +|:---------------------------------- |:----------- +| COLLECT( [ DISTINCT ] value) | Returns a multiset of the values +| COUNT( [ DISTINCT ] value [, value]* ) | Returns the number of input rows for which *value* is not null (wholly not null if *value* is composite) +| COUNT(*) | Returns the number of input rows +| AVG( [ DISTINCT ] numeric) | Returns the average (arithmetic mean) of *numeric* across all input values +| SUM( [ DISTINCT ] numeric) | Returns the sum of *numeric* across all input values +| MAX( [ DISTINCT ] value) | Returns the maximum value of *value* across all input values +| MIN( [ DISTINCT ] value) | Returns the minimum value of *value* across all input values +| STDDEV_POP( [ DISTINCT ] numeric) | Returns the population standard deviation of *numeric* across all input values +| STDDEV_SAMP( [ DISTINCT ] numeric) | Returns the sample standard deviation of *numeric* across all input values +| VAR_POP( [ DISTINCT ] value) | Returns the population variance (square of the population standard deviation) of *numeric* across all input values +| VAR_SAMP( [ DISTINCT ] numeric) | Returns the sample variance (square of the sample standard deviation) of *numeric* across all input values +| COVAR_POP(numeric1, numeric2) | Returns the population covariance of the pair (*numeric1*, *numeric2*) across all input values +| COVAR_SAMP(numeric1, numeric2) | Returns the sample covariance of the pair (*numeric1*, *numeric2*) across all input values +| REGR_SXX(numeric1, numeric2) | Returns the sum of squares of the dependent expression in a linear regression model +| REGR_SYY(numeric1, numeric2) | Returns the sum of squares of the independent expression in a linear regression model + +Not implemented: + +* REGR_AVGX(numeric1, numeric2) +* REGR_AVGY(numeric1, numeric2) +* REGR_COUNT(numeric1, numeric2) +* REGR_INTERCEPT(numeric1, numeric2) +* REGR_R2(numeric1, numeric2) +* REGR_SLOPE(numeric1, numeric2) +* REGR_SXY(numeric1, numeric2) + +### Window functions + +| Operator syntax | Description +|:----------------------------------------- |:----------- +| COUNT(value [, value ]* ) OVER window | Returns the number of rows in *window* for which *value* is not null (wholly not null if *value* is composite) +| COUNT(*) OVER window | Returns the number of rows in *window* +| AVG(numeric) OVER window | Returns the average (arithmetic mean) of *numeric* across all values in *window* +| SUM(numeric) OVER window | Returns the sum of *numeric* across all values in *window* +| MAX(value) OVER window | Returns the maximum value of *value* across all values in *window* +| MIN(value) OVER window | Returns the minimum value of *value* across all values in *window* +| RANK() OVER window | Returns the rank of the current row with gaps; same as ROW_NUMBER of its first peer +| DENSE_RANK() OVER window | Returns the rank of the current row without gaps; this function counts peer groups +| ROW_NUMBER() OVER window | Returns the number of the current row within its partition, counting from 1 +| FIRST_VALUE(value) OVER window | Returns *value* evaluated at the row that is the first row of the window frame +| LAST_VALUE(value) OVER window | Returns *value* evaluated at the row that is the last row of the window frame +| LEAD(value, offset, default) OVER window | Returns *value* evaluated at the row that is *offset* rows after the current row within the partition; if there is no such row, instead returns *default*. Both *offset* and *default* are evaluated with respect to the current row. If omitted, *offset* defaults to 1 and *default* to NULL +| LAG(value, offset, default) OVER window | Returns *value* evaluated at the row that is *offset* rows before the current row within the partition; if there is no such row, instead returns *default*. Both *offset* and *default* are evaluated with respect to the current row. If omitted, *offset* defaults to 1 and *default* to NULL +| NTILE(value) OVER window | Returns an integer ranging from 1 to *value*, dividing the partition as equally as possible + +Not implemented: + +* COUNT(DISTINCT value) OVER window +* FIRST_VALUE(value) IGNORE NULLS OVER window +* LAST_VALUE(value) IGNORE NULLS OVER window +* PERCENT_RANK(value) OVER window +* CUME_DIST(value) OVER window +* NTH_VALUE(value, nth) OVER window + +### Grouping functions + +| Operator syntax | Description +|:-------------------- |:----------- +| GROUPING(expression) | Returns 1 if expression is rolled up in the current row's grouping set, 0 otherwise +| GROUP_ID() | Returns an integer that uniquely identifies the combination of grouping keys +| GROUPING_ID(expression [, expression ] * ) | Returns a bit vector of the given grouping expressions + +### User-defined functions + +Calcite is extensible. You can define each kind of function using user code. +For each kind of function there are often several ways to define a function, +varying from convenient to efficient. + +To implement a *scalar function*, there are 3 options: + +* Create a class with a public static `eval` method, + and register the class; +* Create a class with a public non-static `eval` method, + and a public constructor with no arguments, + and register the class; +* Create a class with one or more public static methods, + and register each class/method combination. + +To implement an *aggregate function*, there are 2 options: + +* Create a class with public static `init`, `add` and `result` methods, + and register the class; +* Create a class with public non-static `init`, `add` and `result` methods, + and a public constructor with no arguments, + and register the class. + +Optionally, add a public `merge` method to the class; this allows Calcite to +generate code that merges sub-totals. + +Optionally, make your class implement the +[SqlSplittableAggFunction]({{ site.apiRoot }}/org/apache/calcite/sql/SqlSplittableAggFunction.html) +interface; this allows Calcite to decompose the function across several stages +of aggregation, roll up from summary tables, and push it through joins. + +To implement a *table function*, there are 3 options: + +* Create a class with a static `eval` method that returns + [ScannableTable]({{ site.apiRoot }}/org/apache/calcite/schema/ScannableTable.html) + or + [QueryableTable]({{ site.apiRoot }}/org/apache/calcite/schema/QueryableTable.html), + and register the class; +* Create a class with a non-static `eval` method that returns + [ScannableTable]({{ site.apiRoot }}/org/apache/calcite/schema/ScannableTable.html) + or + [QueryableTable]({{ site.apiRoot }}/org/apache/calcite/schema/QueryableTable.html), + and register the class; +* Create a class with one or more public static methods that return + [ScannableTable]({{ site.apiRoot }}/org/apache/calcite/schema/ScannableTable.html) + or + [QueryableTable]({{ site.apiRoot }}/org/apache/calcite/schema/QueryableTable.html), + and register each class/method combination. + +To implement a *table macro*, there are 3 options: + +* Create a class with a static `eval` method that returns + [TranslatableTable]({{ site.apiRoot }}/org/apache/calcite/schema/TranslatableTable.html), + and register the class; +* Create a class with a non-static `eval` method that returns + [TranslatableTable]({{ site.apiRoot }}/org/apache/calcite/schema/TranslatableTable.html), + and register the class; +* Create a class with one or more public static methods that return + [TranslatableTable]({{ site.apiRoot }}/org/apache/calcite/schema/TranslatableTable.html), + and register each class/method combination. + +Calcite deduces the parameter types and result type of a function from the +parameter and return types of the Java method that implements it. Further, you +can specify the name and optionality of each parameter using the +[Parameter]({{ site.apiRoot }}/org/apache/calcite/linq4j/function/Parameter.html) +annotation. + +### Calling functions with named and optional parameters + +Usually when you call a function, you need to specify all of its parameters, +in order. But that can be a problem if a function has a lot of parameters, +and especially if you want to add more parameters over time. + +To solve this problem, the SQL standard allows you to pass parameters by name, +and to define parameters which are optional (that is, have a default value +that is used if they are not specified). + +Suppose you have a function `f`, declared as in the following pseudo syntax: + +```FUNCTION f( + INTEGER a, + INTEGER b DEFAULT NULL, + INTEGER c, + INTEGER d DEFAULT NULL, + INTEGER e DEFAULT NULL) RETURNS INTEGER``` + +All of the function's parameters have names, and parameters `b`, `d` and `e` +have a default value of `NULL` and are therefore optional. +(In Calcite, `NULL` is the only allowable default value for optional parameters; +this may change +[in future](https://issues.apache.org/jira/browse/CALCITE-947).) + +When calling a function with optional parameters, +you can omit optional arguments at the end of the list, or use the `DEFAULT` +keyword for any optional arguments. +Here are some examples: + +* `f(1, 2, 3, 4, 5)` provides a value to each parameter, in order; +* `f(1, 2, 3, 4)` omits `e`, which gets its default value, `NULL`; +* `f(1, DEFAULT, 3)` omits `d` and `e`, + and specifies to use the default value of `b`; +* `f(1, DEFAULT, 3, DEFAULT, DEFAULT)` has the same effect as the previous + example; +* `f(1, 2)` is not legal, because `c` is not optional; +* `f(1, 2, DEFAULT, 4)` is not legal, because `c` is not optional. + +You can specify arguments by name using the `=>` syntax. +If one argument is named, they all must be. +Arguments may be in any other, but must not specify any argument more than once, +and you need to provide a value for every parameter which is not optional. +Here are some examples: + +* `f(c => 3, d => 1, a => 0)` is equivalent to `f(0, NULL, 3, 1, NULL)`; +* `f(c => 3, d => 1)` is not legal, because you have not specified a value for + `a` and `a` is not optional. +
http://git-wip-us.apache.org/repos/asf/calcite/blob/5cee486f/avatica/site/_docs/stream.md ---------------------------------------------------------------------- diff --git a/avatica/site/_docs/stream.md b/avatica/site/_docs/stream.md new file mode 100644 index 0000000..f66fbce --- /dev/null +++ b/avatica/site/_docs/stream.md @@ -0,0 +1,1023 @@ +--- +layout: docs +title: Streaming +permalink: /docs/stream.html +--- +<!-- +{% comment %} +Licensed to the Apache Software Foundation (ASF) under one or more +contributor license agreements. See the NOTICE file distributed with +this work for additional information regarding copyright ownership. +The ASF licenses this file to you under the Apache License, Version 2.0 +(the "License"); you may not use this file except in compliance with +the License. You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +{% endcomment %} +--> + +Calcite has extended SQL and relational algebra in order to support +streaming queries. + +* TOC +{:toc} + +## Introduction + +Streams are collections to records that flow continuously, and forever. +Unlike tables, they are not typically stored on disk, but flow over the +network and are held for short periods of time in memory. + +Streams complement tables because they represent what is happening in the +present and future of the enterprise whereas tables represent the past. +It is very common for a stream to be archived into a table. + +Like tables, you often want to query streams in a high-level language +based on relational algebra, validated according to a schema, and optimized +to take advantage of available resources and algorithms. + +Calcite's SQL is an extension to standard SQL, not another 'SQL-like' language. +The distinction is important, for several reasons: + +* Streaming SQL is easy to learn for anyone who knows regular SQL. +* The semantics are clear, because we aim to produce the same results on a + stream as if the same data were in a table. +* You can write queries that combine streams and tables (or the history of + a stream, which is basically an in-memory table). +* Lots of existing tools can generate standard SQL. + +If you don't use the `STREAM` keyword, you are back in regular +standard SQL. + +## An example schema + +Our streaming SQL examples use the following schema: + +* `Orders (rowtime, productId, orderId, units)` - a stream and a table +* `Products (rowtime, productId, name)` - a table +* `Shipments (rowtime, orderId)` - a stream + +## A simple query + +Let's start with the simplest streaming query: + +{% highlight sql %} +SELECT STREAM * +FROM Orders; + + rowtime | productId | orderId | units +----------+-----------+---------+------- + 10:17:00 | 30 | 5 | 4 + 10:17:05 | 10 | 6 | 1 + 10:18:05 | 20 | 7 | 2 + 10:18:07 | 30 | 8 | 20 + 11:02:00 | 10 | 9 | 6 + 11:04:00 | 10 | 10 | 1 + 11:09:30 | 40 | 11 | 12 + 11:24:11 | 10 | 12 | 4 +{% endhighlight %} + +This query reads all columns and rows from the `Orders` stream. +Like any streaming query, it never terminates. It outputs a record whenever +a record arrives in `Orders`. + +Type `Control-C` to terminate the query. + +The `STREAM` keyword is the main extension in streaming SQL. It tells the +system that you are interested in incoming orders, not existing ones. The query + +{% highlight sql %} +SELECT * +FROM Orders; + + rowtime | productId | orderId | units +----------+-----------+---------+------- + 08:30:00 | 10 | 1 | 3 + 08:45:10 | 20 | 2 | 1 + 09:12:21 | 10 | 3 | 10 + 09:27:44 | 30 | 4 | 2 + +4 records returned. +{% endhighlight %} + +is also valid, but will print out all existing orders and then terminate. We +call it a *relational* query, as opposed to *streaming*. It has traditional +SQL semantics. + +`Orders` is special, in that it has both a stream and a table. If you try to run +a streaming query on a table, or a relational query on a stream, Calcite gives +an error: + +{% highlight sql %} +SELECT * FROM Shipments; + +ERROR: Cannot convert stream 'SHIPMENTS' to a table + +SELECT STREAM * FROM Products; + +ERROR: Cannot convert table 'PRODUCTS' to a stream +{% endhighlight %} + +# Filtering rows + +Just as in regular SQL, you use a `WHERE` clause to filter rows: + +{% highlight sql %} +SELECT STREAM * +FROM Orders +WHERE units > 3; + + rowtime | productId | orderId | units +----------+-----------+---------+------- + 10:17:00 | 30 | 5 | 4 + 10:18:07 | 30 | 8 | 20 + 11:02:00 | 10 | 9 | 6 + 11:09:30 | 40 | 11 | 12 + 11:24:11 | 10 | 12 | 4 +{% endhighlight %} + +# Projecting expressions + +Use expressions in the `SELECT` clause to choose which columns to return or +compute expressions: + +{% highlight sql %} +SELECT STREAM rowtime, + 'An order for ' || units || ' ' + || CASE units WHEN 1 THEN 'unit' ELSE 'units' END + || ' of product #' || productId AS description +FROM Orders; + + rowtime | description +----------+--------------------------------------- + 10:17:00 | An order for 4 units of product #30 + 10:17:05 | An order for 1 unit of product #10 + 10:18:05 | An order for 2 units of product #20 + 10:18:07 | An order for 20 units of product #30 + 11:02:00 | An order by 6 units of product #10 + 11:04:00 | An order by 1 unit of product #10 + 11:09:30 | An order for 12 units of product #40 + 11:24:11 | An order by 4 units of product #10 +{% endhighlight %} + +We recommend that you always include the `rowtime` column in the `SELECT` +clause. Having a sorted timestamp in each stream and streaming query makes it +possible to do advanced calculations later, such as `GROUP BY` and `JOIN`. + +# Tumbling windows + +There are several ways to compute aggregate functions on streams. The +differences are: + +* How many rows come out for each row in? +* Does each incoming value appear in one total, or more? +* What defines the "window", the set of rows that contribute to a given output row? +* Is the result a stream or a relation? + +There are various window types: + +* tumbling window (GROUP BY) +* hopping window (multi GROUP BY) +* sliding window (window functions) +* cascading window (window functions) + +and the following diagram shows the kinds of query in which to use them: + + + +First we'll look a *tumbling window*, which is defined by a streaming +`GROUP BY`. Here is an example: + +{% highlight sql %} +SELECT STREAM CEIL(rowtime TO HOUR) AS rowtime, + productId, + COUNT(*) AS c, + SUM(units) AS units +FROM Orders +GROUP BY CEIL(rowtime TO HOUR), productId; + + rowtime | productId | c | units +----------+-----------+---------+------- + 11:00:00 | 30 | 2 | 24 + 11:00:00 | 10 | 1 | 1 + 11:00:00 | 20 | 1 | 7 + 12:00:00 | 10 | 3 | 11 + 12:00:00 | 40 | 1 | 12 +{% endhighlight %} + +The result is a stream. At 11 o'clock, Calcite emits a sub-total for every +`productId` that had an order since 10 o'clock, timestamped 11 o'clock. +At 12 o'clock, it will emit +the orders that occurred between 11:00 and 12:00. Each input row contributes to +only one output row. + +How did Calcite know that the 10:00:00 sub-totals were complete at 11:00:00, +so that it could emit them? It knows that `rowtime` is increasing, and it knows +that `CEIL(rowtime TO HOUR)` is also increasing. So, once it has seen a row +at or after 11:00:00, it will never see a row that will contribute to a 10:00:00 +total. + +A column or expression that is increasing or decreasing is said to be +*monotonic*. + +If column or expression has values that are slightly out of order, +and the stream has a mechanism (such as punctuation or watermarks) +to declare that a particular value will never be seen again, then +the column or expression is said to be *quasi-monotonic*. + +Without a monotonic or quasi-monotonic expression in the `GROUP BY` clause, +Calcite is +not able to make progress, and it will not allow the query: + +{% highlight sql %} +SELECT STREAM productId, + COUNT(*) AS c, + SUM(units) AS units +FROM Orders +GROUP BY productId; + +ERROR: Streaming aggregation requires at least one monotonic expression in GROUP BY clause +{% endhighlight %} + +Monotonic and quasi-monotonic columns need to be declared in the schema. +The monotonicity is +enforced when records enter the stream and assumed by queries that read from +that stream. We recommend that you give each stream a timestamp column called +`rowtime`, but you can declare others to be monotonic, `orderId`, for example. + +We discuss punctuation, watermarks, and other ways of making progress +<a href="#punctuation">below</a>. + +# Tumbling windows, improved + +The previous example of tumbling windows was easy to write because the window +was one hour. For intervals that are not a whole time unit, say 2 hours or +2 hours and 17 minutes, you cannot use `CEIL`, and the expression gets more +complicated. + +Calcite supports an alternative syntax for tumbling windows: + +{% highlight sql %} +SELECT STREAM TUMBLE_END(rowtime, INTERVAL '1' HOUR) AS rowtime, + productId, + COUNT(*) AS c, + SUM(units) AS units +FROM Orders +GROUP BY TUMBLE(rowtime, INTERVAL '1' HOUR), productId; + + rowtime | productId | c | units +----------+-----------+---------+------- + 11:00:00 | 30 | 2 | 24 + 11:00:00 | 10 | 1 | 1 + 11:00:00 | 20 | 1 | 7 + 12:00:00 | 10 | 3 | 11 + 12:00:00 | 40 | 1 | 12 +{% endhighlight %} + +As you can see, it returns the same results as the previous query. The `TUMBLE` +function returns a grouping key that is the same for all the rows that will end +up in a given summary row; the `TUMBLE_END` function takes the same arguments +and returns the time at which that window ends; +there is also a `TUMBLE_START` function. + +`TUMBLE` has an optional parameter to align the window. +In the following example, +we use a 30 minute interval and 0:12 as the alignment time, +so the query emits summaries at 12 and 42 minutes past each hour: + +{% highlight sql %} +SELECT STREAM + TUMBLE_END(rowtime, INTERVAL '30' MINUTE, TIME '0:12') AS rowtime, + productId, + COUNT(*) AS c, + SUM(units) AS units +FROM Orders +GROUP BY TUMBLE(rowtime, INTERVAL '30' MINUTE, TIME '0:12'), + productId; + + rowtime | productId | c | units +----------+-----------+---------+------- + 10:42:00 | 30 | 2 | 24 + 10:42:00 | 10 | 1 | 1 + 10:42:00 | 20 | 1 | 7 + 11:12:00 | 10 | 2 | 7 + 11:12:00 | 40 | 1 | 12 + 11:42:00 | 10 | 1 | 4 +{% endhighlight %} + +# Hopping windows + +Hopping windows are a generalization of tumbling windows that allow data to +be kept in a window for a longer than the emit interval. + +For example, the following query emits a row timestamped 11:00 containing data +from 08:00 to 11:00 (or 10:59.9 if we're being pedantic), +and a row timestamped 12:00 containing data from 09:00 +to 12:00. + +{% highlight sql %} +SELECT STREAM + HOP_END(rowtime, INTERVAL '1' HOUR, INTERVAL '3' HOUR) AS rowtime, + COUNT(*) AS c, + SUM(units) AS units +FROM Orders +GROUP BY HOP(rowtime, INTERVAL '1' HOUR, INTERVAL '3' HOUR); + + rowtime | c | units +----------+----------+------- + 11:00:00 | 4 | 27 + 12:00:00 | 8 | 50 +{% endhighlight %} + +In this query, because the retain period is 3 times the emit period, every input +row contributes to exactly 3 output rows. Imagine that the `HOP` function +generates a collection of group keys for incoming row, and places its values +in the accumulators of each of those group keys. For example, +`HOP(10:18:00, INTERVAL '1' HOUR, INTERVAL '3')` generates 3 periods + +```[08:00, 09:00) +[09:00, 10:00) +[10:00, 11:00) +``` + +This raises the possibility of allowing user-defined partitioning functions +for users who are not happy with the built-in functions `HOP` and `TUMBLE`. + +We can build complex complex expressions such as an exponentially decaying +moving average: + +{% highlight sql %} +SELECT STREAM HOP_END(rowtime), + productId, + SUM(unitPrice * EXP((rowtime - HOP_START(rowtime)) SECOND / INTERVAL '1' HOUR)) + / SUM(EXP((rowtime - HOP_START(rowtime)) SECOND / INTERVAL '1' HOUR)) +FROM Orders +GROUP BY HOP(rowtime, INTERVAL '1' SECOND, INTERVAL '1' HOUR), + productId +{% endhighlight %} + +Emits: + +* a row at `11:00:00` containing rows in `[10:00:00, 11:00:00)`; +* a row at `11:00:01` containing rows in `[10:00:01, 11:00:01)`. + +The expression weighs recent orders more heavily than older orders. +Extending the window from 1 hour to 2 hours or 1 year would have +virtually no effect on the accuracy of the result (but use more memory +and compute). + +Note that we use `HOP_START` inside an aggregate function (`SUM`) because it +is a value that is constant for all rows within a sub-total. This +would not be allowed for typical aggregate functions (`SUM`, `COUNT` +etc.). + +If you are familiar with `GROUPING SETS`, you may notice that partitioning +functions can be seen as a generalization of `GROUPING SETS`, in that they +allow an input row to contribute to multiple sub-totals. +The auxiliary functions for `GROUPING SETS`, +such as `GROUPING()` and `GROUP_ID`, +can be used inside aggregate functions, so it is not surprising that +`HOP_START` and `HOP_END` can be used in the same way. + +# GROUPING SETS + +`GROUPING SETS` is valid for a streaming query provided that every +grouping set contains a monotonic or quasi-monotonic expression. + +`CUBE` and `ROLLUP` are not valid for streaming query, because they will +produce at least one grouping set that aggregates everything (like +`GROUP BY ()`). + +# Filtering after aggregation + +As in standard SQL, you can apply a `HAVING` clause to filter rows emitted by +a streaming `GROUP BY`: + +{% highlight sql %} +SELECT STREAM TUMBLE_END(rowtime, INTERVAL '1' HOUR) AS rowtime, + productId +FROM Orders +GROUP BY TUMBLE(rowtime, INTERVAL '1' HOUR), productId +HAVING COUNT(*) > 2 OR SUM(units) > 10; + + rowtime | productId +----------+----------- + 10:00:00 | 30 + 11:00:00 | 10 + 11:00:00 | 40 +{% endhighlight %} + +# Sub-queries, views and SQL's closure property + +The previous `HAVING` query can be expressed using a `WHERE` clause on a +sub-query: + +{% highlight sql %} +SELECT STREAM rowtime, productId +FROM ( + SELECT TUMBLE_END(rowtime, INTERVAL '1' HOUR) AS rowtime, + productId, + COUNT(*) AS c, + SUM(units) AS su + FROM Orders + GROUP BY TUMBLE(rowtime, INTERVAL '1' HOUR), productId) +WHERE c > 2 OR su > 10; + + rowtime | productId +----------+----------- + 10:00:00 | 30 + 11:00:00 | 10 + 11:00:00 | 40 +{% endhighlight %} + +`HAVING` was introduced in the early days of SQL, when a way was needed to +perform a filter *after* aggregation. (Recall that `WHERE` filters rows before +they enter the `GROUP BY` clause.) + +Since then, SQL has become a mathematically closed language, which means that +any operation you can perform on a table can also perform on a query. + +The *closure property* of SQL is extremely powerful. Not only does it render +`HAVING` obsolete (or, at least, reduce it to syntactic sugar), it makes views +possible: + +{% highlight sql %} +CREATE VIEW HourlyOrderTotals (rowtime, productId, c, su) AS + SELECT TUMBLE_END(rowtime, INTERVAL '1' HOUR), + productId, + COUNT(*), + SUM(units) + FROM Orders + GROUP BY TUMBLE(rowtime, INTERVAL '1' HOUR), productId; + +SELECT STREAM rowtime, productId +FROM HourlyOrderTotals +WHERE c > 2 OR su > 10; + + rowtime | productId +----------+----------- + 10:00:00 | 30 + 11:00:00 | 10 + 11:00:00 | 40 +{% endhighlight %} + +Sub-queries in the `FROM` clause are sometimes referred to as "inline views", +but really, they are more fundamental than views. Views are just a convenient +way to carve your SQL into manageable chunks by giving the pieces names and +storing them in the metadata repository. + +Many people find that nested queries and views are even more useful on streams +than they are on relations. Streaming queries are pipelines of +operators all running continuously, and often those pipelines get quite long. +Nested queries and views help to express and manage those pipelines. + +And, by the way, a `WITH` clause can accomplish the same as a sub-query or +a view: + +{% highlight sql %} +WITH HourlyOrderTotals (rowtime, productId, c, su) AS ( + SELECT TUMBLE_END(rowtime, INTERVAL '1' HOUR), + productId, + COUNT(*), + SUM(units) + FROM Orders + GROUP BY TUMBLE(rowtime, INTERVAL '1' HOUR), productId) +SELECT STREAM rowtime, productId +FROM HourlyOrderTotals +WHERE c > 2 OR su > 10; + + rowtime | productId +----------+----------- + 10:00:00 | 30 + 11:00:00 | 10 + 11:00:00 | 40 +{% endhighlight %} + +# Converting between streams and relations + +Look back at the definition of the `HourlyOrderTotals` view. +Is the view a stream or a relation? + +It does not contain the `STREAM` keyword, so it is a relation. +However, it is a relation that can be converted into a stream. + +You can use it in both relational and streaming queries: + +{% highlight sql %} +# A relation; will query the historic Orders table. +# Returns the largest number of product #10 ever sold in one hour. +SELECT max(su) +FROM HourlyOrderTotals +WHERE productId = 10; + +# A stream; will query the Orders stream. +# Returns every hour in which at least one product #10 was sold. +SELECT STREAM rowtime +FROM HourlyOrderTotals +WHERE productId = 10; +{% endhighlight %} + +This approach is not limited to views and sub-queries. +Following the approach set out in CQL [<a href="#ref1">1</a>], every query +in streaming SQL is defined as a relational query and converted to a stream +using the `STREAM` keyword in the top-most `SELECT`. + +If the `STREAM` keyword is present in sub-queries or view definitions, it has no +effect. + +At query preparation time, Calcite figures out whether the relations referenced +in the query can be converted to streams or historical relations. + +Sometimes a stream makes available some of its history (say the last 24 hours of +data in an Apache Kafka [<a href="#ref2">2</a>] topic) +but not all. At run time, Calcite figures out whether there is sufficient +history to run the query, and if not, gives an error. + +# The "pie chart" problem: Relational queries on streams + +One particular case where you need to convert a stream to a relation +occurs in what I call the "pie chart problem". Imagine that you need to +write a web page with a chart, like the following, that summarizes the +number of orders for each product over the last hour. + + + +But the `Orders` stream only contains a few records, not an hour's summary. +We need to run a relational query on the history of the stream: + +{% highlight sql %} +SELECT productId, count(*) +FROM Orders +WHERE rowtime BETWEEN current_timestamp - INTERVAL '1' HOUR + AND current_timestamp; +{% endhighlight %} + +If the history of the `Orders` stream is being spooled to the `Orders` table, +we can answer the query, albeit at a high cost. Better, if we can tell the +system to materialize one hour summary into a table, +maintain it continuously as the stream flows, +and automatically rewrite queries to use the table. + +# Sorting + +The story for `ORDER BY` is similar to `GROUP BY`. +The syntax looks like regular SQL, but Calcite must be sure that it can deliver +timely results. It therefore requires a monotonic expression on the leading edge +of your `ORDER BY` key. + +{% highlight sql %} +SELECT STREAM CEIL(rowtime TO hour) AS rowtime, productId, orderId, units +FROM Orders +ORDER BY CEIL(rowtime TO hour) ASC, units DESC; + + rowtime | productId | orderId | units +----------+-----------+---------+------- + 10:00:00 | 30 | 8 | 20 + 10:00:00 | 30 | 5 | 4 + 10:00:00 | 20 | 7 | 2 + 10:00:00 | 10 | 6 | 1 + 11:00:00 | 40 | 11 | 12 + 11:00:00 | 10 | 9 | 6 + 11:00:00 | 10 | 12 | 4 + 11:00:00 | 10 | 10 | 1 +{% endhighlight %} + +Most queries will return results in the order that they were inserted, +because the engine is using streaming algorithms, but you should not rely on it. +For example, consider this: + +{% highlight sql %} +SELECT STREAM * +FROM Orders +WHERE productId = 10 +UNION ALL +SELECT STREAM * +FROM Orders +WHERE productId = 30; + + rowtime | productId | orderId | units +----------+-----------+---------+------- + 10:17:05 | 10 | 6 | 1 + 10:17:00 | 30 | 5 | 4 + 10:18:07 | 30 | 8 | 20 + 11:02:00 | 10 | 9 | 6 + 11:04:00 | 10 | 10 | 1 + 11:24:11 | 10 | 12 | 4 +{% endhighlight %} + +The rows with `productId` = 30 are apparently out of order, probably because +the `Orders` stream was partitioned on `productId` and the partitioned streams +sent their data at different times. + +If you require a particular ordering, add an explicit `ORDER BY`: + +{% highlight sql %} +SELECT STREAM * +FROM Orders +WHERE productId = 10 +UNION ALL +SELECT STREAM * +FROM Orders +WHERE productId = 30 +ORDER BY rowtime; + + rowtime | productId | orderId | units +----------+-----------+---------+------- + 10:17:00 | 30 | 5 | 4 + 10:17:05 | 10 | 6 | 1 + 10:18:07 | 30 | 8 | 20 + 11:02:00 | 10 | 9 | 6 + 11:04:00 | 10 | 10 | 1 + 11:24:11 | 10 | 12 | 4 +{% endhighlight %} + +Calcite will probably implement the `UNION ALL` by merging using `rowtime`, +which is only slightly less efficient. + +You only need to add an `ORDER BY` to the outermost query. If you need to, +say, perform `GROUP BY` after a `UNION ALL`, Calcite will add an `ORDER BY` +implicitly, in order to make the GROUP BY algorithm possible. + +# Table constructor + +The `VALUES` clause creates an inline table with a given set of rows. + +Streaming is disallowed. The set of rows never changes, and therefore a stream +would never return any rows. + +{% highlight sql %} +> SELECT STREAM * FROM (VALUES (1, 'abc')); + +ERROR: Cannot stream VALUES +{% endhighlight %} + +# Sliding windows + +Standard SQL features so-called "analytic functions" that can be used in the +`SELECT` clause. Unlike `GROUP BY`, these do not collapse records. For each +record that goes in, one record comes out. But the aggregate function is based +on a window of many rows. + +Let's look at an example. + +{% highlight sql %} +SELECT STREAM rowtime, + productId, + units, + SUM(units) OVER (ORDER BY rowtime RANGE INTERVAL '1' HOUR PRECEDING) unitsLastHour +FROM Orders; +{% endhighlight %} + +The feature packs a lot of power with little effort. You can have multiple +functions in the `SELECT` clause, based on multiple window specifications. + +The following example returns orders whose average order size over the last +10 minutes is greater than the average order size for the last week. + +{% highlight sql %} +SELECT STREAM * +FROM ( + SELECT STREAM rowtime, + productId, + units, + AVG(units) OVER product (RANGE INTERVAL '10' MINUTE PRECEDING) AS m10, + AVG(units) OVER product (RANGE INTERVAL '7' DAY PRECEDING) AS d7 + FROM Orders + WINDOW product AS ( + ORDER BY rowtime + PARTITION BY productId)) +WHERE m10 > d7; +{% endhighlight %} + +For conciseness, here we use a syntax where you partially define a window +using a `WINDOW` clause and then refine the window in each `OVER` clause. +You could also define all windows in the `WINDOW` clause, or all windows inline, +if you wish. + +But the real power goes beyond syntax. Behind the scenes, this query is +maintaining two tables, and adding and removing values from sub-totals using +with FIFO queues. But you can access those tables without introducing a join +into the query. + +Some other features of the windowed aggregation syntax: + +* You can define windows based on row count. +* The window can reference rows that have not yet arrived. + (The stream will wait until they have arrived). +* You can compute order-dependent functions such as `RANK` and median. + +# Cascading windows + +What if we want a query that returns a result for every record, like a +sliding window, but resets totals on a fixed time period, like a +tumbling window? Such a pattern is called a *cascading window*. Here +is an example: + +{% highlight sql %} +SELECT STREAM rowtime, + productId, + units, + SUM(units) OVER (PARTITION BY FLOOR(rowtime TO HOUR)) AS unitsSinceTopOfHour +FROM Orders; +{% endhighlight %} + +It looks similar to a sliding window query, but the monotonic +expression occurs within the `PARTITION BY` clause of the window. As +the rowtime moves from from 10:59:59 to 11:00:00, +`FLOOR(rowtime TO HOUR)` changes from 10:00:00 to 11:00:00, +and therefore a new partition starts. +The first row to arrive in the new hour will start a +new total; the second row will have a total that consists of two rows, +and so on. + +Calcite knows that the old partition will never be used again, so +removes all sub-totals for that partition from its internal storage. + +Analytic functions that using cascading and sliding windows can be +combined in the same query. + +# Joining streams to tables + +There are two kinds of join where streams are concerned: stream-to-table +join and stream-to-stream join. + +A stream-to-table join is straightforward if the contents of the table +are not changing. This query enriches a stream of orders with +each product's list price: + +{% highlight sql %} +SELECT STREAM o.rowtime, o.productId, o.orderId, o.units, + p.name, p.unitPrice +FROM Orders AS o +JOIN Products AS p + ON o.productId = p.productId; + + rowtime | productId | orderId | units | name | unitPrice +----------+-----------+---------+-------+ -------+----------- + 10:17:00 | 30 | 5 | 4 | Cheese | 17 + 10:17:05 | 10 | 6 | 1 | Beer | 0.25 + 10:18:05 | 20 | 7 | 2 | Wine | 6 + 10:18:07 | 30 | 8 | 20 | Cheese | 17 + 11:02:00 | 10 | 9 | 6 | Beer | 0.25 + 11:04:00 | 10 | 10 | 1 | Beer | 0.25 + 11:09:30 | 40 | 11 | 12 | Bread | 100 + 11:24:11 | 10 | 12 | 4 | Beer | 0.25 +{% endhighlight %} + +What should happen if the table is changing? For example, +suppose the unit price of product 10 is increased to 0.35 at 11:00. +Orders placed before 11:00 should have the old price, and orders +placed after 11:00 should reflect the new price. + +One way to implement this is to have a table that keeps every version +with a start and end effective date, `ProductVersions` in the following +example: + +{% highlight sql %} +SELECT STREAM * +FROM Orders AS o +JOIN ProductVersions AS p + ON o.productId = p.productId + AND o.rowtime BETWEEN p.startDate AND p.endDate + + rowtime | productId | orderId | units | productId1 | name | unitPrice +----------+-----------+---------+-------+ -----------+--------+----------- + 10:17:00 | 30 | 5 | 4 | 30 | Cheese | 17 + 10:17:05 | 10 | 6 | 1 | 10 | Beer | 0.25 + 10:18:05 | 20 | 7 | 2 | 20 | Wine | 6 + 10:18:07 | 30 | 8 | 20 | 30 | Cheese | 17 + 11:02:00 | 10 | 9 | 6 | 10 | Beer | 0.35 + 11:04:00 | 10 | 10 | 1 | 10 | Beer | 0.35 + 11:09:30 | 40 | 11 | 12 | 40 | Bread | 100 + 11:24:11 | 10 | 12 | 4 | 10 | Beer | 0.35 +{% endhighlight %} + +The other way to implement this is to use a database with temporal support +(the ability to find the contents of the database as it was at any moment +in the past), and the system needs to know that the `rowtime` column of +the `Orders` stream corresponds to the transaction timestamp of the +`Products` table. + +For many applications, it is not worth the cost and effort of temporal +support or a versioned table. It is acceptable to the application that +the query gives different results when replayed: in this example, on replay, +all orders of product 10 are assigned the later unit price, 0.35. + +# Joining streams to streams + +It makes sense to join two streams if the join condition somehow forces +them to remain a finite distance from one another. In the following query, +the ship date is within one hour of the order date: + +{% highlight sql %} +SELECT STREAM o.rowtime, o.productId, o.orderId, s.rowtime AS shipTime +FROM Orders AS o +JOIN Shipments AS s + ON o.orderId = p.orderId + AND s.rowtime BETWEEN o.rowtime AND o.rowtime + INTERVAL '1' HOUR; + + rowtime | productId | orderId | shipTime +----------+-----------+---------+---------- + 10:17:00 | 30 | 5 | 10:55:00 + 10:17:05 | 10 | 6 | 10:20:00 + 11:02:00 | 10 | 9 | 11:58:00 + 11:24:11 | 10 | 12 | 11:44:00 +{% endhighlight %} + +Note that quite a few orders do not appear, because they did not ship +within an hour. By the time the system receives order 10, timestamped 11:24:11, +it has already removed orders up to and including order 8, timestamped 10:18:07, +from its hash table. + +As you can see, the "lock step", tying together monotonic or quasi-monotonic +columns of the two streams, is necessary for the system to make progress. +It will refuse to execute a query if it cannot deduce a lock step. + +# DML + +It's not only queries that make sense against streams; +it also makes sense to run DML statements (`INSERT`, `UPDATE`, `DELETE`, +and also their rarer cousins `UPSERT` and `REPLACE`) against streams. + +DML is useful because it allows you do materialize streams +or tables based on streams, +and therefore save effort when values are used often. + +Consider how streaming applications often consist of pipelines of queries, +each query transforming input stream(s) to output stream(s). +The component of a pipeline can be a view: + +{% highlight sql %} +CREATE VIEW LargeOrders AS +SELECT STREAM * FROM Orders WHERE units > 1000; +{% endhighlight %} + +or a standing `INSERT` statement: + +{% highlight sql %} +INSERT INTO LargeOrders +SELECT STREAM * FROM Orders WHERE units > 1000; +{% endhighlight %} + +These look similar, and in both cases the next step(s) in the pipeline +can read from `LargeOrders` without worrying how it was populated. +There is a difference in efficiency: the `INSERT` statement does the +same work no matter how many consumers there are; the view does work +proportional to the number of consumers, and in particular, does no +work if there are no consumers. + +Other forms of DML make sense for streams. For example, the following +standing `UPSERT` statement maintains a table that materializes a summary +of the last hour of orders: + +{% highlight sql %} +UPSERT INTO OrdersSummary +SELECT STREAM productId, + COUNT(*) OVER lastHour AS c +FROM Orders +WINDOW lastHour AS ( + PARTITION BY productId + ORDER BY rowtime + RANGE INTERVAL '1' HOUR PRECEDING) +{% endhighlight %} + +# Punctuation + +Punctuation[<a href="#ref5">5</a>] allows a stream query to make progress +even if there are not enough values in a monotonic key to push the results out. + +(I prefer the term "rowtime bounds", +and watermarks[<a href="#ref6">6</a>] are a related concept, +but for these purposes, punctuation will suffice.) + +If a stream has punctuation enabled then it may not be sorted but is +nevertheless sortable. So, for the purposes of semantics, it is sufficient +to work in terms of sorted streams. + +By the way, an out-of-order stream is also sortable if it is *t-sorted* +(i.e. every record is guaranteed to arrive within *t* seconds of its +timestamp) or *k-sorted* (i.e. every record is guaranteed to be no more +than *k* positions out of order). So queries on these streams can be +planned similarly to queries on streams with punctuation. + +And, we often want to aggregate over attributes that are not +time-based but are nevertheless monotonic. "The number of times a team +has shifted between winning-state and losing-state" is one such +monotonic attribute. The system needs to figure out for itself that it +is safe to aggregate over such an attribute; punctuation does not add +any extra information. + +I have in mind some metadata (cost metrics) for the planner: + +1. Is this stream sorted on a given attribute (or attributes)? +2. Is it possible to sort the stream on a given attribute? (For finite + relations, the answer is always "yes"; for streams it depends on the + existence of punctuation, or linkage between the attributes and the + sort key.) +3. What latency do we need to introduce in order to perform that sort? +4. What is the cost (in CPU, memory etc.) of performing that sort? + +We already have (1), in [BuiltInMetadata.Collation]({{ site.apiRoot }}/org/apache/calcite/rel/metadata/BuiltInMetadata.Collation.html). +For (2), the answer is always "true" for finite relations. +But we'll need to implement (2), (3) and (4) for streams. + +# State of the stream + +Not all concepts in this article have been implemented in Calcite. +And others may be implemented in Calcite but not in a particular adapter +such as SamzaSQL [<a href="#ref3">3</a>] [<a href="#ref4">4</a>]. + +## Implemented + +* Streaming `SELECT`, `WHERE`, `GROUP BY`, `HAVING`, `UNION ALL`, `ORDER BY` +* `FLOOR` and `CEIL` functions +* Monotonicity +* Streaming `VALUES` is disallowed + +## Not implemented + +The following features are presented in this document as if Calcite +supports them, but in fact it does not (yet). Full support means +that the reference implementation supports the feature (including +negative cases) and the TCK tests it. + +* Stream-to-stream `JOIN` +* Stream-to-table `JOIN` +* Stream on view +* Streaming `UNION ALL` with `ORDER BY` (merge) +* Relational query on stream +* Streaming windowed aggregation (sliding and cascading windows) +* Check that `STREAM` in sub-queries and views is ignored +* Check that streaming `ORDER BY` cannot have `OFFSET` or `LIMIT` +* Limited history; at run time, check that there is sufficient history + to run the query. +* [Quasi-monotonicity](https://issues.apache.org/jira/browse/CALCITE-1096) +* `HOP` and `TUMBLE` (and auxiliary `HOP_START`, `HOP_END`, + `TUMBLE_START`, `TUMBLE_END`) functions + +## To do in this document + +* Re-visit whether you can stream `VALUES` +* `OVER` clause to define window on stream +* Consider whether to allow `CUBE` and `ROLLUP` in streaming queries, + with an understanding that some levels of aggregation will never complete + (because they have no monotonic expressions) and thus will never be emitted. +* Fix the `UPSERT` example to remove records for products that have not + occurred in the last hour. +* DML that outputs to multiple streams; perhaps an extension to the standard + `REPLACE` statement. + +# Functions + +The following functions are not present in standard SQL +but are defined in streaming SQL. + +Scalar functions: + +* `FLOOR(dateTime TO intervalType)` rounds a date, time or timestamp value + down to a given interval type +* `CEIL(dateTime TO intervalType)` rounds a date, time or timestamp value + up to a given interval type + +Partitioning functions: + +* `HOP(t, emit, retain)` returns a collection of group keys for a row + to be part of a hopping window +* `HOP(t, emit, retain, align)` returns a collection of group keys for a row + to be part of a hopping window with a given alignment +* `TUMBLE(t, emit)` returns a group key for a row + to be part of a tumbling window +* `TUMBLE(t, emit, align)` returns a group key for a row + to be part of a tumbling window with a given alignment + +`TUMBLE(t, e)` is equivalent to `TUMBLE(t, e, TIME '00:00:00')`. + +`TUMBLE(t, e, a)` is equivalent to `HOP(t, e, e, a)`. + +`HOP(t, e, r)` is equivalent to `HOP(t, e, r, TIME '00:00:00')`. + +# References + +* [<a name="ref1">1</a>] + <a href="http://ilpubs.stanford.edu:8090/758/">Arvind Arasu, Shivnath Babu, + and Jennifer Widom (2003) The CQL Continuous Query + Language: Semantic Foundations and Query Execution</a>. +* [<a name="ref2">2</a>] + <a href="http://kafka.apache.org/documentation.html">Apache Kafka</a>. +* [<a name="ref3">3</a>] <a href="http://samza.apache.org">Apache Samza</a>. +* [<a name="ref4">4</a>] <a href="https://github.com/milinda/samza-sql">SamzaSQL</a>. +* [<a name="ref5">5</a>] + <a href="http://www.whitworth.edu/academic/department/mathcomputerscience/faculty/tuckerpeter/pdf/117896_final.pdf">Peter + A. Tucker, David Maier, Tim Sheard, and Leonidas Fegaras (2003) Exploiting + Punctuation Semantics in Continuous Data Streams</a>. +* [<a name="ref6">6</a>] + <a href="http://research.google.com/pubs/pub41378.html">Tyler Akidau, + Alex Balikov, Kaya Bekiroglu, Slava Chernyak, Josh Haberman, Reuven Lax, + Sam McVeety, Daniel Mills, Paul Nordstrom, and Sam Whittle (2013) + MillWheel: Fault-Tolerant Stream Processing at Internet Scale</a>. http://git-wip-us.apache.org/repos/asf/calcite/blob/5cee486f/avatica/site/_docs/testapi.md ---------------------------------------------------------------------- diff --git a/avatica/site/_docs/testapi.md b/avatica/site/_docs/testapi.md new file mode 100644 index 0000000..661f374 --- /dev/null +++ b/avatica/site/_docs/testapi.md @@ -0,0 +1,28 @@ +--- +title: Test API +layout: external +external_url: /testapidocs +--- +{% comment %} +Ideally, we want to use {{ site.apiRoot }} instead of hardcoding +the above external_url value, but I don't believe there's a way to do that +{% endcomment %} + +<!-- +{% comment %} +Licensed to the Apache Software Foundation (ASF) under one or more +contributor license agreements. See the NOTICE file distributed with +this work for additional information regarding copyright ownership. +The ASF licenses this file to you under the Apache License, Version 2.0 +(the "License"); you may not use this file except in compliance with +the License. You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +{% endcomment %} +-->
