This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new fce331a4f7 Migrate Regex Functions from static docs (#12886)
fce331a4f7 is described below
commit fce331a4f7ca908ee7b4500d104fae9ec657f471
Author: Jonathan Chen <[email protected]>
AuthorDate: Mon Oct 14 13:20:21 2024 -0400
Migrate Regex Functions from static docs (#12886)
* regex migrate
* small fixes
* update docs
---------
Co-authored-by: Andrew Lamb <[email protected]>
---
datafusion/functions/src/regex/regexpmatch.rs | 49 ++++++++++-
datafusion/functions/src/regex/regexpreplace.rs | 52 +++++++++++-
docs/source/user-guide/sql/scalar_functions.md | 97 ----------------------
docs/source/user-guide/sql/scalar_functions_new.md | 82 ++++++++++++++++++
4 files changed, 178 insertions(+), 102 deletions(-)
diff --git a/datafusion/functions/src/regex/regexpmatch.rs
b/datafusion/functions/src/regex/regexpmatch.rs
index bfec97f92c..443e505332 100644
--- a/datafusion/functions/src/regex/regexpmatch.rs
+++ b/datafusion/functions/src/regex/regexpmatch.rs
@@ -26,10 +26,11 @@ use datafusion_common::{arrow_datafusion_err, plan_err};
use datafusion_common::{
cast::as_generic_string_array, internal_err, DataFusionError, Result,
};
-use datafusion_expr::{ColumnarValue, TypeSignature};
+use datafusion_expr::scalar_doc_sections::DOC_SECTION_REGEX;
+use datafusion_expr::{ColumnarValue, Documentation, TypeSignature};
use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
use std::any::Any;
-use std::sync::Arc;
+use std::sync::{Arc, OnceLock};
#[derive(Debug)]
pub struct RegexpMatchFunc {
@@ -106,7 +107,51 @@ impl ScalarUDFImpl for RegexpMatchFunc {
result.map(ColumnarValue::Array)
}
}
+
+ fn documentation(&self) -> Option<&Documentation> {
+ Some(get_regexp_match_doc())
+ }
+}
+
+static DOCUMENTATION: OnceLock<Documentation> = OnceLock::new();
+
+fn get_regexp_match_doc() -> &'static Documentation {
+ DOCUMENTATION.get_or_init(|| {
+ Documentation::builder()
+ .with_doc_section(DOC_SECTION_REGEX)
+ .with_description("Returns a list of [regular
expression](https://docs.rs/regex/latest/regex/#syntax) matches in a string.")
+ .with_syntax_example("regexp_match(str, regexp[, flags])")
+ .with_sql_example(r#"```sql
+ > select regexp_match('Köln', '[a-zA-Z]ö[a-zA-Z]{2}');
+ +---------------------------------------------------------+
+ | regexp_match(Utf8("Köln"),Utf8("[a-zA-Z]ö[a-zA-Z]{2}")) |
+ +---------------------------------------------------------+
+ | [Köln] |
+ +---------------------------------------------------------+
+ SELECT regexp_match('aBc', '(b|d)', 'i');
+ +---------------------------------------------------+
+ | regexp_match(Utf8("aBc"),Utf8("(b|d)"),Utf8("i")) |
+ +---------------------------------------------------+
+ | [B] |
+ +---------------------------------------------------+
+```
+Additional examples can be found
[here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/regexp.rs)
+"#)
+ .with_standard_argument("str", "String")
+ .with_argument("regexp","Regular expression to match against.
+ Can be a constant, column, or function.")
+ .with_argument("flags",
+ r#"Optional regular expression flags that control
the behavior of the regular expression. The following flags are supported:
+ - **i**: case-insensitive: letters match both upper and lower case
+ - **m**: multi-line mode: ^ and $ match begin/end of line
+ - **s**: allow . to match \n
+ - **R**: enables CRLF mode: when multi-line mode is enabled, \r\n is used
+ - **U**: swap the meaning of x* and x*?"#)
+ .build()
+ .unwrap()
+ })
}
+
fn regexp_match_func(args: &[ArrayRef]) -> Result<ArrayRef> {
match args[0].data_type() {
DataType::Utf8 => regexp_match::<i32>(args),
diff --git a/datafusion/functions/src/regex/regexpreplace.rs
b/datafusion/functions/src/regex/regexpreplace.rs
index bce8752af2..279e5c6ba9 100644
--- a/datafusion/functions/src/regex/regexpreplace.rs
+++ b/datafusion/functions/src/regex/regexpreplace.rs
@@ -32,14 +32,15 @@ use datafusion_common::{
cast::as_generic_string_array, internal_err, DataFusionError, Result,
};
use datafusion_expr::function::Hint;
+use datafusion_expr::scalar_doc_sections::DOC_SECTION_REGEX;
use datafusion_expr::ColumnarValue;
use datafusion_expr::TypeSignature;
-use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
+use datafusion_expr::{Documentation, ScalarUDFImpl, Signature, Volatility};
use regex::Regex;
use std::any::Any;
use std::collections::HashMap;
-use std::sync::Arc;
-use std::sync::OnceLock;
+use std::sync::{Arc, OnceLock};
+
#[derive(Debug)]
pub struct RegexpReplaceFunc {
signature: Signature,
@@ -123,6 +124,51 @@ impl ScalarUDFImpl for RegexpReplaceFunc {
result.map(ColumnarValue::Array)
}
}
+
+ fn documentation(&self) -> Option<&Documentation> {
+ Some(get_regexp_replace_doc())
+ }
+}
+
+static DOCUMENTATION: OnceLock<Documentation> = OnceLock::new();
+
+fn get_regexp_replace_doc() -> &'static Documentation {
+ DOCUMENTATION.get_or_init(|| {
+ Documentation::builder()
+ .with_doc_section(DOC_SECTION_REGEX)
+ .with_description("Replaces substrings in a string that match a
[regular expression](https://docs.rs/regex/latest/regex/#syntax).")
+ .with_syntax_example("regexp_replace(str, regexp, replacement[,
flags])")
+ .with_sql_example(r#"```sql
+> select regexp_replace('foobarbaz', 'b(..)', 'X\\1Y', 'g');
++------------------------------------------------------------------------+
+| regexp_replace(Utf8("foobarbaz"),Utf8("b(..)"),Utf8("X\1Y"),Utf8("g")) |
++------------------------------------------------------------------------+
+| fooXarYXazY |
++------------------------------------------------------------------------+
+SELECT regexp_replace('aBc', '(b|d)', 'Ab\\1a', 'i');
++-------------------------------------------------------------------+
+| regexp_replace(Utf8("aBc"),Utf8("(b|d)"),Utf8("Ab\1a"),Utf8("i")) |
++-------------------------------------------------------------------+
+| aAbBac |
++-------------------------------------------------------------------+
+```
+Additional examples can be found
[here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/regexp.rs)
+"#)
+ .with_standard_argument("str", "String")
+ .with_argument("regexp","Regular expression to match against.
+ Can be a constant, column, or function.")
+ .with_standard_argument("replacement", "Replacement string")
+ .with_argument("flags",
+ r#"Optional regular expression flags that control the
behavior of the regular expression. The following flags are supported:
+- **g**: (global) Search globally and don't return after the first match
+- **i**: case-insensitive: letters match both upper and lower case
+- **m**: multi-line mode: ^ and $ match begin/end of line
+- **s**: allow . to match \n
+- **R**: enables CRLF mode: when multi-line mode is enabled, \r\n is used
+- **U**: swap the meaning of x* and x*?"#)
+ .build()
+ .unwrap()
+})
}
fn regexp_replace_func(args: &[ColumnarValue]) -> Result<ArrayRef> {
diff --git a/docs/source/user-guide/sql/scalar_functions.md
b/docs/source/user-guide/sql/scalar_functions.md
index 95762d9585..56145ec803 100644
--- a/docs/source/user-guide/sql/scalar_functions.md
+++ b/docs/source/user-guide/sql/scalar_functions.md
@@ -563,103 +563,6 @@ See the new documentation
[`here`](https://datafusion.apache.org/user-guide/sql/
See the new documentation
[`here`](https://datafusion.apache.org/user-guide/sql/scalar_functions_new.html)
-## Regular Expression Functions
-
-Apache DataFusion uses a [PCRE-like] regular expression [syntax]
-(minus support for several features including look-around and backreferences).
-The following regular expression functions are supported:
-
-- [regexp_match](#regexp_match)
-- [regexp_replace](#regexp_replace)
-
-[pcre-like]:
https://en.wikibooks.org/wiki/Regular_Expressions/Perl-Compatible_Regular_Expressions
-[syntax]: https://docs.rs/regex/latest/regex/#syntax
-
-### `regexp_match`
-
-Returns a list of [regular
expression](https://docs.rs/regex/latest/regex/#syntax) matches in a string.
-
-```
-regexp_match(str, regexp[, flags])
-```
-
-#### Arguments
-
-- **str**: String expression to operate on.
- Can be a constant, column, or function, and any combination of string
operators.
-- **regexp**: Regular expression to match against.
- Can be a constant, column, or function.
-- **flags**: Optional regular expression flags that control the behavior of the
- regular expression. The following flags are supported:
- - **i**: case-insensitive: letters match both upper and lower case
- - **m**: multi-line mode: ^ and $ match begin/end of line
- - **s**: allow . to match \n
- - **R**: enables CRLF mode: when multi-line mode is enabled, \r\n is used
- - **U**: swap the meaning of x* and x*?
-
-#### Example
-
-```sql
-select regexp_match('Köln', '[a-zA-Z]ö[a-zA-Z]{2}');
-+---------------------------------------------------------+
-| regexp_match(Utf8("Köln"),Utf8("[a-zA-Z]ö[a-zA-Z]{2}")) |
-+---------------------------------------------------------+
-| [Köln] |
-+---------------------------------------------------------+
-SELECT regexp_match('aBc', '(b|d)', 'i');
-+---------------------------------------------------+
-| regexp_match(Utf8("aBc"),Utf8("(b|d)"),Utf8("i")) |
-+---------------------------------------------------+
-| [B] |
-+---------------------------------------------------+
-```
-
-Additional examples can be found
[here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/regexp.rs)
-
-### `regexp_replace`
-
-Replaces substrings in a string that match a [regular
expression](https://docs.rs/regex/latest/regex/#syntax).
-
-```
-regexp_replace(str, regexp, replacement[, flags])
-```
-
-#### Arguments
-
-- **str**: String expression to operate on.
- Can be a constant, column, or function, and any combination of string
operators.
-- **regexp**: Regular expression to match against.
- Can be a constant, column, or function.
-- **replacement**: Replacement string expression.
- Can be a constant, column, or function, and any combination of string
operators.
-- **flags**: Optional regular expression flags that control the behavior of the
- regular expression. The following flags are supported:
- - **g**: (global) Search globally and don't return after the first match
- - **i**: case-insensitive: letters match both upper and lower case
- - **m**: multi-line mode: ^ and $ match begin/end of line
- - **s**: allow . to match \n
- - **R**: enables CRLF mode: when multi-line mode is enabled, \r\n is used
- - **U**: swap the meaning of x* and x*?
-
-#### Example
-
-```sql
-SELECT regexp_replace('foobarbaz', 'b(..)', 'X\\1Y', 'g');
-+------------------------------------------------------------------------+
-| regexp_replace(Utf8("foobarbaz"),Utf8("b(..)"),Utf8("X\1Y"),Utf8("g")) |
-+------------------------------------------------------------------------+
-| fooXarYXazY |
-+------------------------------------------------------------------------+
-SELECT regexp_replace('aBc', '(b|d)', 'Ab\\1a', 'i');
-+-------------------------------------------------------------------+
-| regexp_replace(Utf8("aBc"),Utf8("(b|d)"),Utf8("Ab\1a"),Utf8("i")) |
-+-------------------------------------------------------------------+
-| aAbBac |
-+-------------------------------------------------------------------+
-```
-
-Additional examples can be found
[here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/regexp.rs)
-
### `position`
Returns the position of `substr` in `origstr` (counting from 1). If `substr`
does
diff --git a/docs/source/user-guide/sql/scalar_functions_new.md
b/docs/source/user-guide/sql/scalar_functions_new.md
index 96fbcaa110..7d0261da0a 100644
--- a/docs/source/user-guide/sql/scalar_functions_new.md
+++ b/docs/source/user-guide/sql/scalar_functions_new.md
@@ -1191,6 +1191,8 @@ regular expression
[syntax](https://docs.rs/regex/latest/regex/#syntax)
The following regular expression functions are supported:
- [regexp_like](#regexp_like)
+- [regexp_match](#regexp_match)
+- [regexp_replace](#regexp_replace)
### `regexp_like`
@@ -1230,6 +1232,86 @@ SELECT regexp_like('aBc', '(b|d)', 'i');
Additional examples can be found
[here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/regexp.rs)
+### `regexp_match`
+
+Returns a list of [regular
expression](https://docs.rs/regex/latest/regex/#syntax) matches in a string.
+
+```
+regexp_match(str, regexp[, flags])
+```
+
+#### Arguments
+
+- **str**: String expression to operate on. Can be a constant, column, or
function, and any combination of operators.
+- **regexp**: Regular expression to match against.
+ Can be a constant, column, or function.
+- **flags**: Optional regular expression flags that control the behavior of
the regular expression. The following flags are supported:
+ - **i**: case-insensitive: letters match both upper and lower case
+ - **m**: multi-line mode: ^ and $ match begin/end of line
+ - **s**: allow . to match \n
+ - **R**: enables CRLF mode: when multi-line mode is enabled, \r\n is used
+ - **U**: swap the meaning of x* and x*?
+
+#### Example
+
+```sql
+ > select regexp_match('Köln', '[a-zA-Z]ö[a-zA-Z]{2}');
+ +---------------------------------------------------------+
+ | regexp_match(Utf8("Köln"),Utf8("[a-zA-Z]ö[a-zA-Z]{2}")) |
+ +---------------------------------------------------------+
+ | [Köln] |
+ +---------------------------------------------------------+
+ SELECT regexp_match('aBc', '(b|d)', 'i');
+ +---------------------------------------------------+
+ | regexp_match(Utf8("aBc"),Utf8("(b|d)"),Utf8("i")) |
+ +---------------------------------------------------+
+ | [B] |
+ +---------------------------------------------------+
+```
+
+Additional examples can be found
[here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/regexp.rs)
+
+### `regexp_replace`
+
+Replaces substrings in a string that match a [regular
expression](https://docs.rs/regex/latest/regex/#syntax).
+
+```
+regexp_replace(str, regexp, replacement[, flags])
+```
+
+#### Arguments
+
+- **str**: String expression to operate on. Can be a constant, column, or
function, and any combination of operators.
+- **regexp**: Regular expression to match against.
+ Can be a constant, column, or function.
+- **replacement**: Replacement string expression to operate on. Can be a
constant, column, or function, and any combination of operators.
+- **flags**: Optional regular expression flags that control the behavior of
the regular expression. The following flags are supported:
+- **g**: (global) Search globally and don't return after the first match
+- **i**: case-insensitive: letters match both upper and lower case
+- **m**: multi-line mode: ^ and $ match begin/end of line
+- **s**: allow . to match \n
+- **R**: enables CRLF mode: when multi-line mode is enabled, \r\n is used
+- **U**: swap the meaning of x* and x*?
+
+#### Example
+
+```sql
+> select regexp_replace('foobarbaz', 'b(..)', 'X\\1Y', 'g');
++------------------------------------------------------------------------+
+| regexp_replace(Utf8("foobarbaz"),Utf8("b(..)"),Utf8("X\1Y"),Utf8("g")) |
++------------------------------------------------------------------------+
+| fooXarYXazY |
++------------------------------------------------------------------------+
+SELECT regexp_replace('aBc', '(b|d)', 'Ab\\1a', 'i');
++-------------------------------------------------------------------+
+| regexp_replace(Utf8("aBc"),Utf8("(b|d)"),Utf8("Ab\1a"),Utf8("i")) |
++-------------------------------------------------------------------+
+| aAbBac |
++-------------------------------------------------------------------+
+```
+
+Additional examples can be found
[here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/regexp.rs)
+
## Time and Date Functions
- [to_date](#to_date)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]