This is an automated email from the ASF dual-hosted git repository.
sarutak pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 6d0dacba1958 [SPARK-55063][SQL][DOCS] Reorganize built-in functions
documentation by category
6d0dacba1958 is described below
commit 6d0dacba195823752acc47d06a3abd79502f527a
Author: Kent Yao <[email protected]>
AuthorDate: Tue Jan 20 01:29:41 2026 +0900
[SPARK-55063][SQL][DOCS] Reorganize built-in functions documentation by
category
### What changes were proposed in this pull request?
This PR enhances the SQL function documentation generation by reorganizing
built-in functions into categories for better navigation and discoverability.
**Before:** All ~300+ functions listed in a single monolithic markdown file
**After:** Functions organized by category with dedicated pages and an
overview index
### Why are the changes needed?
The current documentation lists all built-in functions in a single, very
long markdown file which makes it difficult to:
- Find specific functions quickly
- Browse functions by category
- Maintain as new functions are added
- Navigate on mobile devices
This categorization improves user experience by organizing functions
logically and making the documentation more maintainable.
### Does this PR introduce any user-facing change?
Yes, doc web navi experience change
### How was this patch tested?
<img width="2194" height="1326" alt="image"
src="https://github.com/user-attachments/assets/e0ef7601-a1ee-4344-8630-f1f6114f2b72"
/>
### Was this patch authored or co-authored using generative AI tooling?
Yes, GitHub Copilot was used to assist with code development.
Closes #53829 from yaooqinn/SPARK-55063.
Authored-by: Kent Yao <[email protected]>
Signed-off-by: Kousuke Saruta <[email protected]>
---
.gitignore | 1 +
sql/gen-sql-api-docs.py | 309 ++++++++++++++++++++++++++++++++++++++++++------
sql/mkdocs.yml | 22 ----
3 files changed, 273 insertions(+), 59 deletions(-)
diff --git a/.gitignore b/.gitignore
index fd14d295a7e5..a5be28ef948c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -58,6 +58,7 @@ docs/_site/
docs/api
docs/.local_ruby_bundle
sql/docs
+sql/mkdocs.yml
sql/site
lib_managed/
lint-r-report.log
diff --git a/sql/gen-sql-api-docs.py b/sql/gen-sql-api-docs.py
index 3d19da01b393..f5ed47f6ec7e 100644
--- a/sql/gen-sql-api-docs.py
+++ b/sql/gen-sql-api-docs.py
@@ -15,14 +15,107 @@
# limitations under the License.
#
+import itertools
import os
+import re
from collections import namedtuple
from pyspark.java_gateway import launch_gateway
ExpressionInfo = namedtuple(
- "ExpressionInfo", "className name usage arguments examples note since
deprecated")
+ "ExpressionInfo", "className name usage arguments examples note since
deprecated group")
+
+
+def _make_anchor(name):
+ """
+ Convert function name to a valid HTML anchor.
+ Special characters are converted to descriptive names.
+
+ Parameters:
+ name (str): The function name.
+
+ Returns:
+ str: A valid HTML anchor string.
+ """
+ # Map special characters to descriptive names
+ special_chars = {
+ '!': 'not',
+ '!=': 'notequal',
+ '<>': 'notequal2',
+ '<': 'lt',
+ '<=': 'lte',
+ '<=>': 'nullsafeequal',
+ '=': 'eq',
+ '==': 'equal',
+ '>': 'gt',
+ '>=': 'gte',
+ '&': 'bitand',
+ '|': 'bitor',
+ '^': 'bitxor',
+ '~': 'bitnot',
+ '<<': 'shiftleft',
+ '>>': 'shiftright',
+ '>>>': 'shiftrightunsigned',
+ '+': 'plus',
+ '-': 'minus',
+ '*': 'multiply',
+ '/': 'divide',
+ '%': 'mod',
+ '||': 'concat',
+ }
+
+ if name in special_chars:
+ return special_chars[name]
+
+ # For regular names, convert to lowercase and replace spaces with hyphens
+ # Remove any remaining special characters
+ anchor = name.lower().replace(" ", "-")
+ anchor = re.sub(r'[^a-z0-9_-]', '', anchor)
+ return anchor
+
+
+def _get_display_name(group):
+ """
+ Convert group name to display name.
+
+ Parameters:
+ group (str): The group name (e.g., "agg_funcs", "window_funcs").
+
+ Returns:
+ str: The display name (e.g., "Agg Functions", "Window Functions").
+ """
+ if group is None or group == "":
+ return "Misc Functions"
+ # Replace _funcs suffix, replace underscores with spaces, and title case
+ name = group.replace("_funcs", "").replace("_", " ").title()
+ return "%s Functions" % name
+
+
+def _get_file_name(group):
+ """
+ Convert group name to file name.
+
+ Parameters:
+ group (str): The group name (e.g., "agg_funcs", "window_funcs",
"operator").
+
+ Returns:
+ str: The file name (e.g., "agg-functions", "window-functions",
"operator-functions").
+ """
+ if group is None or group == "":
+ return "misc-functions"
+ # Replace _funcs with -functions, replace underscores with hyphens
+ file_name = group.replace("_funcs", "-functions").replace("_", "-")
+ # If the group doesn't end with _funcs, append -functions
+ if not group.endswith("_funcs") and not file_name.endswith("-functions"):
+ file_name = file_name + "-functions"
+ return file_name
+
+
+# Groups that should be merged into other groups
+GROUP_MERGES = {
+ "lambda_funcs": "collection_funcs", # SPARK-45232
+}
_virtual_operator_infos = [
ExpressionInfo(
@@ -46,7 +139,8 @@ _virtual_operator_infos = [
" NULL",
note="",
since="1.0.0",
- deprecated=""),
+ deprecated="",
+ group="predicate_funcs"),
ExpressionInfo(
className="",
name="<>",
@@ -68,7 +162,8 @@ _virtual_operator_infos = [
" NULL",
note="",
since="1.0.0",
- deprecated=""),
+ deprecated="",
+ group="predicate_funcs"),
ExpressionInfo(
className="",
name="case",
@@ -95,7 +190,8 @@ _virtual_operator_infos = [
" NULL",
note="",
since="1.0.1",
- deprecated=""),
+ deprecated="",
+ group="conditional_funcs"),
ExpressionInfo(
className="",
name="||",
@@ -108,7 +204,8 @@ _virtual_operator_infos = [
" [1,2,3,4,5,6]",
note="\n || for arrays is available since 2.4.0.\n",
since="2.3.0",
- deprecated="")
+ deprecated="",
+ group="string_funcs")
]
@@ -119,11 +216,14 @@ def _list_function_infos(jvm):
"""
jinfos =
jvm.org.apache.spark.sql.api.python.PythonSQLUtils.listBuiltinFunctionInfos()
- infos = _virtual_operator_infos
+ infos = list(_virtual_operator_infos) # Make a copy
for jinfo in jinfos:
name = jinfo.getName()
usage = jinfo.getUsage()
usage = usage.replace("_FUNC_", name) if usage is not None else usage
+ # Get the group and apply any merges
+ group = jinfo.getGroup()
+ group = GROUP_MERGES.get(group, group)
infos.append(ExpressionInfo(
className=jinfo.getClassName(),
name=name,
@@ -132,10 +232,25 @@ def _list_function_infos(jvm):
examples=jinfo.getExamples().replace("_FUNC_", name),
note=jinfo.getNote().replace("_FUNC_", name),
since=jinfo.getSince(),
- deprecated=jinfo.getDeprecated()))
+ deprecated=jinfo.getDeprecated(),
+ group=group))
return sorted(infos, key=lambda i: i.name)
+def _list_grouped_function_infos(jvm):
+ """
+ Returns a list of function information grouped by category.
+ Each item is a tuple of (group_key, list_of_infos).
+ """
+ infos = _list_function_infos(jvm)
+ # Group by category
+ grouped = itertools.groupby(
+ sorted(infos, key=lambda x: (x.group or "", x.name)),
+ key=lambda x: x.group
+ )
+ return [(k, list(g)) for k, g in grouped]
+
+
def _make_pretty_usage(usage):
"""
Makes the usage description pretty and returns a formatted string if
`usage`
@@ -247,12 +362,15 @@ def _make_pretty_deprecated(deprecated):
return "**Deprecated:**\n%s\n" % deprecated
-def generate_sql_api_markdown(jvm, path):
+def generate_sql_api_markdown(jvm, docs_dir):
"""
- Generates a markdown file after listing the function information. The
output file
- is created in `path`.
+ Generates markdown files after listing the function information.
+ Creates one file per category plus an index file.
+ Also generates mkdocs.yml with auto-generated navigation.
+
+ Expected output for each category file:
+ # Category Name
- Expected output:
### NAME
USAGE
@@ -281,35 +399,152 @@ def generate_sql_api_markdown(jvm, path):
"""
- with open(path, 'w') as mdfile:
+ def _write_function_entry(mdfile, info):
+ """Write a single function entry to the markdown file."""
+ name = info.name
+ anchor = _make_anchor(name)
+ usage = _make_pretty_usage(info.usage)
+ arguments = _make_pretty_arguments(info.arguments)
+ examples = _make_pretty_examples(info.examples)
+ note = _make_pretty_note(info.note)
+ since = info.since
+ deprecated = _make_pretty_deprecated(info.deprecated)
+
+ # Use explicit anchor for special characters
+ mdfile.write('<a name="%s"></a>\n\n' % anchor)
+ mdfile.write("### %s\n\n" % name)
+ if usage is not None:
+ mdfile.write("%s\n\n" % usage.strip())
+ if arguments is not None:
+ mdfile.write(arguments)
+ if examples is not None:
+ mdfile.write(examples)
+ if note is not None:
+ mdfile.write(note)
+ if since is not None and since != "":
+ mdfile.write("**Since:** %s\n\n" % since.strip())
+ if deprecated is not None:
+ mdfile.write(deprecated)
+ mdfile.write("<br/>\n\n")
+
+ # Group functions by category
+ grouped_infos = _list_grouped_function_infos(jvm)
+
+ # Track categories that have functions for the index
+ categories_with_functions = []
+
+ # Generate a separate markdown file for each category
+ for group_key, infos in grouped_infos:
+ display_name = _get_display_name(group_key)
+ file_name = _get_file_name(group_key)
+ categories_with_functions.append((group_key, display_name, file_name,
len(infos)))
+
+ category_path = os.path.join(docs_dir, "%s.md" % file_name)
+ with open(category_path, 'w') as mdfile:
+ mdfile.write("# %s\n\n" % display_name)
+ mdfile.write("This page lists all %s available in Spark SQL.\n\n"
% display_name.lower())
+ mdfile.write("---\n\n")
+ for info in infos:
+ _write_function_entry(mdfile, info)
+
+ # Generate the index file with links to all categories
+ index_path = os.path.join(docs_dir, "index.md")
+ with open(index_path, 'w') as mdfile:
mdfile.write("# Built-in Functions\n\n")
- for info in _list_function_infos(jvm):
- name = info.name
- usage = _make_pretty_usage(info.usage)
- arguments = _make_pretty_arguments(info.arguments)
- examples = _make_pretty_examples(info.examples)
- note = _make_pretty_note(info.note)
- since = info.since
- deprecated = _make_pretty_deprecated(info.deprecated)
-
- mdfile.write("### %s\n\n" % name)
- if usage is not None:
- mdfile.write("%s\n\n" % usage.strip())
- if arguments is not None:
- mdfile.write(arguments)
- if examples is not None:
- mdfile.write(examples)
- if note is not None:
- mdfile.write(note)
- if since is not None and since != "":
- mdfile.write("**Since:** %s\n\n" % since.strip())
- if deprecated is not None:
- mdfile.write(deprecated)
- mdfile.write("<br/>\n\n")
+ # Inline CSS for responsive grid layout
+ css = """<style>
+.func-grid {
+ display: grid;
+ grid-template-columns: repeat(4, 1fr);
+ gap: 0.5rem;
+ margin-bottom: 1.5rem;
+}
+.func-grid a {
+ padding: 0.25rem 0;
+ white-space: nowrap;
+ overflow: hidden;
+ text-overflow: ellipsis;
+}
+@media (max-width: 1200px) {
+ .func-grid { grid-template-columns: repeat(3, 1fr); }
+}
+@media (max-width: 768px) {
+ .func-grid { grid-template-columns: repeat(2, 1fr); }
+}
+@media (max-width: 480px) {
+ .func-grid { grid-template-columns: 1fr; }
+}
+</style>
+
+"""
+ mdfile.write(css)
+ mdfile.write("Spark SQL provides a comprehensive set of built-in
functions for data ")
+ mdfile.write("manipulation and analysis. Functions are organized into
the following ")
+ mdfile.write("categories:\n\n")
+
+ # Sort categories by display name for consistent ordering
+ sorted_categories = sorted(categories_with_functions, key=lambda x:
x[1])
+
+ # Create dictionary for efficient lookup
+ grouped_dict = {k: infos for k, infos in grouped_infos}
+
+ # Generate detailed TOC for each category with all function names
+ for group_key, display_name, file_name, count in sorted_categories:
+ mdfile.write("## %s (%d)\n\n" % (display_name, count))
+ # Get the functions for this category
+ category_infos = grouped_dict.get(group_key, [])
+ # Write function links in a responsive grid layout
+ mdfile.write('<div class="func-grid">\n')
+ for info in category_infos:
+ anchor = _make_anchor(info.name)
+ mdfile.write('<a href="%s/#%s">%s</a>\n' % (file_name, anchor,
info.name))
+ mdfile.write('</div>\n\n')
+
+ # Auto-generate mkdocs.yml with navigation
+ _generate_mkdocs_yml(docs_dir, categories_with_functions)
+
+
+def _generate_mkdocs_yml(docs_dir, categories_with_functions):
+ """
+ Generate mkdocs.yml with auto-generated navigation based on function
categories.
+
+ Parameters:
+ docs_dir (str): The docs directory path.
+ categories_with_functions (list): List of tuples (group_key, display_name,
file_name, count).
+ """
+ # mkdocs.yml is in the parent directory of docs
+ mkdocs_path = os.path.join(os.path.dirname(docs_dir), "mkdocs.yml")
+
+ # Sort categories by display name for consistent ordering
+ sorted_categories = sorted(categories_with_functions, key=lambda x: x[1])
+
+ with open(mkdocs_path, 'w') as f:
+ f.write("# AUTO-GENERATED FILE - DO NOT EDIT MANUALLY\n")
+ f.write("# This file is generated by gen-sql-api-docs.py\n")
+ f.write("# Run 'sql/create-docs.sh' to regenerate\n")
+ f.write("\n")
+ f.write("site_name: Spark SQL, Built-in Functions\n")
+ f.write("theme:\n")
+ f.write(" name: readthedocs\n")
+ f.write(" navigation_depth: 3\n")
+ f.write(" collapse_navigation: true\n")
+ f.write("nav:\n")
+ f.write(" - 'Overview': 'index.md'\n")
+
+ for group_key, display_name, file_name, count in sorted_categories:
+ f.write(" - '%s': '%s.md'\n" % (display_name, file_name))
+
+ f.write("markdown_extensions:\n")
+ f.write(" - toc:\n")
+ f.write(" anchorlink: True\n")
+ f.write(" permalink: True\n")
+ f.write(" - tables\n")
if __name__ == "__main__":
jvm = launch_gateway().jvm
spark_root_dir = os.path.dirname(os.path.dirname(__file__))
- markdown_file_path = os.path.join(spark_root_dir, "sql/docs/index.md")
- generate_sql_api_markdown(jvm, markdown_file_path)
+ docs_dir = os.path.join(spark_root_dir, "sql/docs")
+ # Create docs directory if it doesn't exist
+ os.makedirs(docs_dir, exist_ok=True)
+ generate_sql_api_markdown(jvm, docs_dir)
diff --git a/sql/mkdocs.yml b/sql/mkdocs.yml
deleted file mode 100644
index 4463e11f17d1..000000000000
--- a/sql/mkdocs.yml
+++ /dev/null
@@ -1,22 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-site_name: Spark SQL, Built-in Functions
-theme: readthedocs
-nav:
- - 'Functions': 'index.md'
-markdown_extensions:
- - toc:
- anchorlink: True
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]