Re: [PR] [SPARK-55063][SQL][DOCS] Reorganize built-in functions documentation by category [spark]

via GitHub Thu, 15 Jan 2026 23:12:29 -0800


Copilot commented on code in PR #53829:
URL: https://github.com/apache/spark/pull/53829#discussion_r2697272755



##########
sql/gen-sql-api-docs.py:
##########
@@ -281,35 +398,149 @@ def generate_sql_api_markdown(jvm, path):
 
     """
 
-    with open(path, 'w') as mdfile:
+    def _write_function_entry(mdfile, info):
+        """Write a single function entry to the markdown file."""
+        name = info.name
+        anchor = _make_anchor(name)
+        usage = _make_pretty_usage(info.usage)
+        arguments = _make_pretty_arguments(info.arguments)
+        examples = _make_pretty_examples(info.examples)
+        note = _make_pretty_note(info.note)
+        since = info.since
+        deprecated = _make_pretty_deprecated(info.deprecated)
+
+        # Use explicit anchor for special characters
+        mdfile.write('<a name="%s"></a>\n\n' % anchor)
+        mdfile.write("### %s\n\n" % name)
+        if usage is not None:
+            mdfile.write("%s\n\n" % usage.strip())
+        if arguments is not None:
+            mdfile.write(arguments)
+        if examples is not None:
+            mdfile.write(examples)
+        if note is not None:
+            mdfile.write(note)
+        if since is not None and since != "":
+            mdfile.write("**Since:** %s\n\n" % since.strip())
+        if deprecated is not None:
+            mdfile.write(deprecated)
+        mdfile.write("<br/>\n\n")
+
+    # Group functions by category
+    grouped_infos = _list_grouped_function_infos(jvm)
+
+    # Track categories that have functions for the index
+    categories_with_functions = []
+
+    # Generate a separate markdown file for each category
+    for group_key, infos in grouped_infos:
+        display_name = _get_display_name(group_key)
+        file_name = _get_file_name(group_key)
+        categories_with_functions.append((group_key, display_name, file_name, 
len(infos)))
+
+        category_path = os.path.join(docs_dir, "%s.md" % file_name)
+        with open(category_path, 'w') as mdfile:
+            mdfile.write("# %s\n\n" % display_name)
+            mdfile.write("This page lists all %s available in Spark SQL.\n\n" 
% display_name.lower())
+            mdfile.write("---\n\n")
+            for info in infos:
+                _write_function_entry(mdfile, info)
+
+    # Generate the index file with links to all categories
+    index_path = os.path.join(docs_dir, "index.md")
+    with open(index_path, 'w') as mdfile:
         mdfile.write("# Built-in Functions\n\n")
-        for info in _list_function_infos(jvm):
-            name = info.name
-            usage = _make_pretty_usage(info.usage)
-            arguments = _make_pretty_arguments(info.arguments)
-            examples = _make_pretty_examples(info.examples)
-            note = _make_pretty_note(info.note)
-            since = info.since
-            deprecated = _make_pretty_deprecated(info.deprecated)
-
-            mdfile.write("### %s\n\n" % name)
-            if usage is not None:
-                mdfile.write("%s\n\n" % usage.strip())
-            if arguments is not None:
-                mdfile.write(arguments)
-            if examples is not None:
-                mdfile.write(examples)
-            if note is not None:
-                mdfile.write(note)
-            if since is not None and since != "":
-                mdfile.write("**Since:** %s\n\n" % since.strip())
-            if deprecated is not None:
-                mdfile.write(deprecated)
-            mdfile.write("<br/>\n\n")
+        # Inline CSS for responsive grid layout
+        mdfile.write("<style>\n")
+        mdfile.write(".func-grid {\n")
+        mdfile.write("  display: grid;\n")
+        mdfile.write("  grid-template-columns: repeat(4, 1fr);\n")
+        mdfile.write("  gap: 0.5rem;\n")
+        mdfile.write("  margin-bottom: 1.5rem;\n")
+        mdfile.write("}\n")
+        mdfile.write(".func-grid a {\n")
+        mdfile.write("  padding: 0.25rem 0;\n")
+        mdfile.write("  white-space: nowrap;\n")
+        mdfile.write("  overflow: hidden;\n")
+        mdfile.write("  text-overflow: ellipsis;\n")
+        mdfile.write("}\n")
+        mdfile.write("@media (max-width: 1200px) {\n")
+        mdfile.write("  .func-grid { grid-template-columns: repeat(3, 1fr); 
}\n")
+        mdfile.write("}\n")
+        mdfile.write("@media (max-width: 768px) {\n")
+        mdfile.write("  .func-grid { grid-template-columns: repeat(2, 1fr); 
}\n")
+        mdfile.write("}\n")
+        mdfile.write("@media (max-width: 480px) {\n")
+        mdfile.write("  .func-grid { grid-template-columns: 1fr; }\n")
+        mdfile.write("}\n")
+        mdfile.write("</style>\n\n")
+        mdfile.write("Spark SQL provides a comprehensive set of built-in 
functions for data ")
+        mdfile.write("manipulation and analysis. Functions are organized into 
the following ")
+        mdfile.write("categories:\n\n")
+
+        # Sort categories by display name for consistent ordering
+        sorted_categories = sorted(categories_with_functions, key=lambda x: 
x[1])
+
+        # Generate detailed TOC for each category with all function names
+        for group_key, display_name, file_name, count in sorted_categories:
+            mdfile.write("## %s (%d)\n\n" % (display_name, count))
+            # Get the functions for this category
+            category_infos = next(
+                (infos for gk, infos in grouped_infos if gk == group_key),
+                []
+            )
+            # Write function links in a responsive grid layout
+            mdfile.write('<div class="func-grid">\n')
+            for info in category_infos:
+                anchor = _make_anchor(info.name)
+                mdfile.write('<a href="%s.md#%s">%s</a>\n' % (file_name, 
anchor, info.name))

Review Comment:
   Function names containing HTML special characters (like '<', '>', '&') are 
embedded directly in HTML anchor tags without escaping. For example, the 
operators '<>', '||', '<=>' defined in _virtual_operator_infos could render 
incorrectly or cause parsing issues. While MkDocs may handle this in pure 
Markdown, these are raw HTML tags where escaping is not automatic. Consider 
using html.escape(info.name) when embedding function names in HTML tags.



##########
sql/gen-sql-api-docs.py:
##########
@@ -281,35 +398,149 @@ def generate_sql_api_markdown(jvm, path):
 
     """
 
-    with open(path, 'w') as mdfile:
+    def _write_function_entry(mdfile, info):
+        """Write a single function entry to the markdown file."""
+        name = info.name
+        anchor = _make_anchor(name)
+        usage = _make_pretty_usage(info.usage)
+        arguments = _make_pretty_arguments(info.arguments)
+        examples = _make_pretty_examples(info.examples)
+        note = _make_pretty_note(info.note)
+        since = info.since
+        deprecated = _make_pretty_deprecated(info.deprecated)
+
+        # Use explicit anchor for special characters
+        mdfile.write('<a name="%s"></a>\n\n' % anchor)
+        mdfile.write("### %s\n\n" % name)

Review Comment:
   The code uses the deprecated HTML4 'name' attribute for anchors instead of 
the HTML5 'id' attribute. While browsers still support this for backward 
compatibility, modern HTML should use '<a id="..."></a>' or add the id directly 
to the header tag like '### {#anchor-id}'. Additionally, since MkDocs generates 
anchors from headers automatically, there may be duplicate anchors created (one 
from the header '### name' and one from the explicit '<a name="..."></a>').
   ```suggestion
           # Use explicit anchor for special characters by attaching an id to 
the header
           mdfile.write("### %s {#%s}\n\n" % (name, anchor))
   ```



##########
sql/gen-sql-api-docs.py:
##########
@@ -132,10 +231,25 @@ def _list_function_infos(jvm):
             examples=jinfo.getExamples().replace("_FUNC_", name),
             note=jinfo.getNote().replace("_FUNC_", name),
             since=jinfo.getSince(),
-            deprecated=jinfo.getDeprecated()))
+            deprecated=jinfo.getDeprecated(),
+            group=group))
     return sorted(infos, key=lambda i: i.name)
 
 
+def _list_grouped_function_infos(jvm):
+    """
+    Returns a list of function information grouped by category.
+    Each item is a tuple of (group_key, list_of_infos).
+    """
+    infos = _list_function_infos(jvm)
+    # Group by category
+    grouped = itertools.groupby(
+        sorted(infos, key=lambda x: (x.group or "", x.name)),
+        key=lambda x: x.group
+    )
+    return [(k, sorted(list(g), key=lambda x: x.name)) for k, g in grouped]

Review Comment:
   The functions are already sorted by (group, name) on line 247, which means 
within each group they're already sorted by name. The additional sort by name 
on line 250 is redundant and adds unnecessary computation. You can remove the 
sort call and just use 'list(g)' instead of 'sorted(list(g), key=lambda x: 
x.name)'.
   ```suggestion
       return [(k, list(g)) for k, g in grouped]
   ```



##########
sql/gen-sql-api-docs.py:
##########
@@ -281,35 +398,149 @@ def generate_sql_api_markdown(jvm, path):
 
     """
 
-    with open(path, 'w') as mdfile:
+    def _write_function_entry(mdfile, info):
+        """Write a single function entry to the markdown file."""
+        name = info.name
+        anchor = _make_anchor(name)
+        usage = _make_pretty_usage(info.usage)
+        arguments = _make_pretty_arguments(info.arguments)
+        examples = _make_pretty_examples(info.examples)
+        note = _make_pretty_note(info.note)
+        since = info.since
+        deprecated = _make_pretty_deprecated(info.deprecated)
+
+        # Use explicit anchor for special characters
+        mdfile.write('<a name="%s"></a>\n\n' % anchor)
+        mdfile.write("### %s\n\n" % name)
+        if usage is not None:
+            mdfile.write("%s\n\n" % usage.strip())
+        if arguments is not None:
+            mdfile.write(arguments)
+        if examples is not None:
+            mdfile.write(examples)
+        if note is not None:
+            mdfile.write(note)
+        if since is not None and since != "":
+            mdfile.write("**Since:** %s\n\n" % since.strip())
+        if deprecated is not None:
+            mdfile.write(deprecated)
+        mdfile.write("<br/>\n\n")
+
+    # Group functions by category
+    grouped_infos = _list_grouped_function_infos(jvm)
+
+    # Track categories that have functions for the index
+    categories_with_functions = []
+
+    # Generate a separate markdown file for each category
+    for group_key, infos in grouped_infos:
+        display_name = _get_display_name(group_key)
+        file_name = _get_file_name(group_key)
+        categories_with_functions.append((group_key, display_name, file_name, 
len(infos)))
+
+        category_path = os.path.join(docs_dir, "%s.md" % file_name)
+        with open(category_path, 'w') as mdfile:
+            mdfile.write("# %s\n\n" % display_name)
+            mdfile.write("This page lists all %s available in Spark SQL.\n\n" 
% display_name.lower())
+            mdfile.write("---\n\n")
+            for info in infos:
+                _write_function_entry(mdfile, info)
+
+    # Generate the index file with links to all categories
+    index_path = os.path.join(docs_dir, "index.md")
+    with open(index_path, 'w') as mdfile:
         mdfile.write("# Built-in Functions\n\n")
-        for info in _list_function_infos(jvm):
-            name = info.name
-            usage = _make_pretty_usage(info.usage)
-            arguments = _make_pretty_arguments(info.arguments)
-            examples = _make_pretty_examples(info.examples)
-            note = _make_pretty_note(info.note)
-            since = info.since
-            deprecated = _make_pretty_deprecated(info.deprecated)
-
-            mdfile.write("### %s\n\n" % name)
-            if usage is not None:
-                mdfile.write("%s\n\n" % usage.strip())
-            if arguments is not None:
-                mdfile.write(arguments)
-            if examples is not None:
-                mdfile.write(examples)
-            if note is not None:
-                mdfile.write(note)
-            if since is not None and since != "":
-                mdfile.write("**Since:** %s\n\n" % since.strip())
-            if deprecated is not None:
-                mdfile.write(deprecated)
-            mdfile.write("<br/>\n\n")
+        # Inline CSS for responsive grid layout
+        mdfile.write("<style>\n")
+        mdfile.write(".func-grid {\n")
+        mdfile.write("  display: grid;\n")
+        mdfile.write("  grid-template-columns: repeat(4, 1fr);\n")
+        mdfile.write("  gap: 0.5rem;\n")
+        mdfile.write("  margin-bottom: 1.5rem;\n")
+        mdfile.write("}\n")
+        mdfile.write(".func-grid a {\n")
+        mdfile.write("  padding: 0.25rem 0;\n")
+        mdfile.write("  white-space: nowrap;\n")
+        mdfile.write("  overflow: hidden;\n")
+        mdfile.write("  text-overflow: ellipsis;\n")
+        mdfile.write("}\n")
+        mdfile.write("@media (max-width: 1200px) {\n")
+        mdfile.write("  .func-grid { grid-template-columns: repeat(3, 1fr); 
}\n")
+        mdfile.write("}\n")
+        mdfile.write("@media (max-width: 768px) {\n")
+        mdfile.write("  .func-grid { grid-template-columns: repeat(2, 1fr); 
}\n")
+        mdfile.write("}\n")
+        mdfile.write("@media (max-width: 480px) {\n")
+        mdfile.write("  .func-grid { grid-template-columns: 1fr; }\n")
+        mdfile.write("}\n")
+        mdfile.write("</style>\n\n")

Review Comment:
   Embedding CSS directly in Markdown files can cause issues with Content 
Security Policy (CSP) in some documentation hosting environments. Consider 
moving the CSS to a separate stylesheet file or using MkDocs' built-in theming 
capabilities to define custom styles, which would be more maintainable and 
follow documentation best practices.



##########
sql/gen-sql-api-docs.py:
##########
@@ -132,10 +231,25 @@ def _list_function_infos(jvm):
             examples=jinfo.getExamples().replace("_FUNC_", name),
             note=jinfo.getNote().replace("_FUNC_", name),
             since=jinfo.getSince(),
-            deprecated=jinfo.getDeprecated()))
+            deprecated=jinfo.getDeprecated(),
+            group=group))
     return sorted(infos, key=lambda i: i.name)
 
 
+def _list_grouped_function_infos(jvm):
+    """
+    Returns a list of function information grouped by category.
+    Each item is a tuple of (group_key, list_of_infos).
+    """
+    infos = _list_function_infos(jvm)
+    # Group by category
+    grouped = itertools.groupby(
+        sorted(infos, key=lambda x: (x.group or "", x.name)),
+        key=lambda x: x.group

Review Comment:
   The groupby operation uses 'x.group' as the key while the sort uses 'x.group 
or ""'. This inconsistency, while not causing a functional bug due to 
itertools.groupby's behavior, makes the code less clear. For better readability 
and maintainability, consider using the same expression for both: 'key=lambda 
x: x.group or ""' in the groupby call.
   ```suggestion
           key=lambda x: x.group or ""
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] [SPARK-55063][SQL][DOCS] Reorganize built-in functions documentation by category [spark]

Reply via email to