This is an automated email from the ASF dual-hosted git repository. alamb pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/datafusion-site.git
The following commit(s) were added to refs/heads/main by this push: new cbe1336 Add table of contents to blog article (#107) cbe1336 is described below commit cbe1336f2a4a1668a322a895354059252d0d3877 Author: Nuno Faria <nunofpfa...@gmail.com> AuthorDate: Fri Sep 12 11:46:33 2025 +0100 Add table of contents to blog article (#107) Co-authored-by: Andrew Lamb <and...@nerdnetworks.org> --- .../blog/2025-08-15-external-parquet-indexes.md | 2 + content/css/app.css | 21 ++++ content/theme/templates/blogindex.html | 2 +- content/theme/templates/generic.html | 34 +++-- pelicanconf.py | 6 +- plugins/extract_toc/README.md | 137 +++++++++++++++++++++ plugins/extract_toc/__init__.py | 1 + plugins/extract_toc/extract_toc.py | 66 ++++++++++ 8 files changed, 256 insertions(+), 13 deletions(-) diff --git a/content/blog/2025-08-15-external-parquet-indexes.md b/content/blog/2025-08-15-external-parquet-indexes.md index 62d4cfe..53002cc 100644 --- a/content/blog/2025-08-15-external-parquet-indexes.md +++ b/content/blog/2025-08-15-external-parquet-indexes.md @@ -26,6 +26,8 @@ limitations under the License. <!-- diagrams source https://docs.google.com/presentation/d/1e_Z_F8nt2rcvlNvhU11khF5lzJJVqNtqtyJ-G3mp4-Q --> +[TOC] + It is a common misconception that [Apache Parquet] requires (slow) reparsing of metadata and is limited to indexing structures provided by the format. In fact, caching parsed metadata and using custom external indexes along with diff --git a/content/css/app.css b/content/css/app.css index 28ea660..2e0951e 100644 --- a/content/css/app.css +++ b/content/css/app.css @@ -1,3 +1,24 @@ .main-content { max-width: 900px; } + +.toc { + position: sticky; + top: 20px; + overflow-y: auto; + padding: 10px; + max-height: calc(100vh - 20px); + border-color: #eee; + border-radius: 10px; + border-width: 1px; + border-style: solid; + max-width: 400px; +} + +.toctitle { + font-weight: bold; +} + +.toc ul li { + margin-bottom: 5px; +} diff --git a/content/theme/templates/blogindex.html b/content/theme/templates/blogindex.html index 26976da..b5f1fc3 100644 --- a/content/theme/templates/blogindex.html +++ b/content/theme/templates/blogindex.html @@ -1,5 +1,5 @@ <div id="contents"> - <div class="bg-white p-5 rounded"> + <div class="bg-white p-4 p-md-5 rounded"> <div class="col-md-10 col-lg-8 mx-auto main-content"> <h3>Welcome to the Apache DataFusion Blog!</h3> diff --git a/content/theme/templates/generic.html b/content/theme/templates/generic.html index f54efdd..e437e4a 100644 --- a/content/theme/templates/generic.html +++ b/content/theme/templates/generic.html @@ -1,16 +1,28 @@ - - <!-- article contents --> <div id="contents"> - <div class="bg-white p-5 rounded"> - <div class="col-md-10 col-lg-8 mx-auto main-content"> - <h1> - {{ article.title }} - </h1> - <p>Posted on: {{ article.locale_date }} by {{ article.author }}</p> - {{ article.content }} + <div class="bg-white p-4 p-md-5 rounded"> + <div class="row justify-content-center"> + <div class="col-12 col-md-8 main-content"> + <h1> + {{ article.title }} + </h1> + <p>Posted on: {{ article.locale_date }} by {{ article.author }}</p> + + {% if article.toc %} + <aside class="d-md-none mb-2"> + {{ article.toc }} + </aside> + {% endif %} + + {{ article.content }} - {% include "comments.html" %} - </div> + {% include "comments.html" %} </div> + {% if article.toc %} + <aside class="d-none d-md-block col-md-4 col-xl-3 ms-xl-2"> + {{ article.toc }} + </aside> + {% endif %} </div> + </div> +</div> \ No newline at end of file diff --git a/pelicanconf.py b/pelicanconf.py index a409eee..e5433cb 100644 --- a/pelicanconf.py +++ b/pelicanconf.py @@ -16,7 +16,7 @@ PLUGIN_PATHS = [ 'plugins', ] # If the website uses any *.ezmd files, include the 'asfreader' plugin # PLUGINS = [ 'toc', 'gfm', 'asfgenid', ] # PLUGINS = ['asfgenid', 'asfdata', 'pelican-gfm', 'asfreader', 'sitemap'] -PLUGINS = ['asfgenid', 'extract_date_from_filename'] +PLUGINS = ['asfgenid', 'extract_date_from_filename', 'extract_toc'] # All content is located at '.' (aka content/ ) PAGE_PATHS = [ 'pages' ] STATIC_PATHS = [ '.', ] @@ -65,6 +65,10 @@ FEED_RSS = "feed.xml" MARKDOWN = { 'extension_configs': { 'markdown.extensions.fenced_code': {}, + 'markdown.extensions.toc': { + 'title': 'Contents', + 'permalink': True, + }, 'markdown.extensions.tables': {}, }, 'output_format': 'html5', diff --git a/plugins/extract_toc/README.md b/plugins/extract_toc/README.md new file mode 100644 index 0000000..40d2bee --- /dev/null +++ b/plugins/extract_toc/README.md @@ -0,0 +1,137 @@ +Extract Table of Content +======================== + +A Pelican plugin to extract table of contents (ToC) from `article.content` and +place it in its own `article.toc` variable for use in templates. + +Copyright (c) Talha Mansoor + +Author | Talha Mansoor +----------------|----- +Author Email | talha...@gmail.com +Author Homepage | http://onCrashReboot.com +Github Account | https://github.com/talha131 + + +Acknowledgement +--------------- + +Thanks to [Avaris](https://github.com/avaris) for going out of the way to help +me fix Unicode issues and doing a thorough code review. + +Thanks to [gw0](http://gw.tnode.com/) for adding Pandoc reader support. + + +Why do you need it? +=================== + +Pelican can generate ToC of reST and Markdown files, using markup's respective +directive and extension. Such ToC is generated and placed at the beginning of +`article.content` like a string. Consequently it can not be placed anywhere +else on the page (eg. `<nav>` HTML5 tag, in header, or at the end of your +article's contents). + +To solve this problem, this plugin extracts ToC from `article.content` and +places it in its own `article.toc` variable for use in templates. + + +Requirements +============ + +`extract_toc` requires BeautifulSoup. + +```bash +pip install beautifulsoup4 +``` + + +How to Use +========== + +This plugin works by extracting the first occurrence of enclosed in: + +- `<div class="toc">` for the default Markdown reader +- `<div class="contents topic">` for the default reStructuredText reader +- `<nav class="TOC">` for the Pandoc reader + +If ToC appears in your article at more than one places, `extract_toc` will +remove only the first occurrence. You shouldn't probably need to have multiple +ToC in your article. In case you need to display it multiple times, you can +print it via your template. + + +Template example +---------------- + +Add something like this to your Pelican templates if missing: + +```python +{% if article.toc %} + <nav class="toc"> + {{ article.toc }} + </nav> +{% endif %} +``` + + +reStructuredText reader +----------------------- + +To add a table of contents to your reStructuredText document (`.rst`) you need to add a `.. contents::` directive to its beginning. See the [docutils documentation](http://docutils.sourceforge.net/docs/ref/rst/directives.html#table-of-contents) for more details. + +```rst +My super title +############## + +:date: 2010-10-03 +:tags: thats, awesome + +.. contents:: +.. + 1 Head 1 + 1.1 Head 2 + 2 Head 3 + 3 head 4 + +Heading 1 +--------- + +Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. +``` + + +Markdown reader +--------------- + +To enable table of contents generation for the Markdown reader you need to set `MD_EXTENSIONS = (['toc'])` in your Pelican configuration file. + +To add a table of contents to your Markdown document (`.md`) you need to place the `[TOC]` marker to its beginning. See the [Python Markdown documentation](http://pythonhosted.org/Markdown/extensions/toc.html) for more details. + +```markdown +title: My super title +date: 4-4-2013 +tags: thats, awesome + +[TOC] + +# Heading 1 # + +Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. +``` + + +Pandoc reader +------------- + +To enable table of contents generation for the Pandoc reader you need to set `PANDOC_ARGS = (['--toc', '--template=pandoc-template-toc'])` in your Pelican configuration file. + +Contents of the Pandoc template file `pandoc-template-toc.html5`: + +```html +$if(toc)$ +<nav id="TOC"> +$toc$ +</nav> +$endif$ +$body$ +``` diff --git a/plugins/extract_toc/__init__.py b/plugins/extract_toc/__init__.py new file mode 100644 index 0000000..52c5778 --- /dev/null +++ b/plugins/extract_toc/__init__.py @@ -0,0 +1 @@ +from .extract_toc import * diff --git a/plugins/extract_toc/extract_toc.py b/plugins/extract_toc/extract_toc.py new file mode 100644 index 0000000..c92c4a4 --- /dev/null +++ b/plugins/extract_toc/extract_toc.py @@ -0,0 +1,66 @@ +# -*- coding: utf-8 -*- +""" +Extract Table of Content +======================== + +A Pelican plugin to extract table of contents (ToC) from `article.content` and +place it in its own `article.toc` variable for use in templates. +""" + +from os import path +from bs4 import BeautifulSoup +from pelican import signals, readers, contents +import logging + +logger = logging.getLogger(__name__) + + +def extract_toc(content): + if isinstance(content, contents.Static): + return + + soup = BeautifulSoup(content._content, 'html.parser') + filename = content.source_path + extension = path.splitext(filename)[1][1:] + toc = None + + # default Markdown reader + if not toc and readers.MarkdownReader.enabled and extension in readers.MarkdownReader.file_extensions: + toc = soup.find('div', class_='toc') + if toc: + toc.extract() + if len(toc.find_next('ul').find_all('li')) == 0: + toc = None + + # default reStructuredText reader + if not toc and readers.RstReader.enabled and extension in readers.RstReader.file_extensions: + toc = soup.find('div', class_='contents topic') + if toc: + toc.extract() + tag = BeautifulSoup(str(toc), 'html.parser') + tag.div['class'] = 'toc' + tag.div['id'] = '' + p = tag.find('p', class_='topic-title first') + if p: + p.extract() + toc = tag + + # Pandoc reader (markdown and other formats) + if 'pandoc_reader' in content.settings['PLUGINS']: + try: + from pandoc_reader import PandocReader + except ImportError: + PandocReader = False + if not toc and PandocReader and PandocReader.enabled and extension in PandocReader.file_extensions: + toc = soup.find('nav', id='TOC') + + if toc: + toc.extract() + content._content = soup.decode() + content.toc = toc.decode() + if content.toc.startswith('<html>'): + content.toc = content.toc[12:-14] + + +def register(): + signals.content_object_init.connect(extract_toc) --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@datafusion.apache.org For additional commands, e-mail: commits-h...@datafusion.apache.org