;;; ox-xml.el --- XML Backend for Org Export Engine -*- lexical-binding: t; -*-

;; Copyright (C) 2026 Free Software Foundation, Inc.
;; Author: Amy Pillow <amypillow@lavache.com>
;; Keywords: outlines, hypermedia, calendar, text
;; URL: https://codeberg.org/strawburster/grobl

;; This file is NOT part of GNU Emacs.

;; This program is free software: you can redistribute it and/or
;; modify it under the terms of the GNU General Public License as
;; published by the Free Software Foundation, either version 3 of the
;; License, or (at your option) any later version.

;; This program is distributed in the hope that it will be useful, but
;; WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;; General Public License for more details.

;; You should have received a copy of the GNU General Public License
;; along with this program.  If not, see
;; <https://www.gnu.org/licenses/>.

;;; Commentary:
;;
;; This library implements an XML backend for Org sitemap generation.
;;
;; The sitemap follows version 0.9 of the sitemap xml schema:
;;
;;    http://www.sitemaps.org/schemas/sitemap/0.9
;;
;; An atom feed xml document can also be created as a sitemap.
;;
;; For either sitemaps or atom feeds, the `:sitemap-style' must be set
;; to `list', `:auto-sitemap' must be set to `t', and the
;; `:publishing-function' must be set to
;; `org-xml-publish-only-sitemap', or a list including that function.
;;
;; In addition, the `:sitemap-function' and `:sitemap-format-entry'
;; options must be set to either `org-xml-publish-sitemap' and
;; `org-xml-publish-sitemap-entry' or `org-xml-publish-sitemap-atom'
;; and `org-xml-publish-sitemap-atom-entry' respectively, whether you
;; want a sitemap or an atom feed.
;;
;; All of these options are expressed in the constants
;; `org-xml-sitemap' and `org-xml-sitemap-atom', which can be
;; concatenated or spliced into a list of options for a project in
;; `org-publish-project-alist'.
;;
;; This backend uses additional org file keywords to modify the
;; corresponding sitemap or atom entries:
;;
;;   `PRIORITY'
;;
;;      For sitemaps, the priority value to use for this entry,
;;      between 0 and 1.  Typically, indices will have a lower
;;      priority than articles.
;;
;;   `CHANGEFREQ'
;;
;;      For sitemaps, the `changefreq' to use for a entry, any of
;;      `org-xml-sitemap-change-frequencies'.  This tells search
;;      engines how often to review a particular page.
;;
;;   `SITEMAP_IGNORE' / `ATOM_IGNORE'
;;
;;      Do not create an entry for this file in any sitemap or atom
;;      feed, respectively.
;;
;;   `ATOM_NODES'
;;
;;      Will be read as a Lisp expression and should be a list of dom
;;      nodes that can be interpreted by `dom-print' and will be
;;      inserted into the corresponding atom entry.
;;
;;   `ID'
;;
;;      The unique id of an Org file.  For atom feeds, this is
;;      required for every file, and can be generated using
;;      `org-id-get-create'.
;;
;;   `ATOM_INCLUDE_CONTENT'
;;
;;     Whether to include the content of this file in the
;;     corresponding atom entry.  Can be either `html' or `ascii', or
;;     `nil' to not include anything.
;;
;;; Code:

;;; Dependencies

(require 'org-macs)
(require 'ox)
(require 'ox-publish)
(require 'vc-git)
(require 'dom)


;;; Custom types

(defconst org-xml-sitemap-change-frequencies
  '("always" "hourly" "daily" "weekly" "monthly" "yearly" "never")
  "The available frequencies to choose from for sitemap pages.")


;;; User Configuration Variables

(defgroup org-export-xml nil
  "Options for publishing an Org mode project sitemap as XML."
  :tag "Org Export XML"
  :group 'org-export)

(defcustom org-xml-base-url ""
  "The base url to use for the sitemap or atom feed generation.

Required for sitemap or atom feed generation.

This should start with `http://' or `https://' and end with a slash,
such as `http://my-blog.org/'.

This can also be specified on a `org-publish-project-alist' project
using the `:base-url' property."
  :type 'string)

(defcustom org-xml-last-modified-strategy 'mtime
  "The strategy to use for determining the last modification time.

If it is `mtime', the file modification time is used for a file, and for
folders the latest modification time for any file in the folder.

For `git', the latest commit date for each file or folder will be used."
  :type '(radio (const :tag "File modification time" mtime)
                (const :tag "Git commit time" git)))

(defcustom org-xml-sitemap-index-priority 0.2
  "The sitemap priority to assign index.org files.

This should be a number between 0 and 1 inclusive to tell search engines
the relative importance of index files.

This can be overriden by specifying `:sitemap-index-priority' in
`org-publish-project-alist' or in any `index.org' file by specifying a
`PRIORITY' buffer setting keyword."
  :type 'float)

(defcustom org-xml-sitemap-article-priority 0.8
  "The sitemap priority to assign article files.

This should be a number between 0 and 1 inclusive to tell search engines
the relative importance of article files.

This can be overriden by specifying `:sitemap-article-priority' in
`org-publish-project-alist' or in any Org file by specifying a
`PRIORITY' buffer setting keyword."
  :type 'float)

(defcustom org-xml-sitemap-index-freq "daily"
  "The sitemap change frequency to assign index.org files.

This can be overriden by specifying `:sitemap-index-freq' in
`org-publish-project-alist' or in any `index.org' file by specifying a
`CHANGE_FREQ' buffer setting keyword."
  :type `(radio ,org-xml-sitemap-change-frequencies))

(defcustom org-xml-sitemap-article-freq "monthly"
  "The sitemap change frequency to assign article files.

This can be overriden by specifying `:sitemap-article-freq' in
`org-publish-project-alist' or in any Org file by specifying a
`CHANGE_FREQ' buffer setting keyword."
  :type `(radio ,org-xml-sitemap-change-frequencies))

(defcustom org-xml-atom-max-entries 32
  "Maximum number of entries in an individual atom feed."
  :type 'natnum)

(defcustom org-xml-atom-authority ""
  "The authority from which to mint unique identifiers for each article.

Can be the domain name of the website, or another stable unique
identifier.  Can also be specified as `:atom-authority' in
`org-publish-project-alist'."
  :type 'string)

(defcustom org-xml-atom-authority-date ""
  "The date of validity of atom feed idenitifiers.

This can have any form, but must remain absolutely consistent for urls
to remain valid.  The year the authority is valid from is a typical
option.  This can also be specified as `:atom-authority-date' in
`org-publish-project-alist'."
  :type 'string)

(defcustom org-xml-atom-subdir ""
  "Only include articles from a specific subdirectory.

This will filter articles from an atom feed unless they are in the
specified subdirectory, relative to the project's base-dir.  This can
also be specified as `:atom-subidr' in `org-publish-project-alist'."
  :type 'string)

(defcustom org-xml-atom-index "index.org"
  "Use this file to set properties of an atom feed.

The TITLE, DESCRIPTION, and ID buffer keywords of the file, if any are
provided, will override title, subtitle, and id of the output feed.

If an index file is not specified or does not exist, then the title of
the feed will be the name of the project, and the subtitle and id will
be determined by `:atom-subtitle' and `:atom-id' in
`org-publish-project-alist', respectively.

The index file can also be specified as `:atom-index' in
`org-publish-project-alist' as a relative path from the `:base-dir'."
  :type 'string)

(defcustom org-xml-atom-include-content 'html
  "Whether to include the content of every article in the atom feed.

The content can be included as plain text `ascii' or `html'.  If the
output type exists in the publishing directory, reuse it, otherwise run
the necessary exporter.

This can be overriden by specifying `:atom-include-content' in
`org-publish-project-alist' or in any Org file by specifying a
`ATOM_INCLUDE_CONTENT' buffer setting keyword."
  :type '(radio (const :tag "HTML" html)
                (const :tag "ASCII" ascii)
                (const :tag "none" nil)))

(defcustom org-xml-atom-feed-nodes '()
  "A list of extra DOM nodes for every atom feed.

Must be a list of dom nodes recognizeable by `dom-print' to add to every
atom feed.

This can be overriden by specifying `:atom-feed-nodes' in
`org-publish-project-alist' or in any Org file by specifying the
`ATOM_NODES' buffer setting keyword, which will be read as a Lisp
expression."
  :type '(repeat sexp))

(defcustom org-xml-atom-article-nodes '()
  "A list of extra DOM nodes for every atom feed.

Must be a list of dom nodes recognizeable by `dom-print' to add to every
atom entry.

This can be overriden by specifying `:atom-article-nodes' in
`org-publish-project-alist' or in any Org file by specifying the
`ATOM_NODES' buffer setting keyword, which will be read as a Lisp
expression."
  :type '(repeat sexp))


;;; Org export backend registration

(org-export-define-backend 'xml-sitemap
  nil
  :options-alist
  '((:sitemap-index-freq "CHANGE_FREQ" org-xml-sitemap-index-freq)
    (:sitemap-article-freq "CHANGE_FREQ" org-xml-sitemap-article-freq)
    (:sitemap-index-priority "PRIORITY" org-xml-sitemap-index-priority)
    (:sitemap-index-freq "PRIORITY" org-xml-sitemap-article-freq)
    (nil "SITEMAP_IGNORE")
    (:atom-authority nil org-xml-atom-authority)
    (:atom-authority-date nil org-xml-atom-authority-date)
    (:atom-subdir nil org-xml-atom-subdir)
    (:atom-index nil org-xml-atom-index)
    (:atom-subtitle "DESCRIPTION")
    (:atom-feed-id "ID")
    (:atom-include-content "ATOM_INCLUDE_CONTENT" org-xml-atom-include-content)
    (:atom-feed-nodes "ATOM_NODES" org-xml-atom-feed-nodes)
    (:atom-article-nodes "ATOM_NODES" org-xml-atom-article-nodes)
    (nil "ATOM_IGNORE")))


;;; Internal variables

(defvar vc-log-view-type nil)


;;; Internal functions

(defun org-xml--last-modified (file)
  "Get the last update time for FILE as an ISO-8601 string.

The last modification time will be gathered according to
`org-xml-last-modified-strategy'."
  (pcase org-xml-last-modified-strategy
    ('git
     (with-temp-buffer
       (let ((vc-git-log-switches '("--pretty=%cI")))
         (accept-process-output
          (vc-git-print-log (and file (list file))
                            (current-buffer) nil nil 1))
         (string-trim (buffer-string)))))
    (_ ;; mtime
     (if (file-directory-p file)
         (car
          (seq-sort
           'string>
           (mapcar
            (lambda (f)
              (format-time-string
               "%FT%T%z"
               (file-attribute-modification-time
                (file-attributes f))))
            (directory-files-recursively file ".+\\..+"))))
       (format-time-string
        "%FT%T%z"
        (file-attribute-modification-time
         (file-attributes file)))))))

(defun org-xml--atom-id (project id)
  "Generate a `tag' id from ID for an atom feed or entry.

PROJECT is a Org publish property plist.  If the atom authority or
authority date are empty, simply return ID."
  (let ((auth (org-publish-property :atom-authority project
                                    org-xml-atom-authority))
        (auth-date (org-publish-property :atom-authority-date project
                                         org-xml-atom-authority-date)))
    (if (or (string-empty-p auth) (string-empty-p auth-date))
        (progn
          (when (string-empty-p auth)
            (warn "`org-xml-atom-authority' or `:atom-authority' not set!"))
          (when (string-empty-p auth-date)
            (warn "`org-xml-atom-authority-date' or `:atom-authority-date' not set!"))
          id)
      (concat
       "tag:" auth
       "," auth-date
       ":" id))))

(defun org-xml--atom-get-content (backend entry _style project)
  "Get the content of an org file ENTRY exported to BACKEND.

BACKEND can be either `html' or `ascii'.

STYLE must by `list', and PROJECT refers to an alist of org-publish
project settings."
  (let* ((orig-buf (current-buffer))
         (pub-file (expand-file-name
                    (file-name-with-extension
                     entry
                     (or (org-publish-property :html-extension project)
                         org-html-extension))
                    (org-publish-property :publishing-directory project)))
         (content (if (and (eq backend 'html) (file-exists-p pub-file))
                      (org-with-file-buffer pub-file
                        (dom-by-id (libxml-parse-html-region) "content"))
                    (with-temp-buffer
                      (let ((tmp-buf (current-buffer))
                            (org-inhibit-startup t))
                        (with-current-buffer orig-buf
                          (org-export-to-buffer
                              backend tmp-buf nil nil nil
                              (org-publish-property :body-only project))))
                      (pcase backend
                        ('html (dom-by-id (libxml-parse-html-region) "content"))
                        ('ascii (buffer-string)))))))
    (pcase backend
      ('html
       (dom-set-attribute
        content 'xmlns "http://www.w3.org/1999/xhtml")
       `(content ((type . "xhtml")
                  ("xml:lang"
                   . ,(org-publish-property
                       :language project
                       org-export-default-language)))
                 ,content))
      ('ascii
       `(content ((type . "text")) ,content)))))


;;; End-user functions

(defun org-xml-publish-only-sitemap (plist filename pub-dir)
  "Publish an sitemap file as is, or ignore the file.

FILENAME is the filename of the file to be published.  PLIST is the
property list for the given project.  PUB-DIR is the publishing
directory.

Return output file name if published."
  (when (string= (file-relative-name
                  filename (plist-get plist :base-directory))
                 (plist-get plist :sitemap-filename))
    (funcall 'org-publish-attachment plist filename pub-dir)))

(defun org-xml-publish-sitemap (_title list)
  "Generate xml following the sitemap protocol.

TITLE is unused.  LIST is an internal representation of the files to
include, as returned by `org-list-to-list'."
  (with-temp-buffer
    (insert "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n")
    (dom-print
     `(urlset
       ((xmlns . "http://www.sitemaps.org/schemas/sitemap/0.9"))
       ,@(seq-filter 'identity (mapcar 'car (cdr list))))
     t t)
    (buffer-string)))

(defun org-xml-publish-sitemap-entry (entry _style project)
  "Format site map ENTRY, as a dom node.

ENTRY is a file name.  STYLE must be `list'.  PROJECT refers to the
current Org publish project plist."
  (let ((file (expand-file-name
               entry
               (org-publish-property :base-directory project))))
    (when (and (file-readable-p file) (not (directory-name-p file)))
      (let ((org-inhibit-startup t))
        (org-with-file-buffer file
          (let* ((keywords (org-collect-keywords
                            '("PRIORITY" "CHANGE_FREQ"
                              "SITEMAP_IGNORE")))
                 (last-modified (org-xml--last-modified file))
                 (ignore (cadr (assoc "SITEMAP_IGNORE" keywords)))
                 (priority (cadr (assoc "PRIORITY" keywords)))
                 (change-freq (cadr (assoc "CHANGE_FREQ" keywords)))
                 (index (string= (file-name-base file) "index")))
            (unless (or ignore (string-empty-p last-modified))
              `(url
                ()
                (loc
                 ()
                 ,(file-name-concat
                   (org-publish-property :base-url project
                                         org-xml-base-url)
                   (file-name-with-extension
                    entry
                    (org-publish-property :html-extension project
                                          org-html-extension))))
                (lastmod () ,last-modified)
                (changefreq
                 ()
                 ,(or change-freq
                      (format
                       "%s"
                       (if index
                           (org-publish-property
                            :sitemap-index-freq project
                            org-xml-sitemap-index-freq)
                         (org-publish-property
                          :sitemap-article-freq project
                          org-xml-sitemap-article-freq)))))
                (priority
                 ()
                 ,(or priority
                      (format
                       "%.1f"
                       (if index
                           (org-publish-property
                            :sitemap-index-priority project
                            org-xml-sitemap-index-priority)
                         (org-publish-property
                          :sitemap-article-priority project
                          org-xml-sitemap-article-priority)))))))))))))

(defun org-xml-publish-sitemap-atom (title list)
  "Generate atom feed xml document from the DOM nodes in LIST.

TITLE is used to get the publish project the atom feed is being
generated for."
  (let* ((project-name (string-remove-prefix "Sitemap for project " title))
         (project (seq-find
                   (lambda (p) (string= project-name (car p)))
                   org-publish-project-alist)))
    (unless project
      (error "Project '%s' not found in `org-publish-project-alist'"
             project-name))
    (let* ((subdir (org-publish-property :atom-subdir project ""))
           (srcdir (expand-file-name
                    subdir
                    (org-publish-property :base-directory project)))
           (index (file-name-concat srcdir "index.org"))
           (base-url (org-publish-property :base-url project
                                           org-xml-base-url))
           (pub-index (file-name-concat
                       subdir
                       (concat
                        "index."
                        (org-publish-property :html-extension project
                                              org-html-extension))))
           (lang (org-publish-property :language project
                                       org-export-default-language))
           (feed-name (org-publish-property :sitemap-filename project
                                            "feed.atom"))
           (max-entries (org-publish-property :atom-max-entries project
                                              org-xml-atom-max-entries))
           (extra-nodes (org-publish-property :atom-feed-nodes project
                                              org-xml-atom-feed-nodes))
           (title (org-publish-property :sitemap-title project
                                        project-name))
           (subtitle (org-publish-property :atom-subtitle project))
           (id (org-publish-property :atom-id project))
           (last-updated (org-xml--last-modified srcdir)))
      (unless (string-empty-p last-updated)
        (unless base-url
          (error "`org-xml-base-url' or `:base-url' not supplied!"))
        (when (file-exists-p index)
          (let ((org-inhibit-startup t))
            (org-with-file-buffer index
              (let ((keywords (org-collect-keywords
                               '("TITLE" "DESCRIPTION" "ID"
                                 "ATOM_NODES" "ATOM_INCLUDE_CONTENT"))))
                (when-let ((k (assoc "TITLE" keywords)))
                  (setq title (cadr k)))
                (when-let ((k (assoc "DESCRIPTION" keywords)))
                  (setq subtitle (cadr k)))
                (when-let ((k (assoc "ID" keywords)))
                  (setq id (cadr k)))
                (when-let ((k (assoc "ATOM_NODES" keywords)))
                  (setq extra-nodes (read (cadr k))))
                (when-let ((k (assoc "ATOM_INCLUDE_CONTENT" keywords)))
                  (setq extra-nodes (read (cadr k))))))))
        (unless id
          (error "'ID' file-level property not found in index file %s"
                 index))
        (with-temp-buffer
          (insert "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n")
          (dom-print
           `(feed
             ((xmlns . "http://www.w3.org/2005/Atom"))
             (title ((type . "text")) ,title)
             ,@(when subtitle
                 `((subtitle ((type . "text")) ,subtitle)))
             (updated () ,last-updated)
             (id () ,(org-xml--atom-id project id))
             (link ((rel . "alternate") (type . "text/html")
                    (hreflang . ,lang)
                    (href . ,(file-name-concat base-url pub-index))))
             (link ((rel . "self") (type . "application/atom+xml")
                    (href . ,(file-name-concat base-url feed-name))))
             ,@extra-nodes
             ,@(take
                max-entries
                (seq-sort
                 (lambda (a b)
                   (let ((a-updated (cadr (alist-get 'updated (cddr a))))
                         (b-updated (cadr (alist-get 'updated (cddr b)))))
                     (if (string= a-updated b-updated)
                         (let ((a-published (cadr (alist-get 'published (cddr a))))
                               (b-published (cadr (alist-get 'published (cddr b)))))
                           (string> a-published b-published))
                       (string> a-updated b-updated))))
                 (seq-filter
                  'identity (mapcar 'car (cdr list))))))
           t t)
          (buffer-string))))))

(defun org-xml-publish-sitemap-atom-entry (entry style project)
  "Format atom feed ENTRY.

ENTRY is a file name.  STYLE must be `list'.  PROJECT refers to the
current Org publish project plist."
  (let ((subdir (org-publish-property :atom-subdir project)))
    (when (and (or (not subdir) (string-prefix-p subdir entry))
               (not (string= "index.org" (file-name-nondirectory entry))))
      (let* ((base-url (org-publish-property :base-url project
                                             org-xml-base-url))
             (base-dir (org-publish-property :base-directory project))
             (src-file (expand-file-name entry base-dir))
             (pub-file (file-name-with-extension
                        entry
                        (org-publish-property :html-extension project
                                              org-html-extension)))
             (org-inhibit-startup t))
        (when (and (file-readable-p src-file) (not (directory-name-p src-file)))
          (org-with-file-buffer src-file
            (let* ((keywords (org-collect-keywords
                              '("ATOM_IGNORE" "TITLE" "DATE" "ID"
                                "DESCRIPTION" "AUTHOR" "EMAIL"
                                "ATOM_NODES" "ATOM_INCLUDE_CONTENT")))
                   (last-updated (org-xml--last-modified src-file))
                   (title (cadr (assoc "TITLE" keywords)))
                   (desc (cadr (assoc "DESCRIPTION" keywords)))
                   (id (cadr (assoc "ID" keywords)))
                   (date (cadr (assoc "DATE" keywords)))
                   (ignore (cadr (assoc "ATOM_IGNORE" keywords)))
                   (author (cadr (assoc "AUTHOR" keywords)))
                   (email (cadr (assoc "EMAIL" keywords)))
                   (extra-nodes (if-let ((k (assoc "ATOM_NODES" keywords)))
                                    (read (cadr k))
                                  (org-publish-property
                                   :atom-article-nodes project
                                   org-xml-atom-article-nodes)))
                   (content (if-let ((k (assoc "ATOM_INCLUDE_CONTENT" keywords)))
                                (read (cadr k))
                              (org-publish-property
                               :atom-include-content project
                               org-xml-atom-include-content))))
              (unless (or ignore (string-empty-p last-updated))
                (unless date
                  (error "'date' file-level property not found in %s"
                         src-file))
                (unless id
                  (error "'id' file-level property not found in %s"
                         src-file))
                `(entry
                  ()
                  (title () ,title)
                  ,@(when desc
                      `((summary () ,desc)))
                  (link ((rel . "alternate") (type . "text/html")
                         (href . ,(file-name-concat base-url pub-file))))
                  (updated () ,last-updated)
                  (published () ,(org-format-timestamp
                                  (org-timestamp-from-string date)
                                  "%FT%TZ"))
                  (id () ,(org-xml--atom-id project id))
                  (author
                   ()
                   (name () ,(or author user-full-name))
                   (email () ,(or email user-mail-address)))
                  ,@extra-nodes
                  ,@(when content
                      (list (org-xml--atom-get-content
                             content entry style project))))))))))))

(defconst org-xml-sitemap
  '(:auto-sitemap t
    :sitemap-filename "sitemap.xml"
    :sitemap-style list
    :sitemap-function org-xml-publish-sitemap
    :sitemap-format-entry org-xml-publish-sitemap-entry
    :publishing-function org-xml-publish-only-sitemap))

(defconst org-xml-sitemap-atom
  '(:auto-sitemap t
    :sitemap-filename "feed.atom"
    :sitemap-style list
    :sitemap-function org-xml-publish-sitemap-atom
    :sitemap-format-entry org-xml-publish-sitemap-atom-entry
    :publishing-function org-xml-publish-only-sitemap))

(provide 'ox-xml)
;;; ox-xml.el ends here
