Hi, On Tue, 30 Sept 2025 at 22:48, Andrew Dunstan <[email protected]> wrote: > > Hi Bilal, > > This got preempted slightly by Tom's commit 170a8a3f460, but I think > it's worth doing. I tried to simplify it some. See attached. There > doesn't seem to me to be any point in using a different set of files for > the tab tests and the NBSP tests. If we use the same set of files we can > improve the efficiency easily by opening them only once. Here we just > look for all the sgml files and all the xsl files and process them all. > > WDYT?
It looks good to me. I made 2 changes to your patch: 1- Declaration of $line_no is lost, I re-added it. 2- s/.cirrus.tasks,yml/.cirrus.tasks.yml/ in the commit message. -- Regards, Nazir Bilal Yavuz Microsoft
From 4b079e4384f0934d9425635168c2acd399eca579 Mon Sep 17 00:00:00 2001 From: Andrew Dunstan <[email protected]> Date: Tue, 30 Sep 2025 15:39:15 -0400 Subject: [PATCH v2] Improve docs syntax checking Move the checks out of the Makefile into a perl script that can be called from both the Makefile and meson.build. The set of files checked is simplified, so it is just all the sgml and xsl files found in docs/src/sgml directory tree. Along the way make some adjustments to .cirrus.tasks.yml to support this better in CI. Author: Nazir Bilal Yavuz <[email protected]> Co-Author: Andrew Dunstan <[email protected]> Discussion: https://postgr.es/m/CAN55FZ3BnM+0twT-ZWL8As9oBEte_b+SBU==cz6hk8jucm_...@mail.gmail.com --- doc/src/sgml/Makefile | 16 +------ doc/src/sgml/meson.build | 23 +++++++++ doc/src/sgml/sgml_syntax_check.pl | 77 +++++++++++++++++++++++++++++++ .cirrus.tasks.yml | 3 ++ 4 files changed, 105 insertions(+), 14 deletions(-) create mode 100755 doc/src/sgml/sgml_syntax_check.pl diff --git a/doc/src/sgml/Makefile b/doc/src/sgml/Makefile index b53b2694a6b..24670204cbc 100644 --- a/doc/src/sgml/Makefile +++ b/doc/src/sgml/Makefile @@ -200,8 +200,8 @@ MAKEINFO = makeinfo ## # Quick syntax check without style processing -check: postgres.sgml $(ALL_SGML) check-tabs check-nbsp - $(XMLLINT) $(XMLINCLUDE) --noout --valid $< +check: postgres.sgml $(ALL_SGML) + $(PERL) $(srcdir)/sgml_syntax_check.pl --xmllint "$(XMLLINT)" --srcdir $(srcdir) ## @@ -261,18 +261,6 @@ clean-man: endif # sqlmansectnum != 7 -# tabs are harmless, but it is best to avoid them in SGML files -check-tabs: - @( ! grep ' ' $(wildcard $(srcdir)/*.sgml $(srcdir)/func/*.sgml $(srcdir)/ref/*.sgml $(srcdir)/*.xsl) ) || \ - (echo "Tabs appear in SGML/XML files" 1>&2; exit 1) - -# Non-breaking spaces are harmless, but it is best to avoid them in SGML files. -# Use perl command because non-GNU grep or sed could not have hex escape sequence. -check-nbsp: - @ ( $(PERL) -ne '/\xC2\xA0/ and print("$$ARGV:$$_"),$$n++; END {exit($$n>0)}' \ - $(wildcard $(srcdir)/*.sgml $(srcdir)/func/*.sgml $(srcdir)/ref/*.sgml $(srcdir)/*.xsl $(srcdir)/images/*.xsl) ) || \ - (echo "Non-breaking spaces appear in SGML/XML files" 1>&2; exit 1) - ## ## Clean ## diff --git a/doc/src/sgml/meson.build b/doc/src/sgml/meson.build index 6ae192eac68..ce0dea587cd 100644 --- a/doc/src/sgml/meson.build +++ b/doc/src/sgml/meson.build @@ -306,3 +306,26 @@ endif if alldocs.length() != 0 alias_target('alldocs', alldocs) endif + +sgml_syntax_check = files( + 'sgml_syntax_check.pl' +) + +test( + 'sgml_syntax_check', + perl, + protocol: 'exitcode', + suite: 'doc', + args: [ + sgml_syntax_check, + '--xmllint', + '@0@ --nonet'.format(xmllint_bin.full_path()), + '--srcdir', + meson.current_source_dir(), + '--builddir', + meson.current_build_dir(), + ], + depends: doc_generated +) + +testprep_targets += doc_generated diff --git a/doc/src/sgml/sgml_syntax_check.pl b/doc/src/sgml/sgml_syntax_check.pl new file mode 100755 index 00000000000..1e1fa5d8245 --- /dev/null +++ b/doc/src/sgml/sgml_syntax_check.pl @@ -0,0 +1,77 @@ +# /usr/bin/perl + +# Copyright (c) 2025, PostgreSQL Global Development Group + +# doc/src/sgml/sgml_syntax_check.pl + +use strict; +use warnings FATAL => 'all'; +use Getopt::Long; + +use File::Find; + +my $xmllint; +my $srcdir; +my $builddir; + +GetOptions( + 'xmllint:s' => \$xmllint, + 'srcdir:s' => \$srcdir, + 'builddir:s' => \$builddir) or die "$0: wrong arguments"; + +die "$0: --srcdir must be specified\n" unless defined $srcdir; + +my $xmlinclude = "--path . --path $srcdir"; +$xmlinclude .= " --path $builddir" if defined $builddir; + +# find files to process - all the sgml and xsl files (including in subdirectories) +my @files_to_process; +my @dirs_to_search = ($srcdir); +push @dirs_to_search, $builddir if defined $builddir; +find( + sub { + return unless -f $_; + return if $_ !~ /\.(sgml|xsl)$/; + push @files_to_process, $File::Find::name; + }, + @dirs_to_search,); + +# tabs and non-breaking spaces are harmless, but it is best to avoid them in SGML files +sub check_tabs_and_nbsp +{ + my $errors = 0; + for my $f (@files_to_process) + { + open my $fh, "<:encoding(UTF-8)", $f or die "Can't open $f: $!"; + my $line_no = 0; + while (<$fh>) + { + $line_no++; + if (/\t/) + { + print STDERR "Tab found in $f:$_"; + $errors++; + } + if (/\xC2\xA0/) + { + print STDERR "$f:$line_no: contains non-breaking space\n"; + $errors++; + } + } + close($fh); + } + + if ($errors) + { + die "Tabsand/or non-breaking spaces appear in SGML/XML files\n"; + } +} + +sub run_xmllint +{ + my $cmd = "$xmllint $xmlinclude --noout --valid postgres.sgml"; + system($cmd) == 0 or die "xmllint validation failed\n"; +} + +run_xmllint(); +check_tabs_and_nbsp(); diff --git a/.cirrus.tasks.yml b/.cirrus.tasks.yml index eca9d62fc22..1c937247a9a 100644 --- a/.cirrus.tasks.yml +++ b/.cirrus.tasks.yml @@ -627,6 +627,8 @@ task: TEST_JOBS: 8 IMAGE: ghcr.io/cirruslabs/macos-runner:sonoma + XML_CATALOG_FILES: /opt/local/share/xml/docbook/4.5/catalog.xml + CIRRUS_WORKING_DIR: ${HOME}/pgsql/ CCACHE_DIR: ${HOME}/ccache MACPORTS_CACHE: ${HOME}/macports-cache @@ -641,6 +643,7 @@ task: MACOS_PACKAGE_LIST: >- ccache + docbook-xml-4.5 icu kerberos5 lz4 -- 2.51.0
