Script 'mail_helper' called by obssrc
Hello community,
here is the log from the commit of package python-html5-parser for
openSUSE:Factory checked in at 2023-06-23 21:52:42
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Comparing /work/SRC/openSUSE:Factory/python-html5-parser (Old)
and /work/SRC/openSUSE:Factory/.python-html5-parser.new.15902 (New)
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Package is "python-html5-parser"
Fri Jun 23 21:52:42 2023 rev:13 rq:1094741 version:0.4.11
Changes:
--------
--- /work/SRC/openSUSE:Factory/python-html5-parser/python-html5-parser.changes
2021-12-23 17:54:15.747738148 +0100
+++
/work/SRC/openSUSE:Factory/.python-html5-parser.new.15902/python-html5-parser.changes
2023-06-23 21:52:45.266666455 +0200
@@ -1,0 +2,9 @@
+Fri Jun 23 04:59:34 UTC 2023 - ecsos <[email protected]>
+
+- Update to 0.4.11
+ No changelog from upstream.
+ See instead here:
+
https://github.com/kovidgoyal/html5-parser/compare/v0.4.10...v0.4.11?diff=unified&name=v0.4.11
+- Add %{?sle15_python_module_pythons}
+
+-------------------------------------------------------------------
Old:
----
python-html5-parser-0.4.10.tar.gz
New:
----
python-html5-parser-0.4.11.tar.gz
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Other differences:
------------------
++++++ python-html5-parser.spec ++++++
--- /var/tmp/diff_new_pack.GOyuNz/_old 2023-06-23 21:52:45.834669710 +0200
+++ /var/tmp/diff_new_pack.GOyuNz/_new 2023-06-23 21:52:45.838669732 +0200
@@ -1,7 +1,7 @@
#
# spec file for package python-html5-parser
#
-# Copyright (c) 2021 SUSE LLC
+# Copyright (c) 2023 SUSE LLC
#
# All modifications and additions to the file contributed by third parties
# remain the property of their copyright owners, unless otherwise agreed
@@ -16,10 +16,9 @@
#
-%{?!python_module:%define python_module() python-%{**} python3-%{**}}
-%define skip_python2 1
+%{?sle15_python_module_pythons}
Name: python-html5-parser
-Version: 0.4.10
+Version: 0.4.11
Release: 0
Summary: C based HTML 5 parsing for Python
License: Apache-2.0
++++++ python-html5-parser-0.4.10.tar.gz -> python-html5-parser-0.4.11.tar.gz
++++++
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/html5-parser-0.4.10/.github/workflows/ci.py
new/html5-parser-0.4.11/.github/workflows/ci.py
--- old/html5-parser-0.4.10/.github/workflows/ci.py 2021-09-22
09:00:47.000000000 +0200
+++ new/html5-parser-0.4.11/.github/workflows/ci.py 2023-04-12
07:07:46.000000000 +0200
@@ -45,7 +45,7 @@
elif which == 'test':
builder = os.environ['BUILDER']
run(sys.executable, builder, 'test')
- if builder == 'build.py':
+ if builder == 'unix_build.py':
run(sys.executable, builder, 'leak')
else:
raise SystemExit('Unknown action:', which)
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/html5-parser-0.4.10/.github/workflows/ci.yml
new/html5-parser-0.4.11/.github/workflows/ci.yml
--- old/html5-parser-0.4.10/.github/workflows/ci.yml 2021-09-22
09:00:47.000000000 +0200
+++ new/html5-parser-0.4.11/.github/workflows/ci.yml 2023-04-12
07:07:46.000000000 +0200
@@ -15,15 +15,15 @@
strategy:
matrix:
include:
- - { pyver: 2.7, builder: build.py, os: ubuntu-latest, cc:
gcc }
- - { pyver: 2.7, builder: build.py, os: ubuntu-latest, cc:
clang }
- - { pyver: 3.6, builder: build.py, os: ubuntu-latest, cc:
gcc }
- - { pyver: 3.6, builder: build.py, os: ubuntu-latest, cc:
clang }
- - { pyver: 3.8, builder: setup.py, os: ubuntu-latest, cc:
gcc }
+ - { pyver: "2.7", builder: unix_build.py, os:
ubuntu-latest, cc: gcc }
+ - { pyver: "2.7", builder: unix_build.py, os:
ubuntu-latest, cc: clang }
+ - { pyver: "3.8", builder: unix_build.py, os:
ubuntu-latest, cc: gcc }
+ - { pyver: "3.8", builder: unix_build.py, os:
ubuntu-latest, cc: clang }
+ - { pyver: "3.10", builder: setup.py, os: ubuntu-latest,
cc: gcc }
- - { pyver: 3.8, builder: setup.py, os: macos-latest, cc:
clang }
+ - { pyver: "3.10", builder: setup.py, os: macos-latest,
cc: clang }
- - { pyver: 3.8, builder: setup.py, os: windows-latest, cc:
cl }
+ - { pyver: "3.10", builder: setup.py, os: windows-2019,
cc: cl }
steps:
- name: Checkout source code
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/html5-parser-0.4.10/.github/workflows/win-ci.py
new/html5-parser-0.4.11/.github/workflows/win-ci.py
--- old/html5-parser-0.4.10/.github/workflows/win-ci.py 2021-09-22
09:00:47.000000000 +0200
+++ new/html5-parser-0.4.11/.github/workflows/win-ci.py 2023-04-12
07:07:46.000000000 +0200
@@ -16,10 +16,10 @@
import tarfile
import time
-ZLIB = "http://zlib.net/zlib-{}.tar.xz".format("1.2.11")
+ZLIB = "http://zlib.net/zlib-{}.tar.xz".format("1.2.13")
LIBXML2 = "ftp://xmlsoft.org/libxml2/libxml2-{}.tar.gz".format('2.9.4')
LIBXSLT = "ftp://xmlsoft.org/libxml2/libxslt-{}.tar.gz".format('1.1.28')
-LXML =
"https://files.pythonhosted.org/packages/c5/2f/a0d8aa3eee6d53d5723d89e1fc32eee11e76801b424e30b55c7aa6302b01/lxml-4.6.1.tar.gz"
# noqa
+LXML =
"https://files.pythonhosted.org/packages/06/5a/e11cad7b79f2cf3dd2ff8f81fa8ca667e7591d3d8451768589996b65dec1/lxml-4.9.2.tar.gz"
# noqa
SW = os.path.abspath('sw')
PYTHON = os.path.abspath(sys.executable)
os.environ['SW'] = SW
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/html5-parser-0.4.10/MANIFEST.in
new/html5-parser-0.4.11/MANIFEST.in
--- old/html5-parser-0.4.10/MANIFEST.in 2021-09-22 09:00:47.000000000 +0200
+++ new/html5-parser-0.4.11/MANIFEST.in 2023-04-12 07:07:46.000000000 +0200
@@ -1,5 +1,5 @@
exclude *.py
-include setup.py build.py run_tests.py gen*.py win-ci.py
+include setup.py unix_build.py run_tests.py gen*.py win-ci.py
include LICENSE README.rst
include gumbo/*.c gumbo/*.h gumbo/*.py gumbo/*.rl
include src/*.c src/*.h
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/html5-parser-0.4.10/build.py
new/html5-parser-0.4.11/build.py
--- old/html5-parser-0.4.10/build.py 2021-09-22 09:00:47.000000000 +0200
+++ new/html5-parser-0.4.11/build.py 1970-01-01 01:00:00.000000000 +0100
@@ -1,300 +0,0 @@
-#!/usr/bin/env python
-# vim:fileencoding=utf-8
-# License: Apache 2.0 Copyright: 2017, Kovid Goyal <kovid at kovidgoyal.net>
-
-from __future__ import (absolute_import, division, print_function,
unicode_literals)
-
-import argparse
-import errno
-import glob
-import os
-import re
-import shlex
-import shutil
-import subprocess
-import sys
-import sysconfig
-from collections import namedtuple
-from copy import deepcopy
-from itertools import chain
-
-self_path = os.path.abspath(__file__)
-base = os.path.dirname(self_path)
-build_dir = os.path.join(base, 'build', 'custom')
-freeze_dir = os.path.join(base, 'build', 'html5_parser')
-_plat = sys.platform.lower()
-isosx = 'darwin' in _plat
-iswindows = hasattr(sys, 'getwindowsversion')
-is_ci = os.environ.get('CI') == 'true'
-Env = namedtuple('Env', 'cc cflags ldflags linker debug cc_name cc_ver')
-PKGCONFIG = os.environ.get('PKGCONFIG_EXE', 'pkg-config')
-with open(os.path.join(base, 'src/python-wrapper.c'), 'rb') as f:
- raw = f.read().decode('utf-8')
-version = tuple(
- map(
- int, (
- re.search(r'^#define MAJOR (\d+)', raw,
flags=re.MULTILINE).group(1), re.search(
- r'^#define MINOR (\d+)', raw, flags=re.MULTILINE).group(1),
re.search(
- r'^#define PATCH (\d+)', raw,
flags=re.MULTILINE).group(1), )))
-
-
-def safe_makedirs(path):
- try:
- os.makedirs(path)
- except EnvironmentError as err:
- if err.errno != errno.EEXIST:
- raise
-
-
-def add_python_flags(env, return_libs=False):
- env.cflags.extend('-I' + sysconfig.get_path(x) for x in 'include
platinclude'.split())
- libs = []
- libs += sysconfig.get_config_var('LIBS').split()
- libs += sysconfig.get_config_var('SYSLIBS').split()
- fw = sysconfig.get_config_var('PYTHONFRAMEWORK')
- if fw:
- for var in 'data include stdlib'.split():
- val = sysconfig.get_path(var)
- if val and '/{}.framework'.format(fw) in val:
- fdir = val[:val.index('/{}.framework'.format(fw))]
- if os.path.isdir(os.path.join(fdir,
'{}.framework'.format(fw))):
- framework_dir = fdir
- break
- else:
- raise SystemExit('Failed to find Python framework')
- libs.append(os.path.join(framework_dir,
sysconfig.get_config_var('LDLIBRARY')))
- else:
- libs += ['-L' + sysconfig.get_config_var('LIBDIR')]
- libs += ['-lpython' + sysconfig.get_config_var('VERSION') +
getattr(sys, 'abiflags', '')]
- libs += sysconfig.get_config_var('LINKFORSHARED').split()
- env.ldflags.extend(libs)
- return libs if return_libs else env
-
-
-def pkg_config(pkg, *args):
- try:
- val = subprocess.check_output([PKGCONFIG, pkg] +
list(args)).decode('utf-8')
- except EnvironmentError as err:
- if err.errno == errno.ENOENT:
- raise SystemExit('pkg-config is required to build html5-parser')
- raise
- return list(filter(None, map(str, shlex.split(val))))
-
-
-def env_var(which, default='', split=os.pathsep):
- val = str(os.environ.get(which, default))
- if not split:
- return val
- return list(filter(None, val.split(split)))
-
-
-def include_dirs():
- if 'LIBXML_INCLUDE_DIRS' in os.environ:
- return env_var('LIBXML_INCLUDE_DIRS')
- return [x[2:] for x in pkg_config('libxml-2.0', '--cflags-only-I')]
-
-
-def libraries():
- if iswindows:
- return env_var('LIBXML_LIBS', 'libxml2')
- if 'LIBXML_LIBS' in os.environ:
- return env_var('LIBXML_LIBS')
- return [x[2:] for x in pkg_config('libxml-2.0', '--libs-only-l')]
-
-
-def library_dirs():
- if 'LIBXML_LIB_DIRS' in os.environ:
- return env_var('LIBXML_LIB_DIRS')
- return [x[2:] for x in pkg_config('libxml-2.0', '--libs-only-L')]
-
-
-def cc_version():
- cc = os.environ.get('CC', 'gcc')
- raw = subprocess.check_output([cc, '-dM', '-E', '-'],
stdin=open(os.devnull, 'rb'))
- m = re.search(br'^#define __clang__ 1', raw, flags=re.M)
- cc_name = 'gcc' if m is None else 'clang'
- ver = int(re.search(br'#define __GNUC__ (\d+)', raw,
flags=re.M).group(1)), int(
- re.search(br'#define __GNUC_MINOR__ (\d+)', raw, flags=re.M).group(1))
- return cc, ver, cc_name
-
-
-def get_sanitize_args(cc, ccver):
- sanitize_args = set()
- if cc == 'gcc' and ccver < (4, 8):
- return sanitize_args
- sanitize_args.add('-fno-omit-frame-pointer')
- sanitize_args.add('-fsanitize=address')
- if (cc == 'gcc' and ccver >= (5, 0)) or (cc == 'clang' and not isosx):
- # clang on macOS does not support -fsanitize=undefined
- sanitize_args.add('-fsanitize=undefined')
- # if cc == 'gcc' or (cc == 'clang' and ccver >= (4, 2)):
- # sanitize_args.add('-fno-sanitize-recover=all')
- return sanitize_args
-
-
-def init_env(debug=False, sanitize=False, native_optimizations=False,
add_python=True):
- native_optimizations = (native_optimizations and not sanitize and not
debug)
- cc, ccver, cc_name = cc_version()
- stack_protector = '-fstack-protector'
- if ccver >= (4, 9) and cc_name == 'gcc':
- stack_protector += '-strong'
- missing_braces = ''
- if ccver < (5, 2) and cc_name == 'gcc':
- missing_braces = '-Wno-missing-braces'
- optimize = '-ggdb' if debug or sanitize else '-O3'
- sanitize_args = get_sanitize_args(cc_name, ccver) if sanitize else set()
- cflags = os.environ.get(
- 'OVERRIDE_CFLAGS', (
- '-Wextra -Wno-missing-field-initializers -Wall -std=c99
-fvisibility=hidden'
- ' -pedantic-errors -Werror {} {} -D{}DEBUG -fwrapv {} {} -pipe
{}').format(
- optimize, ' '.join(sanitize_args), ('' if debug else 'N'),
stack_protector,
- missing_braces, '-march=native' if native_optimizations else
''))
- libxml_cflags = pkg_config('libxml-2.0', '--cflags')
- cflags = shlex.split(cflags) + libxml_cflags +
shlex.split(sysconfig.get_config_var('CCSHARED'))
- ldflags = os.environ.get(
- 'OVERRIDE_LDFLAGS', '-Wall -shared ' + ' '.join(sanitize_args) + (''
if debug else ' -O3'))
- libxml_ldflags = pkg_config('libxml-2.0', '--libs')
- ldflags = shlex.split(ldflags) + libxml_ldflags
- cflags += shlex.split(os.environ.get('CFLAGS', ''))
- ldflags += shlex.split(os.environ.get('LDFLAGS', ''))
- cflags.append('-pthread')
- ans = Env(cc, cflags, ldflags, cc, debug, cc_name, ccver)
- return add_python_flags(ans) if add_python else ans
-
-
-def run_tool(cmd):
- if hasattr(cmd, 'lower'):
- cmd = shlex.split(cmd)
- print(' '.join(cmd))
- p = subprocess.Popen(cmd)
- ret = p.wait()
- if ret != 0:
- raise SystemExit(ret)
-
-
-def newer(dest, *sources):
- try:
- dtime = os.path.getmtime(dest)
- except EnvironmentError:
- return True
- for s in chain(sources, (self_path, )):
- if os.path.getmtime(s) >= dtime:
- return True
- return False
-
-
-def find_c_files(src_dir):
- ans, headers = [], []
- for x in sorted(os.listdir(src_dir)):
- ext = os.path.splitext(x)[1]
- if ext == '.c' and not x.endswith('-check.c'):
- ans.append(os.path.join(src_dir, x))
- elif ext == '.h':
- headers.append(os.path.join(src_dir, x))
- ans.sort(key=os.path.getmtime, reverse=True)
- return tuple(ans), tuple(headers)
-
-
-def build_obj(src, env, headers):
- suffix = '-debug' if env.debug else ''
- obj = os.path.join(build_dir, os.path.basename(src).rpartition('.')[0] +
suffix + '.o')
- if newer(obj, src, *headers):
- cflags = list(env.cflags)
- if src.endswith('char_ref.c'):
- cflags.append('-Wno-unused-const-variable')
- cmd = [env.cc] + cflags + ['-c', src] + ['-o', obj]
- run_tool(cmd)
- return obj
-
-
-TEST_EXE = os.path.join(build_dir, 'test')
-MEMLEAK_EXE = os.path.join(build_dir, 'mem-leak-check')
-if is_ci:
- TEST_EXE = os.path.join(os.path.dirname(os.path.abspath(sys.executable)),
'test-html5-parser')
-SRC_DIRS = 'src gumbo'.split()
-MOD_EXT = '.so'
-
-
-def link(objects, env):
- dest = os.path.join(build_dir, 'html_parser' + MOD_EXT)
- o = ['-o', dest]
- cmd = [env.linker] + objects + o + env.ldflags
- if newer(dest, *objects):
- run_tool(cmd)
- return dest
-
-
-def build(args, build_leak_check=False):
- debug_objects = []
- debug_env = init_env(debug=True, sanitize=True)
- for sdir in SRC_DIRS:
- sources, headers = find_c_files(sdir)
- if sdir == 'src':
- headers += ('gumbo/gumbo.h', )
- debug_objects.extend(build_obj(c, debug_env, headers) for c in sources)
- link(debug_objects, debug_env)
- ldflags = add_python_flags(deepcopy(debug_env), return_libs=True)
- if newer(TEST_EXE, *debug_objects):
- cmd = ([debug_env.cc] + debug_env.cflags + ['test.c'] + ['-o',
TEST_EXE] + ldflags)
- run_tool(cmd)
- if build_leak_check and newer(MEMLEAK_EXE, 'mem-leak-check.c',
*debug_objects):
- cmd = ([debug_env.cc] + debug_env.cflags + ['mem-leak-check.c'] + [
- '-o', MEMLEAK_EXE] + debug_objects + debug_env.ldflags)
- cmd = [x for x in cmd if x not in {'-fPIC', '-pthread', '-shared'}]
- run_tool(cmd)
- for mod in glob.glob(os.path.join(build_dir, '*' + MOD_EXT)):
- shutil.copy2(mod, freeze_dir)
- for mod in glob.glob(os.path.join('src', 'html5_parser', '*.py')):
- shutil.copy2(mod, freeze_dir)
-
-
-TEST_COMMAND = ['run_tests.py']
-
-
-def add_python_path(env, path):
- pp = env.get('PYTHONPATH', '')
- to_join = filter(None, [os.path.abspath(path), pp])
- env['PYTHONPATH'] = os.pathsep.join(to_join)
- return env
-
-
-def option_parser():
- p = argparse.ArgumentParser()
- p.add_argument(
- 'action',
- nargs='?',
- default='test',
- choices='build test try leak'.split(),
- help='Action to perform (default is build)')
- p.add_argument('rest', nargs='*')
- return p
-
-
-def main():
- args = option_parser().parse_args()
- os.chdir(base)
- safe_makedirs(build_dir), safe_makedirs(freeze_dir)
- if args.action == 'build':
- build(args)
- elif args.action == 'test':
- build(args)
- os.environ['ASAN_OPTIONS'] = 'leak_check_at_exit=0'
- add_python_path(os.environ, os.path.dirname(freeze_dir))
- print('\nrunning tests...')
- os.execlp(TEST_EXE, TEST_EXE, 'run_tests.py', *args.rest)
- elif args.action == 'try':
- build(args)
- os.environ['ASAN_OPTIONS'] = 'leak_check_at_exit=0'
- add_python_path(os.environ, os.path.dirname(freeze_dir))
- os.execlp(
- TEST_EXE, TEST_EXE, '-c', 'from html5_parser import *; ' +
args.rest[0], *args.rest[1:])
- elif args.action == 'leak':
- build(args, build_leak_check=True)
- os.environ['MEMLEAK_EXE'] = os.path.abspath(MEMLEAK_EXE)
- os.environ['ASAN_OPTIONS'] = 'leak_check_at_exit=0'
- add_python_path(os.environ, os.path.dirname(freeze_dir))
- os.execlp(TEST_EXE, TEST_EXE, 'run_tests.py')
-
-
-if __name__ == '__main__':
- main()
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/html5-parser-0.4.10/docs/index.rst
new/html5-parser-0.4.11/docs/index.rst
--- old/html5-parser-0.4.10/docs/index.rst 2021-09-22 09:00:47.000000000
+0200
+++ new/html5-parser-0.4.11/docs/index.rst 2023-04-12 07:07:46.000000000
+0200
@@ -1,7 +1,7 @@
html5-parser
================
-|pypi| |unix_build| |windows_build|
+|pypi| |unix_build|
A fast implementation of the `HTML 5 parsing spec
<https://www.w3.org/TR/html5/syntax.html#parsing>`_ for Python. Parsing is done
@@ -11,7 +11,7 @@
times that can be **a thirtieth** of the html5lib parse times. That is a
speedup of **30x**. This differs, for instance, from the gumbo python bindings,
where the initial parsing is done in C but the transformation into the final
-tree is done in python.
+tree is done in python.
Installation
@@ -65,7 +65,7 @@
To use html5-parser in your code, after installing it simply do:
.. code-block:: python
-
+
from html5_parser import parse
from lxml.etree import tostring
root = parse(some_html)
@@ -83,7 +83,7 @@
html5-parser has the ability to parse XHTML documents as well. It will
preserve namespace information even for namespaces not defined in the HTML 5
-spec. You can ask it to treat the input html as possibly XHTML by using the
+spec. You can ask it to treat the input html as possibly XHTML by using the
``maybe_xhtml`` parameter to the :func:`html5_parser.parse` function. For
example:
.. code-block:: html
@@ -158,7 +158,7 @@
===============================================================================
html5lib |lxml |yes |35
|
soup+html5lib |BeautifulSoup |yes |8
|
- soup+lxml.html |BeautifulSoup |no |2
|
+ soup+lxml.html |BeautifulSoup |no |2
|
There is further potential for speedup. Currently the gumbo subsystem uses
@@ -189,7 +189,7 @@
<html:p>xxx<ns0:svg
xmlns:ns0="http://www.w3.org/2000/svg"><ns0:image
xmlns:ns1="http://www.w3.org/1999/xlink" ns1:href="xxx"/></ns0:svg></html:p>
<html:p>yyy</html:p>
</html:body>
- </html:html>
+ </html:html>
With **html5-parser**:
@@ -221,7 +221,7 @@
<https://github.com/google/gumbo-parser>`__ which has undergone a Google
security review and been tested on 2.5 billion pages from the Google cache. In
addition, html5-parser passes (almost) all the tests from the html5lib test
-suite.
+suite.
Finally, html5-parser is compiled with ``-pedantic-errors -Wall -Werror`` and
the test suite, consisting of thousands of tests, is run using the address and
@@ -232,10 +232,6 @@
:target: https://pypi.python.org/pypi/html5-parser
:alt: Latest version released on PyPi
-.. |unix_build| image:: https://api.travis-ci.org/kovidgoyal/html5-parser.svg
- :target: http://travis-ci.org/kovidgoyal/html5-parser
- :alt: Build status of the master branch on Unix
-
-.. |windows_build| image::
https://ci.appveyor.com/api/projects/status/github/kovidgoyal/html5-parser?svg=true
- :target: https://ci.appveyor.com/project/kovidgoyal/html5-parser
- :alt: Build status of the master branch on Windows
+.. |unix_build| image::
https://github.com/kovidgoyal/html5-parser/workflows/CI/badge.svg
+ :target:
https://github.com/kovidgoyal/html5-parser/actions?query=workflow%3ACI%22
+ :alt: Build status of the master branch
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/html5-parser-0.4.10/gumbo/parser.c
new/html5-parser-0.4.11/gumbo/parser.c
--- old/html5-parser-0.4.10/gumbo/parser.c 2021-09-22 09:00:47.000000000
+0200
+++ new/html5-parser-0.4.11/gumbo/parser.c 2023-04-12 07:07:46.000000000
+0200
@@ -185,7 +185,6 @@
{"xlink:show", "show", GUMBO_ATTR_NAMESPACE_XLINK},
{"xlink:title", "title", GUMBO_ATTR_NAMESPACE_XLINK},
{"xlink:type", "type", GUMBO_ATTR_NAMESPACE_XLINK},
- {"xml:base", "base", GUMBO_ATTR_NAMESPACE_XML},
{"xml:lang", "lang", GUMBO_ATTR_NAMESPACE_XML},
{"xml:space", "space", GUMBO_ATTR_NAMESPACE_XML},
{"xmlns", "xmlns", GUMBO_ATTR_NAMESPACE_XMLNS},
@@ -4304,6 +4303,12 @@
parser, token);
}
+static bool
+current_node_is_html_or_integration_point(GumboParser *parser) {
+ GumboNode *current_node = get_current_node(parser);
+ return current_node && (is_mathml_integration_point(current_node) ||
is_html_integration_point(current_node) ||
current_node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML);
+}
+
//
http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inforeign
static bool handle_in_foreign_content(GumboParser* parser, GumboToken* token) {
switch (token->type) {
@@ -4349,18 +4354,8 @@
) {
/* Parse error */
parser_add_parse_error(parser, token);
-
- GumboNode *current_node;
- while ((current_node = get_current_node(parser)) && !(
- is_mathml_integration_point(current_node) ||
- is_html_integration_point(current_node) ||
- current_node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML
- )) {
- if (!pop_current_node(parser)) break;
- }
-
- parser->_parser_state->_reprocess_current_token = true;
- return false;
+ while(!current_node_is_html_or_integration_point(parser) &&
pop_current_node(parser)) {}
+ return handle_html_content(parser, token);
}
if (token->type == GUMBO_TOKEN_START_TAG) {
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/html5-parser-0.4.10/gumbo/util.h
new/html5-parser-0.4.11/gumbo/util.h
--- old/html5-parser-0.4.10/gumbo/util.h 2021-09-22 09:00:47.000000000
+0200
+++ new/html5-parser-0.4.11/gumbo/util.h 2023-04-12 07:07:46.000000000
+0200
@@ -71,9 +71,10 @@
return (c | 0x20) >= 'a' && (c | 0x20) <= 'z';
}
-#ifdef GUMBO_DEBUG
+#if defined(GUMBO_DEBUG)
// Debug wrapper for printf, to make it easier to turn off debugging info when
// required.
+#include <stdio.h>
#define gumbo_debug(...) fprintf(stderr, __VA_ARGS__)
#else
#define gumbo_debug(...)
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/html5-parser-0.4.10/publish.py
new/html5-parser-0.4.11/publish.py
--- old/html5-parser-0.4.10/publish.py 2021-09-22 09:00:47.000000000 +0200
+++ new/html5-parser-0.4.11/publish.py 2023-04-12 07:07:46.000000000 +0200
@@ -1,4 +1,4 @@
-#!/usr/bin/env python2
+#!/usr/bin/env python
# vim:fileencoding=utf-8
# License: Apache 2.0 Copyright: 2017, Kovid Goyal <kovid at kovidgoyal.net>
@@ -15,7 +15,7 @@
sys.path.insert(0, '.')
if True:
- from build import version
+ from unix_build import version
del sys.path[0]
VERSION = '{}.{}.{}'.format(*version)
@@ -41,7 +41,7 @@
def build_release():
for rem in 'dist build'.split():
os.path.exists(rem) and shutil.rmtree(rem)
- run(sys.executable, 'setup.py', '-q', 'sdist')
+ run(sys.executable, '-m', 'build', '-s')
def sign_release():
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/html5-parser-0.4.10/setup.cfg
new/html5-parser-0.4.11/setup.cfg
--- old/html5-parser-0.4.10/setup.cfg 2021-09-22 09:00:47.000000000 +0200
+++ new/html5-parser-0.4.11/setup.cfg 2023-04-12 07:07:46.000000000 +0200
@@ -1,3 +1,35 @@
+[metadata]
+name = html5-parser
+version = 0.4.11
+author = Kovid Goyal
+author_email = [email protected]
+description = Fast C based HTML 5 parsing for python
+license =Apache 2.0
+url = https://html5-parser.readthedocs.io
+platforms = any
+classifiers =
+ Development Status :: 5 - Production/Stable
+ Intended Audience :: Developers
+ License :: OSI Approved :: Apache Software License
+ Natural Language :: English
+ Operating System :: OS Independent
+ Programming Language :: Python
+ Topic :: Text Processing
+ Topic :: Text Processing :: Markup
+ Topic :: Text Processing :: Markup :: HTML
+ Topic :: Text Processing :: Markup :: XML
+
+[options]
+package_dir =
+ =src
+packages = html5_parser
+install_requires =
+ chardet
+ lxml>=3.8.0
+
+[options.extras_require]
+soup = beautifulsoup4
+
[flake8]
max-line-length = 100
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/html5-parser-0.4.10/setup.py
new/html5-parser-0.4.11/setup.py
--- old/html5-parser-0.4.10/setup.py 2021-09-22 09:00:47.000000000 +0200
+++ new/html5-parser-0.4.11/setup.py 2023-04-12 07:07:46.000000000 +0200
@@ -4,18 +4,18 @@
import os
import sys
-from distutils.command.build import build as Build
from itertools import chain
-from setuptools import Extension, setup
+from setuptools import Extension, setup, Command
self_path = os.path.abspath(__file__)
base = os.path.dirname(self_path)
sys.path.insert(0, base)
if True:
- from build import (
- SRC_DIRS, find_c_files, include_dirs, libraries, library_dirs,
version, iswindows,
- TEST_COMMAND, add_python_path)
+ from unix_build import (
+ SRC_DIRS, TEST_COMMAND, add_python_path, find_c_files, include_dirs,
iswindows, libraries,
+ library_dirs, version
+ )
del sys.path[0]
src_files = tuple(chain(*map(lambda x: find_c_files(x)[0], SRC_DIRS)))
@@ -24,54 +24,40 @@
cargs.extend('-std=c99 -fvisibility=hidden'.split())
-class Test(Build):
+class Test(Command):
description = "run unit tests after in-place build"
+ user_options = []
+ user_options = [
+ ('test-name=', None, 'Specify the test to run.'),
+ ]
+ sub_commands = [
+ ('build', None),
+ ]
+
+ def initialize_options(self):
+ self.test_name = ''
+
+ def finalize_options(self):
+ pass
def run(self):
- Build.run(self)
- if self.dry_run:
- self.announce('skipping "test" (dry run)')
- return
+ for cmd_name in self.get_sub_commands():
+ self.run_command(cmd_name)
import subprocess
- env = add_python_path(os.environ.copy(), self.build_lib)
+ build = self.get_finalized_command('build')
+ env = add_python_path(os.environ.copy(), build.build_lib)
print('\nrunning tests...')
sys.stdout.flush()
- ret = subprocess.Popen([sys.executable] + TEST_COMMAND, env=env).wait()
+ cmd = [sys.executable] + TEST_COMMAND
+ if self.test_name:
+ cmd.append(self.test_name)
+ ret = subprocess.Popen(cmd, env=env).wait()
if ret != 0:
raise SystemExit(ret)
-CLASSIFIERS = """\
-Development Status :: 5 - Production/Stable
-Intended Audience :: Developers
-License :: OSI Approved :: Apache Software License
-Natural Language :: English
-Operating System :: OS Independent
-Programming Language :: Python
-Topic :: Text Processing
-Topic :: Text Processing :: Markup
-Topic :: Text Processing :: Markup :: HTML
-Topic :: Text Processing :: Markup :: XML
-"""
-
setup(
- name='html5-parser',
- version='{}.{}.{}'.format(*version),
- author='Kovid Goyal',
- author_email='[email protected]',
- description='Fast C based HTML 5 parsing for python',
- license='Apache 2.0',
- url='https://html5-parser.readthedocs.io',
- download_url=(
- "https://pypi.python.org/packages/source/m/html5-parser/"
- "html5-parser-{}.{}.{}.tar.gz".format(*version)),
- classifiers=[c for c in CLASSIFIERS.split("\n") if c],
- platforms=['any'],
- install_requires=['chardet', 'lxml>=3.8.0'],
- extras_require={'soup': 'beautifulsoup4'},
- packages=['html5_parser'],
- package_dir={'': 'src'},
cmdclass={'test': Test},
ext_modules=[
Extension(
@@ -80,4 +66,9 @@
libraries=libraries(),
library_dirs=library_dirs(),
extra_compile_args=cargs,
+ define_macros=[
+ ('MAJOR', str(version.major)),
+ ('MINOR', str(version.minor)),
+ ('PATCH', str(version.patch))
+ ],
sources=list(map(str, src_files)))])
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/html5-parser-0.4.10/src/html5_parser/soup.py
new/html5-parser-0.4.11/src/html5_parser/soup.py
--- old/html5-parser-0.4.10/src/html5_parser/soup.py 2021-09-22
09:00:47.000000000 +0200
+++ new/html5-parser-0.4.11/src/html5_parser/soup.py 2023-04-12
07:07:46.000000000 +0200
@@ -127,7 +127,7 @@
def parse(utf8_data, stack_size=16 * 1024, keep_doctype=False,
return_root=True):
- from . import html_parser
+ from html5_parser import html_parser
bs, soup, new_tag, Comment, append, NavigableString = init_soup()
if not isinstance(utf8_data, bytes):
utf8_data = utf8_data.encode('utf-8')
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/html5-parser-0.4.10/src/python-wrapper.c
new/html5-parser-0.4.11/src/python-wrapper.c
--- old/html5-parser-0.4.10/src/python-wrapper.c 2021-09-22
09:00:47.000000000 +0200
+++ new/html5-parser-0.4.11/src/python-wrapper.c 2023-04-12
07:07:46.000000000 +0200
@@ -13,10 +13,6 @@
#include "as-libxml.h"
#include "as-python-tree.h"
-#define MAJOR 0
-#define MINOR 4
-#define PATCH 10
-
static char *NAME = "libxml2:xmlDoc";
static char *DESTRUCTOR = "destructor:xmlFreeDoc";
@@ -96,12 +92,6 @@
return NULL;
}
}
- if (fragment_namespace != GUMBO_NAMESPACE_HTML) {
- // causes infinite loops in gumbo, enable the non html fragment
context tests
- // in html5lib_adapter.py to trigger
- PyErr_SetString(PyExc_KeyError, "Fragment parsing with non-HTML
namespaces is not supported");
- return NULL;
- }
doc = parse_with_options(buffer, (size_t)sz, &opts, context,
fragment_namespace);
if (!doc) return NULL;
return encapsulate(doc);
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/html5-parser-0.4.10/test/html5lib_adapter.py
new/html5-parser-0.4.11/test/html5lib_adapter.py
--- old/html5-parser-0.4.10/test/html5lib_adapter.py 2021-09-22
09:00:47.000000000 +0200
+++ new/html5-parser-0.4.11/test/html5lib_adapter.py 2023-04-12
07:07:46.000000000 +0200
@@ -89,13 +89,16 @@
add(level, '<', ns, name, '>')
return ns + name
- def serialize_attr(name, val, level):
+ def serialize_attr_name(name):
ns = ''
if name.startswith('{'):
ns, name = name[1:].rpartition('}')[::2]
ns = NAMESPACE_PREFIXES.get(ns, ns)
+ return ns + name
+
+ def serialize_attr(name, val, level):
level += 2
- add(level, ns, name, '=', '"', val, '"')
+ add(level, serialize_attr_name(name), '=', '"', val, '"')
def serialize_text(text, level=0):
add((level + 2) if level else 1, '"', text, '"')
@@ -105,7 +108,7 @@
def serialize_node(node, level=1):
name = serialize_tag(node.tag, level)
- for attr in sorted(node.keys()):
+ for attr in sorted(node.keys(), key=serialize_attr_name):
serialize_attr(attr, node.get(attr), level)
if name == 'template':
level += 2
@@ -128,6 +131,8 @@
serialize_comment(node)
else:
serialize_node(node)
+ if node.tail:
+ serialize_text(node.tail)
else:
for c in root.itersiblings(preceding=True):
serialize_comment(c)
@@ -176,6 +181,10 @@
return (
'gumbo and html5lib differ on <menuitem> parsing'
' and I cannot be bothered to figure out who is right')
+ if 'search-element' in test_name:
+ return (
+ 'No idea what the <search> element is. In any case the tests
only differ in'
+ ' indentation, so skipping')
noscript = re.search(r'^\| +<noscript>$', expected, flags=re.MULTILINE)
if noscript is not None:
return '<noscript> is always parsed with scripting off by gumbo'
@@ -184,8 +193,6 @@
for line in errors:
if 'expected-doctype-name-but' in line or 'unknown-doctype' in
line:
return 'gumbo auto-corrects malformed doctypes'
- if fragment_context and ':' in fragment_context:
- return 'Fragment parsing with non HTML contexts not supported'
def implementation(self, fragment_context, html, expected, errors,
test_name):
if fragment_context:
@@ -195,15 +202,16 @@
raise unittest.SkipTest(bad)
root = parse(
- html, namespace_elements=True, sanitize_names=False,
fragment_context=fragment_context)
+ html, namespace_elements=True, sanitize_names=False,
+ fragment_context=fragment_context)
output = serialize_construction_output(root,
fragment_context=fragment_context)
+ from lxml.etree import tostring
- # html5lib doesn't yet support the template tag, but it appears in the
- # tests with the expectation that the template contents will be under
the
- # word 'contents', so we need to reformat that string a bit.
- # expected = reformatTemplateContents(expected)
-
- error_msg = '\n'.join(['\n\nInput:', html, '\nExpected:', expected,
'\nReceived:', output])
+ error_msg = '\n'.join([
+ '\n\nTest name:', test_name, '\nInput:', html, '\nExpected:',
expected,
+ '\nReceived:', output,
+ '\nOutput tree:', tostring(root, encoding='unicode'),
+ ])
self.ae(expected, output, error_msg + '\n')
# TODO: Check error messages, when there's full error support.
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/html5-parser-0.4.10/unix_build.py
new/html5-parser-0.4.11/unix_build.py
--- old/html5-parser-0.4.10/unix_build.py 1970-01-01 01:00:00.000000000
+0100
+++ new/html5-parser-0.4.11/unix_build.py 2023-04-12 07:07:46.000000000
+0200
@@ -0,0 +1,305 @@
+#!/usr/bin/env python
+# vim:fileencoding=utf-8
+# License: Apache 2.0 Copyright: 2017, Kovid Goyal <kovid at kovidgoyal.net>
+
+from __future__ import (absolute_import, division, print_function,
unicode_literals)
+
+import argparse
+import errno
+import glob
+import os
+import re
+import shlex
+import shutil
+import subprocess
+import sys
+import sysconfig
+from collections import namedtuple
+from copy import deepcopy
+from itertools import chain
+try:
+ import configparser
+except ImportError:
+ import ConfigParser as configparser
+
+self_path = os.path.abspath(__file__)
+base = os.path.dirname(self_path)
+build_dir = os.path.join(base, 'build', 'custom')
+freeze_dir = os.path.join(base, 'build', 'html5_parser')
+_plat = sys.platform.lower()
+isosx = 'darwin' in _plat
+iswindows = hasattr(sys, 'getwindowsversion')
+is_ci = os.environ.get('CI') == 'true'
+Env = namedtuple('Env', 'cc cflags ldflags linker debug cc_name cc_ver')
+PKGCONFIG = os.environ.get('PKGCONFIG_EXE', 'pkg-config')
+cfg = configparser.ConfigParser()
+cfg.read(os.path.join(base, 'setup.cfg'))
+version = namedtuple('Version', 'major minor patch')(
+ *map(int, cfg.get('metadata', 'version').split('.')))
+
+
+def safe_makedirs(path):
+ try:
+ os.makedirs(path)
+ except EnvironmentError as err:
+ if err.errno != errno.EEXIST:
+ raise
+
+
+def add_python_flags(env, return_libs=False):
+ env.cflags.extend('-I' + sysconfig.get_path(x) for x in 'include
platinclude'.split())
+ libs = []
+ libs += sysconfig.get_config_var('LIBS').split()
+ libs += sysconfig.get_config_var('SYSLIBS').split()
+ fw = sysconfig.get_config_var('PYTHONFRAMEWORK')
+ if fw:
+ for var in 'data include stdlib'.split():
+ val = sysconfig.get_path(var)
+ if val and '/{}.framework'.format(fw) in val:
+ fdir = val[:val.index('/{}.framework'.format(fw))]
+ if os.path.isdir(os.path.join(fdir,
'{}.framework'.format(fw))):
+ framework_dir = fdir
+ break
+ else:
+ raise SystemExit('Failed to find Python framework')
+ libs.append(os.path.join(framework_dir,
sysconfig.get_config_var('LDLIBRARY')))
+ else:
+ libs += ['-L' + sysconfig.get_config_var('LIBDIR')]
+ libs += ['-lpython' + sysconfig.get_config_var('VERSION') +
getattr(sys, 'abiflags', '')]
+ libs += sysconfig.get_config_var('LINKFORSHARED').split()
+ env.ldflags.extend(libs)
+ return libs if return_libs else env
+
+
+def pkg_config(pkg, *args):
+ try:
+ val = subprocess.check_output([PKGCONFIG, pkg] +
list(args)).decode('utf-8')
+ except EnvironmentError as err:
+ if err.errno == errno.ENOENT:
+ raise SystemExit('pkg-config is required to build html5-parser')
+ raise
+ return list(filter(None, map(str, shlex.split(val))))
+
+
+def env_var(which, default='', split=os.pathsep):
+ val = str(os.environ.get(which, default))
+ if not split:
+ return val
+ return list(filter(None, val.split(split)))
+
+
+def include_dirs():
+ if 'LIBXML_INCLUDE_DIRS' in os.environ:
+ return env_var('LIBXML_INCLUDE_DIRS')
+ return [x[2:] for x in pkg_config('libxml-2.0', '--cflags-only-I')]
+
+
+def libraries():
+ if iswindows:
+ return env_var('LIBXML_LIBS', 'libxml2')
+ if 'LIBXML_LIBS' in os.environ:
+ return env_var('LIBXML_LIBS')
+ return [x[2:] for x in pkg_config('libxml-2.0', '--libs-only-l')]
+
+
+def library_dirs():
+ if 'LIBXML_LIB_DIRS' in os.environ:
+ return env_var('LIBXML_LIB_DIRS')
+ return [x[2:] for x in pkg_config('libxml-2.0', '--libs-only-L')]
+
+
+def cc_version():
+ cc = os.environ.get('CC', 'gcc')
+ raw = subprocess.check_output([cc, '-dM', '-E', '-'],
stdin=open(os.devnull, 'rb'))
+ m = re.search(br'^#define __clang__ 1', raw, flags=re.M)
+ cc_name = 'gcc' if m is None else 'clang'
+ ver = int(re.search(br'#define __GNUC__ (\d+)', raw,
flags=re.M).group(1)), int(
+ re.search(br'#define __GNUC_MINOR__ (\d+)', raw, flags=re.M).group(1))
+ return cc, ver, cc_name
+
+
+def get_sanitize_args(cc, ccver):
+ sanitize_args = set()
+ if cc == 'gcc' and ccver < (4, 8):
+ return sanitize_args
+ sanitize_args.add('-fno-omit-frame-pointer')
+ sanitize_args.add('-fsanitize=address')
+ if (cc == 'gcc' and ccver >= (5, 0)) or (cc == 'clang' and not isosx):
+ # clang on macOS does not support -fsanitize=undefined
+ sanitize_args.add('-fsanitize=undefined')
+ # if cc == 'gcc' or (cc == 'clang' and ccver >= (4, 2)):
+ # sanitize_args.add('-fno-sanitize-recover=all')
+ return sanitize_args
+
+
+def init_env(debug=False, sanitize=False, native_optimizations=False,
add_python=True):
+ native_optimizations = (native_optimizations and not sanitize and not
debug)
+ cc, ccver, cc_name = cc_version()
+ stack_protector = '-fstack-protector'
+ if ccver >= (4, 9) and cc_name == 'gcc':
+ stack_protector += '-strong'
+ missing_braces = ''
+ if ccver < (5, 2) and cc_name == 'gcc':
+ missing_braces = '-Wno-missing-braces'
+ optimize = '-ggdb' if debug or sanitize else '-O3'
+ sanitize_args = get_sanitize_args(cc_name, ccver) if sanitize else set()
+ cflags = os.environ.get(
+ 'OVERRIDE_CFLAGS', (
+ '-Wextra -Wno-missing-field-initializers -Wall -std=c99
-fvisibility=hidden'
+ ' -pedantic-errors -Werror {} {} -D{}DEBUG -fwrapv {} {} -pipe
{}').format(
+ optimize, ' '.join(sanitize_args), ('' if debug else 'N'),
stack_protector,
+ missing_braces, '-march=native' if native_optimizations else
''))
+ libxml_cflags = pkg_config('libxml-2.0', '--cflags')
+ cflags = shlex.split(cflags) + libxml_cflags +
shlex.split(sysconfig.get_config_var('CCSHARED'))
+ ldflags = os.environ.get(
+ 'OVERRIDE_LDFLAGS', '-Wall -shared ' + ' '.join(sanitize_args) + (''
if debug else ' -O3'))
+ libxml_ldflags = pkg_config('libxml-2.0', '--libs')
+ ldflags = shlex.split(ldflags) + libxml_ldflags
+ cflags += shlex.split(os.environ.get('CFLAGS', ''))
+ ldflags += shlex.split(os.environ.get('LDFLAGS', ''))
+ cflags.append('-pthread')
+ cflags.extend((
+ '-DMAJOR=' + str(version.major),
+ '-DMINOR=' + str(version.minor),
+ '-DPATCH=' + str(version.patch),
+ ))
+ ans = Env(cc, cflags, ldflags, cc, debug, cc_name, ccver)
+ return add_python_flags(ans) if add_python else ans
+
+
+def run_tool(cmd):
+ if hasattr(cmd, 'lower'):
+ cmd = shlex.split(cmd)
+ print(' '.join(cmd))
+ p = subprocess.Popen(cmd)
+ ret = p.wait()
+ if ret != 0:
+ raise SystemExit(ret)
+
+
+def newer(dest, *sources):
+ try:
+ dtime = os.path.getmtime(dest)
+ except EnvironmentError:
+ return True
+ for s in chain(sources, (self_path, )):
+ if os.path.getmtime(s) >= dtime:
+ return True
+ return False
+
+
+def find_c_files(src_dir):
+ ans, headers = [], []
+ for x in sorted(os.listdir(src_dir)):
+ ext = os.path.splitext(x)[1]
+ if ext == '.c' and not x.endswith('-check.c'):
+ ans.append(os.path.join(src_dir, x))
+ elif ext == '.h':
+ headers.append(os.path.join(src_dir, x))
+ ans.sort(key=os.path.getmtime, reverse=True)
+ return tuple(ans), tuple(headers)
+
+
+def build_obj(src, env, headers):
+ suffix = '-debug' if env.debug else ''
+ obj = os.path.join(build_dir, os.path.basename(src).rpartition('.')[0] +
suffix + '.o')
+ if newer(obj, src, *headers):
+ cflags = list(env.cflags)
+ if src.endswith('char_ref.c'):
+ cflags.append('-Wno-unused-const-variable')
+ cmd = [env.cc] + cflags + ['-c', src] + ['-o', obj]
+ run_tool(cmd)
+ return obj
+
+
+TEST_EXE = os.path.join(build_dir, 'test')
+MEMLEAK_EXE = os.path.join(build_dir, 'mem-leak-check')
+if is_ci:
+ TEST_EXE = os.path.join(os.path.dirname(os.path.abspath(sys.executable)),
'test-html5-parser')
+SRC_DIRS = 'src gumbo'.split()
+MOD_EXT = '.so'
+
+
+def link(objects, env):
+ dest = os.path.join(build_dir, 'html_parser' + MOD_EXT)
+ o = ['-o', dest]
+ cmd = [env.linker] + objects + o + env.ldflags
+ if newer(dest, *objects):
+ run_tool(cmd)
+ return dest
+
+
+def build(args, build_leak_check=False):
+ debug_objects = []
+ debug_env = init_env(debug=True, sanitize=True)
+ for sdir in SRC_DIRS:
+ sources, headers = find_c_files(sdir)
+ if sdir == 'src':
+ headers += ('gumbo/gumbo.h', )
+ debug_objects.extend(build_obj(c, debug_env, headers) for c in sources)
+ link(debug_objects, debug_env)
+ ldflags = add_python_flags(deepcopy(debug_env), return_libs=True)
+ if newer(TEST_EXE, *debug_objects):
+ cmd = ([debug_env.cc] + debug_env.cflags + ['test.c'] + ['-o',
TEST_EXE] + ldflags)
+ run_tool(cmd)
+ if build_leak_check and newer(MEMLEAK_EXE, 'mem-leak-check.c',
*debug_objects):
+ cmd = ([debug_env.cc] + debug_env.cflags + ['mem-leak-check.c'] + [
+ '-o', MEMLEAK_EXE] + debug_objects + debug_env.ldflags)
+ cmd = [x for x in cmd if x not in {'-fPIC', '-pthread', '-shared'}]
+ run_tool(cmd)
+ for mod in glob.glob(os.path.join(build_dir, '*' + MOD_EXT)):
+ shutil.copy2(mod, freeze_dir)
+ for mod in glob.glob(os.path.join('src', 'html5_parser', '*.py')):
+ shutil.copy2(mod, freeze_dir)
+
+
+TEST_COMMAND = ['run_tests.py']
+
+
+def add_python_path(env, path):
+ pp = env.get('PYTHONPATH', '')
+ to_join = filter(None, [os.path.abspath(path), pp])
+ env['PYTHONPATH'] = os.pathsep.join(to_join)
+ return env
+
+
+def option_parser():
+ p = argparse.ArgumentParser()
+ p.add_argument(
+ 'action',
+ nargs='?',
+ default='test',
+ choices='build test try leak'.split(),
+ help='Action to perform (default is build)')
+ p.add_argument('rest', nargs='*')
+ return p
+
+
+def main():
+ args = option_parser().parse_args()
+ os.chdir(base)
+ safe_makedirs(build_dir), safe_makedirs(freeze_dir)
+ if args.action == 'build':
+ build(args)
+ elif args.action == 'test':
+ build(args)
+ os.environ['ASAN_OPTIONS'] = 'leak_check_at_exit=0'
+ add_python_path(os.environ, os.path.dirname(freeze_dir))
+ print('\nrunning tests...')
+ os.execlp(TEST_EXE, TEST_EXE, 'run_tests.py', *args.rest)
+ elif args.action == 'try':
+ build(args)
+ os.environ['ASAN_OPTIONS'] = 'leak_check_at_exit=0'
+ add_python_path(os.environ, os.path.dirname(freeze_dir))
+ os.execlp(
+ TEST_EXE, TEST_EXE, '-c', 'from html5_parser import *; ' +
args.rest[0], *args.rest[1:])
+ elif args.action == 'leak':
+ build(args, build_leak_check=True)
+ os.environ['MEMLEAK_EXE'] = os.path.abspath(MEMLEAK_EXE)
+ os.environ['ASAN_OPTIONS'] = 'leak_check_at_exit=0'
+ add_python_path(os.environ, os.path.dirname(freeze_dir))
+ os.execlp(TEST_EXE, TEST_EXE, 'run_tests.py')
+
+
+if __name__ == '__main__':
+ main()