Script 'mail_helper' called by obssrc
Hello community,
here is the log from the commit of package python-html5-parser for
openSUSE:Factory checked in at 2021-10-15 23:04:10
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Comparing /work/SRC/openSUSE:Factory/python-html5-parser (Old)
and /work/SRC/openSUSE:Factory/.python-html5-parser.new.1890 (New)
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Package is "python-html5-parser"
Fri Oct 15 23:04:10 2021 rev:11 rq:925386 version:0.4.10
Changes:
--------
--- /work/SRC/openSUSE:Factory/python-html5-parser/python-html5-parser.changes
2021-06-02 22:12:28.168119085 +0200
+++
/work/SRC/openSUSE:Factory/.python-html5-parser.new.1890/python-html5-parser.changes
2021-10-15 23:04:45.358131358 +0200
@@ -1,0 +2,8 @@
+Fri Oct 15 08:27:05 UTC 2021 - ecsos <[email protected]>
+
+- Update to 0.4.10
+ No changelog from upstream.
+ See instead here:
+
https://github.com/kovidgoyal/html5-parser/compare/v0.4.9...v0.4.10?diff=unified&name=v0.4.10
+
+-------------------------------------------------------------------
Old:
----
python-html5-parser-0.4.9.tar.gz
New:
----
python-html5-parser-0.4.10.tar.gz
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Other differences:
------------------
++++++ python-html5-parser.spec ++++++
--- /var/tmp/diff_new_pack.TAqKae/_old 2021-10-15 23:04:45.798131672 +0200
+++ /var/tmp/diff_new_pack.TAqKae/_new 2021-10-15 23:04:45.802131674 +0200
@@ -18,7 +18,7 @@
%{?!python_module:%define python_module() python-%{**} python3-%{**}}
Name: python-html5-parser
-Version: 0.4.9
+Version: 0.4.10
Release: 0
Summary: C based HTML 5 parsing for Python
License: Apache-2.0
@@ -26,6 +26,7 @@
URL: https://github.com/kovidgoyal/html5-parser
Source:
https://github.com/kovidgoyal/html5-parser/archive/v%{version}/%{name}-%{version}.tar.gz
BuildRequires: %{python_module beautifulsoup4}
+BuildRequires: %{python_module chardet}
BuildRequires: %{python_module devel}
BuildRequires: %{python_module lxml >= 3.8.0}
BuildRequires: %{python_module setuptools}
++++++ python-html5-parser-0.4.9.tar.gz -> python-html5-parser-0.4.10.tar.gz
++++++
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/html5-parser-0.4.9/.appveyor.yml
new/html5-parser-0.4.10/.appveyor.yml
--- old/html5-parser-0.4.9/.appveyor.yml 2019-11-03 04:13:38.000000000
+0100
+++ new/html5-parser-0.4.10/.appveyor.yml 1970-01-01 01:00:00.000000000
+0100
@@ -1,27 +0,0 @@
-os: Visual Studio 2015
-
-platform:
- - x64
- - x86
-
-cache:
- - sw -> win-ci.py
-
-environment:
- matrix:
- - PY: 36
-
-
-build_script:
- - ps: |
- If ($env:Platform -Match "x86") {
- $env:VCVARS_PLATFORM="x86"
- } Else {
- $env:VCVARS_PLATFORM="amd64"
- }
- - call "%VS140COMNTOOLS%\..\..\VC\vcvarsall.bat" %VCVARS_PLATFORM%
- - C:/Python36-x64/python.exe win-ci.py install_deps
- - git clone --depth 1 "https://github.com/html5lib/html5lib-tests.git"
test/html5lib-tests
-
-test_script:
- - C:/Python36-x64/python.exe win-ci.py test
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/html5-parser-0.4.9/.github/workflows/ci.py
new/html5-parser-0.4.10/.github/workflows/ci.py
--- old/html5-parser-0.4.9/.github/workflows/ci.py 1970-01-01
01:00:00.000000000 +0100
+++ new/html5-parser-0.4.10/.github/workflows/ci.py 2021-09-22
09:00:47.000000000 +0200
@@ -0,0 +1,55 @@
+#!/usr/bin/env python
+# vim:fileencoding=utf-8
+# License: Apache 2.0 Copyright: 2017, Kovid Goyal <kovid at kovidgoyal.net>
+
+from __future__ import absolute_import, division, print_function,
unicode_literals
+
+import os
+import shlex
+import subprocess
+import sys
+
+
+is_macos = 'darwin' in sys.platform.lower()
+
+
+def run(*a):
+ if len(a) == 1:
+ a = shlex.split(a[0])
+ ret = subprocess.Popen(a).wait()
+ if ret != 0:
+ print('Running:', a, 'failed', file=sys.stderr)
+ raise SystemExit(ret)
+
+
+def install_deps():
+ if is_macos:
+ pass
+ else:
+ run('sudo apt-get update')
+ run('sudo apt-get install -y libxml2-dev libxslt-dev')
+ deps = 'chardet lxml beautifulsoup4'.split()
+ if sys.version_info.major == 2:
+ deps.append('BeautifulSoup')
+ run(sys.executable, '-m', 'pip', 'install', '--no-binary', 'lxml', *deps)
+ run(sys.executable, '-c', 'from lxml import etree; print(etree)')
+
+
+def main():
+ which = sys.argv[-1]
+ if hasattr(sys, 'getwindowsversion'):
+ run(sys.executable, os.path.join(os.path.dirname(__file__),
'win-ci.py'), which)
+ return
+ if which == 'install':
+ install_deps()
+ elif which == 'test':
+ builder = os.environ['BUILDER']
+ run(sys.executable, builder, 'test')
+ if builder == 'build.py':
+ run(sys.executable, builder, 'leak')
+ else:
+ raise SystemExit('Unknown action:', which)
+
+
+if __name__ == '__main__':
+ main()
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/html5-parser-0.4.9/.github/workflows/ci.yml
new/html5-parser-0.4.10/.github/workflows/ci.yml
--- old/html5-parser-0.4.9/.github/workflows/ci.yml 1970-01-01
01:00:00.000000000 +0100
+++ new/html5-parser-0.4.10/.github/workflows/ci.yml 2021-09-22
09:00:47.000000000 +0200
@@ -0,0 +1,49 @@
+name: CI
+on: [push, pull_request]
+env:
+ CI: 'true'
+ LC_ALL: en_US.UTF-8
+ LANG: en_US.UTF-8
+
+jobs:
+ test:
+ name: Test on ${{ matrix.os }} (python=${{ matrix.pyver }} cc=${{
matrix.cc }} builder=${{ matrix.builder }})
+ runs-on: ${{ matrix.os }}
+ env:
+ CC: ${{ matrix.cc }}
+ BUILDER: ${{ matrix.builder }}
+ strategy:
+ matrix:
+ include:
+ - { pyver: 2.7, builder: build.py, os: ubuntu-latest, cc:
gcc }
+ - { pyver: 2.7, builder: build.py, os: ubuntu-latest, cc:
clang }
+ - { pyver: 3.6, builder: build.py, os: ubuntu-latest, cc:
gcc }
+ - { pyver: 3.6, builder: build.py, os: ubuntu-latest, cc:
clang }
+ - { pyver: 3.8, builder: setup.py, os: ubuntu-latest, cc:
gcc }
+
+ - { pyver: 3.8, builder: setup.py, os: macos-latest, cc:
clang }
+
+ - { pyver: 3.8, builder: setup.py, os: windows-latest, cc:
cl }
+
+ steps:
+ - name: Checkout source code
+ uses: actions/checkout@master
+ with:
+ fetch-depth: 10
+
+ - name: Set up Python ${{ matrix.pyver }}
+ uses: actions/setup-python@master
+ with:
+ python-version: ${{ matrix.pyver }}
+
+ - name: Install dependencies
+ run:
+ python .github/workflows/ci.py install
+
+ - name: Download html5lib tests
+ run:
+ git clone --depth 1
https://github.com/html5lib/html5lib-tests.git test/html5lib-tests
+
+ - name: Run tests
+ run:
+ python .github/workflows/ci.py test
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/html5-parser-0.4.9/.github/workflows/win-ci.py
new/html5-parser-0.4.10/.github/workflows/win-ci.py
--- old/html5-parser-0.4.9/.github/workflows/win-ci.py 1970-01-01
01:00:00.000000000 +0100
+++ new/html5-parser-0.4.10/.github/workflows/win-ci.py 2021-09-22
09:00:47.000000000 +0200
@@ -0,0 +1,307 @@
+#!/usr/bin/env python3
+# vim:fileencoding=utf-8
+# License: Apache 2.0 Copyright: 2017, Kovid Goyal <kovid at kovidgoyal.net>
+
+from __future__ import print_function
+
+import errno
+import glob
+import io
+import os
+import pipes
+import shlex
+import shutil
+import subprocess
+import sys
+import tarfile
+import time
+
+ZLIB = "http://zlib.net/zlib-{}.tar.xz".format("1.2.11")
+LIBXML2 = "ftp://xmlsoft.org/libxml2/libxml2-{}.tar.gz".format('2.9.4')
+LIBXSLT = "ftp://xmlsoft.org/libxml2/libxslt-{}.tar.gz".format('1.1.28')
+LXML =
"https://files.pythonhosted.org/packages/c5/2f/a0d8aa3eee6d53d5723d89e1fc32eee11e76801b424e30b55c7aa6302b01/lxml-4.6.1.tar.gz"
# noqa
+SW = os.path.abspath('sw')
+PYTHON = os.path.abspath(sys.executable)
+os.environ['SW'] = SW
+os.environ['PYTHONPATH'] = os.path.join(SW, r'python\Lib\site-packages')
+plat = 'amd64' if sys.maxsize > 2**32 else 'x86'
+
+
+def printf(*a, **k):
+ print(*a, **k)
+ sys.stdout.flush()
+
+
+def walk(path='.'):
+ for dirpath, dirnames, filenames in os.walk(path):
+ for f in filenames:
+ yield os.path.join(dirpath, f)
+
+
+def download_file(url):
+ for i in range(5):
+ try:
+ printf('Downloading', url)
+ try:
+ return subprocess.check_output(['curl.exe', '-fSL', url])
+ except FileNotFoundError:
+ try:
+ from urllib.request import urlopen
+ except ImportError:
+ from urllib import urlopen
+ return urlopen(url).read()
+ except subprocess.CalledProcessError:
+ time.sleep(1)
+ raise SystemExit('Failed to download: {}'.format(url))
+
+
+def split(x):
+ x = x.replace('\\', '\\\\')
+ return shlex.split(x)
+
+
+def run(*args, env=None, cwd=None):
+ if len(args) == 1 and isinstance(args[0], type('')):
+ cmd = split(args[0])
+ else:
+ cmd = args
+ printf(' '.join(pipes.quote(x) for x in cmd))
+ sys.stdout.flush()
+ if env:
+ printf('Using modified env:', env)
+ e = os.environ.copy()
+ e.update(env)
+ env = e
+ try:
+ p = subprocess.Popen(cmd, cwd=cwd, env=env)
+ except EnvironmentError as err:
+ if err.errno == errno.ENOENT:
+ raise SystemExit('Could not find the program: %s' % cmd[0])
+ raise
+ if p.wait() != 0:
+ raise SystemExit(p.returncode)
+
+
+def distutils_vcvars():
+ from distutils.msvc9compiler import find_vcvarsall, get_build_version
+ return find_vcvarsall(get_build_version())
+
+
+def remove_dups(variable):
+ old_list = variable.split(os.pathsep)
+ new_list = []
+ for i in old_list:
+ if i not in new_list:
+ new_list.append(i)
+ return os.pathsep.join(new_list)
+
+
+def query_process(cmd):
+ if plat == 'amd64' and 'PROGRAMFILES(x86)' not in os.environ:
+ os.environ['PROGRAMFILES(x86)'] = os.environ['PROGRAMFILES'] + ' (x86)'
+ result = {}
+ popen = subprocess.Popen(cmd, stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE)
+ try:
+ stdout, stderr = popen.communicate()
+ if popen.wait() != 0:
+ raise RuntimeError(stderr.decode("mbcs"))
+
+ stdout = stdout.decode("mbcs")
+ for line in stdout.splitlines():
+ if '=' not in line:
+ continue
+ line = line.strip()
+ key, value = line.split('=', 1)
+ key = key.lower()
+ if key == 'path':
+ if value.endswith(os.pathsep):
+ value = value[:-1]
+ value = remove_dups(value)
+ result[key] = value
+
+ finally:
+ popen.stdout.close()
+ popen.stderr.close()
+ return result
+
+
+def query_vcvarsall():
+ vcvarsall = distutils_vcvars()
+ return query_process('"%s" %s & set' % (vcvarsall, plat))
+
+
+def download_and_extract(url):
+ raw = io.BytesIO(download_file(url))
+ with tarfile.open(fileobj=raw, mode='r:*') as f:
+ f.extractall()
+ for x in os.listdir('.'):
+ if os.path.isdir(x):
+ os.chdir(x)
+ return
+
+
+def ensure_dir(path):
+ try:
+ os.makedirs(path)
+ except EnvironmentError as err:
+ if err.errno != errno.EEXIST:
+ raise
+
+
+def replace_in_file(path, old, new, missing_ok=False):
+ if isinstance(old, type('')):
+ old = old.encode('utf-8')
+ if isinstance(new, type('')):
+ new = new.encode('utf-8')
+ with open(path, 'r+b') as f:
+ raw = f.read()
+ if isinstance(old, bytes):
+ nraw = raw.replace(old, new)
+ else:
+ nraw = old.sub(new, raw)
+ if raw == nraw and not missing_ok:
+ raise ValueError('Failed (pattern not found) to patch: ' + path)
+ f.seek(0), f.truncate()
+ f.write(nraw)
+
+
+def copy_headers(pattern, destdir='include'):
+ dest = os.path.join(SW, destdir)
+ ensure_dir(dest)
+ files = glob.glob(pattern)
+ for f in files:
+ dst = os.path.join(dest, os.path.basename(f))
+ if os.path.isdir(f):
+ shutil.copytree(f, dst)
+ else:
+ shutil.copy2(f, dst)
+
+
+def install_binaries(pattern, destdir='lib', fname_map=os.path.basename):
+ dest = os.path.join(SW, destdir)
+ ensure_dir(dest)
+ files = glob.glob(pattern)
+ files.sort(key=len, reverse=True)
+ if not files:
+ raise ValueError('The pattern %s did not match any actual files' %
pattern)
+ for f in files:
+ dst = os.path.join(dest, fname_map(f))
+ shutil.copy(f, dst)
+ os.chmod(dst, 0o755)
+ if os.path.exists(f + '.manifest'):
+ shutil.copy(f + '.manifest', dst + '.manifest')
+
+
+def install_tree(src, dest_parent='include', ignore=None):
+ dest_parent = os.path.join(SW, dest_parent)
+ dst = os.path.join(dest_parent, os.path.basename(src))
+ if os.path.exists(dst):
+ shutil.rmtree(dst)
+ shutil.copytree(src, dst, symlinks=True, ignore=ignore)
+ return dst
+
+
+def pure_python():
+ run(PYTHON, '-m', 'pip', 'install', 'chardet', 'bs4', '--prefix',
os.path.join(SW, 'python'))
+ run(PYTHON, '-c', 'import bs4; print(bs4)')
+
+
+def zlib():
+ run('nmake -f win32/Makefile.msc')
+ install_binaries('zlib1.dll*', 'bin')
+ install_binaries('zlib.lib'), install_binaries('zdll.*')
+ copy_headers('zconf.h'), copy_headers('zlib.h')
+
+
+def libxml2():
+ run(
+ *(
+ 'cscript.exe configure.js include={0}/include lib={0}/lib
prefix={0} zlib=yes iconv=no'.
+ format(SW.replace(os.sep, '/')).split()),
+ cwd='win32')
+ run('nmake /f Makefile.msvc', cwd='win32')
+ install_tree('include/libxml', 'include/libxml2')
+ for f in walk('.'):
+ if f.endswith('.dll'):
+ install_binaries(f, 'bin')
+ elif f.endswith('.lib'):
+ install_binaries(f)
+
+
+def libxslt():
+ run(
+ *(
+ 'cscript.exe configure.js include={0}/include
include={0}/include/libxml2 lib={0}/lib '
+ 'prefix={0} zlib=yes iconv=no'.format(SW.replace(os.sep,
'/')).split()),
+ cwd='win32')
+ replace_in_file('libxslt/win32config.h', '#define snprintf _snprintf', '')
+ for f in walk('.'):
+ if os.path.basename(f).startswith('Makefile'):
+ replace_in_file(f, '/OPT:NOWIN98', '', missing_ok=True)
+ run('nmake /f Makefile.msvc', cwd='win32')
+ install_tree('libxslt', 'include')
+ install_tree('libexslt', 'include')
+ for f in walk('.'):
+ if f.endswith('.dll'):
+ install_binaries(f, 'bin')
+ elif f.endswith('.lib'):
+ install_binaries(f)
+
+
+def lxml():
+ replace_in_file('setupinfo.py', ", 'iconv'", '')
+ run(
+ PYTHON,
+ *(
+ 'setup.py build_ext -I {0}/include;{0}/include/libxml2 -L
{0}/lib'.format(
+ SW.replace(os.sep, '/')).split()))
+ run(PYTHON, 'setup.py', 'install', '--prefix', os.path.join(SW, 'python'))
+ package = glob.glob(os.path.join(SW, 'python', 'lib', 'site-packages',
'lxml-*.egg', 'lxml'))[0]
+ os.rename(package, os.path.join(SW, 'python', 'lib', 'site-packages',
'lxml'))
+
+
+def install_deps():
+ env = query_vcvarsall()
+ os.environ.update(env)
+ print(PYTHON)
+ for x in 'build lib bin include python/Lib/site-packages'.split():
+ ensure_dir(os.path.join(SW, x))
+ os.chdir(os.path.join(SW, 'build'))
+ base = os.getcwd()
+ pure_python()
+ for name in 'zlib libxml2 libxslt lxml'.split():
+ os.chdir(base)
+ if os.path.exists(name):
+ continue
+ os.mkdir(name), os.chdir(name)
+ try:
+ download_and_extract(globals()[name.upper()])
+ globals()[name]()
+ except Exception:
+ os.chdir(base)
+ shutil.rmtree(name)
+ raise
+
+
+def build():
+ env = query_vcvarsall()
+ os.environ.update(env)
+ os.environ.update(dict(
+ LIBXML_INCLUDE_DIRS=r'{0}\include;{0}\include\libxml2'.format(SW),
+ LIBXML_LIB_DIRS=r'{0}\lib'.format(SW),
+ HTML5_PARSER_DLL_DIR=os.path.join(SW, 'bin'),
+ ))
+ print('Using PYTHONPATH:', os.environ['PYTHONPATH'])
+ run(PYTHON, 'setup.py', 'test')
+
+
+def main():
+ if sys.argv[-1] == 'install':
+ install_deps()
+ else:
+ build()
+
+
+if __name__ == '__main__':
+ main()
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/html5-parser-0.4.9/.travis.yml
new/html5-parser-0.4.10/.travis.yml
--- old/html5-parser-0.4.9/.travis.yml 2019-11-03 04:13:38.000000000 +0100
+++ new/html5-parser-0.4.10/.travis.yml 1970-01-01 01:00:00.000000000 +0100
@@ -1,75 +0,0 @@
-env:
- global:
- - PYTHONHASHSEED=random
-
-matrix:
- include:
- - os: linux
- language: python
- python: 2.7
- env: BUILDER=build.py CC=gcc PYTHON=python
- group: beta
- dist: trusty
- sudo: false
- addons:
- apt:
- packages:
- - libxml2-dev
- - os: linux
- language: python
- python: 2.7
- env: BUILDER=build.py CC=clang PYTHON=python
LSAN_OPTIONS=verbosity=1:log_threads=1
- group: beta
- dist: trusty
- # See https://github.com/travis-ci/travis-ci/issues/9033
- sudo: required
- addons:
- apt:
- packages:
- - libxml2-dev
- - os: linux
- language: python
- python: 2.7
- env: BUILDER=setup.py PYTHON=python
- group: beta
- dist: trusty
- sudo: false
- addons:
- apt:
- packages:
- - libxml2-dev
- - os: linux
- language: python
- python: 3.6
- env: BUILDER=setup.py PYTHON=python
- group: beta
- dist: trusty
- sudo: false
- addons:
- apt:
- packages:
- - libxml2-dev
- - os: osx
- language: generic
- env: BUILDER=setup.py PYTHON=python3
-
-install: |
- set -e
- if [[ "$TRAVIS_OS_NAME" == 'osx' ]]; then
- brew update;
- brew upgrade python;
- python3 --version
- pip3 install --no-binary lxml chardet lxml beautifulsoup4
- else
- PLIB=$(ldd `which python` | grep libpython | cut -d ' ' -f 3)
- export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:`dirname $PLIB`
- pip install --no-binary lxml chardet lxml beautifulsoup4
- if [[ $TRAVIS_PYTHON_VERSION == 2.* ]]; then pip install
BeautifulSoup; fi
- fi
- $PYTHON -c "from lxml import etree; print(etree)"
- git clone --depth 1 "https://github.com/html5lib/html5lib-tests.git"
test/html5lib-tests
- set +e
-
-script:
- - $PYTHON $BUILDER test
- - if [[ $BUILDER == "build.py" ]]; then $PYTHON $BUILDER leak; fi
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/html5-parser-0.4.9/README.rst
new/html5-parser-0.4.10/README.rst
--- old/html5-parser-0.4.9/README.rst 2019-11-03 04:13:38.000000000 +0100
+++ new/html5-parser-0.4.10/README.rst 2021-09-22 09:00:47.000000000 +0200
@@ -1,7 +1,7 @@
html5-parser
================
-|pypi| |unix_build| |windows_build| |docs|
+|pypi| |build| |docs|
A *fast*, standards compliant, C based, HTML 5 parser for python. Over
**thirty**
times as fast as pure python based parsers, such as html5lib.
@@ -12,13 +12,9 @@
:target: https://pypi.python.org/pypi/html5-parser
:alt: Latest version released on PyPi
-.. |unix_build| image:: https://api.travis-ci.org/kovidgoyal/html5-parser.svg
- :target: http://travis-ci.org/kovidgoyal/html5-parser
- :alt: Build status of the master branch on Unix
-
-.. |windows_build| image::
https://ci.appveyor.com/api/projects/status/github/kovidgoyal/html5-parser?svg=true
- :target: https://ci.appveyor.com/project/kovidgoyal/html5-parser
- :alt: Build status of the master branch on Windows
+.. |build| image::
https://github.com/kovidgoyal/html5-parser/workflows/CI/badge.svg
+ :target:
https://github.com/kovidgoyal/html5-parser/actions?query=workflow%3ACI"
+ :alt: Build status of the master branch
.. |docs| image::
https://readthedocs.org/projects/html5-parser/badge/?version=latest
:target: https://html5-parser.readthedocs.io/en/latest/
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/html5-parser-0.4.9/build.py
new/html5-parser-0.4.10/build.py
--- old/html5-parser-0.4.9/build.py 2019-11-03 04:13:38.000000000 +0100
+++ new/html5-parser-0.4.10/build.py 2021-09-22 09:00:47.000000000 +0200
@@ -25,7 +25,7 @@
_plat = sys.platform.lower()
isosx = 'darwin' in _plat
iswindows = hasattr(sys, 'getwindowsversion')
-is_travis = os.environ.get('TRAVIS') == 'true'
+is_ci = os.environ.get('CI') == 'true'
Env = namedtuple('Env', 'cc cflags ldflags linker debug cc_name cc_ver')
PKGCONFIG = os.environ.get('PKGCONFIG_EXE', 'pkg-config')
with open(os.path.join(base, 'src/python-wrapper.c'), 'rb') as f:
@@ -209,7 +209,7 @@
TEST_EXE = os.path.join(build_dir, 'test')
MEMLEAK_EXE = os.path.join(build_dir, 'mem-leak-check')
-if is_travis:
+if is_ci:
TEST_EXE = os.path.join(os.path.dirname(os.path.abspath(sys.executable)),
'test-html5-parser')
SRC_DIRS = 'src gumbo'.split()
MOD_EXT = '.so'
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/html5-parser-0.4.9/gumbo/error.c
new/html5-parser-0.4.10/gumbo/error.c
--- old/html5-parser-0.4.9/gumbo/error.c 2019-11-03 04:13:38.000000000
+0100
+++ new/html5-parser-0.4.10/gumbo/error.c 2021-09-22 09:00:47.000000000
+0200
@@ -78,8 +78,8 @@
if (i) {
print_message(output, ", ");
}
- GumboTag tag = (GumboTag) error->tag_stack.data[i];
- print_message(output, gumbo_normalized_tagname(tag));
+ uintptr_t tag = (uintptr_t) error->tag_stack.data[i];
+ print_message(output, gumbo_normalized_tagname((GumboTag)tag));
}
gumbo_string_buffer_append_codepoint('.', output);
}
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/html5-parser-0.4.9/gumbo/parser.c
new/html5-parser-0.4.10/gumbo/parser.c
--- old/html5-parser-0.4.9/gumbo/parser.c 2019-11-03 04:13:38.000000000
+0100
+++ new/html5-parser-0.4.10/gumbo/parser.c 2021-09-22 09:00:47.000000000
+0200
@@ -645,7 +645,7 @@
if (template_insertion_modes->length == 0) {
return GUMBO_INSERTION_MODE_INITIAL;
}
- return (GumboInsertionMode)
+ return (GumboInsertionMode)(uintptr_t)
template_insertion_modes->data[(template_insertion_modes->length - 1)];
}
@@ -4344,27 +4344,23 @@
(tag_is(token, kStartTag, GUMBO_TAG_FONT) &&
(token_has_attribute(token, "color") ||
token_has_attribute(token, "face") ||
- token_has_attribute(token, "size")))) {
+ token_has_attribute(token, "size"))) ||
+ (tag_in(token, kEndTag, (gumbo_tagset){TAG(P), TAG(BR)}))
+ ) {
/* Parse error */
parser_add_parse_error(parser, token);
- /*
- * Fragment case: If the parser was originally created for the HTML
- * fragment parsing algorithm, then act as described in the "any other
- * start tag" entry below.
- */
- if (!is_fragment_parser(parser)) {
- do {
- pop_current_node(parser);
- } while (!(is_mathml_integration_point(get_current_node(parser)) ||
- is_html_integration_point(get_current_node(parser)) ||
- get_current_node(parser)->v.element.tag_namespace ==
- GUMBO_NAMESPACE_HTML));
- parser->_parser_state->_reprocess_current_token = true;
- return false;
+ GumboNode *current_node;
+ while ((current_node = get_current_node(parser)) && !(
+ is_mathml_integration_point(current_node) ||
+ is_html_integration_point(current_node) ||
+ current_node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML
+ )) {
+ if (!pop_current_node(parser)) break;
}
- assert(token->type == GUMBO_TOKEN_START_TAG);
+ parser->_parser_state->_reprocess_current_token = true;
+ return false;
}
if (token->type == GUMBO_TOKEN_START_TAG) {
@@ -4647,7 +4643,7 @@
// we exclude the <html> tag as it causes crashes in the
as-lxml
// module, see
https://github.com/kovidgoyal/html5-parser/issues/17
// I dont have the time to track down the root cause,
probably something
- // related to resuing the same string segments for the tag
name and the
+ // related to reusing the same string segments for the tag
name and the
// special cloning/modification that happens to HTML tags.
Since HTML tags
// are treated specially anyway, there is no harm in
excluding them.
TAG(HTML)})) {
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/html5-parser-0.4.9/run_tests.py
new/html5-parser-0.4.10/run_tests.py
--- old/html5-parser-0.4.9/run_tests.py 2019-11-03 04:13:38.000000000 +0100
+++ new/html5-parser-0.4.10/run_tests.py 2021-09-22 09:00:47.000000000
+0200
@@ -10,6 +10,12 @@
import sys
import unittest
+if 'HTML5_PARSER_DLL_DIR' in os.environ:
+ sys.save_dll_dir = os.add_dll_directory(os.environ['HTML5_PARSER_DLL_DIR'])
+ print('Added DLL directory', sys.save_dll_dir, 'with contents:',
+ os.listdir(os.environ['HTML5_PARSER_DLL_DIR']))
+ print('Current sys.path:', sys.path)
+
self_path = os.path.abspath(__file__)
base = os.path.dirname(self_path)
html5lib_tests_path = os.path.join(base, 'test', 'html5lib-tests')
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/html5-parser-0.4.9/src/as-libxml.c
new/html5-parser-0.4.10/src/as-libxml.c
--- old/html5-parser-0.4.9/src/as-libxml.c 2019-11-03 04:13:38.000000000
+0100
+++ new/html5-parser-0.4.10/src/as-libxml.c 2021-09-22 09:00:47.000000000
+0200
@@ -215,7 +215,7 @@
if (UNLIKELY(elem->tag >= GUMBO_TAG_UNKNOWN)) {
gumbo_tag_from_original_text(&(elem->original_tag));
- uint8_t tag_sz = MIN(sizeof(buf) - 1, elem->original_tag.length);
+ uint8_t tag_sz = (uint8_t)(MIN(sizeof(buf) - 1,
elem->original_tag.length));
memcpy(buf, elem->original_tag.data, tag_sz);
tag = buf;
if (pd->maybe_xhtml) {
@@ -223,7 +223,7 @@
nsprefix = check_for_namespace_prefix(&temp, &tag_sz);
tag = temp;
}
- tag_sz = pd->sanitize_names ? sanitize_name((char*)tag) : strlen(tag);
+ tag_sz = (uint8_t)(pd->sanitize_names ? sanitize_name((char*)tag) :
strlen(tag));
tag_name = xmlDictLookup(doc->dict, BAD_CAST tag, tag_sz);
} else if (UNLIKELY(elem->tag_namespace == GUMBO_NAMESPACE_SVG)) {
gumbo_tag_from_original_text(&(elem->original_tag));
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/html5-parser-0.4.9/src/html5_parser/__init__.py
new/html5-parser-0.4.10/src/html5_parser/__init__.py
--- old/html5-parser-0.4.9/src/html5_parser/__init__.py 2019-11-03
04:13:38.000000000 +0100
+++ new/html5-parser-0.4.10/src/html5_parser/__init__.py 2021-09-22
09:00:47.000000000 +0200
@@ -115,7 +115,7 @@
return {'lxml.etree': 'lxml', 'etree': 'stdlib_etree'}.get(x, x)
-NAMESPACE_SUPPORTING_BUILDERS = frozenset('lxml stdlib_etree dom'.split())
+NAMESPACE_SUPPORTING_BUILDERS = frozenset('lxml stdlib_etree dom
lxml_html'.split())
def parse(
@@ -129,7 +129,8 @@
return_root=True,
line_number_attr=None,
sanitize_names=True,
- stack_size=16 * 1024
+ stack_size=16 * 1024,
+ fragment_context=None,
):
'''
Parse the specified :attr:`html` and return the parsed representation.
@@ -145,7 +146,9 @@
:param treebuilder:
The type of tree to return. Note that only the lxml treebuilder is
fast, as all
other treebuilders are implemented in python, not C. Supported values
are:
- * `lxml <http://lxml.de>`_ -- the default, and fastest
+ * `lxml <https://lxml.de>`_ -- the default, and fastest
+ * `lxml_html <https://lxml.de>`_ -- tree of lxml.html.HtmlElement,
same speed as lxml
+ (new in *0.4.10*)
* etree (the python stdlib :mod:`xml.etree.ElementTree`)
* dom (the python stdlib :mod:`xml.dom.minidom`)
* `soup <https://www.crummy.com/software/BeautifulSoup>`_ --
BeautifulSoup,
@@ -161,7 +164,8 @@
suitable for XHTML. In particular handles self-closed CDATA elements.
So a ``<title/>`` or ``<style/>`` in the HTML will not completely break
parsing. Also preserves namespaced tags and attributes even for
namespaces
- not supported by HTML 5 (this works only with the ``lxml``
treebuilder).
+ not supported by HTML 5 (this works only with the ``lxml`` and
``lxml_html``
+ treebuilders).
Note that setting this also implicitly sets ``namespace_elements``.
:param return_root: If True, return the root node of the document,
otherwise
@@ -181,6 +185,10 @@
default is sufficient to avoid memory allocations for all but the
largest documents.
+ :param fragment_context: the tag name under which to parse the HTML when
the html
+ is a fragment. Common choices are ``div`` or ``body``. To use SVG or
MATHML tags
+ prefix the tag name with ``svg:`` or ``math:`` respectively. Note that
currently
+ using a non-HTML fragment_context is not supported. New in *0.4.10*.
'''
data = as_utf8(html or b'', transport_encoding, fallback_encoding)
treebuilder = normalize_treebuilder(treebuilder)
@@ -190,6 +198,15 @@
data, return_root=return_root, keep_doctype=keep_doctype,
stack_size=stack_size)
if treebuilder not in NAMESPACE_SUPPORTING_BUILDERS:
namespace_elements = False
+ fragment_namespace = html_parser.GUMBO_NAMESPACE_HTML
+ if fragment_context:
+ fragment_context = fragment_context.lower()
+ if ':' in fragment_context:
+ ns, fragment_context = fragment_context.split(':', 1)
+ fragment_namespace = {
+ 'svg': html_parser.GUMBO_NAMESPACE_SVG, 'math':
html_parser.GUMBO_NAMESPACE_MATHML,
+ 'html': html_parser.GUMBO_NAMESPACE_HTML
+ }[ns]
capsule = html_parser.parse(
data,
@@ -198,10 +215,17 @@
maybe_xhtml=maybe_xhtml,
line_number_attr=line_number_attr,
sanitize_names=sanitize_names,
- stack_size=stack_size)
-
- ans = etree.adopt_external_document(capsule)
- if treebuilder == 'lxml':
+ stack_size=stack_size,
+ fragment_context=fragment_context,
+ fragment_namespace=fragment_namespace,
+ )
+
+ interpreter = None
+ if treebuilder == 'lxml_html':
+ from lxml.html import HTMLParser
+ interpreter = HTMLParser()
+ ans = etree.adopt_external_document(capsule, parser=interpreter)
+ if treebuilder in ('lxml', 'lxml_html'):
return ans.getroot() if return_root else ans
m = importlib.import_module('html5_parser.' + treebuilder)
return m.adapt(ans, return_root=return_root)
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/html5-parser-0.4.9/src/python-wrapper.c
new/html5-parser-0.4.10/src/python-wrapper.c
--- old/html5-parser-0.4.9/src/python-wrapper.c 2019-11-03 04:13:38.000000000
+0100
+++ new/html5-parser-0.4.10/src/python-wrapper.c 2021-09-22
09:00:47.000000000 +0200
@@ -15,7 +15,7 @@
#define MAJOR 0
#define MINOR 4
-#define PATCH 9
+#define PATCH 10
static char *NAME = "libxml2:xmlDoc";
static char *DESTRUCTOR = "destructor:xmlFreeDoc";
@@ -35,12 +35,12 @@
return doc;
}
-static inline libxml_doc*
-parse_with_options(const char* buffer, size_t buffer_length, Options *opts) {
+static libxml_doc*
+parse_with_options(const char* buffer, size_t buffer_length, Options *opts,
const GumboTag context, GumboNamespaceEnum context_namespace) {
GumboOutput *output = NULL;
libxml_doc* doc = NULL;
Py_BEGIN_ALLOW_THREADS;
- output = gumbo_parse_with_options(&(opts->gumbo_opts), buffer,
buffer_length);
+ output = gumbo_parse_fragment(&(opts->gumbo_opts), buffer, buffer_length,
context, context_namespace);
Py_END_ALLOW_THREADS;
if (output == NULL) PyErr_NoMemory();
else {
@@ -76,18 +76,33 @@
Options opts = {0};
opts.stack_size = 16 * 1024;
PyObject *kd = Py_True, *mx = Py_False, *ne = Py_False, *sn = Py_True;
+ char *fragment_context = NULL; Py_ssize_t fragment_context_sz = 0;
opts.gumbo_opts = kGumboDefaultOptions;
opts.gumbo_opts.max_errors = 0; // We discard errors since we are not
reporting them anyway
+ GumboNamespaceEnum fragment_namespace = GUMBO_NAMESPACE_HTML;
- static char *kwlist[] = {"data", "namespace_elements", "keep_doctype",
"maybe_xhtml", "line_number_attr", "sanitize_names", "stack_size", NULL};
+ static char *kwlist[] = {"data", "namespace_elements", "keep_doctype",
"maybe_xhtml", "line_number_attr", "sanitize_names", "stack_size",
"fragment_context", "fragment_namespace", NULL};
- if (!PyArg_ParseTupleAndKeywords(args, kwds, "s#|OOOzOI", kwlist, &buffer,
&sz, &ne, &kd, &mx, &(opts.line_number_attr), &sn, &(opts.stack_size))) return
NULL;
+ if (!PyArg_ParseTupleAndKeywords(args, kwds, "s#|OOOzOIz#i", kwlist,
&buffer, &sz, &ne, &kd, &mx, &(opts.line_number_attr), &sn, &(opts.stack_size),
&fragment_context, &fragment_context_sz, &fragment_namespace)) return NULL;
opts.namespace_elements = PyObject_IsTrue(ne);
opts.keep_doctype = PyObject_IsTrue(kd);
opts.sanitize_names = PyObject_IsTrue(sn);
opts.gumbo_opts.use_xhtml_rules = PyObject_IsTrue(mx);
-
- doc = parse_with_options(buffer, (size_t)sz, &opts);
+ GumboTag context = GUMBO_TAG_LAST;
+ if (fragment_context && fragment_context_sz > 0) {
+ context = gumbo_tagn_enum(fragment_context, fragment_context_sz);
+ if (context == GUMBO_TAG_UNKNOWN) {
+ PyErr_Format(PyExc_KeyError, "Unknown fragment_context tag name:
%s", fragment_context);
+ return NULL;
+ }
+ }
+ if (fragment_namespace != GUMBO_NAMESPACE_HTML) {
+ // causes infinite loops in gumbo, enable the non html fragment
context tests
+ // in html5lib_adapter.py to trigger
+ PyErr_SetString(PyExc_KeyError, "Fragment parsing with non-HTML
namespaces is not supported");
+ return NULL;
+ }
+ doc = parse_with_options(buffer, (size_t)sz, &opts, context,
fragment_namespace);
if (!doc) return NULL;
return encapsulate(doc);
}
@@ -187,6 +202,9 @@
if (PyModule_AddIntMacro(m, MAJOR) != 0) INITERROR;
if (PyModule_AddIntMacro(m, MINOR) != 0) INITERROR;
if (PyModule_AddIntMacro(m, PATCH) != 0) INITERROR;
+ if (PyModule_AddIntMacro(m, GUMBO_NAMESPACE_HTML) != 0) INITERROR;
+ if (PyModule_AddIntMacro(m, GUMBO_NAMESPACE_SVG) != 0) INITERROR;
+ if (PyModule_AddIntMacro(m, GUMBO_NAMESPACE_MATHML) != 0) INITERROR;
if (PyModule_AddIntConstant(m, "LIBXML_VERSION", get_libxml_version()) !=
0) INITERROR;
known_tag_names = PyTuple_New(GUMBO_TAG_UNKNOWN);
if (known_tag_names == NULL) INITERROR;
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/html5-parser-0.4.9/test/basic.py
new/html5-parser-0.4.10/test/basic.py
--- old/html5-parser-0.4.9/test/basic.py 2019-11-03 04:13:38.000000000
+0100
+++ new/html5-parser-0.4.10/test/basic.py 2021-09-22 09:00:47.000000000
+0200
@@ -94,3 +94,12 @@
self.ae(root[1][0].sourceline, 4)
self.ae(root[1][0][0].sourceline, 4)
self.ae(root[1][0][0].get('ln'), '4')
+
+ def test_lxml_html(self):
+ root = parse('<html><head><body><p><span>', treebuilder='lxml_html')
+ from lxml.html import HtmlElement
+ self.assertIsInstance(root, HtmlElement)
+
+ def test_fragment(self):
+ root = parse('<span>a</span>', fragment_context='div')
+ self.ae(root[0].tag, 'span')
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/html5-parser-0.4.9/test/html5lib_adapter.py
new/html5-parser-0.4.10/test/html5lib_adapter.py
--- old/html5-parser-0.4.9/test/html5lib_adapter.py 2019-11-03
04:13:38.000000000 +0100
+++ new/html5-parser-0.4.10/test/html5lib_adapter.py 2021-09-22
09:00:47.000000000 +0200
@@ -65,10 +65,10 @@
return {k: n(v) for k, v in data.items()}
-def serialize_construction_output(root):
+def serialize_construction_output(root, fragment_context):
tree = root.getroottree()
lines = []
- if tree.docinfo.doctype:
+ if tree.docinfo.doctype and not fragment_context:
di = tree.docinfo
if di.public_id or di.system_url:
d = '<!DOCTYPE {} "{}" "{}">'.format(di.root_name, di.public_id,
di.system_url)
@@ -97,12 +97,11 @@
level += 2
add(level, ns, name, '=', '"', val, '"')
- def serialize_text(text, level):
- level += 2
- add(level, '"', text, '"')
+ def serialize_text(text, level=0):
+ add((level + 2) if level else 1, '"', text, '"')
def serialize_comment(node, level=1):
- add(level, '<!-- ', node.text, ' -->')
+ add(level, '<!-- ', node.text or '', ' -->')
def serialize_node(node, level=1):
name = serialize_tag(node.tag, level)
@@ -121,11 +120,20 @@
if child.tail:
serialize_text(child.tail, level)
- for c in root.itersiblings(preceding=True):
- serialize_comment(c)
- serialize_node(root)
- for c in root.itersiblings():
- serialize_comment(c)
+ if fragment_context:
+ if root.text:
+ serialize_text(root.text)
+ for node in root.iterchildren():
+ if isinstance(node, _Comment):
+ serialize_comment(node)
+ else:
+ serialize_node(node)
+ else:
+ for c in root.itersiblings(preceding=True):
+ serialize_comment(c)
+ serialize_node(root)
+ for c in root.itersiblings():
+ serialize_comment(c)
output = '\n'.join(lines)
# gumbo does not fix single carriage returns generated by entities and it
# does not lowercase unknown tags
@@ -159,7 +167,7 @@
class ConstructionTests(BaseTest):
@classmethod
- def check_test(cls, inner_html, html, expected, errors, test_name):
+ def check_test(cls, fragment_context, html, expected, errors, test_name):
if test_name == 'isindex' or html == '<!doctype html><isindex
type="hidden">':
return (
'gumbo and html5lib differ on <isindex> parsing'
@@ -176,17 +184,19 @@
for line in errors:
if 'expected-doctype-name-but' in line or 'unknown-doctype' in
line:
return 'gumbo auto-corrects malformed doctypes'
- if inner_html:
- return 'TODO: Implement fragment parsing'
+ if fragment_context and ':' in fragment_context:
+ return 'Fragment parsing with non HTML contexts not supported'
- def implementation(self, inner_html, html, expected, errors, test_name):
- html = inner_html or html
- bad = self.check_test(inner_html, html, expected, errors, test_name)
+ def implementation(self, fragment_context, html, expected, errors,
test_name):
+ if fragment_context:
+ fragment_context = fragment_context.replace(' ', ':')
+ bad = self.check_test(fragment_context, html, expected, errors,
test_name)
if bad is not None:
raise unittest.SkipTest(bad)
- root = parse(html, namespace_elements=True, sanitize_names=False)
- output = serialize_construction_output(root)
+ root = parse(
+ html, namespace_elements=True, sanitize_names=False,
fragment_context=fragment_context)
+ output = serialize_construction_output(root,
fragment_context=fragment_context)
# html5lib doesn't yet support the template tag, but it appears in the
# tests with the expectation that the template contents will be under
the
@@ -200,7 +210,7 @@
class EncodingTests(BaseTest):
- def implementation(self, inner_html, html, expected, errors, test_name):
+ def implementation(self, fragment_context, html, expected, errors,
test_name):
if '<!-- Starts with UTF-8 BOM -->' in html:
raw = b'\xef\xbb\xbf' + html[3:].encode('ascii')
self.assertIs(check_bom(raw), codecs.BOM_UTF8)
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/html5-parser-0.4.9/win-ci.py
new/html5-parser-0.4.10/win-ci.py
--- old/html5-parser-0.4.9/win-ci.py 2019-11-03 04:13:38.000000000 +0100
+++ new/html5-parser-0.4.10/win-ci.py 1970-01-01 01:00:00.000000000 +0100
@@ -1,255 +0,0 @@
-#!/usr/bin/env python3
-# vim:fileencoding=utf-8
-# License: Apache 2.0 Copyright: 2017, Kovid Goyal <kovid at kovidgoyal.net>
-
-from __future__ import print_function
-
-import errno
-import glob
-import io
-import os
-import pipes
-import shlex
-import shutil
-import subprocess
-import sys
-import tarfile
-import time
-
-ZLIB = "http://zlib.net/zlib-{}.tar.xz".format("1.2.11")
-LIBXML2 = "ftp://xmlsoft.org/libxml2/libxml2-{}.tar.gz".format('2.9.4')
-LIBXSLT = "ftp://xmlsoft.org/libxml2/libxslt-{}.tar.gz".format('1.1.28')
-LXML =
"https://pypi.python.org/packages/20/b3/9f245de14b7696e2d2a386c0b09032a2ff6625270761d6543827e667d8de/lxml-3.8.0.tar.gz"
# noqa
-SW = os.path.abspath('sw')
-if 'PY' in os.environ and 'Platform' in os.environ:
- PYTHON =
os.path.expandvars('C:\\Python%PY%-%Platform%\\python.exe').replace('-x86', '')
-else:
- PYTHON = sys.executable
-os.environ['SW'] = SW
-os.environ['PYTHONPATH'] =
os.path.expandvars('%SW%\\python\\Lib\\site-packages;%PYTHONPATH%')
-
-
-def printf(*a, **k):
- print(*a, **k)
- sys.stdout.flush()
-
-
-def walk(path='.'):
- for dirpath, dirnames, filenames in os.walk(path):
- for f in filenames:
- yield os.path.join(dirpath, f)
-
-
-def download_file(url):
- for i in range(5):
- try:
- printf('Downloading', url)
- try:
- return subprocess.check_output(['curl.exe', '-fSL', url])
- except FileNotFoundError:
- try:
- from urllib.request import urlopen
- except ImportError:
- from urllib import urlopen
- return urlopen(url).read()
- except subprocess.CalledProcessError:
- time.sleep(1)
- raise SystemExit('Failed to download: {}'.format(url))
-
-
-def split(x):
- x = x.replace('\\', '\\\\')
- return shlex.split(x)
-
-
-def run(*args, env=None, cwd=None):
- if len(args) == 1 and isinstance(args[0], type('')):
- cmd = split(args[0])
- else:
- cmd = args
- printf(' '.join(pipes.quote(x) for x in cmd))
- sys.stdout.flush()
- if env:
- printf('Using modified env:', env)
- e = os.environ.copy()
- e.update(env)
- env = e
- try:
- p = subprocess.Popen(cmd, cwd=cwd, env=env)
- except EnvironmentError as err:
- if err.errno == errno.ENOENT:
- raise SystemExit('Could not find the program: %s' % cmd[0])
- raise
- if p.wait() != 0:
- raise SystemExit(p.returncode)
-
-
-def download_and_extract(url):
- raw = io.BytesIO(download_file(url))
- with tarfile.open(fileobj=raw, mode='r:*') as f:
- f.extractall()
- for x in os.listdir('.'):
- if os.path.isdir(x):
- os.chdir(x)
- return
-
-
-def ensure_dir(path):
- try:
- os.makedirs(path)
- except EnvironmentError as err:
- if err.errno != errno.EEXIST:
- raise
-
-
-def replace_in_file(path, old, new, missing_ok=False):
- if isinstance(old, type('')):
- old = old.encode('utf-8')
- if isinstance(new, type('')):
- new = new.encode('utf-8')
- with open(path, 'r+b') as f:
- raw = f.read()
- if isinstance(old, bytes):
- nraw = raw.replace(old, new)
- else:
- nraw = old.sub(new, raw)
- if raw == nraw and not missing_ok:
- raise ValueError('Failed (pattern not found) to patch: ' + path)
- f.seek(0), f.truncate()
- f.write(nraw)
-
-
-def copy_headers(pattern, destdir='include'):
- dest = os.path.join(SW, destdir)
- ensure_dir(dest)
- files = glob.glob(pattern)
- for f in files:
- dst = os.path.join(dest, os.path.basename(f))
- if os.path.isdir(f):
- shutil.copytree(f, dst)
- else:
- shutil.copy2(f, dst)
-
-
-def install_binaries(pattern, destdir='lib', fname_map=os.path.basename):
- dest = os.path.join(SW, destdir)
- ensure_dir(dest)
- files = glob.glob(pattern)
- files.sort(key=len, reverse=True)
- if not files:
- raise ValueError('The pattern %s did not match any actual files' %
pattern)
- for f in files:
- dst = os.path.join(dest, fname_map(f))
- shutil.copy(f, dst)
- os.chmod(dst, 0o755)
- if os.path.exists(f + '.manifest'):
- shutil.copy(f + '.manifest', dst + '.manifest')
-
-
-def install_tree(src, dest_parent='include', ignore=None):
- dest_parent = os.path.join(SW, dest_parent)
- dst = os.path.join(dest_parent, os.path.basename(src))
- if os.path.exists(dst):
- shutil.rmtree(dst)
- shutil.copytree(src, dst, symlinks=True, ignore=ignore)
- return dst
-
-
-def pure_python():
- run(PYTHON, '-m', 'pip', 'install', 'chardet', 'bs4', '--prefix',
os.path.join(SW, 'python'))
- run(PYTHON, '-c', 'import bs4; print(bs4)')
-
-
-def zlib():
- run('nmake -f win32/Makefile.msc')
- install_binaries('zlib1.dll*', 'bin')
- install_binaries('zlib.lib'), install_binaries('zdll.*')
- copy_headers('zconf.h'), copy_headers('zlib.h')
-
-
-def libxml2():
- run(
- *(
- 'cscript.exe configure.js include={0}/include lib={0}/lib
prefix={0} zlib=yes iconv=no'.
- format(SW.replace(os.sep, '/')).split()),
- cwd='win32')
- run('nmake /f Makefile.msvc', cwd='win32')
- install_tree('include/libxml', 'include/libxml2')
- for f in walk('.'):
- if f.endswith('.dll'):
- install_binaries(f, 'bin')
- elif f.endswith('.lib'):
- install_binaries(f)
-
-
-def libxslt():
- run(
- *(
- 'cscript.exe configure.js include={0}/include
include={0}/include/libxml2 lib={0}/lib '
- 'prefix={0} zlib=yes iconv=no'.format(SW.replace(os.sep,
'/')).split()),
- cwd='win32')
- replace_in_file('libxslt/win32config.h', '#define snprintf _snprintf', '')
- for f in walk('.'):
- if os.path.basename(f).startswith('Makefile'):
- replace_in_file(f, '/OPT:NOWIN98', '', missing_ok=True)
- run('nmake /f Makefile.msvc', cwd='win32')
- install_tree('libxslt', 'include')
- install_tree('libexslt', 'include')
- for f in walk('.'):
- if f.endswith('.dll'):
- install_binaries(f, 'bin')
- elif f.endswith('.lib'):
- install_binaries(f)
-
-
-def lxml():
- replace_in_file('setupinfo.py', ", 'iconv'", '')
- run(
- PYTHON,
- *(
- 'setup.py build_ext -I {0}/include;{0}/include/libxml2 -L
{0}/lib'.format(
- SW.replace(os.sep, '/')).split()))
- run(PYTHON, 'setup.py', 'install', '--prefix', os.path.join(SW, 'python'))
-
-
-def install_deps():
- print(PYTHON)
- for x in 'build lib bin include python/Lib/site-packages'.split():
- ensure_dir(os.path.join(SW, x))
- os.chdir(os.path.join(SW, 'build'))
- base = os.getcwd()
- pure_python()
- for name in 'zlib libxml2 libxslt lxml'.split():
- os.chdir(base)
- if os.path.exists(name):
- continue
- os.mkdir(name), os.chdir(name)
- try:
- download_and_extract(globals()[name.upper()])
- globals()[name]()
- except:
- os.chdir(base)
- shutil.rmtree(name)
- raise
-
-
-def build():
- p = os.environ['PATH']
- p = os.path.join(SW, 'bin') + os.pathsep + p
- env = dict(
- LIBXML_INCLUDE_DIRS=r'{0}\include;{0}\include\libxml2'.format(SW),
- LIBXML_LIB_DIRS=r'{0}\lib'.format(SW),
- PATH=p
- )
- run(PYTHON, 'setup.py', 'test', env=env)
-
-
-def main():
- if sys.argv[-1] == 'install_deps':
- install_deps()
- else:
- build()
-
-
-if __name__ == '__main__':
- main()