Package: release.debian.org
Severity: normal
Tags: trixie
X-Debbugs-Cc: [email protected], [email protected]
Control: affects -1 + src:lxml-html-clean
User: [email protected]
Usertags: pu

  - CVE-2026-28348: CSS @import Filter Bypass via Unicode Escapes
  - CVE-2026-28350: <base> tag injection through default Cleaner
                    configuration

The only code changes in the new upstream releases are the CVE fixes
in clean.py, everything else are test/CI/documentation changes
(including testcases for the CVEs).
diffstat for lxml-html-clean-0.4.2 lxml-html-clean-0.4.4

 .github/workflows/main.yml |    4 
 CHANGES.rst                |   25 +++++
 README.md                  |    2 
 debian/changelog           |   25 +++++
 debian/control             |    2 
 docs/usage.rst             |    5 +
 lxml_html_clean/clean.py   |   27 ++++++
 setup.cfg                  |    5 -
 tests/test_clean.py        |  198 ++++++++++++++++++++++++++++++++++++++++++++-
 tests/test_clean.txt       |    2 
 tox.ini                    |    2 
 11 files changed, 284 insertions(+), 13 deletions(-)

diff -Nru lxml-html-clean-0.4.2/CHANGES.rst lxml-html-clean-0.4.4/CHANGES.rst
--- lxml-html-clean-0.4.2/CHANGES.rst   2025-04-09 14:14:25.000000000 +0300
+++ lxml-html-clean-0.4.4/CHANGES.rst   2026-02-27 11:32:37.000000000 +0200
@@ -6,6 +6,31 @@
 Unreleased
 ==========
 
+0.4.4 (2026-02-26)
+==================
+
+Bugs fixed
+----------
+
+* Fixed a bug where Unicode escapes in CSS were not properly decoded
+  before security checks. This prevents attackers from bypassing filters
+  using escape sequences.
+* Fixed a security issue where ``<base>`` tags could be used for URL
+  hijacking attacks. The ``<base>`` tag is now automatically removed
+  whenever the ``<head>`` tag is removed (via ``page_structure=True``
+  or manual configuration), as ``<base>`` must be inside ``<head>``
+  according to HTML specifications.
+
+0.4.3 (2025-10-02)
+==================
+
+Maintenance
+-----------
+
+* Tests updated to work correctly with new lxml and libxml2 releases.
+* Python 3.6 and 3.7 are no longer tested.
+* Improved documentation about CSS removal behavior.
+
 0.4.2 (2025-04-09)
 ==================
 
diff -Nru lxml-html-clean-0.4.2/debian/changelog 
lxml-html-clean-0.4.4/debian/changelog
--- lxml-html-clean-0.4.2/debian/changelog      2025-04-23 08:47:04.000000000 
+0300
+++ lxml-html-clean-0.4.4/debian/changelog      2026-06-18 20:48:59.000000000 
+0300
@@ -1,3 +1,28 @@
+lxml-html-clean (0.4.4-1~deb13u1) trixie; urgency=medium
+
+  * Non-maintainer upload.
+  * Rebuild for trixie.
+    - CVE-2026-28348: CSS @import Filter Bypass via Unicode Escapes
+    - CVE-2026-28350: <base> tag injection through default Cleaner
+                      configuration
+
+
+ -- Adrian Bunk <[email protected]>  Thu, 18 Jun 2026 20:48:59 +0300
+
+lxml-html-clean (0.4.4-1) unstable; urgency=medium
+
+  * New upstream version.
+  * Bump standards version.
+
+ -- Matthias Klose <[email protected]>  Fri, 06 Mar 2026 08:37:31 +0100
+
+lxml-html-clean (0.4.3-1) unstable; urgency=medium
+
+  * New upstream version. Closes: #1114193.
+  * Bump standards version.
+
+ -- Matthias Klose <[email protected]>  Sun, 05 Oct 2025 11:15:18 +0200
+
 lxml-html-clean (0.4.2-1) unstable; urgency=medium
 
   * New upstream version.
diff -Nru lxml-html-clean-0.4.2/debian/control 
lxml-html-clean-0.4.4/debian/control
--- lxml-html-clean-0.4.2/debian/control        2024-08-02 03:41:19.000000000 
+0300
+++ lxml-html-clean-0.4.4/debian/control        2026-03-06 09:37:31.000000000 
+0200
@@ -9,7 +9,7 @@
  python3-setuptools,
  python3-all,
  python3-lxml,
-Standards-Version: 4.7.0
+Standards-Version: 4.7.3
 Homepage: https://github.com/fedora-python/lxml_html_clean
 
 Package: python3-lxml-html-clean
diff -Nru lxml-html-clean-0.4.2/docs/usage.rst 
lxml-html-clean-0.4.4/docs/usage.rst
--- lxml-html-clean-0.4.2/docs/usage.rst        2025-04-09 14:14:25.000000000 
+0300
+++ lxml-html-clean-0.4.4/docs/usage.rst        2026-02-27 11:32:37.000000000 
+0200
@@ -109,6 +109,11 @@
       </body>
     </html>
 
+To control the removal of CSS styles, set the ``style`` and/or ``inline_style``
+keyword arguments to ``True`` when creating a ``Cleaner`` instance.
+If neither option is enabled, only ``@import`` rules are automatically removed
+from CSS content.
+
 You can also whitelist some otherwise dangerous content with
 ``Cleaner(host_whitelist=['www.youtube.com'])``, which would allow
 embedded media from YouTube, while still filtering out embedded media
diff -Nru lxml-html-clean-0.4.2/.github/workflows/main.yml 
lxml-html-clean-0.4.4/.github/workflows/main.yml
--- lxml-html-clean-0.4.2/.github/workflows/main.yml    2025-04-09 
14:14:25.000000000 +0300
+++ lxml-html-clean-0.4.4/.github/workflows/main.yml    2026-02-27 
11:32:37.000000000 +0200
@@ -19,17 +19,15 @@
       uses: fedora-python/tox-github-action@main
       with:
         tox_env: ${{ matrix.tox_env }}
-        dnf_install: gcc libxml2-devel libxslt-devel
     strategy:
       matrix:
         tox_env:
-          - py36
-          - py38
           - py39
           - py310
           - py311
           - py312
           - py313
+          - py314
           - mypy
 
     # Use GitHub's Linux Docker host
diff -Nru lxml-html-clean-0.4.2/lxml_html_clean/clean.py 
lxml-html-clean-0.4.4/lxml_html_clean/clean.py
--- lxml-html-clean-0.4.2/lxml_html_clean/clean.py      2025-04-09 
14:14:25.000000000 +0300
+++ lxml-html-clean-0.4.4/lxml_html_clean/clean.py      2026-02-27 
11:32:37.000000000 +0200
@@ -422,6 +422,12 @@
         if self.annoying_tags:
             remove_tags.update(('blink', 'marquee'))
 
+        # Remove <base> tags whenever <head> is being removed.
+        # According to HTML spec, <base> must be in <head>, but browsers
+        # may interpret it even when misplaced, allowing URL hijacking attacks.
+        if 'head' in kill_tags or 'head' in remove_tags:
+            kill_tags.add('base')
+
         _remove = deque()
         _kill = deque()
         for el in doc.iter():
@@ -578,6 +584,26 @@
     _comments_re = re.compile(r'/\*.*?\*/', re.S)
     _find_comments = _comments_re.finditer
     _substitute_comments = _comments_re.sub
+    _css_unicode_escape_re = re.compile(r'\\([0-9a-fA-F]{1,6})\s?')
+
+    def _decode_css_unicode_escapes(self, style):
+        """
+        Decode CSS Unicode escape sequences like \\69 or \\000069 to their
+        actual character values. This prevents bypassing security checks
+        using CSS escape sequences.
+
+        CSS escape syntax: backslash followed by 1-6 hex digits,
+        optionally followed by a whitespace character.
+        """
+        def replace_escape(match):
+            hex_value = match.group(1)
+            try:
+                return chr(int(hex_value, 16))
+            except (ValueError, OverflowError):
+                # Invalid unicode codepoint, keep original
+                return match.group(0)
+
+        return self._css_unicode_escape_re.sub(replace_escape, style)
 
     def _has_sneaky_javascript(self, style):
         """
@@ -591,6 +617,7 @@
         more sneaky attempts.
         """
         style = self._substitute_comments('', style)
+        style = self._decode_css_unicode_escapes(style)
         style = style.replace('\\', '')
         style = _substitute_whitespace('', style)
         style = style.lower()
diff -Nru lxml-html-clean-0.4.2/README.md lxml-html-clean-0.4.4/README.md
--- lxml-html-clean-0.4.2/README.md     2025-04-09 14:14:25.000000000 +0300
+++ lxml-html-clean-0.4.4/README.md     2026-02-27 11:32:37.000000000 +0200
@@ -4,7 +4,7 @@
 
 This project was initially a part of [lxml](https://github.com/lxml/lxml). 
Because HTML cleaner is designed as blocklist-based, many reports about 
possible security vulnerabilities were filed for lxml and that make the project 
problematic for security-sensitive environments. Therefore we decided to 
extract the problematic part to a separate project.
 
-**Important**: the HTML Cleaner in ``lxml_html_clean`` is **not** considered 
appropriate **for security sensitive environments**. See e.g. 
[bleach](https://pypi.org/project/bleach/) for an alternative.
+**Important**: the HTML Cleaner in ``lxml_html_clean`` is **not** considered 
appropriate **for security sensitive environments**. See e.g. 
[nh3](https://pypi.org/project/nh3/) for an alternative.
 
 This project uses functions from Python's `urllib.parse` for URL parsing which 
**do not validate inputs**. For more information on potential security risks, 
refer to the [URL parsing 
security](https://docs.python.org/3/library/urllib.parse.html#url-parsing-security)
 documentation. A maliciously crafted URL could potentially bypass the allowed 
hosts check in `Cleaner`.
 
diff -Nru lxml-html-clean-0.4.2/setup.cfg lxml-html-clean-0.4.4/setup.cfg
--- lxml-html-clean-0.4.2/setup.cfg     2025-04-09 14:14:25.000000000 +0300
+++ lxml-html-clean-0.4.4/setup.cfg     2026-02-27 11:32:37.000000000 +0200
@@ -1,6 +1,6 @@
 [metadata]
 name = lxml_html_clean
-version = 0.4.2
+version = 0.4.4
 description = HTML cleaner from lxml project
 long_description = file:README.md
 long_description_content_type = text/markdown
@@ -13,14 +13,13 @@
 license_files = LICENSE.txt
 classifiers =
     Programming Language :: Python :: 3
-    Programming Language :: Python :: 3.6
-    Programming Language :: Python :: 3.7
     Programming Language :: Python :: 3.8
     Programming Language :: Python :: 3.9
     Programming Language :: Python :: 3.10
     Programming Language :: Python :: 3.11
     Programming Language :: Python :: 3.12
     Programming Language :: Python :: 3.13
+    Programming Language :: Python :: 3.14
 
 [options]
 packages =
diff -Nru lxml-html-clean-0.4.2/tests/test_clean.py 
lxml-html-clean-0.4.4/tests/test_clean.py
--- lxml-html-clean-0.4.2/tests/test_clean.py   2025-04-09 14:14:25.000000000 
+0300
+++ lxml-html-clean-0.4.4/tests/test_clean.py   2026-02-27 11:32:37.000000000 
+0200
@@ -331,20 +331,20 @@
 
     def test_host_whitelist_valid(self):
         # Frame with valid hostname in src is allowed.
-        html = '<div><iframe src="https://example.com/page";></div>'
+        html = '<div><iframe src="https://example.com/page";></iframe></div>'
         expected = '<div><iframe 
src="https://example.com/page";></iframe></div>'
         cleaner = Cleaner(frames=False, host_whitelist=["example.com"])
         self.assertEqual(expected, cleaner.clean_html(html))
 
     def test_host_whitelist_invalid(self):
-        html = '<div><iframe src="https://evil.com/page";></div>'
+        html = '<div><iframe src="https://evil.com/page";></iframe></div>'
         expected = '<div></div>'
         cleaner = Cleaner(frames=False, host_whitelist=["example.com"])
         self.assertEqual(expected, cleaner.clean_html(html))
 
     def test_host_whitelist_sneaky_userinfo(self):
         # Regression test: Don't be fooled by hostname and colon in userinfo.
-        html = '<div><iframe src="https://example.com:@evil.com/page";></div>'
+        html = '<div><iframe 
src="https://example.com:@evil.com/page";></iframe></div>'
         expected = '<div></div>'
         cleaner = Cleaner(frames=False, host_whitelist=["example.com"])
         self.assertEqual(expected, cleaner.clean_html(html))
@@ -393,3 +393,195 @@
             self.assertEqual(len(w), 0)
         self.assertNotIn("google.com", result)
         self.assertNotIn("example.com", result)
+
+    def test_base_tag_removed_with_page_structure(self):
+        # Test that <base> tags are removed when page_structure=True (default)
+        # This prevents URL hijacking attacks where <base> redirects all 
relative URLs
+
+        test_cases = [
+            # <base> in proper location (inside <head>)
+            '<html><head><base href="http://evil.com/";></head><body><a 
href="page.html">link</a></body></html>',
+            # <base> outside <head>
+            '<div><base href="http://evil.com/";><a 
href="page.html">link</a></div>',
+            # Multiple <base> tags
+            '<base href="http://evil.com/";><div><base 
href="http://evil2.com/";></div>',
+            # <base> with target attribute
+            '<base target="_blank"><div>content</div>',
+            # <base> at various positions
+            '<html><base href="http://evil.com/";><body>test</body></html>',
+        ]
+
+        for html in test_cases:
+            with self.subTest(html=html):
+                cleaned = clean_html(html)
+                # Verify <base> tag is completely removed
+                self.assertNotIn('base', cleaned.lower())
+                self.assertNotIn('evil.com', cleaned)
+                self.assertNotIn('evil2.com', cleaned)
+
+    def test_base_tag_kept_when_page_structure_false(self):
+        # When page_structure=False and head is not removed, <base> should be 
kept
+        cleaner = Cleaner(page_structure=False)
+        html = '<html><head><base 
href="http://example.com/";></head><body>test</body></html>'
+        cleaned = cleaner.clean_html(html)
+        self.assertIn('<base href="http://example.com/";>', cleaned)
+
+    def test_base_tag_removed_when_head_in_remove_tags(self):
+        # Even with page_structure=False, <base> should be removed if head is 
manually removed
+        cleaner = Cleaner(page_structure=False, remove_tags=['head'])
+        html = '<html><head><base 
href="http://evil.com/";></head><body>test</body></html>'
+        cleaned = cleaner.clean_html(html)
+        self.assertNotIn('base', cleaned.lower())
+        self.assertNotIn('evil.com', cleaned)
+
+    def test_base_tag_removed_when_head_in_kill_tags(self):
+        # Even with page_structure=False, <base> should be removed if head is 
in kill_tags
+        cleaner = Cleaner(page_structure=False, kill_tags=['head'])
+        html = '<html><head><base 
href="http://evil.com/";></head><body>test</body></html>'
+        cleaned = cleaner.clean_html(html)
+        self.assertNotIn('base', cleaned.lower())
+        self.assertNotIn('evil.com', cleaned)
+
+    def test_unicode_escape_in_style(self):
+        # Test that CSS Unicode escapes are properly decoded before security 
checks
+        # This prevents attackers from bypassing filters using escape sequences
+        # CSS escape syntax: \HHHHHH where H is a hex digit (1-6 digits)
+
+        # Test inline style attributes (requires safe_attrs_only=False)
+        cleaner = Cleaner(safe_attrs_only=False)
+        inline_style_cases = [
+            # \6a\61\76\61\73\63\72\69\70\74 = "javascript"
+            ('<div style="background: 
url(\\6a\\61\\76\\61\\73\\63\\72\\69\\70\\74:alert(1))">test</div>', 
'<div>test</div>'),
+            # \69 = 'i', so \69mport = "import"
+            ('<div style="@\\69mport url(evil.css)">test</div>', 
'<div>test</div>'),
+            # \69 with space after = 'i', space consumed as part of escape
+            ('<div style="@\\69 mport url(evil.css)">test</div>', 
'<div>test</div>'),
+            # \65\78\70\72\65\73\73\69\6f\6e = "expression"
+            ('<div 
style="\\65\\78\\70\\72\\65\\73\\73\\69\\6f\\6e(alert(1))">test</div>', 
'<div>test</div>'),
+        ]
+
+        for html, expected in inline_style_cases:
+            with self.subTest(html=html):
+                cleaned = cleaner.clean_html(html)
+                self.assertEqual(expected, cleaned)
+
+        # Test <style> tag content (uses default clean_html)
+        style_tag_cases = [
+            # Unicode-escaped "javascript:" in url()
+            
'<style>url(\\6a\\61\\76\\61\\73\\63\\72\\69\\70\\74:alert(1))</style>',
+            # Unicode-escaped "javascript:" without url()
+            '<style>\\6a\\61\\76\\61\\73\\63\\72\\69\\70\\74:alert(1)</style>',
+            # Unicode-escaped "expression"
+            
'<style>\\65\\78\\70\\72\\65\\73\\73\\69\\6f\\6e(alert(1))</style>',
+            # Unicode-escaped @import with 'i'
+            '<style>@\\69mport url(evil.css)</style>',
+            # Unicode-escaped "data:" scheme
+            
'<style>url(\\64\\61\\74\\61:image/svg+xml;base64,PHN2ZyBvbmxvYWQ9YWxlcnQoMSk+)</style>',
+            # Space after escape is consumed: \69 mport = "import"
+            '<style>@\\69 mport url(evil.css)</style>',
+            # 6-digit escape: \000069 = 'i'
+            '<style>@\\000069mport url(evil.css)</style>',
+            # 6-digit escape with space
+            '<style>@\\000069 mport url(evil.css)</style>',
+        ]
+
+        for html in style_tag_cases:
+            with self.subTest(html=html):
+                cleaned = clean_html(html)
+                self.assertEqual('<div><style>/* deleted */</style></div>', 
cleaned)
+
+    def test_unicode_escape_mixed_with_comments(self):
+        # Unicode escapes mixed with CSS comments should still be caught
+        test_cases = [
+            # \69 = 'i' with comment before
+            '<style>@/*comment*/\\69mport url(evil.css)</style>',
+            # \69 = 'i' with comment after
+            '<style>@\\69mport/*comment*/ url(evil.css)</style>',
+            # Multiple escapes with comments
+            
'<style>\\65\\78/*comment*/\\70\\72\\65\\73\\73\\69\\6f\\6e(alert(1))</style>',
+        ]
+
+        for html in test_cases:
+            with self.subTest(html=html):
+                cleaned = clean_html(html)
+                self.assertEqual('<div><style>/* deleted */</style></div>', 
cleaned)
+
+    def test_unicode_escape_case_insensitive(self):
+        # CSS hex escapes should work with both uppercase and lowercase hex 
digits
+        # \69 = 'i', \6D = 'm', etc.
+        test_cases = [
+            # @import with uppercase hex digits: \69\6D\70\6F\72\74
+            '<style>@\\69\\6D\\70\\6F\\72\\74 url(evil.css)</style>',
+            # @import with some uppercase
+            '<style>@\\69\\6D\\70\\6f\\72\\74 url(evil.css)</style>',
+        ]
+
+        for html in test_cases:
+            with self.subTest(html=html):
+                cleaned = clean_html(html)
+                self.assertEqual('<div><style>/* deleted */</style></div>', 
cleaned)
+
+    def test_unicode_escape_various_schemes(self):
+        # Test Unicode escapes for various malicious schemes
+        test_cases = [
+            # \76\62\73\63\72\69\70\74 = "vbscript"
+            '<style>url(\\76\\62\\73\\63\\72\\69\\70\\74:alert(1))</style>',
+            # \6a\73\63\72\69\70\74 = "jscript"
+            '<style>url(\\6a\\73\\63\\72\\69\\70\\74:alert(1))</style>',
+            # \6c\69\76\65\73\63\72\69\70\74 = "livescript"
+            
'<style>url(\\6c\\69\\76\\65\\73\\63\\72\\69\\70\\74:alert(1))</style>',
+            # \6d\6f\63\68\61 = "mocha"
+            '<style>url(\\6d\\6f\\63\\68\\61:alert(1))</style>',
+        ]
+
+        for html in test_cases:
+            with self.subTest(html=html):
+                cleaned = clean_html(html)
+                self.assertEqual('<div><style>/* deleted */</style></div>', 
cleaned)
+
+    def test_unicode_escape_with_whitespace_variations(self):
+        # Test different whitespace characters after Unicode escapes
+        cleaner = Cleaner(safe_attrs_only=False)
+        test_cases = [
+            # Tab after escape
+            ('<div style="@\\69\tmport url(evil.css)">test</div>', 
'<div>test</div>'),
+            # Newline after escape (note: actual newline, not \n)
+            ('<div style="@\\69\nmport url(evil.css)">test</div>', 
'<div>test</div>'),
+            # Form feed after escape
+            ('<div style="@\\69\fmport url(evil.css)">test</div>', 
'<div>test</div>'),
+        ]
+
+        for html, expected in test_cases:
+            with self.subTest(html=html):
+                cleaned = cleaner.clean_html(html)
+                self.assertEqual(expected, cleaned)
+
+    def test_backslash_removal_after_unicode_decode(self):
+        # After decoding Unicode escapes, remaining backslashes are removed
+        # This ensures double-obfuscation (unicode + backslashes) is caught
+        test_cases = [
+            # Step 1: \69 → 'i', Step 2: remove \, Result: @import
+            '<style>@\\69\\m\\p\\o\\r\\t url(evil.css)</style>',
+            # Multiple unicode escapes with backslashes mixed in
+            '<style>@\\69\\6d\\p\\6f\\r\\t url(evil.css)</style>',
+        ]
+
+        for html in test_cases:
+            with self.subTest(html=html):
+                cleaned = clean_html(html)
+                self.assertEqual('<div><style>/* deleted */</style></div>', 
cleaned)
+
+    def test_backslash_obfuscation_without_unicode(self):
+        # Test that patterns using ONLY backslash obfuscation (no unicode) are 
caught
+        # Step 1: No unicode escapes, Step 2: remove \, Result: malicious 
pattern
+        test_cases = [
+            # @\i\m\p\o\r\t → @import (caught by '@import' check)
+            '<style>@\\i\\m\\p\\o\\r\\t url(evil.css)</style>',
+            # Can also test combinations that create javascript schemes
+            '<style>@\\import url(evil.css)</style>',
+        ]
+
+        for html in test_cases:
+            with self.subTest(html=html):
+                cleaned = clean_html(html)
+                self.assertEqual('<div><style>/* deleted */</style></div>', 
cleaned)
diff -Nru lxml-html-clean-0.4.2/tests/test_clean.txt 
lxml-html-clean-0.4.4/tests/test_clean.txt
--- lxml-html-clean-0.4.2/tests/test_clean.txt  2025-04-09 14:14:25.000000000 
+0300
+++ lxml-html-clean-0.4.4/tests/test_clean.txt  2026-02-27 11:32:37.000000000 
+0200
@@ -84,7 +84,7 @@
   <body onload="evil_function()">
     <!-- I am interpreted for EVIL! -->
     <a href="javascript:evil_function()">a link</a>
-    <a href="javascrip%20t%20:evil_function()">a control char link</a>
+    <a href="j%01a%02v%03a%04s%05c%06r%07i%0Ep%20t%20:evil_function()">a 
control char link</a>
     <a 
href="data:text/html;base64,PHNjcmlwdD5hbGVydCgidGVzdCIpOzwvc2NyaXB0Pg==">data</a>
     <a href="#" onclick="evil_function()">another link</a>
     <p onclick="evil_function()">a paragraph</p>
diff -Nru lxml-html-clean-0.4.2/tox.ini lxml-html-clean-0.4.4/tox.ini
--- lxml-html-clean-0.4.2/tox.ini       2025-04-09 14:14:25.000000000 +0300
+++ lxml-html-clean-0.4.4/tox.ini       2026-02-27 11:32:37.000000000 +0200
@@ -1,5 +1,5 @@
 [tox]
-envlist = py36,py38,py39,py310,py311,py312,py313,mypy
+envlist = py39,py310,py311,py312,py313,py314,mypy
 skipsdist = True
 
 [testenv]

Reply via email to