Control: tag -1 + patch

There's an upstream PR migrating to bs4
https://github.com/google/gumbo-parser/pull/368

Patch attached.

SR

-- 
Stefano Rivera
  http://tumbleweed.org.za/
  +1 415 683 3272
diff -Nru gumbo-parser-0.10.1+dfsg/debian/control 
gumbo-parser-0.10.1+dfsg/debian/control
--- gumbo-parser-0.10.1+dfsg/debian/control     2015-12-30 18:44:19.000000000 
+0000
+++ gumbo-parser-0.10.1+dfsg/debian/control     2018-08-27 15:48:16.000000000 
+0100
@@ -45,7 +45,7 @@
 Architecture: all
 Depends: ${misc:Depends}, ${shlibs:Depends}, ${python:Depends},
  libgumbo1 (>= ${source:Version})
-Recommends: python-beautifulsoup, python-html5lib
+Recommends: python-bs4, python-html5lib
 Description: pure-C HTML5 parser Python bindings
  Gumbo is an implementation of the [HTML5 parsing algorithm implemented
  as a pure C99 library with no outside dependencies.  It's designed to serve
@@ -59,7 +59,7 @@
 Architecture: all
 Depends: ${misc:Depends}, ${shlibs:Depends}, ${python3:Depends},
  libgumbo1 (>= ${source:Version})
-Recommends: python3-html5lib
+Recommends: python3-bs4, python3-html5lib
 Description: pure-C HTML5 parser Python 3 bindings
  Gumbo is an implementation of the [HTML5 parsing algorithm implemented
  as a pure C99 library with no outside dependencies.  It's designed to serve
diff -Nru gumbo-parser-0.10.1+dfsg/debian/patches/03-bs4.patch 
gumbo-parser-0.10.1+dfsg/debian/patches/03-bs4.patch
--- gumbo-parser-0.10.1+dfsg/debian/patches/03-bs4.patch        1970-01-01 
01:00:00.000000000 +0100
+++ gumbo-parser-0.10.1+dfsg/debian/patches/03-bs4.patch        2018-08-27 
15:48:16.000000000 +0100
@@ -0,0 +1,178 @@
+From 29e1abb337af2a15ac4b38fb1c28d1b55ed08d54 Mon Sep 17 00:00:00 2001
+From: Roman Miroshnychenko <roma...@yandex.ua>
+Date: Tue, 19 Jul 2016 18:25:52 +0300
+Subject: [PATCH] Updates soup_adapter to use BeautifulSoup 4
+
+Also fixes the indentation according to PEP-8
+---
+ python/gumbo/soup_adapter.py | 123 +++++++++++++++++------------------
+ 1 file changed, 61 insertions(+), 62 deletions(-)
+
+diff --git a/python/gumbo/soup_adapter.py b/python/gumbo/soup_adapter.py
+index b18748f..6a247dd 100644
+--- a/python/gumbo/soup_adapter.py
++++ b/python/gumbo/soup_adapter.py
+@@ -13,66 +13,65 @@
+ # limitations under the License.
+ #
+ 
+-"""Adapter between Gumbo and BeautifulSoup.
++"""Adapter between Gumbo and BeautifulSoup 4.
+ 
+-This parses an HTML document and gives back a BeautifulSoup object, which you
+-can then manipulate like a normal BeautifulSoup parse tree.
++This parses an HTML document and gives back a BeautifulSoup 4 object, which 
you
++can then manipulate like a normal BeautifulSoup 4 parse tree.
+ """
+ 
+ __author__ = 'jdt...@google.com (Jonathan Tang)'
+ 
+-import BeautifulSoup
+-
++import bs4 as BeautifulSoup
+ import gumboc
+ 
+ 
+ def _utf8(text):
+-  return text.decode('utf-8', 'replace')
++    return text.decode('utf-8', 'replace')
+ 
+ 
+ def _add_source_info(obj, original_text, start_pos, end_pos):
+-  obj.original = str(original_text)
+-  obj.line = start_pos.line
+-  obj.col = start_pos.column
+-  obj.offset = start_pos.offset
+-  if end_pos:
+-    obj.end_line = end_pos.line
+-    obj.end_col = end_pos.column
+-    obj.end_offset = end_pos.offset
++    obj.original = str(original_text)
++    obj.line = start_pos.line
++    obj.col = start_pos.column
++    obj.offset = start_pos.offset
++    if end_pos:
++        obj.end_line = end_pos.line
++        obj.end_col = end_pos.column
++        obj.end_offset = end_pos.offset
+ 
+ 
+ def _convert_attrs(attrs):
+-  # TODO(jdtang): Ideally attributes would pass along their positions as well,
+-  # but I can't extend the built in str objects with new attributes.  Maybe 
work
+-  # around this with a subclass in some way...
+-  return [(_utf8(attr.name), _utf8(attr.value)) for attr in attrs]
++    # TODO(jdtang): Ideally attributes would pass along their positions as 
well,
++    # but I can't extend the built in str objects with new attributes.  Maybe 
work
++    # around this with a subclass in some way...
++    return [(_utf8(attr.name), _utf8(attr.value)) for attr in attrs]
+ 
+ 
+ def _add_document(soup, element):
+-  # Currently ignored, since there's no real place for this in the 
BeautifulSoup
+-  # API.
+-  pass
++    # Currently ignored, since there's no real place for this in the 
BeautifulSoup
++    # API.
++    pass
+ 
+ 
+ def _add_element(soup, element):
+-  # TODO(jdtang): Expose next/previous in gumbo so they can be passed along to
+-  # BeautifulSoup.
+-  tag = BeautifulSoup.Tag(
+-      soup, _utf8(element.tag_name), _convert_attrs(element.attributes))
+-  for child in element.children:
+-    tag.append(_add_node(soup, child))
+-  _add_source_info(
+-      tag, element.original_tag, element.start_pos, element.end_pos)
+-  tag.original_end_tag = str(element.original_end_tag)
+-  return tag
++    # TODO(jdtang): Expose next/previous in gumbo so they can be passed along 
to
++    # BeautifulSoup.
++    tag = BeautifulSoup.Tag(
++        name=_utf8(element.tag_name), 
attrs=_convert_attrs(element.attributes))
++    for child in element.children:
++        tag.append(_add_node(soup, child))
++    _add_source_info(
++        tag, element.original_tag, element.start_pos, element.end_pos)
++    tag.original_end_tag = str(element.original_end_tag)
++    return tag
+ 
+ 
+ def _add_text(cls):
+-  def add_text_internal(soup, element):
+-    text = cls(_utf8(element.text))
+-    _add_source_info(text, element.original_text, element.start_pos, None)
+-    return text
+-  return add_text_internal
++    def add_text_internal(soup, element):
++        text = cls(_utf8(element.text))
++        _add_source_info(text, element.original_text, element.start_pos, None)
++        return text
++    return add_text_internal
+ 
+ 
+ _HANDLERS = [
+@@ -87,35 +86,35 @@ def add_text_internal(soup, element):
+ 
+ 
+ def _add_node(soup, node):
+-  return _HANDLERS[node.type.value](soup, node.contents)
++    return _HANDLERS[node.type.value](soup, node.contents)
+ 
+ 
+ def _add_next_prev_pointers(soup):
+-  def _traverse(node):
+-    # .findAll requires the .next pointer, which is what we're trying to add
+-    # when we call this, and so we manually supply a generator to yield the
+-    # nodes in DOM order.
+-    yield node
+-    try:
+-      for child in node.contents:
+-        for descendant in _traverse(child):
+-          yield descendant
+-    except AttributeError:
+-      # Not an element.
+-      return
+-
+-  nodes = sorted(_traverse(soup), key=lambda node: node.offset)
+-  if nodes:
+-    nodes[0].previous = None
+-    nodes[-1].next = None
+-  for i, node in enumerate(nodes[1:-1], 1):
+-    nodes[i-1].next = node
+-    node.previous = nodes[i-1]
++    def _traverse(node):
++        # .findAll requires the .next pointer, which is what we're trying to 
add
++        # when we call this, and so we manually supply a generator to yield 
the
++        # nodes in DOM order.
++        yield node
++        try:
++          for child in node.contents:
++            for descendant in _traverse(child):
++                yield descendant
++        except AttributeError:
++            # Not an element.
++            return
++
++    nodes = sorted(_traverse(soup), key=lambda node: node.offset)
++    if nodes:
++        nodes[0].previous_element = None
++        nodes[-1].next_element = None
++    for i, node in enumerate(nodes[1:-1], 1):
++        nodes[i-1].next_element = node
++        node.previous_element = nodes[i-1]
+ 
+ 
+ def parse(text, **kwargs):
+-  with gumboc.parse(text, **kwargs) as output:
+-    soup = BeautifulSoup.BeautifulSoup()
+-    soup.append(_add_node(soup, output.contents.root.contents))
+-    _add_next_prev_pointers(soup)
+-    return soup
++    with gumboc.parse(text, **kwargs) as output:
++        soup = BeautifulSoup.BeautifulSoup(features='html.parser')
++        soup.append(_add_node(soup, output.contents.root.contents))
++        _add_next_prev_pointers(soup)
++        return soup
diff -Nru gumbo-parser-0.10.1+dfsg/debian/patches/series 
gumbo-parser-0.10.1+dfsg/debian/patches/series
--- gumbo-parser-0.10.1+dfsg/debian/patches/series      2015-12-30 
18:22:04.000000000 +0000
+++ gumbo-parser-0.10.1+dfsg/debian/patches/series      2018-08-27 
15:48:16.000000000 +0100
@@ -1,3 +1,4 @@
 00-tests.patch
 01-name-of-lib.patch
 02-circular-dependency.patch
+03-bs4.patch

Reply via email to