This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch branch_3x-TIKA-4419-v2 in repository https://gitbox.apache.org/repos/asf/tika.git
commit ea533aa000fe695c41504a67842cd4de5eacf776 Author: tallison <[email protected]> AuthorDate: Tue May 20 08:17:38 2025 -0400 TIKA-4419 -- experiment with custom list of self-closeable-tags.txt --- .../org/apache/tika/parser/html/JSoupParser.java | 29 +++++++++++- .../tika/parser/html/self-closeable-tags.txt | 52 ++++++++++++++++++++++ 2 files changed, 79 insertions(+), 2 deletions(-) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/JSoupParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/JSoupParser.java index c34a496db..bedbce788 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/JSoupParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/JSoupParser.java @@ -16,8 +16,10 @@ */ package org.apache.tika.parser.html; +import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; +import java.io.InputStreamReader; import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.util.Arrays; @@ -34,6 +36,9 @@ import org.jsoup.nodes.DataNode; import org.jsoup.nodes.Document; import org.jsoup.nodes.Node; import org.jsoup.nodes.TextNode; +import org.jsoup.parser.Parser; +import org.jsoup.parser.Tag; +import org.jsoup.parser.TagSet; import org.jsoup.select.NodeFilter; import org.jsoup.select.NodeTraversor; import org.xml.sax.ContentHandler; @@ -70,6 +75,26 @@ public class JSoupParser extends AbstractEncodingDetectorParser { private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet( new HashSet<MediaType>(Arrays.asList(MediaType.text("html"), XHTML, WAP_XHTML, X_ASP))); + private static final TagSet SELF_CLOSEABLE_TAGS = TagSet.Html(); + + static { + try (BufferedReader reader = new BufferedReader(new InputStreamReader( + JSoupParser.class.getResourceAsStream("self-closeable-tags.txt"), StandardCharsets.UTF_8))) { + String line = reader.readLine(); + while (line != null) { + if (line.startsWith("#") || line.trim().isEmpty()) { + line = reader.readLine(); + continue; + } + Tag t = SELF_CLOSEABLE_TAGS.valueOf(line.trim(), Parser.NamespaceHtml); + t.set(Tag.SelfClose); + line = reader.readLine(); + } + } catch (IOException e) { + throw new RuntimeException("Can't find self-closeable-tags.txt"); + } + } + @Field private boolean extractScripts = false; @@ -128,7 +153,7 @@ public class JSoupParser extends AbstractEncodingDetectorParser { HtmlMapper mapper = context.get(HtmlMapper.class, new DefaultHtmlMapper()); //do better with baseUri? - Document document = Jsoup.parse(CloseShieldInputStream.wrap(stream), charset.name(), ""); + Document document = Jsoup.parse(CloseShieldInputStream.wrap(stream), charset.name(), "", Parser.htmlParser().tagSet(SELF_CLOSEABLE_TAGS)); document.quirksMode(Document.QuirksMode.quirks); ContentHandler xhtml = new XHTMLDowngradeHandler( new HtmlHandler(mapper, handler, metadata, context, extractScripts)); @@ -147,7 +172,7 @@ public class JSoupParser extends AbstractEncodingDetectorParser { HtmlMapper mapper = context.get(HtmlMapper.class, new DefaultHtmlMapper()); //do better with baseUri? - Document document = Jsoup.parse(html); + Document document = Jsoup.parse(html, Parser.htmlParser().tagSet(SELF_CLOSEABLE_TAGS)); document.quirksMode(Document.QuirksMode.quirks); ContentHandler xhtml = new XHTMLDowngradeHandler( new HtmlHandler(mapper, handler, metadata, context, extractScripts)); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/resources/org/apache/tika/parser/html/self-closeable-tags.txt b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/resources/org/apache/tika/parser/html/self-closeable-tags.txt new file mode 100644 index 000000000..ff44a4385 --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/resources/org/apache/tika/parser/html/self-closeable-tags.txt @@ -0,0 +1,52 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# See TIKA-4419 +br +img +meta +input +link +path +a +hr +image +col +area +param +base +td +div +p +source +option +embed +span +tr +td +script +wbr +li +button +iframe +b +i +ul +form +textarea +table +select +strong +title \ No newline at end of file
