This is an automated email from the ASF dual-hosted git repository.
claude pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/creadur-rat.git
The following commit(s) were added to refs/heads/master by this push:
new 20f6e4df RAT-494: Fix NPE on unsupported character set (#546)
20f6e4df is described below
commit 20f6e4df94d20d09037503684322a8311c506891
Author: Claude Warren <[email protected]>
AuthorDate: Thu Oct 9 09:22:44 2025 +0200
RAT-494: Fix NPE on unsupported character set (#546)
* Fix NPE on unsupported character set
Change now redefines any document with an unsupported character set as an
UNKNOWN document.
Unable to create a test to verify.
---------
Co-authored-by: P. Ottlinger <[email protected]>
Co-authored-by: P. Ottlinger <[email protected]>
---
.../java/org/apache/rat/DeprecationReporter.java | 2 +-
.../java/org/apache/rat/analysis/TikaProcessor.java | 20 +++++++++++++-------
.../src/main/java/org/apache/rat/api/Document.java | 7 ++++++-
src/changes/changes.xml | 3 +++
4 files changed, 23 insertions(+), 9 deletions(-)
diff --git
a/apache-rat-core/src/main/java/org/apache/rat/DeprecationReporter.java
b/apache-rat-core/src/main/java/org/apache/rat/DeprecationReporter.java
index 1d307b69..2a1dd63a 100644
--- a/apache-rat-core/src/main/java/org/apache/rat/DeprecationReporter.java
+++ b/apache-rat-core/src/main/java/org/apache/rat/DeprecationReporter.java
@@ -94,7 +94,7 @@ public final class DeprecationReporter {
*/
public static void logDeprecated(final Class<?> clazz) {
if (clazz.getAnnotation(Deprecated.class) != null) {
- String name = format("Deprecated class used: %s ", clazz);
+ String name = format("class used: %s ", clazz);
Info info = clazz.getAnnotation(Info.class);
if (info == null) {
DefaultLog.getInstance().warn(formatEntry(name, "", false,
""));
diff --git
a/apache-rat-core/src/main/java/org/apache/rat/analysis/TikaProcessor.java
b/apache-rat-core/src/main/java/org/apache/rat/analysis/TikaProcessor.java
index 9cad66cf..693af5bf 100644
--- a/apache-rat-core/src/main/java/org/apache/rat/analysis/TikaProcessor.java
+++ b/apache-rat-core/src/main/java/org/apache/rat/analysis/TikaProcessor.java
@@ -141,9 +141,13 @@ public final class TikaProcessor {
.setDocumentType(fromMediaType(mediaType));
}
if (Document.Type.STANDARD ==
document.getMetaData().getDocumentType()) {
- document.getMetaData().setCharset(detectCharset(stream,
document.getName()));
- if (NoteGuesser.isNote(document)) {
-
document.getMetaData().setDocumentType(Document.Type.NOTICE);
+ try {
+ document.getMetaData().setCharset(detectCharset(stream,
document.getName()));
+ if (NoteGuesser.isNote(document)) {
+
document.getMetaData().setDocumentType(Document.Type.NOTICE);
+ }
+ } catch (UnsupportedCharsetException e) {
+
document.getMetaData().setDocumentType(Document.Type.UNKNOWN);
}
}
return result;
@@ -155,11 +159,12 @@ public final class TikaProcessor {
/**
* Determine the character set for the input stream. Input stream must
implement {@code mark}.
* @param stream the stream to check.
- * @param documentName the name of the document being read.
+ * @param documentName the name of the document being processed.
* @return the detected character set or {@code null} if not detectable.
* @throws IOException on IO error.
+ * @throws UnsupportedCharsetException on unsupported charset.
*/
- private static Charset detectCharset(final InputStream stream, final
DocumentName documentName) throws IOException {
+ private static Charset detectCharset(final InputStream stream, final
DocumentName documentName) throws IOException, UnsupportedCharsetException {
CharsetDetector encodingDetector = new CharsetDetector();
encodingDetector.setText(stream);
CharsetMatch charsetMatch = encodingDetector.detect();
@@ -167,8 +172,9 @@ public final class TikaProcessor {
try {
return Charset.forName(charsetMatch.getName());
} catch (UnsupportedCharsetException e) {
- DefaultLog.getInstance().warn(String.format("Unsupported
character set '%s' in file '%s'. Will use system default encoding.",
- charsetMatch.getName(), documentName));
+ DefaultLog.getInstance().warn(String.format("Unsupported
character set '%s' in file '%s'",
+ charsetMatch.getName(), documentName));
+ throw e;
}
}
return null;
diff --git a/apache-rat-core/src/main/java/org/apache/rat/api/Document.java
b/apache-rat-core/src/main/java/org/apache/rat/api/Document.java
index 0dbfa10c..a29babf7 100644
--- a/apache-rat-core/src/main/java/org/apache/rat/api/Document.java
+++ b/apache-rat-core/src/main/java/org/apache/rat/api/Document.java
@@ -109,7 +109,12 @@ public abstract class Document implements
Comparable<Document> {
* @throws IOException if this document cannot be read.
*/
public Reader reader() throws IOException {
- return new
CharsetDetector().getReader(TikaProcessor.markSupportedInputStream(inputStream()),
getMetaData().getCharset().name());
+ // RAT-494: Tika's CharsetDetector.getReader() may return null if the
read can not be constructed due to I/O or encoding errors
+ Reader result = new
CharsetDetector().getReader(TikaProcessor.markSupportedInputStream(inputStream()),
getMetaData().getCharset().name());
+ if (result == null) {
+ throw new IOException(String.format("Can not read document `%s`",
getName()));
+ }
+ return result;
}
/**
diff --git a/src/changes/changes.xml b/src/changes/changes.xml
index 1c39e48d..b5ad5c49 100644
--- a/src/changes/changes.xml
+++ b/src/changes/changes.xml
@@ -72,6 +72,9 @@ The <action> type attribute can be one of:
</release>
-->
<release version="0.17-SNAPSHOT" date="xxxx-yy-zz" description="Current
SNAPSHOT - release to be done">
+ <action issue="RAT-494" type="fix" dev="claudenw" due-to="Tilman
Hausherr">
+ Fix NPE when encoding found in scanned document is not supported by
the currently used JDK. Tika part of the bugfix can be found via TIKA-4505.
+ </action>
<action issue="RAT-489" type="add" dev="pottlinger">
Provide a central known issues section to the RAT homepage in order to
inform users more directly about already known challenges with the current RAT
version.
</action>