This is an automated email from the ASF dual-hosted git repository.

claude pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/creadur-rat.git


The following commit(s) were added to refs/heads/master by this push:
     new 20f6e4df RAT-494: Fix NPE on unsupported character set (#546)
20f6e4df is described below

commit 20f6e4df94d20d09037503684322a8311c506891
Author: Claude Warren <[email protected]>
AuthorDate: Thu Oct 9 09:22:44 2025 +0200

    RAT-494: Fix NPE on unsupported character set (#546)
    
    * Fix NPE on unsupported character set
    
    Change now redefines any document with an unsupported character set as an 
UNKNOWN document.
    Unable to create a test to verify.
    
    ---------
    
    Co-authored-by: P. Ottlinger <[email protected]>
    Co-authored-by: P. Ottlinger <[email protected]>
---
 .../java/org/apache/rat/DeprecationReporter.java     |  2 +-
 .../java/org/apache/rat/analysis/TikaProcessor.java  | 20 +++++++++++++-------
 .../src/main/java/org/apache/rat/api/Document.java   |  7 ++++++-
 src/changes/changes.xml                              |  3 +++
 4 files changed, 23 insertions(+), 9 deletions(-)

diff --git 
a/apache-rat-core/src/main/java/org/apache/rat/DeprecationReporter.java 
b/apache-rat-core/src/main/java/org/apache/rat/DeprecationReporter.java
index 1d307b69..2a1dd63a 100644
--- a/apache-rat-core/src/main/java/org/apache/rat/DeprecationReporter.java
+++ b/apache-rat-core/src/main/java/org/apache/rat/DeprecationReporter.java
@@ -94,7 +94,7 @@ public final class DeprecationReporter {
      */
     public static void logDeprecated(final Class<?> clazz) {
         if (clazz.getAnnotation(Deprecated.class) != null) {
-            String name = format("Deprecated class used: %s ", clazz);
+            String name = format("class used: %s ", clazz);
             Info info = clazz.getAnnotation(Info.class);
             if (info == null) {
                 DefaultLog.getInstance().warn(formatEntry(name, "", false, 
""));
diff --git 
a/apache-rat-core/src/main/java/org/apache/rat/analysis/TikaProcessor.java 
b/apache-rat-core/src/main/java/org/apache/rat/analysis/TikaProcessor.java
index 9cad66cf..693af5bf 100644
--- a/apache-rat-core/src/main/java/org/apache/rat/analysis/TikaProcessor.java
+++ b/apache-rat-core/src/main/java/org/apache/rat/analysis/TikaProcessor.java
@@ -141,9 +141,13 @@ public final class TikaProcessor {
                         .setDocumentType(fromMediaType(mediaType));
             }
             if (Document.Type.STANDARD == 
document.getMetaData().getDocumentType()) {
-                document.getMetaData().setCharset(detectCharset(stream, 
document.getName()));
-                if (NoteGuesser.isNote(document)) {
-                    
document.getMetaData().setDocumentType(Document.Type.NOTICE);
+                try {
+                    document.getMetaData().setCharset(detectCharset(stream, 
document.getName()));
+                    if (NoteGuesser.isNote(document)) {
+                        
document.getMetaData().setDocumentType(Document.Type.NOTICE);
+                    }
+                } catch (UnsupportedCharsetException e) {
+                    
document.getMetaData().setDocumentType(Document.Type.UNKNOWN);
                 }
             }
             return result;
@@ -155,11 +159,12 @@ public final class TikaProcessor {
     /**
      * Determine the character set for the input stream. Input stream must 
implement {@code mark}.
      * @param stream the stream to check.
-     * @param documentName the name of the document being read.
+     * @param documentName the name of the document being processed.
      * @return the detected character set or {@code null} if not detectable.
      * @throws IOException on IO error.
+     * @throws UnsupportedCharsetException on unsupported charset.
      */
-    private static Charset detectCharset(final InputStream stream, final 
DocumentName documentName) throws IOException {
+    private static Charset detectCharset(final InputStream stream, final 
DocumentName documentName) throws IOException, UnsupportedCharsetException {
         CharsetDetector encodingDetector = new CharsetDetector();
         encodingDetector.setText(stream);
         CharsetMatch charsetMatch = encodingDetector.detect();
@@ -167,8 +172,9 @@ public final class TikaProcessor {
             try {
                 return Charset.forName(charsetMatch.getName());
             } catch (UnsupportedCharsetException e) {
-                DefaultLog.getInstance().warn(String.format("Unsupported 
character set '%s' in file '%s'. Will use system default encoding.",
-                                charsetMatch.getName(), documentName));
+                DefaultLog.getInstance().warn(String.format("Unsupported 
character set '%s' in file '%s'",
+                        charsetMatch.getName(), documentName));
+                throw e;
             }
         }
         return null;
diff --git a/apache-rat-core/src/main/java/org/apache/rat/api/Document.java 
b/apache-rat-core/src/main/java/org/apache/rat/api/Document.java
index 0dbfa10c..a29babf7 100644
--- a/apache-rat-core/src/main/java/org/apache/rat/api/Document.java
+++ b/apache-rat-core/src/main/java/org/apache/rat/api/Document.java
@@ -109,7 +109,12 @@ public abstract class Document implements 
Comparable<Document> {
      * @throws IOException if this document cannot be read.
      */
     public Reader reader() throws IOException {
-        return new 
CharsetDetector().getReader(TikaProcessor.markSupportedInputStream(inputStream()),
 getMetaData().getCharset().name());
+        // RAT-494: Tika's CharsetDetector.getReader() may return null if the 
read can not be constructed due to I/O or encoding errors
+        Reader result = new 
CharsetDetector().getReader(TikaProcessor.markSupportedInputStream(inputStream()),
 getMetaData().getCharset().name());
+        if (result == null) {
+            throw new IOException(String.format("Can not read document `%s`", 
getName()));
+        }
+        return result;
     }
 
     /**
diff --git a/src/changes/changes.xml b/src/changes/changes.xml
index 1c39e48d..b5ad5c49 100644
--- a/src/changes/changes.xml
+++ b/src/changes/changes.xml
@@ -72,6 +72,9 @@ The <action> type attribute can be one of:
     </release>
     -->
     <release version="0.17-SNAPSHOT" date="xxxx-yy-zz" description="Current 
SNAPSHOT - release to be done">
+      <action issue="RAT-494" type="fix" dev="claudenw" due-to="Tilman 
Hausherr">
+        Fix NPE when encoding found in scanned document is not supported by 
the currently used JDK. Tika part of the bugfix can be found via TIKA-4505.
+      </action>
       <action issue="RAT-489" type="add" dev="pottlinger">
         Provide a central known issues section to the RAT homepage in order to 
inform users more directly about already known challenges with the current RAT 
version.
       </action>

Reply via email to