This is an automated email from the ASF dual-hosted git repository.

claude pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/creadur-rat.git


The following commit(s) were added to refs/heads/master by this push:
     new 0a9559e1 RAT-81: Fixed encoding issue causing text files to not be 
read properly (#395)
0a9559e1 is described below

commit 0a9559e1d8726ba16c933ec560f7e5d42400360e
Author: Claude Warren <[email protected]>
AuthorDate: Mon Nov 18 17:21:02 2024 +0000

    RAT-81: Fixed encoding issue causing text files to not be read properly 
(#395)
    
    * Fixed encoding issue where text files not in UTF-8 encoding would not be 
properly.
    
    Change adds charset to the metadata when it can be discovered.  If not UTF8 
is returned.
    
    Added integration test RAT-81 to show reading of UTF8 and IBM037 encoding 
works.
    
    * Minor fixes
    
    * RAT-81: Add changelog about encoding bugfix
    
    * added logging and removed dead code
    
    * fix for RAT-96
    
    Added mediaType and encoding attributes to XML output.
    Added updated DefaultAnalyserFactoryTests to account for change
    Added integration tests for RAT-147 and RAT-211 based on code in 
DefaultAnalyserFactoryTests
    Updated ReportTest to add dependencies and package jar to classpath for 
test.
    Fixed testing issues in Ant unit caused by addition of mediatype and 
attributes.
    renamed reportTest directories to use a '_' rather than a '-' to account 
for java package names.
    
    * RAT-81: groovify the test code, minor fixes
    
    * RAT-81: Add mediaType and encoding to RAT report, minor fixes
    
    ---------
    
    Co-authored-by: P. Ottlinger <[email protected]>
    Co-authored-by: P. Ottlinger <[email protected]>
---
 apache-rat-core/pom.xml                            |   4 +
 .../src/it/java/org/apache/rat/ReportTest.java     |  11 ++-
 .../it/resources/ReportTest/RAT-246/src/.gitignore |   1 -
 .../ReportTest/RAT-246/src/dir1/FileToIgnore       |   0
 .../resources/ReportTest/RAT_147/commandLine.txt   |   4 +
 .../ReportTest/RAT_147/src/unix-newlines.txt.bin   |   8 ++
 .../RAT_147/src/windows-newlines.txt.bin           |   9 +++
 .../it/resources/ReportTest/RAT_147/verify.groovy  |  62 +++++++++++++++
 .../resources/ReportTest/RAT_211/commandLine.txt   |   6 ++
 .../src/leader-election-message-arrives.dia        | Bin 0 -> 5796 bytes
 .../resources/ReportTest/RAT_211/src/side_left.bmp | Bin 0 -> 345238 bytes
 .../it/resources/ReportTest/RAT_211/verify.groovy  |  56 ++++++++++++++
 .../{RAT-246 => RAT_246}/commandLine.txt           |   0
 .../ReportTest/{RAT-246 => RAT_246}/notes.md       |   0
 .../{RAT-246 => RAT_246}/src/dir1/FileToProcess    |   0
 .../ReportTest/{RAT-246 => RAT_246}/verify.groovy  |   6 +-
 .../{RAT-408 => RAT_408}/commandLine.txt           |   0
 .../{RAT-408 => RAT_408}/expected-message.txt      |   0
 .../src/karapace/anonymize_schemas/_init_.py       |   0
 .../it/resources/ReportTest/RAT_81/commandLine.txt |   2 +
 .../it/resources/ReportTest/RAT_81/src/IBM037.txt  |   1 +
 .../it/resources/ReportTest/RAT_81/src/UTF8.txt    |  20 +++++
 .../it/resources/ReportTest/RAT_81/verify.groovy   |  61 +++++++++++++++
 .../src/it/resources/ReportTest/readme.md          |   2 +
 .../org/apache/rat/analysis/TikaProcessor.java     |  46 ++++++++++-
 .../src/main/java/org/apache/rat/api/Document.java |  12 ++-
 .../src/main/java/org/apache/rat/api/MetaData.java |  19 +++++
 .../apache/rat/document/ArchiveEntryDocument.java  |   8 --
 .../java/org/apache/rat/document/FileDocument.java |   8 +-
 .../org/apache/rat/report/xml/XmlElements.java     |  62 ++++++++++-----
 .../main/resources/org/apache/rat/rat-report.xsd   |   2 +
 .../rat/analysis/DefaultAnalyserFactoryTest.java   |  84 ++++++++++-----------
 .../resources/antunit/report-normal-operation.xml  |   2 +-
 pom.xml                                            |   8 +-
 src/changes/changes.xml                            |   9 +++
 35 files changed, 419 insertions(+), 94 deletions(-)

diff --git a/apache-rat-core/pom.xml b/apache-rat-core/pom.xml
index 145cd4f7..8a500a5f 100644
--- a/apache-rat-core/pom.xml
+++ b/apache-rat-core/pom.xml
@@ -196,6 +196,10 @@
       <groupId>org.apache.tika</groupId>
       <artifactId>tika-core</artifactId>
     </dependency>
+    <dependency>
+      <groupId>org.apache.tika</groupId>
+      <artifactId>tika-parser-text-module</artifactId>
+    </dependency>
     <dependency>
       <!-- this dependency is actually used by integeration testing code -->
       <groupId>org.codehaus.groovy</groupId>
diff --git a/apache-rat-core/src/it/java/org/apache/rat/ReportTest.java 
b/apache-rat-core/src/it/java/org/apache/rat/ReportTest.java
index 6654ed58..45d74840 100644
--- a/apache-rat-core/src/it/java/org/apache/rat/ReportTest.java
+++ b/apache-rat-core/src/it/java/org/apache/rat/ReportTest.java
@@ -18,7 +18,6 @@
  */
 package org.apache.rat;
 
-import groovy.lang.Binding;
 import groovy.lang.GroovyShell;
 import java.io.File;
 import java.io.FileReader;
@@ -48,6 +47,7 @@ import org.apache.rat.report.RatReport;
 import org.apache.rat.utils.DefaultLog;
 import org.apache.rat.utils.Log;
 import org.apache.rat.walker.DirectoryWalker;
+import org.codehaus.groovy.control.CompilerConfiguration;
 import org.junit.jupiter.params.ParameterizedTest;
 import org.junit.jupiter.params.provider.Arguments;
 import org.junit.jupiter.params.provider.MethodSource;
@@ -123,9 +123,12 @@ public class ReportTest {
         File groovyScript = new File(baseDir, "verify.groovy");
         if (groovyScript.exists()) {
             // call groovy expressions from Java code
-            Binding binding = new Binding();
-            GroovyShell shell = new GroovyShell(binding);
+            CompilerConfiguration compilerConfiguration = new 
CompilerConfiguration();
 
+            GroovyShell shell = new GroovyShell(compilerConfiguration);
+            for (String classPath : 
System.getProperty("java.class.path").split(File.pathSeparator)) {
+                shell.getClassLoader().addClasspath(classPath);
+            }
             Object value = shell.run(groovyScript, new 
String[]{outputFile.getAbsolutePath(), logFile.getAbsolutePath()});
             if (value != null) {
                 fail(String.format("%s",value));
@@ -194,7 +197,7 @@ public class ReportTest {
         @Override
         public void log(Level level, String msg) {
             if (isEnabled(level)) {
-                logFile.println(String.format("%s: %s", level, msg));
+                logFile.printf("%s: %s%n", level, msg);
             }
         }
 
diff --git a/apache-rat-core/src/it/resources/ReportTest/RAT-246/src/.gitignore 
b/apache-rat-core/src/it/resources/ReportTest/RAT-246/src/.gitignore
deleted file mode 100644
index 4b2162f1..00000000
--- a/apache-rat-core/src/it/resources/ReportTest/RAT-246/src/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-**/FileToIgnore
diff --git 
a/apache-rat-core/src/it/resources/ReportTest/RAT-246/src/dir1/FileToIgnore 
b/apache-rat-core/src/it/resources/ReportTest/RAT-246/src/dir1/FileToIgnore
deleted file mode 100644
index e69de29b..00000000
diff --git 
a/apache-rat-core/src/it/resources/ReportTest/RAT_147/commandLine.txt 
b/apache-rat-core/src/it/resources/ReportTest/RAT_147/commandLine.txt
new file mode 100644
index 00000000..91180535
--- /dev/null
+++ b/apache-rat-core/src/it/resources/ReportTest/RAT_147/commandLine.txt
@@ -0,0 +1,4 @@
+--counter-max
+UNAPPROVED:2
+--output-style
+xml
diff --git 
a/apache-rat-core/src/it/resources/ReportTest/RAT_147/src/unix-newlines.txt.bin 
b/apache-rat-core/src/it/resources/ReportTest/RAT_147/src/unix-newlines.txt.bin
new file mode 100644
index 00000000..2c498da3
--- /dev/null
+++ 
b/apache-rat-core/src/it/resources/ReportTest/RAT_147/src/unix-newlines.txt.bin
@@ -0,0 +1,8 @@
+sentence 1.
+sentence 2.
+
+
+sentence 3.
+
+sentence 4.
+
diff --git 
a/apache-rat-core/src/it/resources/ReportTest/RAT_147/src/windows-newlines.txt.bin
 
b/apache-rat-core/src/it/resources/ReportTest/RAT_147/src/windows-newlines.txt.bin
new file mode 100644
index 00000000..a0adb98f
--- /dev/null
+++ 
b/apache-rat-core/src/it/resources/ReportTest/RAT_147/src/windows-newlines.txt.bin
@@ -0,0 +1,9 @@
+sentence 1.
+sentence 2.
+
+
+sentence 3.
+
+sentence 4.
+
+
diff --git a/apache-rat-core/src/it/resources/ReportTest/RAT_147/verify.groovy 
b/apache-rat-core/src/it/resources/ReportTest/RAT_147/verify.groovy
new file mode 100644
index 00000000..e531e870
--- /dev/null
+++ b/apache-rat-core/src/it/resources/ReportTest/RAT_147/verify.groovy
@@ -0,0 +1,62 @@
+package ReportTest.RAT_246
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.rat.testhelpers.TextUtils
+import org.apache.rat.testhelpers.XmlUtils
+import org.w3c.dom.NodeList
+
+import javax.xml.xpath.XPath
+import javax.xml.xpath.XPathFactory
+
+import static org.junit.jupiter.api.Assertions.assertEquals
+
+doc = XmlUtils.toDom(new FileInputStream(args[0]))
+XPath xPath = XPathFactory.newInstance().newXPath()
+
+NodeList nodeList = XmlUtils.getNodeList(doc, xPath, 
"/rat-report/resource[@name='/unix-newlines.txt.bin']")
+assertEquals(1, nodeList.getLength())
+node = nodeList.item(0)
+attributes = node.getAttributes()
+assertEquals("UTF-8", attributes.getNamedItem("encoding").getNodeValue())
+assertEquals("text/plain", attributes.getNamedItem("mediaType").getNodeValue())
+assertEquals("STANDARD", attributes.getNamedItem("type").getNodeValue())
+nodeList = XmlUtils.getNodeList(node, xPath, "license")
+assertEquals(1, nodeList.getLength())
+node = nodeList.item(0)
+attributes = node.getAttributes()
+assertEquals("false", attributes.getNamedItem("approval").getNodeValue())
+
+
+nodeList = XmlUtils.getNodeList(doc, xPath, 
"/rat-report/resource[@name='/windows-newlines.txt.bin']")
+assertEquals(1, nodeList.getLength())
+node = nodeList.item(0)
+attributes = node.getAttributes()
+assertEquals("UTF-8", attributes.getNamedItem("encoding").getNodeValue())
+assertEquals("text/plain", attributes.getNamedItem("mediaType").getNodeValue())
+assertEquals("STANDARD", attributes.getNamedItem("type").getNodeValue())
+nodeList = XmlUtils.getNodeList(node, xPath, "license")
+assertEquals(1, nodeList.getLength())
+node = nodeList.item(0)
+attributes = node.getAttributes()
+assertEquals("false", attributes.getNamedItem("approval").getNodeValue())
+
+logOutput = new File(args[1])
+log = logOutput.text
+
+TextUtils.assertPatternNotInTarget("^ERROR:", log)
+TextUtils.assertPatternNotInTarget("^WARN:", log)
\ No newline at end of file
diff --git 
a/apache-rat-core/src/it/resources/ReportTest/RAT_211/commandLine.txt 
b/apache-rat-core/src/it/resources/ReportTest/RAT_211/commandLine.txt
new file mode 100644
index 00000000..119348ed
--- /dev/null
+++ b/apache-rat-core/src/it/resources/ReportTest/RAT_211/commandLine.txt
@@ -0,0 +1,6 @@
+--counter-min
+LICENSE_CATEGORIES:0
+LICENSE_NAMES:0
+STANDARDS:0
+--output-style
+xml
diff --git 
a/apache-rat-core/src/it/resources/ReportTest/RAT_211/src/leader-election-message-arrives.dia
 
b/apache-rat-core/src/it/resources/ReportTest/RAT_211/src/leader-election-message-arrives.dia
new file mode 100644
index 00000000..41fa5300
Binary files /dev/null and 
b/apache-rat-core/src/it/resources/ReportTest/RAT_211/src/leader-election-message-arrives.dia
 differ
diff --git 
a/apache-rat-core/src/it/resources/ReportTest/RAT_211/src/side_left.bmp 
b/apache-rat-core/src/it/resources/ReportTest/RAT_211/src/side_left.bmp
new file mode 100644
index 00000000..c1d3d69a
Binary files /dev/null and 
b/apache-rat-core/src/it/resources/ReportTest/RAT_211/src/side_left.bmp differ
diff --git a/apache-rat-core/src/it/resources/ReportTest/RAT_211/verify.groovy 
b/apache-rat-core/src/it/resources/ReportTest/RAT_211/verify.groovy
new file mode 100644
index 00000000..288fe4a6
--- /dev/null
+++ b/apache-rat-core/src/it/resources/ReportTest/RAT_211/verify.groovy
@@ -0,0 +1,56 @@
+package ReportTest.RAT_211
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.rat.testhelpers.TextUtils
+import org.apache.rat.testhelpers.XmlUtils
+import org.w3c.dom.NodeList
+
+import javax.xml.xpath.XPath
+import javax.xml.xpath.XPathFactory
+
+import static org.junit.jupiter.api.Assertions.assertEquals
+import static org.junit.jupiter.api.Assertions.assertNull
+
+doc = XmlUtils.toDom(new FileInputStream(args[0]))
+XPath xPath = XPathFactory.newInstance().newXPath()
+
+NodeList nodeList = XmlUtils.getNodeList(doc, xPath, 
"/rat-report/resource[@name='/leader-election-message-arrives.dia']")
+assertEquals(1, nodeList.getLength())
+node = nodeList.item(0)
+attributes = node.getAttributes()
+assertNull(attributes.getNamedItem("encoding"), "There should not be an 
encoding")
+assertEquals("application/gzip", 
attributes.getNamedItem("mediaType").getNodeValue())
+assertEquals("ARCHIVE", attributes.getNamedItem("type").getNodeValue())
+nodeList = XmlUtils.getNodeList(node, xPath, "license")
+assertEquals(0, nodeList.getLength())
+
+nodeList = XmlUtils.getNodeList(doc, xPath, 
"/rat-report/resource[@name='/side_left.bmp']")
+assertEquals(1, nodeList.getLength())
+node = nodeList.item(0)
+attributes = node.getAttributes()
+assertNull(attributes.getNamedItem("encoding"), "There should not be an 
encoding")
+assertEquals("image/bmp", attributes.getNamedItem("mediaType").getNodeValue())
+assertEquals("BINARY", attributes.getNamedItem("type").getNodeValue())
+nodeList = XmlUtils.getNodeList(node, xPath, "license")
+assertEquals(0, nodeList.getLength())
+
+logOutput = new File(args[1])
+log = logOutput.text
+
+TextUtils.assertPatternNotInTarget("^ERROR:", log)
+TextUtils.assertPatternNotInTarget("^WARN:", log)
\ No newline at end of file
diff --git 
a/apache-rat-core/src/it/resources/ReportTest/RAT-246/commandLine.txt 
b/apache-rat-core/src/it/resources/ReportTest/RAT_246/commandLine.txt
similarity index 100%
rename from apache-rat-core/src/it/resources/ReportTest/RAT-246/commandLine.txt
rename to apache-rat-core/src/it/resources/ReportTest/RAT_246/commandLine.txt
diff --git a/apache-rat-core/src/it/resources/ReportTest/RAT-246/notes.md 
b/apache-rat-core/src/it/resources/ReportTest/RAT_246/notes.md
similarity index 100%
rename from apache-rat-core/src/it/resources/ReportTest/RAT-246/notes.md
rename to apache-rat-core/src/it/resources/ReportTest/RAT_246/notes.md
diff --git 
a/apache-rat-core/src/it/resources/ReportTest/RAT-246/src/dir1/FileToProcess 
b/apache-rat-core/src/it/resources/ReportTest/RAT_246/src/dir1/FileToProcess
similarity index 100%
rename from 
apache-rat-core/src/it/resources/ReportTest/RAT-246/src/dir1/FileToProcess
rename to 
apache-rat-core/src/it/resources/ReportTest/RAT_246/src/dir1/FileToProcess
diff --git a/apache-rat-core/src/it/resources/ReportTest/RAT-246/verify.groovy 
b/apache-rat-core/src/it/resources/ReportTest/RAT_246/verify.groovy
similarity index 90%
rename from apache-rat-core/src/it/resources/ReportTest/RAT-246/verify.groovy
rename to apache-rat-core/src/it/resources/ReportTest/RAT_246/verify.groovy
index 5d089962..1be9253b 100644
--- a/apache-rat-core/src/it/resources/ReportTest/RAT-246/verify.groovy
+++ b/apache-rat-core/src/it/resources/ReportTest/RAT_246/verify.groovy
@@ -16,14 +16,14 @@ package ReportTest.RAT_246
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-output = new File(args[0]);
+output = new File(args[0])
 content = output.text
 
 assert !content.contains('/dir1/FileToIgnore')
 assert content.contains('/dir1/FileToProcess')
 
-logOutput = new File(args[1]);
+logOutput = new File(args[1])
 log = logOutput.text
 
 assert log.contains('INFO: Processing exclude file from GIT.')
-assert log.contains("INFO: Excluding GIT collection.")
+assert log.contains("INFO: Excluding GIT collection.")
\ No newline at end of file
diff --git 
a/apache-rat-core/src/it/resources/ReportTest/RAT-408/commandLine.txt 
b/apache-rat-core/src/it/resources/ReportTest/RAT_408/commandLine.txt
similarity index 100%
rename from apache-rat-core/src/it/resources/ReportTest/RAT-408/commandLine.txt
rename to apache-rat-core/src/it/resources/ReportTest/RAT_408/commandLine.txt
diff --git 
a/apache-rat-core/src/it/resources/ReportTest/RAT-408/expected-message.txt 
b/apache-rat-core/src/it/resources/ReportTest/RAT_408/expected-message.txt
similarity index 100%
rename from 
apache-rat-core/src/it/resources/ReportTest/RAT-408/expected-message.txt
rename to 
apache-rat-core/src/it/resources/ReportTest/RAT_408/expected-message.txt
diff --git 
a/apache-rat-core/src/it/resources/ReportTest/RAT-408/src/karapace/anonymize_schemas/_init_.py
 
b/apache-rat-core/src/it/resources/ReportTest/RAT_408/src/karapace/anonymize_schemas/_init_.py
similarity index 100%
rename from 
apache-rat-core/src/it/resources/ReportTest/RAT-408/src/karapace/anonymize_schemas/_init_.py
rename to 
apache-rat-core/src/it/resources/ReportTest/RAT_408/src/karapace/anonymize_schemas/_init_.py
diff --git a/apache-rat-core/src/it/resources/ReportTest/RAT_81/commandLine.txt 
b/apache-rat-core/src/it/resources/ReportTest/RAT_81/commandLine.txt
new file mode 100644
index 00000000..6981deb8
--- /dev/null
+++ b/apache-rat-core/src/it/resources/ReportTest/RAT_81/commandLine.txt
@@ -0,0 +1,2 @@
+--output-style
+xml
diff --git a/apache-rat-core/src/it/resources/ReportTest/RAT_81/src/IBM037.txt 
b/apache-rat-core/src/it/resources/ReportTest/RAT_81/src/IBM037.txt
new file mode 100644
index 00000000..fc5a8995
--- /dev/null
+++ b/apache-rat-core/src/it/resources/ReportTest/RAT_81/src/IBM037.txt
@@ -0,0 +1 @@
+a\@\@Ӊ������@��@���@������@▆�����@Ɩ��������@M���]@�����@���@@@\@\@��@����@�����������@�������@����������K@@ⅅ@���@������@����@\@\@�����������@����@����@����@���@����������@�����������@@@@@@@@\@\@���������@���������@���������K@@㈅@���@��������@����@����@@@\@\@��@���@�����@���@������@Ӊ�����k@兙����@�K�@M���@@@@@@@@@@@@\@\@Ӊ�����]^@���@���@���@���@����@����@������@��@����������@@@\@\@����@���@Ӊ�����K@@薤@���@������@�@�
 �����@��@���@Ӊ�����@��@@@\@\@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ [...]
\ No newline at end of file
diff --git a/apache-rat-core/src/it/resources/ReportTest/RAT_81/src/UTF8.txt 
b/apache-rat-core/src/it/resources/ReportTest/RAT_81/src/UTF8.txt
new file mode 100644
index 00000000..175b1890
--- /dev/null
+++ b/apache-rat-core/src/it/resources/ReportTest/RAT_81/src/UTF8.txt
@@ -0,0 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one   *
+ * or more contributor license agreements.  See the NOTICE file *
+ * distributed with this work for additional information        *
+ * regarding copyright ownership.  The ASF licenses this file   *
+ * to you under the Apache License, Version 2.0 (the            *
+ * "License"); you may not use this file except in compliance   *
+ * with the License.  You may obtain a copy of the License at   *
+ *                                                              *
+ *   http://www.apache.org/licenses/LICENSE-2.0                 *
+ *                                                              *
+ * Unless required by applicable law or agreed to in writing,   *
+ * software distributed under the License is distributed on an  *
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY       *
+ * KIND, either express or implied.  See the License for the    *
+ * specific language governing permissions and limitations      *
+ * under the License.                                           *
+ */
+
+This is a file that has the IBM037 encoding.  Hopefully it will not fail.
diff --git a/apache-rat-core/src/it/resources/ReportTest/RAT_81/verify.groovy 
b/apache-rat-core/src/it/resources/ReportTest/RAT_81/verify.groovy
new file mode 100644
index 00000000..a3a78c3a
--- /dev/null
+++ b/apache-rat-core/src/it/resources/ReportTest/RAT_81/verify.groovy
@@ -0,0 +1,61 @@
+package ReportTest.RAT_81
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.rat.testhelpers.TextUtils
+import org.apache.rat.testhelpers.XmlUtils
+import org.w3c.dom.NodeList
+
+import javax.xml.xpath.XPath
+import javax.xml.xpath.XPathFactory
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+doc = XmlUtils.toDom(new FileInputStream(args[0]))
+XPath xPath = XPathFactory.newInstance().newXPath()
+
+NodeList nodeList = XmlUtils.getNodeList(doc, xPath, 
"/rat-report/resource[@name='/IBM037.txt']")
+assertEquals(1, nodeList.getLength())
+node = nodeList.item(0)
+attributes = node.getAttributes()
+assertEquals("IBM500", attributes.getNamedItem("encoding").getNodeValue())
+assertEquals("text/plain", attributes.getNamedItem("mediaType").getNodeValue())
+assertEquals("STANDARD", attributes.getNamedItem("type").getNodeValue())
+nodeList = XmlUtils.getNodeList(node, xPath, "license")
+assertEquals(1, nodeList.getLength())
+node = nodeList.item(0)
+attributes = node.getAttributes()
+assertEquals("true", attributes.getNamedItem("approval").getNodeValue())
+
+nodeList = XmlUtils.getNodeList(doc, xPath, 
"/rat-report/resource[@name='/UTF8.txt']")
+assertEquals(1, nodeList.getLength())
+node = nodeList.item(0)
+attributes = node.getAttributes()
+assertEquals("ISO-8859-1", attributes.getNamedItem("encoding").getNodeValue())
+assertEquals("text/plain", attributes.getNamedItem("mediaType").getNodeValue())
+assertEquals("STANDARD", attributes.getNamedItem("type").getNodeValue())
+nodeList = XmlUtils.getNodeList(node, xPath, "license")
+assertEquals(1, nodeList.getLength())
+node = nodeList.item(0)
+attributes = node.getAttributes()
+assertEquals("true", attributes.getNamedItem("approval").getNodeValue())
+
+logOutput = new File(args[1])
+log = logOutput.text
+
+TextUtils.assertPatternNotInTarget("^ERROR:", log)
+TextUtils.assertPatternNotInTarget("^WARN:", log)
\ No newline at end of file
diff --git a/apache-rat-core/src/it/resources/ReportTest/readme.md 
b/apache-rat-core/src/it/resources/ReportTest/readme.md
index 6c9937e4..f46013fc 100644
--- a/apache-rat-core/src/it/resources/ReportTest/readme.md
+++ b/apache-rat-core/src/it/resources/ReportTest/readme.md
@@ -1 +1,3 @@
 Directories here are structured to be read by ReportTest.
+
+Directories should be named `RAT_###` to correspond with `RAT-###` jira 
tickets where the association exists. 
diff --git 
a/apache-rat-core/src/main/java/org/apache/rat/analysis/TikaProcessor.java 
b/apache-rat-core/src/main/java/org/apache/rat/analysis/TikaProcessor.java
index 12658eb4..c88328bc 100644
--- a/apache-rat-core/src/main/java/org/apache/rat/analysis/TikaProcessor.java
+++ b/apache-rat-core/src/main/java/org/apache/rat/analysis/TikaProcessor.java
@@ -18,18 +18,25 @@
  */
 package org.apache.rat.analysis;
 
+import java.io.BufferedInputStream;
 import java.io.IOException;
 import java.io.InputStream;
+import java.nio.charset.Charset;
+import java.nio.charset.UnsupportedCharsetException;
 import java.util.HashMap;
 import java.util.Map;
 
 import org.apache.rat.api.Document;
+import org.apache.rat.document.DocumentName;
 import org.apache.rat.document.RatDocumentAnalysisException;
 import org.apache.rat.document.guesser.NoteGuesser;
+import org.apache.rat.utils.DefaultLog;
 import org.apache.tika.Tika;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.txt.CharsetDetector;
+import org.apache.tika.parser.txt.CharsetMatch;
 
 /**
  * A wrapping around the tika processor.
@@ -105,6 +112,15 @@ public final class TikaProcessor {
         return new HashMap<>(DOCUMENT_TYPE_MAP);
     }
 
+    /**
+     * Ensures that the input stream supports {@code mark}.
+     * @param stream the stream to test.
+     * @return a stream that supports {@code mark}.
+     */
+    public static InputStream markSupportedInputStream(final InputStream 
stream) {
+        return stream.markSupported() ? stream : new 
BufferedInputStream(stream);
+    }
+
     /**
      * Process the input document.
      * @param document the Document to process.
@@ -113,7 +129,7 @@ public final class TikaProcessor {
      */
     public static String process(final Document document) throws 
RatDocumentAnalysisException {
         Metadata metadata = new Metadata();
-        try (InputStream stream = document.inputStream()) {
+        try (InputStream stream = 
markSupportedInputStream(document.inputStream())) {
             metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, 
document.getName().getName());
             String result = TIKA.detect(stream, metadata);
             String[] parts = result.split("/");
@@ -122,6 +138,7 @@ public final class TikaProcessor {
             document.getMetaData()
                     .setDocumentType(fromMediaType(mediaType));
             if (Document.Type.STANDARD == 
document.getMetaData().getDocumentType()) {
+                document.getMetaData().setCharset(detectCharset(stream, 
document.getName()));
                 if (NoteGuesser.isNote(document)) {
                     
document.getMetaData().setDocumentType(Document.Type.NOTICE);
                 }
@@ -133,6 +150,33 @@ public final class TikaProcessor {
         }
     }
 
+    /**
+     * Determine the character set for the input stream. Input stream must 
implement {@code mark}.
+     * @param stream the stream to check.
+     * @param documentName the name of the document being read.
+     * @return the detected character set or {@code null} if not detectable.
+     * @throws IOException on IO error.
+     */
+    private static Charset detectCharset(final InputStream stream, final 
DocumentName documentName) throws IOException {
+        CharsetDetector encodingDetector = new CharsetDetector();
+        encodingDetector.setText(stream);
+        CharsetMatch charsetMatch = encodingDetector.detect();
+        if (charsetMatch != null) {
+            try {
+                return Charset.forName(charsetMatch.getName());
+            } catch (UnsupportedCharsetException e) {
+                DefaultLog.getInstance().warn(String.format("Unsupported 
character set '%s' in file '%s'.  Will use system default encoding.",
+                                charsetMatch.getName(), documentName));
+            }
+        }
+        return null;
+    }
+
+    /**
+     * Gets the Document.Type based on the MediaType.
+     * @param mediaType the media type to check.
+     * @return The document type.
+     */
     public static Document.Type fromMediaType(final MediaType mediaType) {
         if ("text".equals(mediaType.getType())) {
             return Document.Type.STANDARD;
diff --git a/apache-rat-core/src/main/java/org/apache/rat/api/Document.java 
b/apache-rat-core/src/main/java/org/apache/rat/api/Document.java
index 7628e6b4..f5fd2e94 100644
--- a/apache-rat-core/src/main/java/org/apache/rat/api/Document.java
+++ b/apache-rat-core/src/main/java/org/apache/rat/api/Document.java
@@ -23,8 +23,10 @@ import java.io.InputStream;
 import java.io.Reader;
 import java.util.SortedSet;
 
+import org.apache.rat.analysis.TikaProcessor;
 import org.apache.rat.document.DocumentName;
 import org.apache.rat.document.DocumentNameMatcher;
+import org.apache.tika.parser.txt.CharsetDetector;
 
 /**
  * The representation of a document being scanned.
@@ -46,7 +48,7 @@ public abstract class Document implements 
Comparable<Document> {
         /** A standard document. */
         STANDARD,
         /** An unknown document type. */
-        UNKNOWN;
+        UNKNOWN
     }
 
     /** The path matcher used by this document */
@@ -104,14 +106,16 @@ public abstract class Document implements 
Comparable<Document> {
     /**
      * Reads the contents of this document.
      * @return <code>Reader</code> not null
-     * @throws IOException if this document cannot be read
+     * @throws IOException if this document cannot be read.
      */
-    public abstract Reader reader() throws IOException;
+    public Reader reader() throws IOException {
+        return new 
CharsetDetector().getReader(TikaProcessor.markSupportedInputStream(inputStream()),
 getMetaData().getCharset().name());
+    }
 
     /**
      * Streams the document's contents.
      * @return a non-null input stream of the document.
-     * @throws IOException when stream could not be opened
+     * @throws IOException when stream could not be opened.
      */
     public abstract InputStream inputStream() throws IOException;
 
diff --git a/apache-rat-core/src/main/java/org/apache/rat/api/MetaData.java 
b/apache-rat-core/src/main/java/org/apache/rat/api/MetaData.java
index 806e0c0d..e73bb86e 100644
--- a/apache-rat-core/src/main/java/org/apache/rat/api/MetaData.java
+++ b/apache-rat-core/src/main/java/org/apache/rat/api/MetaData.java
@@ -18,6 +18,7 @@
  */
 package org.apache.rat.api;
 
+import java.nio.charset.Charset;
 import java.util.HashSet;
 import java.util.Set;
 import java.util.SortedSet;
@@ -38,6 +39,8 @@ public class MetaData {
     private final SortedSet<ILicense> matchedLicenses;
     /** The list of License Family Categories that are approved */
     private final Set<String> approvedLicenses;
+    /** The charset for this document */
+    private Charset charset;
     /** The media type for this document */
     private MediaType mediaType;
     /** The document type for this document */
@@ -53,6 +56,22 @@ public class MetaData {
         this.approvedLicenses = new HashSet<>();
     }
 
+    /**
+     * Gets the charset for the document. If the charset was not set will 
return the system default charset.
+     * @return the charset for the document
+     */
+    public Charset getCharset() {
+        return charset == null ? Charset.defaultCharset() : charset;
+    }
+
+    /**
+     * Sets the charset for the document. If set to {@code null} the system 
default charset will be used.
+     * @param charset the charset to use.
+     */
+    public void setCharset(final Charset charset) {
+        this.charset = charset;
+    }
+
     /**
      * Gets the defined media type.
      * @return the media type.
diff --git 
a/apache-rat-core/src/main/java/org/apache/rat/document/ArchiveEntryDocument.java
 
b/apache-rat-core/src/main/java/org/apache/rat/document/ArchiveEntryDocument.java
index e25e44db..0553e433 100644
--- 
a/apache-rat-core/src/main/java/org/apache/rat/document/ArchiveEntryDocument.java
+++ 
b/apache-rat-core/src/main/java/org/apache/rat/document/ArchiveEntryDocument.java
@@ -21,9 +21,6 @@ package org.apache.rat.document;
 
 import java.io.ByteArrayInputStream;
 import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.io.Reader;
-import java.nio.charset.StandardCharsets;
 import java.util.Collections;
 import java.util.SortedSet;
 
@@ -62,9 +59,4 @@ public class ArchiveEntryDocument extends Document {
     public SortedSet<Document> listChildren() {
         return Collections.emptySortedSet();
     }
-
-    @Override
-    public Reader reader() {
-        return new InputStreamReader(new ByteArrayInputStream(contents), 
StandardCharsets.UTF_8);
-    }
 }
diff --git 
a/apache-rat-core/src/main/java/org/apache/rat/document/FileDocument.java 
b/apache-rat-core/src/main/java/org/apache/rat/document/FileDocument.java
index 37bbfe79..31a7deb3 100644
--- a/apache-rat-core/src/main/java/org/apache/rat/document/FileDocument.java
+++ b/apache-rat-core/src/main/java/org/apache/rat/document/FileDocument.java
@@ -19,10 +19,8 @@
 package org.apache.rat.document;
 
 import java.io.File;
-import java.io.FileReader;
 import java.io.IOException;
 import java.io.InputStream;
-import java.io.Reader;
 import java.nio.file.Files;
 import java.util.Arrays;
 import java.util.Collections;
@@ -33,7 +31,7 @@ import org.apache.rat.api.Document;
 import org.apache.rat.config.exclusion.ExclusionUtils;
 
 /**
- * Document wrapping a File object
+ * Document wrapping a File object.
  */
 public class FileDocument extends Document {
 
@@ -69,10 +67,6 @@ public class FileDocument extends Document {
         return Collections.emptySortedSet();
     }
 
-    public Reader reader() throws IOException {
-        return new FileReader(file);
-    }
-
     public InputStream inputStream() throws IOException {
         return Files.newInputStream(file.toPath());
     }
diff --git 
a/apache-rat-core/src/main/java/org/apache/rat/report/xml/XmlElements.java 
b/apache-rat-core/src/main/java/org/apache/rat/report/xml/XmlElements.java
index 6b1ef68b..82ea5ea1 100644
--- a/apache-rat-core/src/main/java/org/apache/rat/report/xml/XmlElements.java
+++ b/apache-rat-core/src/main/java/org/apache/rat/report/xml/XmlElements.java
@@ -30,11 +30,22 @@ import org.apache.rat.api.MetaData;
 import org.apache.rat.api.RatException;
 import org.apache.rat.license.ILicense;
 import org.apache.rat.report.xml.writer.IXmlWriter;
+import org.apache.rat.utils.CasedString;
 
 /**
  * Creates the elements in the XML report.
  */
 public class XmlElements {
+    /**
+     * Converts an enum name to snake case.
+     * @param name the enum name to convert.
+     * @return a camel cased name.
+     */
+    private static String normalizeName(final String name) {
+        CasedString casedName = new CasedString(CasedString.StringCase.SNAKE, 
name.toLowerCase(Locale.ROOT));
+       return casedName.toCase(CasedString.StringCase.CAMEL);
+    }
+
     /**
      * The elements in the report.
      */
@@ -42,28 +53,28 @@ public class XmlElements {
         /** The start of the Rat report */
         RAT_REPORT("rat-report"),
         /** The version of Rat being run */
-        VERSION("version"),
+        VERSION(),
         /** A resource element */
-        RESOURCE("resource"),
+        RESOURCE(),
         /** A license element */
-        LICENSE("license"),
+        LICENSE(),
         /** A notes element */
-        NOTES("notes"),
+        NOTES(),
         /** A sample from the file */
-        SAMPLE("sample"),
+        SAMPLE(),
         /** A statistics element */
-        STATISTICS("statistics"),
+        STATISTICS(),
         /** A statistic entry */
-        STATISTIC("statistic"),
+        STATISTIC(),
         /** A license name entry */
-        LICENSE_NAME("licenseName"),
+        LICENSE_NAME(),
         /** A license category entry */
-        LICENSE_CATEGORY("licenseCategory"),
+        LICENSE_CATEGORY(),
         /** A document type entry */
-        DOCUMENT_TYPE("documentType");
+        DOCUMENT_TYPE();
 
         /** The XML name for the element */
-        private String elementName;
+        private final String elementName;
 
         /**
          * Constructor.
@@ -73,6 +84,10 @@ public class XmlElements {
             this.elementName = elementName;
         }
 
+        Elements() {
+            this.elementName = normalizeName(name());
+        }
+
         /**
          * Gets the XML element name.
          * @return the XML element name.
@@ -80,7 +95,7 @@ public class XmlElements {
         public String getElementName() {
             return elementName;
         }
-    };
+    }
 
     /**
      * The attributes of elements in the report.
@@ -98,17 +113,21 @@ public class XmlElements {
         APPROVAL,
         /** The family category */
         FAMILY,
-        /** The type */
+        /** The document type */
         TYPE,
-        /** The Id */
+        /** The id */
         ID,
         /** The name */
         NAME,
         /** A counter */
         COUNT,
         /** A description */
-        DESCRIPTION
-    };
+        DESCRIPTION,
+        /** The media type for a document */
+        MEDIA_TYPE,
+        /** The encoding for a text document */
+        ENCODING
+    }
 
     /** The XMLWriter that we write to */
     private final IXmlWriter writer;
@@ -187,9 +206,14 @@ public class XmlElements {
      */
     public XmlElements document(final Document document) throws RatException {
         final MetaData metaData = document.getMetaData();
-        return write(Elements.RESOURCE)
+        XmlElements result = write(Elements.RESOURCE)
                 .write(Attributes.NAME, document.getName().localized("/"))
-                .write(Attributes.TYPE, metaData.getDocumentType().toString());
+                .write(Attributes.TYPE, metaData.getDocumentType().toString())
+                .write(Attributes.MEDIA_TYPE, 
metaData.getMediaType().toString());
+        if (Document.Type.STANDARD == metaData.getDocumentType()) {
+            result = result.write(Attributes.ENCODING, 
metaData.getCharset().displayName());
+        }
+        return result;
     }
 
     /**
@@ -314,7 +338,7 @@ public class XmlElements {
      */
     public XmlElements write(final Attributes attribute, final String value) 
throws RatException {
         try {
-            writer.attribute(attribute.name().toLowerCase(Locale.ROOT), value);
+            writer.attribute(normalizeName(attribute.name()), value);
             return this;
         } catch (IOException e) {
             throw new RatException("Cannot open add attribute: " + attribute, 
e);
diff --git a/apache-rat-core/src/main/resources/org/apache/rat/rat-report.xsd 
b/apache-rat-core/src/main/resources/org/apache/rat/rat-report.xsd
index 5d002bc4..fdd530da 100644
--- a/apache-rat-core/src/main/resources/org/apache/rat/rat-report.xsd
+++ b/apache-rat-core/src/main/resources/org/apache/rat/rat-report.xsd
@@ -46,6 +46,8 @@
                         </xs:sequence>
                         <xs:attribute type="xs:string" name="name" 
use="required"/>
                         <xs:attribute type="xs:string" name="type" 
use="required"/>
+                        <xs:attribute type="xs:string" name="mediaType" 
use="required" />
+                        <xs:attribute type="xs:string" name="encoding" 
use="optional" />
                     </xs:complexType>
                 </xs:element>
                 <xs:element name="statistics" maxOccurs="1" minOccurs="0">
diff --git 
a/apache-rat-core/src/test/java/org/apache/rat/analysis/DefaultAnalyserFactoryTest.java
 
b/apache-rat-core/src/test/java/org/apache/rat/analysis/DefaultAnalyserFactoryTest.java
index d72fb423..bb724630 100644
--- 
a/apache-rat-core/src/test/java/org/apache/rat/analysis/DefaultAnalyserFactoryTest.java
+++ 
b/apache-rat-core/src/test/java/org/apache/rat/analysis/DefaultAnalyserFactoryTest.java
@@ -25,6 +25,9 @@ import static org.junit.jupiter.api.Assertions.assertTrue;
 import java.io.File;
 import java.io.StringWriter;
 
+import java.util.ArrayList;
+import java.util.List;
+import java.util.stream.Stream;
 import org.apache.rat.Defaults;
 import org.apache.rat.ReportConfiguration;
 import org.apache.rat.api.Document;
@@ -38,6 +41,9 @@ import org.apache.rat.testhelpers.TextUtils;
 import org.assertj.core.util.Files;
 import org.junit.jupiter.api.BeforeEach;
 import org.junit.jupiter.api.Test;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.Arguments;
+import org.junit.jupiter.params.provider.MethodSource;
 
 public class DefaultAnalyserFactoryTest {
 
@@ -63,9 +69,6 @@ public class DefaultAnalyserFactoryTest {
     @Test
     public void standardTypeAnalyser() throws Exception {
         String[] expected = {
-                "<resource name='/elements/Text.txt' type='STANDARD'>"
-                        + "<license id='?????' name='Unknown license' 
approval='false' family='?????'/>"
-                        + "<sample><![CDATA[ /*", //
                 " * Licensed to the Apache Software Foundation (ASF) under 
one", //
                 " * or more contributor license agreements.  See the NOTICE 
file", //
                 " * distributed with this work for additional information", //
@@ -79,16 +82,19 @@ public class DefaultAnalyserFactoryTest {
                 " * \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY", 
//
                 " * KIND, either express or implied.  See the License for 
the", //
                 " * specific language governing permissions and limitations", 
//
-                " * under the License.", //
-                " ]]></sample></resource>" };
+                " * under the License."
+        };
 
         final Document document = new FileDocument(basedir,
                 Resources.getResourceFile("/elements/Text.txt"), p -> true);
         analyser.analyse(document);
-        reporter.report(document);
-        String result = out.toString();
+        assertEquals(Document.Type.STANDARD, 
document.getMetaData().getDocumentType());
+        assertEquals("text/plain", 
document.getMetaData().getMediaType().toString());
+        assertEquals(1, document.getMetaData().licenses().count());
+        document.getMetaData().licenses().forEach(lic -> 
assertEquals(UnknownLicense.INSTANCE, lic));
+        String result = document.getMetaData().getSampleHeader();
         for (String exp : expected) {
-            assertTrue(result.contains(exp), () -> exp);
+            assertTrue(result.contains(exp), exp);
         }
     }
 
@@ -97,8 +103,8 @@ public class DefaultAnalyserFactoryTest {
         final Document document = new FileDocument(basedir,
                 Resources.getResourceFile("/elements/LICENSE"), p -> true);
         analyser.analyse(document);
-        reporter.report(document);
-        assertEquals("<resource name='/elements/LICENSE' type='NOTICE'/>", 
out.toString());
+        assertEquals(Document.Type.NOTICE, 
document.getMetaData().getDocumentType());
+        assertEquals("text/plain", 
document.getMetaData().getMediaType().toString());
     }
 
     @Test
@@ -106,8 +112,8 @@ public class DefaultAnalyserFactoryTest {
         final Document document = new FileDocument(basedir,
                 Resources.getResourceFile("/elements/Image.png"), p -> true);
         analyser.analyse(document);
-        reporter.report(document);
-        assertEquals("<resource name='/elements/Image.png' type='BINARY'/>", 
out.toString());
+        assertEquals(Document.Type.BINARY, 
document.getMetaData().getDocumentType());
+        assertEquals("image/png", 
document.getMetaData().getMediaType().toString());
     }
 
     @Test
@@ -119,42 +125,32 @@ public class DefaultAnalyserFactoryTest {
         config.setFrom(defaults);
         analyser = DefaultAnalyserFactory.createDefaultAnalyser(config);
         analyser.analyse(document);
-        reporter.report(document);
-        assertEquals("<resource name='/elements/dummy.jar' type='ARCHIVE'/>", 
out.toString());
+        assertEquals(Document.Type.ARCHIVE, 
document.getMetaData().getDocumentType());
+        assertEquals("application/java-archive", 
document.getMetaData().getMediaType().toString());
     }
 
-    @Test
-    public void archivesAbsenceTest() throws Exception {
-        final Document document = new FileDocument(basedir,
-                Resources.getResourceFile("/elements/dummy.jar"), p -> true);
-        Defaults defaults = Defaults.builder().build();
-        ReportConfiguration config = new ReportConfiguration();
-        config.setFrom(defaults);
-        config.setArchiveProcessing(ReportConfiguration.Processing.ABSENCE);
-        analyser = DefaultAnalyserFactory.createDefaultAnalyser(config);
-        analyser.analyse(document);
-        reporter.report(document);
-        String result = out.toString();
-        TextUtils.assertContains("<resource name='/elements/dummy.jar' 
type='ARCHIVE'>", result);
-        TextUtils.assertContains("<license id='?????' name='Unknown license' 
approval='false' family='?????'/>", result);
-        TextUtils.assertContains("<license id='ASL' name='Applied Apache 
License Version 2.0' approval='false' family='AL   '/>", result);
+    private static Stream<Arguments> archivesAbsenceTestData() {
+        List<Arguments> lst = new ArrayList<>();
+        lst.add(Arguments.of(ReportConfiguration.Processing.NOTIFICATION, 0));
+        lst.add(Arguments.of(ReportConfiguration.Processing.PRESENCE, 2));
+        lst.add(Arguments.of(ReportConfiguration.Processing.ABSENCE, 3));
+        return lst.stream();
     }
 
-    @Test
-    public void archivesPresenceTest() throws Exception {
+    @ParameterizedTest
+    @MethodSource("archivesAbsenceTestData")
+    public void archivesAbsenceTest(ReportConfiguration.Processing 
archiveProcessing, int expectedLicenseCount) throws Exception {
         final Document document = new FileDocument(basedir,
                 Resources.getResourceFile("/elements/dummy.jar"), p -> true);
         Defaults defaults = Defaults.builder().build();
         ReportConfiguration config = new ReportConfiguration();
         config.setFrom(defaults);
-        config.setArchiveProcessing(ReportConfiguration.Processing.PRESENCE);
+        config.setArchiveProcessing(archiveProcessing);
         analyser = DefaultAnalyserFactory.createDefaultAnalyser(config);
         analyser.analyse(document);
-        reporter.report(document);
-        String result = out.toString();
-        TextUtils.assertContains("<resource name='/elements/dummy.jar' 
type='ARCHIVE'>", result);
-        TextUtils.assertNotContains("<license id='?????' name='Unknown 
license' approval='false' family='?????'/>", result);
-        TextUtils.assertContains("<license id='ASL' name='Applied Apache 
License Version 2.0' approval='false' family='AL   '/>", result);
+        assertEquals(Document.Type.ARCHIVE, 
document.getMetaData().getDocumentType());
+        assertEquals("application/java-archive", 
document.getMetaData().getMediaType().toString());
+        assertEquals(expectedLicenseCount, 
document.getMetaData().licenses().count());
     }
 
     @Test
@@ -162,8 +158,8 @@ public class DefaultAnalyserFactoryTest {
         final Document document = new FileDocument(basedir,
                 Resources.getResourceFile("/elements/dummy.jar"), p -> true);
         analyser.analyse(document);
-        reporter.report(document);
-        assertEquals("<resource name='/elements/dummy.jar' type='ARCHIVE'/>", 
out.toString());
+        assertEquals(Document.Type.ARCHIVE, 
document.getMetaData().getDocumentType());
+        assertEquals("application/java-archive", 
document.getMetaData().getMediaType().toString());
     }
 
     @Test
@@ -171,8 +167,8 @@ public class DefaultAnalyserFactoryTest {
         final Document document = new FileDocument(basedir,
                 Resources.getResourceFile("/jira/RAT211/side_left.bmp"), p -> 
true);
         analyser.analyse(document);
-        reporter.report(document);
-        assertEquals("<resource name='/jira/RAT211/side_left.bmp' 
type='BINARY'/>", out.toString());
+        assertEquals(Document.Type.BINARY, 
document.getMetaData().getDocumentType());
+        assertEquals("image/bmp", 
document.getMetaData().getMediaType().toString());
     }
 
     @Test
@@ -180,10 +176,8 @@ public class DefaultAnalyserFactoryTest {
         final Document document = new FileDocument(basedir,
                 
Resources.getResourceFile("/jira/RAT211/leader-election-message-arrives.dia"), 
p -> true);
         analyser.analyse(document);
-        reporter.report(document);
-        assertEquals(
-                "<resource 
name='/jira/RAT211/leader-election-message-arrives.dia' type='ARCHIVE'/>",
-                out.toString());
+        assertEquals(Document.Type.ARCHIVE, 
document.getMetaData().getDocumentType());
+        assertEquals("application/gzip", 
document.getMetaData().getMediaType().toString());
     }
 
     @Test
diff --git 
a/apache-rat-tasks/src/test/resources/antunit/report-normal-operation.xml 
b/apache-rat-tasks/src/test/resources/antunit/report-normal-operation.xml
index db495417..10cfd859 100644
--- a/apache-rat-tasks/src/test/resources/antunit/report-normal-operation.xml
+++ b/apache-rat-tasks/src/test/resources/antunit/report-normal-operation.xml
@@ -75,7 +75,7 @@ SPDX-License-Identifier: Apache-2.0
                </pathconvert>
                <property name="expectedOutput" value=" S 
/report-normal-operation.xml" />
                <property name="expectedOutputXML"
-                       value='&lt;resource 
name="/report-normal-operation.xml"' />
+                       value='&lt;resource encoding="ISO-8859-1" 
mediaType="application/xml" name="/report-normal-operation.xml" 
type="STANDARD"' />
                <property name="expectedOutputXML2" value='family="AL   "' />
        </target>
 
diff --git a/pom.xml b/pom.xml
index 4801ddab..eb5f216d 100644
--- a/pom.xml
+++ b/pom.xml
@@ -50,6 +50,7 @@ agnostic home for software distribution comprehension and 
audit tools.
     <ant.version>1.10.15</ant.version>
     <mockito.version>4.11.0</mockito.version>
     <javaVersion>1.8</javaVersion>
+    <tika.version>2.9.2</tika.version>
     <maven.compiler.source>${javaVersion}</maven.compiler.source>
     <maven.compiler.target>${javaVersion}</maven.compiler.target>
     <!-- This is the version of Maven required to use the Rat Maven Plugin -->
@@ -198,7 +199,12 @@ agnostic home for software distribution comprehension and 
audit tools.
       <dependency>
         <groupId>org.apache.tika</groupId>
         <artifactId>tika-core</artifactId>
-        <version>2.9.2</version>
+        <version>${tika.version}</version>
+      </dependency>
+      <dependency>
+        <groupId>org.apache.tika</groupId>
+        <artifactId>tika-parser-text-module</artifactId>
+        <version>${tika.version}</version>
       </dependency>
       <dependency>
         <groupId>org.codehaus.groovy</groupId>
diff --git a/src/changes/changes.xml b/src/changes/changes.xml
index f0b8008d..cee0945f 100644
--- a/src/changes/changes.xml
+++ b/src/changes/changes.xml
@@ -72,6 +72,15 @@ The <action> type attribute can be one of:
     </release>
     -->
     <release version="0.17-SNAPSHOT" date="xxxx-yy-zz" description="Current 
SNAPSHOT - release to be done">
+      <action issue="RAT-81" type="add" dev="claudenw">
+        Added encoding information of the file being read to the RAT report in 
case of STANDARD document files.
+        Added media type attribute in report for all files.
+      </action>
+      <action issue="RAT-81" type="fix" dev="claudenw">
+        Fixed encoding issue where text files not in UTF-8 encoding would not 
be read properly.
+        Change adds charset to the metadata when it can be discovered. If not 
UTF-8 is returned.
+        Added integration test to show reading of UTF8 and IBM037 encoding 
works.
+      </action>
       <action issue="RAT-408" type="fix" dev="claudenw">
         Added core integration tests and verified RAT-408 is fixed with the 
new exclusion engine.
       </action>

Reply via email to