This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 7825b59cb TIKA-4153 -- revert changes to robots.txt detection and add 
unit test for robots file starting with comments
7825b59cb is described below

commit 7825b59cb383411a4928ecbadf25d4a3f6f07c28
Author: tallison <[email protected]>
AuthorDate: Mon Oct 16 10:36:20 2023 -0400

    TIKA-4153 -- revert changes to robots.txt detection and add unit test for 
robots file starting with comments
---
 .../resources/org/apache/tika/mime/tika-mimetypes.xml  | 18 ++++++++----------
 .../test/java/org/apache/tika/mime/TestMimeTypes.java  |  1 +
 .../src/test/resources/test-documents/testRobots2.txt  | 11 +++++++++++
 3 files changed, 20 insertions(+), 10 deletions(-)

diff --git 
a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml 
b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index b49e355e6..53808c752 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -2135,16 +2135,14 @@
     <!-- should have a higher priority than rfc822 - TIKA-3489 -->
     <magic priority="55">
       <match minShouldMatch="2">
-        <match minShouldMatch="1">
-          <match value="user-agent:" type="stringignorecase" offset="0"/>
-          <match value="sitemap:" type="stringignorecase" offset="0"/>
-        </match>
-        <match minShouldMatch="1">
-          <match value="\nuser-agent:" type="stringignorecase" 
offset="0:1000"/>
-          <match value="\nallow:" type="stringignorecase" offset="0:1000"/>
-          <match value="\ndisallow:" type="stringignorecase" offset="0:1000"/>
-          <match value="\nsitemap:" type="stringignorecase" offset="0:1000"/>
-        </match>
+        <match value="user-agent:" type="stringignorecase" offset="0"/>
+        <match value="allow:" type="stringignorecase" offset="0"/>
+        <match value="disallow:" type="stringignorecase" offset="0"/>
+        <match value="sitemap:" type="stringignorecase" offset="0"/>
+        <match value="\nuser-agent:" type="stringignorecase" offset="0:1000"/>
+        <match value="\nallow:" type="stringignorecase" offset="0:1000"/>
+        <match value="\ndisallow:" type="stringignorecase" offset="0:1000"/>
+        <match value="\nsitemap:" type="stringignorecase" offset="0:1000"/>
       </match>
     </magic>
     <sub-class-of type="text/plain"/>
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypes.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypes.java
index 73945f355..3dad7d6af 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypes.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypes.java
@@ -1023,6 +1023,7 @@ public class TestMimeTypes {
     @Test
     public void testRobots() throws Exception {
         assertTypeByData("text/x-robots", "testRobots.txt");
+        assertTypeByData("text/x-robots", "testRobots2.txt");
     }
 
     @Test
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testRobots2.txt
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testRobots2.txt
new file mode 100644
index 000000000..2ad0152d0
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testRobots2.txt
@@ -0,0 +1,11 @@
+# elevate robots begin
+# robots.txt, added by the Elevate plugin for WordPress
+# file version: 1
+User-agent: *
+Disallow: /wp-admin/
+Disallow: /readme.html
+Disallow: /trackback/
+Allow: /wp-admin/admin-ajax.php
+Allow: /wp-content/uploads
+Sitemap: https://blahdeblah.com/sitemap.xml
+# elevate robots end
\ No newline at end of file

Reply via email to