Author: jukka
Date: Mon Nov 23 11:40:22 2009
New Revision: 883308

URL: http://svn.apache.org/viewvc?rev=883308&view=rev
Log:
TIKA-321: Optimize type detection speed

Refactor to reduce the number of Clause objects that type detection needs to go 
through

Added:
    
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/AndClause.java   
(with props)
    
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/OrClause.java   
(with props)
Removed:
    
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MagicClause.java
    lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/Operator.java
Modified:
    lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/Clause.java
    
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java

Added: 
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/AndClause.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/AndClause.java?rev=883308&view=auto
==============================================================================
--- 
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/AndClause.java 
(added)
+++ 
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/AndClause.java 
Mon Nov 23 11:40:22 2009
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.mime;
+
+import java.util.Arrays;
+
+class AndClause implements Clause {
+
+    private final Clause[] clauses;
+
+    AndClause(Clause... clauses) {
+        this.clauses = clauses;
+    }
+
+    public boolean eval(byte[] data) {
+        for (Clause clause : clauses) {
+            if (!clause.eval(data)) {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    public int size() {
+        int size = 0;
+        for (Clause clause : clauses) {
+            size += clause.size();
+        }
+        return size;
+    }
+
+    public String toString() {
+        return "and" + Arrays.toString(clauses);
+    }
+
+}

Propchange: 
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/AndClause.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: 
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/Clause.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/Clause.java?rev=883308&r1=883307&r2=883308&view=diff
==============================================================================
--- lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/Clause.java 
(original)
+++ lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/Clause.java 
Mon Nov 23 11:40:22 2009
@@ -32,30 +32,4 @@
      */
     int size();
 
-    /** A clause that is always true. */
-    Clause TRUE = new Clause() {
-        public boolean eval(byte[] data) {
-            return true;
-        }
-        public int size() {
-            return 0;
-        }
-        public String toString() {
-            return "TRUE";
-        }
-    };
-
-    /** A clause that is always false. */
-    Clause FALSE = new Clause() {
-        public boolean eval(byte[] data) {
-            return false;
-        }
-        public int size() {
-            return 0;
-        }
-        public String toString() {
-            return "FALSE";
-        }
-    };
-
 }

Modified: 
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java?rev=883308&r1=883307&r2=883308&view=diff
==============================================================================
--- 
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java
 (original)
+++ 
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java
 Mon Nov 23 11:40:22 2009
@@ -29,6 +29,9 @@
 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.List;
+
 import javax.xml.parsers.DocumentBuilder;
 import javax.xml.parsers.DocumentBuilderFactory;
 import javax.xml.parsers.ParserConfigurationException;
@@ -172,42 +175,37 @@
      */
     private void readMagic(Element element, MimeType mimeType)
             throws MimeTypeException {
-        Magic magic = new Magic(mimeType);
-
-        String priority = element.getAttribute(MAGIC_PRIORITY_ATTR);
-        if (priority != null && priority.length() > 0) {
-            magic.setPriority(Integer.parseInt(priority));
+        int priority = 50;
+        String value = element.getAttribute(MAGIC_PRIORITY_ATTR);
+        if (value != null && value.length() > 0) {
+            priority = Integer.parseInt(value);
         }
 
-        magic.setClause(readMatches(element));
-
-        mimeType.addMagic(magic);
+        for (Clause clause : readMatches(element)) {
+            Magic magic = new Magic(mimeType);
+            magic.setPriority(priority);
+            magic.setClause(clause);
+            mimeType.addMagic(magic);
+        }
     }
 
-    private Clause readMatches(Element element) throws MimeTypeException {
-        Clause prev = Clause.FALSE;
-        Clause clause = null;
+    private List<Clause> readMatches(Element element) throws MimeTypeException 
{
+        List<Clause> clauses = new ArrayList<Clause>();
         NodeList nodes = element.getChildNodes();
         for (int i = 0; i < nodes.getLength(); i++) {
             Node node = nodes.item(i);
             if (node.getNodeType() == Node.ELEMENT_NODE) {
                 Element nodeElement = (Element) node;
                 if (nodeElement.getTagName().equals(MATCH_TAG)) {
-                    clause = readMatch(nodeElement);
-                    Clause sub = readMatches(nodeElement);
-                    if (sub != null) {
-                        clause = new MagicClause(Operator.AND, clause, sub);
-                    }
-                    clause = new MagicClause(Operator.OR, prev, clause);
-                    prev = clause;
+                    clauses.add(readMatch(nodeElement));
                 }
             }
         }
-        return clause;
+        return clauses;
     }
 
     /** Read Element named match. */
-    private MagicMatch readMatch(Element element) throws MimeTypeException {
+    private Clause readMatch(Element element) throws MimeTypeException {
         String type = "string";
         int start = 0;
         int end = 0;
@@ -253,7 +251,16 @@
 
         MagicDetector detector = new MagicDetector(
                 MediaType.TEXT_PLAIN, patternBytes, maskBytes, start, end);
-        return new MagicMatch(detector, length);
+        Clause clause = new MagicMatch(detector, length);
+
+        List<Clause> subClauses = readMatches(element);
+        if (subClauses.size() == 0) {
+            return clause;
+        } else if (subClauses.size() == 1) {
+            return new AndClause(clause, subClauses.get(0));
+        } else {
+            return new AndClause(clause, new OrClause(subClauses));
+        }
     }
 
     private byte[] decodeValue(String type, String value)

Added: 
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/OrClause.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/OrClause.java?rev=883308&view=auto
==============================================================================
--- 
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/OrClause.java 
(added)
+++ 
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/OrClause.java 
Mon Nov 23 11:40:22 2009
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.mime;
+
+import java.util.List;
+
+class OrClause implements Clause {
+
+    private final List<Clause> clauses;
+
+    OrClause(List<Clause> clauses) {
+        this.clauses = clauses;
+    }
+
+    public boolean eval(byte[] data) {
+        for (Clause clause : clauses) {
+            if (clause.eval(data)) {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    public int size() {
+        int size = 0;
+        for (Clause clause : clauses) {
+            size = Math.max(size, clause.size());
+        }
+        return size;
+    }
+
+    public String toString() {
+        return "or" + clauses;
+    }
+
+}

Propchange: 
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/OrClause.java
------------------------------------------------------------------------------
    svn:eol-style = native


Reply via email to