Author: jukka
Date: Mon Nov 23 11:40:22 2009
New Revision: 883308
URL: http://svn.apache.org/viewvc?rev=883308&view=rev
Log:
TIKA-321: Optimize type detection speed
Refactor to reduce the number of Clause objects that type detection needs to go
through
Added:
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/AndClause.java
(with props)
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/OrClause.java
(with props)
Removed:
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MagicClause.java
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/Operator.java
Modified:
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/Clause.java
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java
Added:
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/AndClause.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/AndClause.java?rev=883308&view=auto
==============================================================================
---
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/AndClause.java
(added)
+++
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/AndClause.java
Mon Nov 23 11:40:22 2009
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.mime;
+
+import java.util.Arrays;
+
+class AndClause implements Clause {
+
+ private final Clause[] clauses;
+
+ AndClause(Clause... clauses) {
+ this.clauses = clauses;
+ }
+
+ public boolean eval(byte[] data) {
+ for (Clause clause : clauses) {
+ if (!clause.eval(data)) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ public int size() {
+ int size = 0;
+ for (Clause clause : clauses) {
+ size += clause.size();
+ }
+ return size;
+ }
+
+ public String toString() {
+ return "and" + Arrays.toString(clauses);
+ }
+
+}
Propchange:
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/AndClause.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified:
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/Clause.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/Clause.java?rev=883308&r1=883307&r2=883308&view=diff
==============================================================================
--- lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/Clause.java
(original)
+++ lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/Clause.java
Mon Nov 23 11:40:22 2009
@@ -32,30 +32,4 @@
*/
int size();
- /** A clause that is always true. */
- Clause TRUE = new Clause() {
- public boolean eval(byte[] data) {
- return true;
- }
- public int size() {
- return 0;
- }
- public String toString() {
- return "TRUE";
- }
- };
-
- /** A clause that is always false. */
- Clause FALSE = new Clause() {
- public boolean eval(byte[] data) {
- return false;
- }
- public int size() {
- return 0;
- }
- public String toString() {
- return "FALSE";
- }
- };
-
}
Modified:
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java?rev=883308&r1=883307&r2=883308&view=diff
==============================================================================
---
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java
(original)
+++
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java
Mon Nov 23 11:40:22 2009
@@ -29,6 +29,9 @@
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.List;
+
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
@@ -172,42 +175,37 @@
*/
private void readMagic(Element element, MimeType mimeType)
throws MimeTypeException {
- Magic magic = new Magic(mimeType);
-
- String priority = element.getAttribute(MAGIC_PRIORITY_ATTR);
- if (priority != null && priority.length() > 0) {
- magic.setPriority(Integer.parseInt(priority));
+ int priority = 50;
+ String value = element.getAttribute(MAGIC_PRIORITY_ATTR);
+ if (value != null && value.length() > 0) {
+ priority = Integer.parseInt(value);
}
- magic.setClause(readMatches(element));
-
- mimeType.addMagic(magic);
+ for (Clause clause : readMatches(element)) {
+ Magic magic = new Magic(mimeType);
+ magic.setPriority(priority);
+ magic.setClause(clause);
+ mimeType.addMagic(magic);
+ }
}
- private Clause readMatches(Element element) throws MimeTypeException {
- Clause prev = Clause.FALSE;
- Clause clause = null;
+ private List<Clause> readMatches(Element element) throws MimeTypeException
{
+ List<Clause> clauses = new ArrayList<Clause>();
NodeList nodes = element.getChildNodes();
for (int i = 0; i < nodes.getLength(); i++) {
Node node = nodes.item(i);
if (node.getNodeType() == Node.ELEMENT_NODE) {
Element nodeElement = (Element) node;
if (nodeElement.getTagName().equals(MATCH_TAG)) {
- clause = readMatch(nodeElement);
- Clause sub = readMatches(nodeElement);
- if (sub != null) {
- clause = new MagicClause(Operator.AND, clause, sub);
- }
- clause = new MagicClause(Operator.OR, prev, clause);
- prev = clause;
+ clauses.add(readMatch(nodeElement));
}
}
}
- return clause;
+ return clauses;
}
/** Read Element named match. */
- private MagicMatch readMatch(Element element) throws MimeTypeException {
+ private Clause readMatch(Element element) throws MimeTypeException {
String type = "string";
int start = 0;
int end = 0;
@@ -253,7 +251,16 @@
MagicDetector detector = new MagicDetector(
MediaType.TEXT_PLAIN, patternBytes, maskBytes, start, end);
- return new MagicMatch(detector, length);
+ Clause clause = new MagicMatch(detector, length);
+
+ List<Clause> subClauses = readMatches(element);
+ if (subClauses.size() == 0) {
+ return clause;
+ } else if (subClauses.size() == 1) {
+ return new AndClause(clause, subClauses.get(0));
+ } else {
+ return new AndClause(clause, new OrClause(subClauses));
+ }
}
private byte[] decodeValue(String type, String value)
Added:
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/OrClause.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/OrClause.java?rev=883308&view=auto
==============================================================================
---
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/OrClause.java
(added)
+++
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/OrClause.java
Mon Nov 23 11:40:22 2009
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.mime;
+
+import java.util.List;
+
+class OrClause implements Clause {
+
+ private final List<Clause> clauses;
+
+ OrClause(List<Clause> clauses) {
+ this.clauses = clauses;
+ }
+
+ public boolean eval(byte[] data) {
+ for (Clause clause : clauses) {
+ if (clause.eval(data)) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ public int size() {
+ int size = 0;
+ for (Clause clause : clauses) {
+ size = Math.max(size, clause.size());
+ }
+ return size;
+ }
+
+ public String toString() {
+ return "or" + clauses;
+ }
+
+}
Propchange:
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/OrClause.java
------------------------------------------------------------------------------
svn:eol-style = native