tik...

bob Tue, 05 Jan 2016 19:51:54 -0800

Added: 
tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/main/java/org/apache/tika/parser/ner/NamedEntityParser.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/main/java/org/apache/tika/parser/ner/NamedEntityParser.java?rev=1723223&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/main/java/org/apache/tika/parser/ner/NamedEntityParser.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/main/java/org/apache/tika/parser/ner/NamedEntityParser.java
 Wed Jan  6 03:50:50 2016
@@ -0,0 +1,151 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.ner;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.tika.Tika;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.ner.opennlp.OpenNLPNERecogniser;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+/**
+ *
+ * This implementation of {@link org.apache.tika.parser.Parser} extracts
+ * entity names from text content and adds it to the metadata.
+ * <p>All the metadata keys will have a common suffix {@value 
#MD_KEY_PREFIX}</p>
+ * <p>The Named Entity recogniser implementation can be changed by setting the
+ * system property {@value #SYS_PROP_NER_IMPL} value to a name of class that
+ * implements {@link NERecogniser} contract</p>
+ * @see OpenNLPNERecogniser
+ * @see NERecogniser
+ *
+ */
+public class NamedEntityParser extends AbstractParser {
+
+    public static final Logger LOG = 
LoggerFactory.getLogger(NamedEntityParser.class);
+    public static final Set<MediaType> MEDIA_TYPES = new HashSet<>();
+    public static final String MD_KEY_PREFIX = "NER_";
+    public static final String DEFAULT_NER_IMPL = 
OpenNLPNERecogniser.class.getName();
+    public static final String SYS_PROP_NER_IMPL = "ner.impl.class";
+
+    public Tika secondaryParser;
+
+    static {
+        MEDIA_TYPES.add(MediaType.TEXT_PLAIN);
+    }
+
+    private List<NERecogniser> nerChain;
+    private volatile boolean initialized = false;
+    private volatile boolean available = false;
+
+    private synchronized void initialize(ParseContext context) {
+        if (initialized) {
+            return;
+        }
+        initialized = true;
+
+        //TODO: read class name from context or config
+        //There can be multiple classes in the form of comma separated class 
names;
+        String classNamesString = System.getProperty(SYS_PROP_NER_IMPL,
+                DEFAULT_NER_IMPL);
+        String[] classNames = classNamesString.split(",");
+        this.nerChain = new ArrayList<>(classNames.length);
+        for (String className : classNames) {
+            className = className.trim();
+            LOG.info("going to load, instantiate and bind the instance of {}",
+                    className);
+            try {
+                NERecogniser recogniser =
+                        (NERecogniser) Class.forName(className).newInstance();
+                LOG.info("{} is available ? {}", className,
+                        recogniser.isAvailable());
+                if (recogniser.isAvailable()) {
+                    nerChain.add(recogniser);
+                }
+            } catch (Exception e) {
+                LOG.error(e.getMessage(), e);
+            }
+        }
+        try {
+            TikaConfig config = new TikaConfig();
+            this.secondaryParser = new Tika(config);
+            this.available = !nerChain.isEmpty();
+            LOG.info("Number of NERecognisers in chain {}", nerChain.size());
+        } catch (Exception e){
+            LOG.error(e.getMessage(), e);
+            this.available = false;
+        }
+    }
+
+    public Set<MediaType> getSupportedTypes(ParseContext parseContext) {
+        return MEDIA_TYPES;
+    }
+
+    public void parse(InputStream inputStream, ContentHandler contentHandler,
+                      Metadata metadata, ParseContext parseContext)
+            throws IOException, SAXException, TikaException {
+
+        if (!initialized) {
+            initialize(parseContext);
+        }
+        if (!available) {
+            return;
+        }
+
+        Reader reader = MediaType.TEXT_PLAIN.toString()
+                .equals(metadata.get(Metadata.CONTENT_TYPE))
+                ? new InputStreamReader(inputStream, StandardCharsets.UTF_8)
+                : secondaryParser.parse(inputStream);
+
+        String text = IOUtils.toString(reader);
+        IOUtils.closeQuietly(reader);
+
+        for (NERecogniser ner : nerChain) {
+            Map<String, Set<String>> names = ner.recognise(text);
+            if (names != null) {
+                for (Map.Entry<String, Set<String>> entry : names.entrySet()) {
+                    if (entry.getValue() != null) {
+                        String mdKey = MD_KEY_PREFIX + entry.getKey();
+                        for (String name : entry.getValue()) {
+                            metadata.add(mdKey, name);
+                        }
+                    }
+                }
+            }
+        }
+    }
+}


Added: 
tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/main/java/org/apache/tika/parser/ner/corenlp/CoreNLPNERecogniser.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/main/java/org/apache/tika/parser/ner/corenlp/CoreNLPNERecogniser.java?rev=1723223&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/main/java/org/apache/tika/parser/ner/corenlp/CoreNLPNERecogniser.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/main/java/org/apache/tika/parser/ner/corenlp/CoreNLPNERecogniser.java
 Wed Jan  6 03:50:50 2016
@@ -0,0 +1,169 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ner.corenlp;
+
+import org.apache.tika.io.IOUtils;
+import org.apache.tika.parser.ner.NERecogniser;
+import org.json.JSONObject;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.lang.reflect.Field;
+import java.lang.reflect.Method;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Properties;
+import java.util.Set;
+
+/**
+ *  This class offers an implementation of {@link NERecogniser} based on
+ *  CRF classifiers from Stanford CoreNLP. This NER requires additional setup,
+ *  due to runtime binding to Stanford CoreNLP.
+ *  See <a href="http://wiki.apache.org/tika/TikaAndNER#CoreNLP";>
+ *      Tika NER Wiki</a> for configuring this recogniser.
+ *  @see NERecogniser
+ *
+ */
+public class CoreNLPNERecogniser implements NERecogniser {
+
+    private static final Logger LOG = 
LoggerFactory.getLogger(CoreNLPNERecogniser.class);
+
+    //default model paths
+    public static final String NER_3CLASS_MODEL = 
"edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz";
+    public static final String NER_4CLASS_MODEL = 
"edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz";
+    public static final String NER_7CLASS_MODEL = 
"edu/stanford/nlp/models/ner/english.muc.7class.distsim.crf.ser.gz";
+    /**
+     * default Model path
+     */
+    public static final String DEFAULT_MODEL_PATH = NER_7CLASS_MODEL;
+    public static final String MODEL_PROP_NAME = "ner.corenlp.model";
+
+    public static final Set<String> ENTITY_TYPES = new HashSet<String>(){{
+        add(PERSON);
+        add(TIME);
+        add(LOCATION);
+        add(ORGANIZATION);
+        add(MONEY);
+        add(PERCENT);
+        add(DATE);
+    }};
+    private static final String CLASSIFIER_CLASS_NAME = 
"edu.stanford.nlp.ie.crf.CRFClassifier";
+
+    private boolean available = false;
+    private Field firstField;
+    private Field secondField;
+    private Field thirdField;
+    private Object classifierInstance;
+    private Method classifyMethod;
+
+    public CoreNLPNERecogniser(){
+        this(System.getProperty(MODEL_PROP_NAME, DEFAULT_MODEL_PATH));
+    }
+
+    /**
+     * Creates a NERecogniser by loading model from given path
+     * @param modelPath path to NER model file
+     */
+    public CoreNLPNERecogniser(String modelPath) {
+        try {
+            Properties props = new Properties();
+            Class<?> classifierClass = Class.forName(CLASSIFIER_CLASS_NAME);
+            Method loadMethod = classifierClass.getMethod("getClassifier", 
String.class, Properties.class);
+            classifierInstance = loadMethod.invoke(classifierClass, modelPath, 
props);
+            classifyMethod = 
classifierClass.getMethod("classifyToCharacterOffsets", String.class);
+
+            //these fields are for accessing result
+            Class<?> tripleClass = 
Class.forName("edu.stanford.nlp.util.Triple");
+            this.firstField = tripleClass.getField("first");
+            this.secondField = tripleClass.getField("second");
+            this.thirdField = tripleClass.getField("third");
+            this.available = true;
+        } catch (Exception e) {
+            LOG.warn("{} while trying to load the model from {}", 
e.getMessage(), modelPath);
+        }
+        LOG.info("Available for service ? {}", available);
+    }
+
+    /**
+     *
+     * @return {@code true} if model was available, valid and was able to 
initialise the classifier.
+     * returns {@code false} when this recogniser is not available for service.
+     */
+    public boolean isAvailable() {
+        return available;
+    }
+
+    /**
+     * Gets set of entity types recognised by this recogniser
+     * @return set of entity classes/types
+     */
+    public Set<String> getEntityTypes() {
+        return ENTITY_TYPES;
+    }
+
+    /**
+     * recognises names of entities in the text
+     * @param text text which possibly contains names
+     * @return map of entity type -> set of names
+     */
+    public Map<String, Set<String>> recognise(String text) {
+        Map<String, Set<String>> names = new HashMap<>();
+        try {
+            Object result = classifyMethod.invoke(classifierInstance, text);
+            List entries = (List) result;
+            for (Object entry : entries) {
+                String entityType = (String) firstField.get(entry);
+                if (!names.containsKey(entityType)) {
+                    names.put(entityType, new HashSet<String>());
+                }
+                Integer start = (Integer) secondField.get(entry);
+                Integer end = (Integer) thirdField.get(entry);
+                String name = text.substring(start, end);
+                //Clean repeating spaces, replace line breaks and tabs with 
single space
+                name = name.trim().replaceAll("(\\s\\s+)|\n|\t", " ");
+                if (!name.isEmpty()) {
+                    names.get(entityType).add(name);
+                }
+            }
+
+        } catch (Exception e) {
+            LOG.debug(e.getMessage(), e);
+        }
+        return names;
+    }
+
+    public static void main(String[] args) throws IOException {
+        if (args.length != 1) {
+            System.err.println("Error: Invalid Args");
+            System.err.println("This tool finds names inside text");
+            System.err.println("Usage: <path/to/text/file>");
+            return;
+        }
+
+        try (FileInputStream stream = new FileInputStream(args[0])) {
+            String text = IOUtils.toString(stream);
+            CoreNLPNERecogniser ner = new CoreNLPNERecogniser();
+            Map<String, Set<String>> names = ner.recognise(text);
+            JSONObject jNames = new JSONObject(names);
+            System.out.println(jNames.toString(2));
+        }
+    }
+}

Added: 
tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/main/java/org/apache/tika/parser/ner/opennlp/OpenNLPNERecogniser.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/main/java/org/apache/tika/parser/ner/opennlp/OpenNLPNERecogniser.java?rev=1723223&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/main/java/org/apache/tika/parser/ner/opennlp/OpenNLPNERecogniser.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/main/java/org/apache/tika/parser/ner/opennlp/OpenNLPNERecogniser.java
 Wed Jan  6 03:50:50 2016
@@ -0,0 +1,151 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.ner.opennlp;
+
+import org.apache.tika.parser.ner.NERecogniser;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+
+/**
+ *
+ * This implementation of {@link NERecogniser} chains an array of
+ * {@link OpenNLPNameFinder}s for which NER models are
+ * available in classpath.
+ *
+ * The following models are scanned during initialization via class loader.:
+ *
+ * <table>
+ *     <tr>
+ *         <th>Entity Type</th><th>Path</th>
+ *     </tr>
+ *     <tr>
+ *         <td>{@value PERSON}</td><td> {@value PERSON_FILE}</td>
+ *     </tr>
+ *     <tr>
+ *         <td>{@value LOCATION}</td><td>{@value LOCATION_FILE}</td>
+ *     </tr>
+ *     <tr>
+ *         <td>{@value ORGANIZATION}</td><td>{@value ORGANIZATION_FILE}</td>
+ *     </tr>
+ *     <tr>
+ *         <td>{@value TIME}</td><td>{@value TIME_FILE}</td>
+ *     </tr>
+ *     <tr>
+ *         <td>{@value DATE}</td><td>{@value DATE_FILE}</td>
+ *     </tr>
+ *     <tr>
+ *         <td>{@value PERCENT}</td><td>{@value PERCENT_FILE}</td>
+ *     </tr>
+ *     <tr>
+ *         <td>{@value MONEY}</td><td>{@value MONEY_FILE}</td>
+ *     </tr>
+ * </table>
+ *
+ * @see org.apache.tika.parser.ner.NamedEntityParser#DEFAULT_NER_IMPL
+ */
+public class OpenNLPNERecogniser implements NERecogniser {
+
+    public static final String MODELS_DIR = OpenNLPNERecogniser.class
+            .getPackage().getName().replace(".", "/");
+    public static final String PERSON_FILE = "ner-person.bin";
+    public static final String LOCATION_FILE = "ner-location.bin";
+    public static final String ORGANIZATION_FILE = "ner-organization.bin";
+    public static final String TIME_FILE = "ner-time.bin";
+    public static final String DATE_FILE = "ner-date.bin";
+    public static final String PERCENT_FILE = "ner-percentage.bin";
+    public static final String MONEY_FILE = "ner-money.bin";
+
+
+    //Default (English) Models for the common 7 classes of named types
+    public static final String NER_PERSON_MODEL = MODELS_DIR + "/" + 
PERSON_FILE;
+    public static final String NER_LOCATION_MODEL = MODELS_DIR + "/" + 
LOCATION_FILE;
+    public static final String NER_ORGANIZATION_MODEL = MODELS_DIR + "/" + 
ORGANIZATION_FILE;
+    public static final String NER_TIME_MODEL = MODELS_DIR + "/" + TIME_FILE;
+    public static final String NER_DATE_MODEL = MODELS_DIR + "/" + DATE_FILE;
+    public static final String NER_PERCENT_MODEL = MODELS_DIR + "/" + 
PERCENT_FILE;
+    public static final String NER_MONEY_MODEL = MODELS_DIR + "/" + MONEY_FILE;
+
+    public static final Map<String, String> DEFAULT_MODELS =
+            new HashMap<String, String>(){{
+                put(PERSON, NER_PERSON_MODEL);
+                put(LOCATION, NER_LOCATION_MODEL);
+                put(ORGANIZATION, NER_ORGANIZATION_MODEL);
+                put(TIME, NER_TIME_MODEL);
+                put(DATE, NER_DATE_MODEL);
+                put(PERCENT, NER_PERCENT_MODEL);
+                put(MONEY, NER_MONEY_MODEL);
+            }};
+
+    private Set<String> entityTypes;
+    private List<OpenNLPNameFinder> nameFinders;
+    private boolean available;
+
+    /**
+     * Creates a default chain of Name finders using default OpenNLP 
recognizers
+     */
+    public OpenNLPNERecogniser(){
+        this(DEFAULT_MODELS);
+    }
+
+    /**
+     * Creates a chain of Named Entity recognisers
+     * @param models map of entityType -> model path
+     * NOTE: the model path should be known to class loader.
+     */
+    public OpenNLPNERecogniser(Map<String, String> models){
+        this.nameFinders = new ArrayList<>();
+        this.entityTypes = new HashSet<>();
+        for (Map.Entry<String, String> entry : models.entrySet()) {
+            OpenNLPNameFinder finder =
+                    new OpenNLPNameFinder(entry.getKey(), entry.getValue());
+            if (finder.isAvailable()) {
+                this.nameFinders.add(finder);
+                this.entityTypes.add(entry.getKey());
+            }
+        }
+        this.entityTypes = Collections.unmodifiableSet(this.entityTypes);
+        this.available = nameFinders.size() > 0; //at least one finder is 
present
+    }
+
+    @Override
+    public boolean isAvailable() {
+        return available;
+    }
+
+    @Override
+    public Set<String> getEntityTypes() {
+        return entityTypes;
+    }
+
+    @Override
+    public Map<String, Set<String>> recognise(String text) {
+        String[] tokens = OpenNLPNameFinder.tokenize(text);
+        Map<String, Set<String>> names = new HashMap<>();
+        for (OpenNLPNameFinder finder : nameFinders) {
+            names.putAll(finder.findNames(tokens));
+        }
+        return names;
+    }
+}

Added: 
tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/main/java/org/apache/tika/parser/ner/opennlp/OpenNLPNameFinder.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/main/java/org/apache/tika/parser/ner/opennlp/OpenNLPNameFinder.java?rev=1723223&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/main/java/org/apache/tika/parser/ner/opennlp/OpenNLPNameFinder.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/main/java/org/apache/tika/parser/ner/opennlp/OpenNLPNameFinder.java
 Wed Jan  6 03:50:50 2016
@@ -0,0 +1,112 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.ner.opennlp;
+
+import opennlp.tools.namefind.NameFinderME;
+import opennlp.tools.namefind.TokenNameFinderModel;
+import opennlp.tools.util.Span;
+import org.apache.tika.io.IOUtils;
+import org.apache.tika.parser.ner.NERecogniser;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+/**
+ * An implementation of {@link NERecogniser} that finds names in text using 
Open NLP Model.
+ * This implementation works with only one entity type. For chain this name 
finder instances,
+ * see {@link OpenNLPNERecogniser}
+ */
+public class OpenNLPNameFinder implements NERecogniser {
+
+    private static final Logger LOG = 
LoggerFactory.getLogger(OpenNLPNameFinder.class);
+    private final String nameType;
+    private final Set<String> nameTypes;
+    private NameFinderME nameFinder;
+    private boolean available;
+
+    /**
+     * Creates OpenNLP name finder
+     * @param nameType the entity type recognised by the given NER model
+     * @param nerModelPath path to ner model
+     */
+    public OpenNLPNameFinder(String nameType, String nerModelPath) {
+        this.nameTypes = Collections.singleton(nameType);
+        this.nameType = nameType;
+        InputStream nerModelStream  = 
getClass().getClassLoader().getResourceAsStream(nerModelPath);
+        try {
+            if (nerModelStream != null){
+                TokenNameFinderModel model = new 
TokenNameFinderModel(nerModelStream);
+                this.nameFinder = new NameFinderME(model);
+                this.available = true;
+            } else {
+                LOG.warn("Couldn't find model from {} using class loader", 
nerModelPath);
+            }
+        } catch (IOException e) {
+            LOG.error(e.getMessage(), e);
+        } finally {
+            IOUtils.closeQuietly(nerModelStream);
+        }
+        LOG.info("{} NER : Available for service ? {}", nameType, available);
+    }
+
+    @Override
+    public boolean isAvailable() {
+        return available;
+    }
+
+    @Override
+    public Set<String> getEntityTypes() {
+        return nameTypes;
+    }
+
+    public static String[] tokenize(String text){
+        //NOTE: replace this with a NLP tokenizer tool
+        //clean + split
+        return text.trim().replaceAll("(\\s\\s+)", " ").split("\\s");
+    }
+
+    @Override
+    public synchronized Map<String, Set<String>> recognise(String text) {
+        String[] tokens = tokenize(text);
+        return findNames(tokens);
+    }
+
+    /**
+     * finds names from given array of tokens
+     * @param tokens the tokens array
+     * @return map of EntityType -> set of entity names
+     */
+    public Map<String, Set<String>> findNames(String[] tokens) {
+        Span[] nameSpans = nameFinder.find(tokens);
+        String[] names = Span.spansToStrings(nameSpans, tokens);
+        Map<String, Set<String>> result = new HashMap<>();
+        if (names != null && names.length > 0) {
+            result.put(nameType, new HashSet<>(Arrays.asList(names)));
+        }
+        nameFinder.clearAdaptiveData();
+        return result;
+    }
+}

Added: 
tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/main/java/org/apache/tika/parser/ner/regex/RegexNERecogniser.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/main/java/org/apache/tika/parser/ner/regex/RegexNERecogniser.java?rev=1723223&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/main/java/org/apache/tika/parser/ner/regex/RegexNERecogniser.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/main/java/org/apache/tika/parser/ner/regex/RegexNERecogniser.java
 Wed Jan  6 03:50:50 2016
@@ -0,0 +1,145 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.ner.regex;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.tika.parser.ner.NERecogniser;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * This class offers an implementation of {@link NERecogniser} based on
+ * Regular Expressions.
+ *<p>
+ * The default configuration file {@value NER_REGEX_FILE} is used when no
+ * argument constructor is used to instantiate this class. The regex file is
+ * loaded via {@link Class#getResourceAsStream(String)}, so the file should be
+ * placed in the same package path as of this class.
+ * </p>
+ * The format of regex configuration as follows:
+ * <pre>
+ * ENTITY_TYPE1=REGEX1
+ * ENTITY_TYPE2=REGEX2
+ * </pre>
+ *
+ * <i>For example, to extract week day from text:</i>
+ * <pre>WEEK_DAY=(?i)((sun)|(mon)|(tues)|(thurs)|(fri)|((sat)(ur)?))(day)?
+ * </pre>
+ * @since Nov. 7, 2015
+ */
+public class RegexNERecogniser implements NERecogniser {
+
+    public static final String NER_REGEX_FILE = "ner-regex.txt";
+    private static Logger LOG = 
LoggerFactory.getLogger(RegexNERecogniser.class);
+
+    public Set<String> entityTypes = new HashSet<>();
+    public Map<String, Pattern> patterns;
+    private boolean available = false;
+
+    private static RegexNERecogniser INSTANCE;
+
+    public RegexNERecogniser(){
+        this(RegexNERecogniser.class.getResourceAsStream(NER_REGEX_FILE));
+    }
+
+    public RegexNERecogniser(InputStream stream){
+        try {
+            patterns = new HashMap<>();
+            List<String> lines = IOUtils.readLines(stream, 
StandardCharsets.UTF_8);
+            IOUtils.closeQuietly(stream);
+            for (String line : lines) {
+                line = line.trim();
+                if (line.isEmpty() || line.startsWith("#")){ //empty or comment
+                    continue;                                //skip
+                }
+
+                int delim = line.indexOf('=');
+                if (delim < 0) { //delim not found
+                    //skip
+                    LOG.error("Skipped : Invalid config : {} ", line);
+                    continue;
+                }
+                String type = line.substring(0, delim).trim();
+                String patternStr = line.substring(delim+1, 
line.length()).trim();
+                patterns.put(type, Pattern.compile(patternStr));
+                entityTypes.add(type);
+            }
+        } catch (Exception e) {
+            LOG.error(e.getMessage(), e);
+        }
+        available = !entityTypes.isEmpty();
+    }
+
+    public synchronized static RegexNERecogniser getInstance() {
+        if (INSTANCE == null) {
+            INSTANCE = new RegexNERecogniser();
+        }
+        return INSTANCE;
+    }
+
+    @Override
+    public boolean isAvailable() {
+        return available;
+    }
+
+    @Override
+    public Set<String> getEntityTypes() {
+        return entityTypes;
+    }
+
+    /**
+     * finds matching sub groups in text
+     * @param text text containing interesting sub strings
+     * @param pattern pattern to find sub strings
+     * @return set of sub strings if any found, or null if none found
+     */
+    public Set<String> findMatches(String text, Pattern pattern){
+        Set<String> results = null;
+        Matcher matcher = pattern.matcher(text);
+        if (matcher.find()) {
+            results = new HashSet<>();
+            results.add(matcher.group(0));
+            while (matcher.find()) {
+                results.add(matcher.group(0));
+            }
+        }
+        return results;
+    }
+
+    @Override
+    public Map<String, Set<String>> recognise(String text) {
+        Map<String, Set<String>> result = new HashMap<>();
+        for (Map.Entry<String, Pattern> entry : patterns.entrySet()) {
+            Set<String> names = findMatches(text, entry.getValue());
+            if (names != null) {
+                result.put(entry.getKey(), names);
+            }
+        }
+        return result;
+    }
+}

Added: 
tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser?rev=1723223&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
 Wed Jan  6 03:50:50 2016
@@ -0,0 +1,18 @@
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+
+org.apache.tika.parser.crypto.Pkcs7Parser
+#org.apache.tika.parser.ner.NamedEntityParser

Added: 
tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/main/resources/org/apache/tika/parser/ner/regex/ner-regex.txt
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/main/resources/org/apache/tika/parser/ner/regex/ner-regex.txt?rev=1723223&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/main/resources/org/apache/tika/parser/ner/regex/ner-regex.txt
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/main/resources/org/apache/tika/parser/ner/regex/ner-regex.txt
 Wed Jan  6 03:50:50 2016
@@ -0,0 +1,22 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+ 
+
+# The pattern as follows
+# type = regex
+# the first occurrence of '=' separates type from its regex
+
+# WEEK_DAY=(?i)((sun)|(mon)|(tues)|(thurs)|(fri)|((sat)(ur)?))(day)?
\ No newline at end of file

Added: 
tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/test/java/org/apache/tika/parser/crypto/Pkcs7ParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/test/java/org/apache/tika/parser/crypto/Pkcs7ParserTest.java?rev=1723223&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/test/java/org/apache/tika/parser/crypto/Pkcs7ParserTest.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/test/java/org/apache/tika/parser/crypto/Pkcs7ParserTest.java
 Wed Jan  6 03:50:50 2016
@@ -0,0 +1,45 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.crypto;
+
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
+import java.io.InputStream;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.xml.sax.ContentHandler;
+
+public class Pkcs7ParserTest extends TikaTest {
+    public void testDetachedSignature() throws Exception {
+        try (InputStream input = Pkcs7ParserTest.class.getResourceAsStream(
+                "/test-documents/testDetached.p7s")) {
+            ContentHandler handler = new BodyContentHandler();
+            Metadata metadata = new Metadata();
+            new Pkcs7Parser().parse(input, handler, metadata, new 
ParseContext());
+        } catch (NullPointerException npe) {
+            fail("should not get NPE");
+        } catch (TikaException te) {
+            assertTrue(te.toString().contains("cannot parse detached pkcs7 
signature"));
+        }
+    }
+}

Added: 
tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/test/java/org/apache/tika/parser/ner/NamedEntityParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/test/java/org/apache/tika/parser/ner/NamedEntityParserTest.java?rev=1723223&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/test/java/org/apache/tika/parser/ner/NamedEntityParserTest.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/test/java/org/apache/tika/parser/ner/NamedEntityParserTest.java
 Wed Jan  6 03:50:50 2016
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ner;
+
+import org.apache.tika.Tika;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ner.opennlp.OpenNLPNERecogniser;
+import org.apache.tika.parser.ner.regex.RegexNERecogniser;
+import org.junit.Test;
+
+import java.io.ByteArrayInputStream;
+import java.nio.charset.Charset;
+import java.util.Arrays;
+import java.util.HashSet;
+
+import static org.junit.Assert.assertTrue;
+
+/**
+ *Test case for {@link NamedEntityParser}
+ */
+public class NamedEntityParserTest {
+
+    public static final String CONFIG_FILE = "tika-config.xml";
+
+    @Test
+    public void testParse() throws Exception {
+
+        //test config is added to resources directory
+        TikaConfig config = new 
TikaConfig(getClass().getResourceAsStream(CONFIG_FILE));
+        Tika tika = new Tika(config);
+        String text = "I am student at University of Southern California 
(USC)," +
+                " located in Los Angeles . USC's football team is called by 
name Trojans." +
+                " Mr. John McKay was a head coach of the team from 1960 - 
1975";
+        Metadata md = new Metadata();
+        tika.parse(new 
ByteArrayInputStream(text.getBytes(Charset.defaultCharset())), md);
+
+        HashSet<String> set = new HashSet<String>();
+        set.addAll(Arrays.asList(md.getValues("X-Parsed-By")));
+        assertTrue(set.contains(NamedEntityParser.class.getName()));
+
+        set.clear();
+        set.addAll(Arrays.asList(md.getValues("NER_PERSON")));
+        assertTrue(set.contains("John McKay"));
+
+        set.clear();
+        set.addAll(Arrays.asList(md.getValues("NER_LOCATION")));
+        assertTrue(set.contains("Los Angeles"));
+
+        set.clear();
+        set.addAll(Arrays.asList(md.getValues("NER_ORGANIZATION")));
+        assertTrue(set.contains("University of Southern California"));
+
+        set.clear();
+        set.addAll(Arrays.asList(md.getValues("NER_DATE")));
+        assertTrue(set.contains("1960 - 1975"));
+
+    }
+
+    @Test
+    public void testNerChain() throws Exception {
+        String classNames = OpenNLPNERecogniser.class.getName()
+                + "," + RegexNERecogniser.class.getName();
+        System.setProperty(NamedEntityParser.SYS_PROP_NER_IMPL, classNames);
+        TikaConfig config = new 
TikaConfig(getClass().getResourceAsStream(CONFIG_FILE));
+        Tika tika = new Tika(config);
+        String text = "University of Southern California (USC), is located in 
Los Angeles ." +
+                " Campus is busy from monday to saturday";
+        Metadata md = new Metadata();
+        tika.parse(new 
ByteArrayInputStream(text.getBytes(Charset.defaultCharset())), md);
+        HashSet<String> keys = new HashSet<>(Arrays.asList(md.names()));
+        assertTrue(keys.contains("NER_WEEK_DAY"));
+        assertTrue(keys.contains("NER_LOCATION"));
+
+    }
+}
\ No newline at end of file

Added: 
tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/test/java/org/apache/tika/parser/ner/regex/RegexNERecogniserTest.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/test/java/org/apache/tika/parser/ner/regex/RegexNERecogniserTest.java?rev=1723223&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/test/java/org/apache/tika/parser/ner/regex/RegexNERecogniserTest.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/test/java/org/apache/tika/parser/ner/regex/RegexNERecogniserTest.java
 Wed Jan  6 03:50:50 2016
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright 
owlocationNameEntitieship.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ner.regex;
+
+import org.apache.tika.Tika;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ner.NamedEntityParser;
+import org.junit.Test;
+
+import java.io.ByteArrayInputStream;
+import java.nio.charset.StandardCharsets;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+
+import static org.junit.Assert.assertTrue;
+
+public class RegexNERecogniserTest {
+
+    @Test
+    public void testGetEntityTypes() throws Exception {
+
+        String text = "Hey, Lets meet on this Sunday or MONDAY because i am 
busy on Saturday";
+        System.setProperty(NamedEntityParser.SYS_PROP_NER_IMPL, 
RegexNERecogniser.class.getName());
+
+        Tika tika = new Tika(new 
TikaConfig(NamedEntityParser.class.getResourceAsStream("tika-config.xml")));
+        Metadata md = new Metadata();
+        tika.parse(new 
ByteArrayInputStream(text.getBytes(StandardCharsets.UTF_8)), md);
+
+        Set<String> days = new 
HashSet<>(Arrays.asList(md.getValues("NER_WEEK_DAY")));
+        assertTrue(days.contains("Sunday"));
+        assertTrue(days.contains("MONDAY"));
+        assertTrue(days.contains("Saturday"));
+        assertTrue(days.size() == 3); //and nothing else
+
+
+    }
+}
\ No newline at end of file

Added: tika/branches/2.x/tika-parser-modules/tika-cad-module/pom.xml
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-cad-module/pom.xml?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-cad-module/pom.xml (added)
+++ tika/branches/2.x/tika-parser-modules/tika-cad-module/pom.xml Wed Jan  6 
03:50:50 2016
@@ -0,0 +1,62 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more 
contributor 
+  license agreements. See the NOTICE file distributed with this work for 
additional 
+  information regarding copyright ownership. The ASF licenses this file to 
+  you under the Apache License, Version 2.0 (the "License"); you may not use 
+  this file except in compliance with the License. You may obtain a copy of 
+  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
+  by applicable law or agreed to in writing, software distributed under the 
+  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
+  OF ANY KIND, either express or implied. See the License for the specific 
+  language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0"; 
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance";
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 
http://maven.apache.org/xsd/maven-4.0.0.xsd";>
+  <modelVersion>4.0.0</modelVersion>
+
+  <parent>
+    <groupId>org.apache.tika</groupId>
+    <artifactId>tika-parser-modules</artifactId>
+    <version>2.0-SNAPSHOT</version>
+  </parent>
+
+  <artifactId>tika-cad-module</artifactId>
+  <name>Apache Tika CAD Module</name>
+  <url>http://tika.apache.org/</url>
+  
+  <dependencies>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-core</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-core</artifactId>
+      <version>${project.version}</version>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.poi</groupId>
+      <artifactId>poi</artifactId>
+      <version>${poi.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-text-module</artifactId>
+      <version>${project.version}</version>
+      <scope>test</scope>
+    </dependency>
+  </dependencies>
+  
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-dependency-plugin</artifactId>
+      </plugin>
+    </plugins>
+  </build>
+
+</project>
\ No newline at end of file

Added: 
tika/branches/2.x/tika-parser-modules/tika-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParser.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParser.java?rev=1723223&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParser.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParser.java
 Wed Jan  6 03:50:50 2016
@@ -0,0 +1,356 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.dwg;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.poi.util.IOUtils;
+import org.apache.poi.util.StringUtil;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.EndianUtils;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * DWG (CAD Drawing) parser. This is a very basic parser, which just
+ *  looks for bits of the headers.
+ * Note that we use Apache POI for various parts of the processing, as
+ *  lots of the low level string/int/short concepts are the same.
+ */
+public class DWGParser extends AbstractParser {
+
+    /** Serial version UID */
+    private static final long serialVersionUID = -7744232583079169119L;
+
+    private static MediaType TYPE = MediaType.image("vnd.dwg");
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return Collections.singleton(TYPE);
+    }
+
+    /** The order of the fields in the header */
+    private static final Property[] HEADER_PROPERTIES_ENTRIES = {
+        TikaCoreProperties.TITLE, 
+        TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_DESCRIPTION,
+        TikaCoreProperties.CREATOR,
+        TikaCoreProperties.TRANSITION_KEYWORDS_TO_DC_SUBJECT,
+        TikaCoreProperties.COMMENTS,
+        TikaCoreProperties.MODIFIER,
+        null, // Unknown?
+        TikaCoreProperties.RELATION, // Hyperlink
+    };
+
+    /** For the 2000 file, they're indexed */
+    private static final Property[] HEADER_2000_PROPERTIES_ENTRIES = {
+       null, 
+       TikaCoreProperties.RELATION, // 0x01
+       TikaCoreProperties.TITLE,    // 0x02
+       TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_DESCRIPTION,  // 0x03
+       TikaCoreProperties.CREATOR,   // 0x04
+       null,
+       TikaCoreProperties.COMMENTS,// 0x06 
+       TikaCoreProperties.TRANSITION_KEYWORDS_TO_DC_SUBJECT,    // 0x07
+       TikaCoreProperties.MODIFIER, // 0x08
+   };
+
+    private static final String HEADER_2000_PROPERTIES_MARKER_STR =
+            "DWGPROPS COOKIE";
+
+    private static final byte[] HEADER_2000_PROPERTIES_MARKER =
+            new byte[HEADER_2000_PROPERTIES_MARKER_STR.length()];
+
+    static {
+        StringUtil.putCompressedUnicode(
+                HEADER_2000_PROPERTIES_MARKER_STR,
+                HEADER_2000_PROPERTIES_MARKER, 0);
+    }
+
+    /** 
+     * How far to skip after the last standard property, before
+     *  we find any custom properties that might be there.
+     */
+    private static final int CUSTOM_PROPERTIES_SKIP = 20;
+    
+    /** 
+     * The value of padding bytes other than 0 in some DWG files.
+     */
+    private static final int[] CUSTOM_PROPERTIES_ALT_PADDING_VALUES = new 
int[] {0x2, 0, 0, 0};
+
+    public void parse(
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context)
+            throws IOException, TikaException, SAXException {
+        // First up, which version of the format are we handling?
+        byte[] header = new byte[128];
+        IOUtils.readFully(stream, header);
+        String version = new String(header, 0, 6, "US-ASCII");
+
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+
+        if (version.equals("AC1015")) {
+            metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
+            if (skipTo2000PropertyInfoSection(stream, header)) {
+                get2000Props(stream,metadata,xhtml);
+            }
+        } else if (version.equals("AC1018")) {
+            metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
+            if (skipToPropertyInfoSection(stream, header)) {
+                get2004Props(stream,metadata,xhtml);
+            }
+        } else if (version.equals("AC1021") || version.equals("AC1024")) {
+            metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
+            if (skipToPropertyInfoSection(stream, header)) {
+                get2007and2010Props(stream,metadata,xhtml);
+            }
+        } else {
+            throw new TikaException(
+                    "Unsupported AutoCAD drawing version: " + version);
+        }
+
+        xhtml.endDocument();
+    }
+
+    /**
+     * Stored as US-ASCII
+     */
+    private void get2004Props(
+            InputStream stream, Metadata metadata, XHTMLContentHandler xhtml)
+            throws IOException, TikaException, SAXException {
+       // Standard properties
+        for (int i = 0; i < HEADER_PROPERTIES_ENTRIES.length; i++) {
+            String headerValue = read2004String(stream);
+            handleHeader(i, headerValue, metadata, xhtml);
+        }
+
+        // Custom properties
+        int customCount = skipToCustomProperties(stream);
+        for (int i = 0; i < customCount; i++) {
+           String propName = read2004String(stream);
+           String propValue = read2004String(stream);
+           if(propName.length() > 0 && propValue.length() > 0) {
+              metadata.add(propName, propValue);
+           }
+        }
+    }
+
+    private String read2004String(InputStream stream) throws IOException, 
TikaException {
+       int stringLen = EndianUtils.readUShortLE(stream);
+
+       byte[] stringData = new byte[stringLen];
+       IOUtils.readFully(stream, stringData);
+
+       // Often but not always null terminated
+       if (stringData[stringLen-1] == 0) {
+           stringLen--;
+       }
+       String value = StringUtil.getFromCompressedUnicode(stringData, 0, 
stringLen);
+       return value;
+    }
+
+    /**
+     * Stored as UCS2, so 16 bit "unicode"
+     */
+    private void get2007and2010Props(
+            InputStream stream, Metadata metadata, XHTMLContentHandler xhtml)
+            throws IOException, TikaException, SAXException {
+        // Standard properties
+        for (int i = 0; i < HEADER_PROPERTIES_ENTRIES.length; i++) {
+            String headerValue = read2007and2010String(stream);
+            handleHeader(i, headerValue, metadata, xhtml);
+        }
+
+        // Custom properties
+        int customCount = skipToCustomProperties(stream);
+        for (int i = 0; i < customCount; i++) {
+           String propName = read2007and2010String(stream);
+           String propValue = read2007and2010String(stream);
+           if(propName.length() > 0 && propValue.length() > 0) {
+              metadata.add(propName, propValue);
+           }
+        }
+    }
+
+    private String read2007and2010String(InputStream stream) throws 
IOException, TikaException {
+       int stringLen = EndianUtils.readUShortLE(stream);
+
+       byte[] stringData = new byte[stringLen * 2];
+       IOUtils.readFully(stream, stringData);
+       String value = StringUtil.getFromUnicodeLE(stringData);
+
+       // Some strings are null terminated
+       if(value.charAt(value.length()-1) == 0) {
+           value = value.substring(0, value.length()-1);
+       }
+
+       return value;
+    }
+
+    private void get2000Props(
+            InputStream stream, Metadata metadata, XHTMLContentHandler xhtml)
+            throws IOException, TikaException, SAXException {
+        int propCount = 0;
+        while(propCount < 30) {
+            int propIdx = EndianUtils.readUShortLE(stream);
+            int length = EndianUtils.readUShortLE(stream);
+            int valueType = stream.read();
+            
+            if(propIdx == 0x28) {
+               // This one seems not to follow the pattern
+               length = 0x19;
+            } else if(propIdx == 90) {
+               // We think this means the end of properties
+               break;
+            }
+
+            byte[] value = new byte[length];
+            IOUtils.readFully(stream, value);
+            if(valueType == 0x1e) {
+                // Normal string, good
+                String val = StringUtil.getFromCompressedUnicode(value, 0, 
length);
+                
+                // Is it one we can look up by index?
+                if(propIdx < HEADER_2000_PROPERTIES_ENTRIES.length) {
+                   metadata.add(HEADER_2000_PROPERTIES_ENTRIES[propIdx], val);
+                   xhtml.element("p", val);
+                } else if(propIdx == 0x012c) {
+                   int splitAt = val.indexOf('='); 
+                   if(splitAt > -1) {
+                      String propName = val.substring(0, splitAt);
+                      String propVal = val.substring(splitAt+1);
+                      metadata.add(propName, propVal);
+                   }
+                }
+            } else {
+                // No idea...
+            }
+            
+            propCount++;
+        }
+    }
+
+    private void handleHeader(
+            int headerNumber, String value, Metadata metadata,
+            XHTMLContentHandler xhtml) throws SAXException {
+        if(value == null || value.length() == 0) {
+            return;
+        }
+
+        Property headerProp = HEADER_PROPERTIES_ENTRIES[headerNumber];
+        if(headerProp != null) {
+            metadata.set(headerProp, value);
+        }
+
+        xhtml.element("p", value);
+    }
+
+    /**
+     * Grab the offset, then skip there
+     */
+    private boolean skipToPropertyInfoSection(InputStream stream, byte[] 
header)
+            throws IOException, TikaException {
+        // The offset is stored in the header from 0x20 onwards
+        long offsetToSection = EndianUtils.getLongLE(header, 0x20);
+        
+        // Sanity check the offset. Some files seem to use a different format,
+        //  and the offset isn't available at 0x20. Until we can work out how
+        //  to find the offset in those files, skip them if detected
+        if (offsetToSection > 0xa00000l) {
+           // Header should never be more than 10mb into the file, something 
is wrong
+           offsetToSection = 0;
+        }
+        
+        // Work out how far to skip, and sanity check
+        long toSkip = offsetToSection - header.length;
+        if(offsetToSection == 0){
+            return false;
+        }        
+        while (toSkip > 0) {
+            byte[] skip = new byte[Math.min((int) toSkip, 0x4000)];
+            IOUtils.readFully(stream, skip);
+            toSkip -= skip.length;
+        }
+        return true;
+    }
+
+    /**
+     * We think it can be anywhere...
+     */
+    private boolean skipTo2000PropertyInfoSection(InputStream stream, byte[] 
header)
+            throws IOException {
+       int val = 0;
+       while(val != -1) {
+          val = stream.read();
+          if(val == HEADER_2000_PROPERTIES_MARKER[0]) {
+             boolean going = true;
+             for(int i=1; i<HEADER_2000_PROPERTIES_MARKER.length && going; 
i++) {
+                val = stream.read();
+                if(val != HEADER_2000_PROPERTIES_MARKER[i]) going = false;
+             }
+             if(going) {
+                // Bingo, found it
+                return true;
+             }
+          }
+       }
+       return false;
+    }
+
+    private int skipToCustomProperties(InputStream stream) 
+            throws IOException, TikaException {
+       // There should be 4 zero bytes or CUSTOM_PROPERTIES_ALT_PADDING_VALUES 
next
+       byte[] padding = new byte[4];
+       IOUtils.readFully(stream, padding);
+       if((padding[0] == 0 && padding[1] == 0 &&
+             padding[2] == 0 && padding[3] == 0) ||
+             (padding[0] == CUSTOM_PROPERTIES_ALT_PADDING_VALUES[0] && 
+               padding[1] == CUSTOM_PROPERTIES_ALT_PADDING_VALUES[1] &&
+               padding[2] == CUSTOM_PROPERTIES_ALT_PADDING_VALUES[2] &&
+               padding[3] == CUSTOM_PROPERTIES_ALT_PADDING_VALUES[3])) {
+           
+          // Looks hopeful, skip on
+          padding = new byte[CUSTOM_PROPERTIES_SKIP];
+          IOUtils.readFully(stream, padding);
+          
+          // We should now have the count
+          int count = EndianUtils.readUShortLE(stream);
+          
+          // Sanity check it
+          if(count > 0 && count < 0x7f) {
+             // Looks plausible
+             return count;
+          } else {
+             // No properties / count is too high to trust
+             return 0;
+          }
+       } else {
+          // No padding. That probably means no custom props
+          return 0;
+       }
+    }
+
+}

Added: 
tika/branches/2.x/tika-parser-modules/tika-cad-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-cad-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java?rev=1723223&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-cad-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-cad-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java
 Wed Jan  6 03:50:50 2016
@@ -0,0 +1,275 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.prt;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.UnsupportedEncodingException;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.poi.util.IOUtils;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.EndianUtils;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import static java.nio.charset.StandardCharsets.US_ASCII;
+
+/**
+ * A basic text extracting parser for the CADKey PRT (CAD Drawing)
+ *  format. It outputs text from note entries.
+ */
+
+public class PRTParser extends AbstractParser {
+
+    /** Serial version UID */
+    private static final long serialVersionUID = 4659638314375035178L;
+
+    private static final Set<MediaType> SUPPORTED_TYPES = 
Collections.singleton(MediaType.application("x-prt"));
+    public static final String PRT_MIME_TYPE = "application/x-prt";
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+       return SUPPORTED_TYPES;
+    }
+
+    /**
+     * How long do we allow a text run to claim to be, before we
+     * decide we're confused and it's not really text after all?
+     */
+    private static final int MAX_SANE_TEXT_LENGTH = 0x0800;
+    
+    /*
+     * Text types:
+     *   00 00 00 00 f0 [3b]f sz sz TEXT     *view name*
+     *   00 00 00 00 f0 3f 00 00 00 00 00 00 00 00 sz sz TEXT  *view name*
+     *   (anything)  e0 3f sz sz TEXT    *view name*
+     *   3x 33 33 33 33 33 e3 3f 0x 00 00 0x 00 00 0x 0x 1f sz sz TEXT    
*note entries* 
+     *   
+     *  Note - all text is null terminated
+     */
+      
+    public void parse(InputStream stream, ContentHandler handler, Metadata 
metadata, 
+          ParseContext context) throws IOException, SAXException, 
TikaException {
+       
+       XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+       Last5 l5 = new Last5();
+       int read;
+       
+       // Try to get the creation date, which is YYYYMMDDhhmm
+       byte[] header = new byte[30];
+       IOUtils.readFully(stream, header);
+       byte[] date = new byte[12];
+       IOUtils.readFully(stream, date);
+       
+       String dateStr = new String(date, US_ASCII);
+       if(dateStr.startsWith("19") || dateStr.startsWith("20")) {
+          String formattedDate = dateStr.substring(0, 4) + "-" + 
dateStr.substring(4,6) +
+             "-" + dateStr.substring(6,8) + "T" + dateStr.substring(8,10) + 
":" +
+             dateStr.substring(10, 12) + ":00";
+          metadata.set(TikaCoreProperties.CREATED, formattedDate);
+          // TODO Metadata.DATE is used as modified, should it be here?
+          metadata.set(Metadata.DATE, formattedDate);
+       }
+       metadata.set(Metadata.CONTENT_TYPE, PRT_MIME_TYPE);
+       
+       // The description, if set, is the next up-to-500 bytes
+       byte[] desc = new byte[500];
+       IOUtils.readFully(stream, desc);
+       String description = extractText(desc, true);
+       if(description.length() > 0) {
+          metadata.set(TikaCoreProperties.DESCRIPTION, description);
+       }
+       
+       // Now look for text
+       while( (read = stream.read()) > -1) {
+          if(read == 0xe0 || read == 0xe3 || read == 0xf0) {
+             int nread = stream.read();
+             if(nread == 0x3f || nread == 0xbf) {
+                // Looks promising, check back for a suitable value
+                if(read == 0xe3 && nread == 0x3f) {
+                   if(l5.is33()) {
+                      // Bingo, note text
+                      handleNoteText(stream, xhtml);
+                   }
+                } else if(l5.is00()) {
+                   // Likely view name
+                   handleViewName(read, nread, stream, xhtml, l5);
+                }
+             }
+          } else {
+             l5.record(read);
+          }
+       }
+    }
+    
+    private void handleNoteText(InputStream stream, XHTMLContentHandler xhtml) 
+    throws IOException, SAXException, TikaException {
+       // Ensure we have the right padding text
+       int read;
+       for(int i=0; i<10; i++) {
+          read = stream.read();
+          if(read >= 0 && read <= 0x0f) {
+             // Promising
+          } else {
+             // Wrong, false detection
+             return;
+          }
+       }
+       read = stream.read();
+       if(read != 0x1f) {
+          // Wrong, false detection
+          return;
+       }
+       
+       int length = EndianUtils.readUShortLE(stream);
+       if(length <= MAX_SANE_TEXT_LENGTH) {
+          // Length sanity check passed
+          handleText(length, stream, xhtml);
+       }
+    }
+    
+    private void handleViewName(int typeA, int typeB, InputStream stream, 
+          XHTMLContentHandler xhtml, Last5 l5) 
+    throws IOException, SAXException, TikaException {
+       // Is it 8 byte zero padded?
+       int maybeLength = EndianUtils.readUShortLE(stream);
+       if(maybeLength == 0) {
+          // Check the next 6 bytes too
+          for(int i=0; i<6; i++) {
+             int read = stream.read();
+             if(read >= 0 && read <= 0x0f) {
+                // Promising
+             } else {
+                // Wrong, false detection
+                return;
+             }
+          }
+          
+          byte[] b2 = new byte[2];
+          IOUtils.readFully(stream, b2);
+          int length = EndianUtils.getUShortLE(b2);
+          if(length > 1 && length <= MAX_SANE_TEXT_LENGTH) {
+             // Length sanity check passed
+             handleText(length, stream, xhtml);
+          } else {
+             // Was probably something else
+             l5.record(b2[0]);
+             l5.record(b2[1]);
+          }
+       } else if(maybeLength > 0 && maybeLength < MAX_SANE_TEXT_LENGTH) {
+          // Looks like it's straight into the text
+          handleText(maybeLength, stream, xhtml);
+       }
+    }
+    
+    private void handleText(int length, InputStream stream, 
XHTMLContentHandler xhtml) 
+    throws IOException, SAXException, TikaException {
+       byte[] str = new byte[length];
+       IOUtils.readFully(stream, str);
+       if(str[length-1] != 0) {
+          // Not properly null terminated, must be wrong
+          return;
+       }
+       
+       String text = extractText(str, false);
+       
+       xhtml.startElement("p");
+       xhtml.characters(text);
+       xhtml.endElement("p");
+    }
+    
+    /**
+     * Does our best to turn the bytes into text
+     */
+    private String extractText(byte[] data, boolean trim) throws TikaException 
{
+       // The text is always stored null terminated, but sometimes
+       //  may have extra null padding too
+       int length = data.length - 1;
+       if(trim) {
+          for(int i=0; i<data.length; i++) {
+             if(data[i] == 0) {
+                length = i;
+                break;
+             }
+          }
+       }
+       
+       // We believe that the text is basically stored as CP437
+       // That said, there are a few characters slightly wrong for that...
+       String text;
+       try {
+          text = new String(data, 0, length, "cp437");
+       } catch(UnsupportedEncodingException e) {
+          throw new TikaException("JVM Broken, core codepage CP437 missing!");
+       }
+       
+       // Fix up the known character issues
+       text = text.replace("\u03C6","\u00D8");
+
+       // All done, as best as we can!
+       return text;
+    }
+    
+    /**
+     * Provides a view on the previous 5 bytes
+     */
+    private static class Last5 {
+       byte[] data = new byte[5];
+       int pos = 0;
+       
+       private void record(int b) {
+          data[pos] = (byte)b;
+          pos++;
+          if(pos >= data.length) {
+             pos = 0;
+          }
+       }
+       
+       private byte[] get() {
+          byte[] ret = new byte[5];
+          for(int i=0; i<ret.length; i++) {
+             int p = pos - i;
+             if(p < 0) { p += ret.length; }
+             ret[i] = data[p];
+          }
+          return ret;
+       }
+       
+       private boolean is33() {
+          byte[] last5 = get();
+          for(byte b : last5) {
+             if(b != 0x33) return false;
+          }
+          return true;
+       }
+       
+       private boolean is00() {
+          byte[] last5 = get();
+          for(byte b : last5) {
+             if(b != 0x00) return false;
+          }
+          return true;
+       }
+    }
+}

Added: 
tika/branches/2.x/tika-parser-modules/tika-cad-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-cad-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser?rev=1723223&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-cad-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-cad-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
 Wed Jan  6 03:50:50 2016
@@ -0,0 +1,18 @@
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+
+org.apache.tika.parser.dwg.DWGParser
+#org.apache.tika.parser.prt.PRTParser

Added: 
tika/branches/2.x/tika-parser-modules/tika-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java?rev=1723223&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java
 Wed Jan  6 03:50:50 2016
@@ -0,0 +1,202 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.dwg;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNull;
+import static org.apache.tika.TikaTest.assertContains;
+
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class DWGParserTest {
+  
+    @Test
+    public void testDWG2000Parser() throws Exception {
+        InputStream input = DWGParserTest.class.getResourceAsStream(
+                "/test-documents/testDWG2000.dwg");
+        testParserAlt(input);
+    }
+
+    @Test
+    public void testDWG2004Parser() throws Exception {
+        InputStream input = DWGParserTest.class.getResourceAsStream(
+                "/test-documents/testDWG2004.dwg");
+        testParser(input);
+    }
+
+    @Test
+    public void testDWG2004ParserNoHeaderAddress() throws Exception {
+        InputStream input = DWGParserTest.class.getResourceAsStream(
+                "/test-documents/testDWG2004_no_header.dwg");
+        testParserNoHeader(input);
+    }
+
+    @Test
+    public void testDWG2007Parser() throws Exception {
+        InputStream input = DWGParserTest.class.getResourceAsStream(
+                "/test-documents/testDWG2007.dwg");
+        testParser(input);
+    }
+
+    @Test
+    public void testDWG2010Parser() throws Exception {
+        InputStream input = DWGParserTest.class.getResourceAsStream(
+                "/test-documents/testDWG2010.dwg");
+        testParser(input);
+    }
+    
+    @Test
+    public void testDWG2010CustomPropertiesParser() throws Exception {
+        // Check that standard parsing works
+        InputStream testInput = DWGParserTest.class.getResourceAsStream(
+                "/test-documents/testDWG2010_custom_props.dwg");
+        testParser(testInput);
+        
+        // Check that custom properties with alternate padding work
+        try (InputStream input = DWGParserTest.class.getResourceAsStream(
+                "/test-documents/testDWG2010_custom_props.dwg")) {
+            Metadata metadata = new Metadata();
+            ContentHandler handler = new BodyContentHandler();
+            new DWGParser().parse(input, handler, metadata, null);
+
+            assertEquals("valueforcustomprop1",
+                    metadata.get("customprop1"));
+            assertEquals("valueforcustomprop2",
+                    metadata.get("customprop2"));
+        }
+    }
+
+    @Test
+    public void testDWGMechParser() throws Exception {
+        String[] types = new String[] {
+              "6", "2004", "2004DX", "2005", "2006",
+              "2007", "2008", "2009", "2010", "2011"
+        };
+        for (String type : types) {
+           InputStream input = DWGParserTest.class.getResourceAsStream(
+                   "/test-documents/testDWGmech"+type+".dwg");
+           testParserAlt(input);
+        }
+    }
+
+    @SuppressWarnings("deprecation")
+    private void testParser(InputStream input) throws Exception {
+        try {
+            Metadata metadata = new Metadata();
+            ContentHandler handler = new BodyContentHandler();
+            new DWGParser().parse(input, handler, metadata);
+
+            assertEquals("image/vnd.dwg", metadata.get(Metadata.CONTENT_TYPE));
+
+            assertEquals("The quick brown fox jumps over the lazy dog", 
+                    metadata.get(TikaCoreProperties.TITLE));
+            assertEquals("Gym class featuring a brown fox and lazy dog",
+                    metadata.get(TikaCoreProperties.DESCRIPTION));
+            assertEquals("Gym class featuring a brown fox and lazy dog",
+                    metadata.get(Metadata.SUBJECT));
+            assertEquals("Nevin Nollop",
+                    metadata.get(TikaCoreProperties.CREATOR));
+            assertEquals("Pangram, fox, dog",
+                    metadata.get(TikaCoreProperties.KEYWORDS));
+            assertEquals("Lorem ipsum",
+                    metadata.get(TikaCoreProperties.COMMENTS).substring(0,11));
+            assertEquals("http://www.alfresco.com";,
+                    metadata.get(TikaCoreProperties.RELATION));
+            
+            // Check some of the old style metadata too
+            assertEquals("The quick brown fox jumps over the lazy dog", 
+                  metadata.get(Metadata.TITLE));
+            assertEquals("Gym class featuring a brown fox and lazy dog",
+                  metadata.get(Metadata.SUBJECT));
+
+            String content = handler.toString();
+            assertContains("The quick brown fox jumps over the lazy dog", 
content);
+            assertContains("Gym class", content);
+            assertContains("www.alfresco.com", content);
+        } finally {
+            input.close();
+        }
+    }
+
+    @SuppressWarnings("deprecation")
+    private void testParserNoHeader(InputStream input) throws Exception {
+        try {
+            Metadata metadata = new Metadata();
+            ContentHandler handler = new BodyContentHandler();
+            new DWGParser().parse(input, handler, metadata);
+
+            assertEquals("image/vnd.dwg", metadata.get(Metadata.CONTENT_TYPE));
+            
+            assertNull(metadata.get(TikaCoreProperties.TITLE));
+            assertNull(metadata.get(TikaCoreProperties.DESCRIPTION));
+            assertNull(metadata.get(Metadata.SUBJECT));
+            assertNull(metadata.get(TikaCoreProperties.CREATOR));
+            assertNull(metadata.get(TikaCoreProperties.KEYWORDS));
+            assertNull(metadata.get(TikaCoreProperties.COMMENTS));
+            assertNull(metadata.get(TikaCoreProperties.RELATION));
+
+            String content = handler.toString();
+            assertEquals("", content);
+        } finally {
+            input.close();
+        }
+    }
+
+    @SuppressWarnings("deprecation")
+    private void testParserAlt(InputStream input) throws Exception {
+        try {
+            Metadata metadata = new Metadata();
+            ContentHandler handler = new BodyContentHandler();
+            new DWGParser().parse(input, handler, metadata);
+
+            assertEquals("image/vnd.dwg", metadata.get(Metadata.CONTENT_TYPE));
+
+            assertEquals("Test Title", 
+                    metadata.get(TikaCoreProperties.TITLE));
+            assertEquals("Test Subject",
+                    metadata.get(TikaCoreProperties.DESCRIPTION));
+            assertEquals("Test Subject",
+                    metadata.get(Metadata.SUBJECT));
+            assertEquals("My Author",
+                    metadata.get(TikaCoreProperties.CREATOR));
+            assertEquals("My keyword1, MyKeyword2",
+                    metadata.get(TikaCoreProperties.KEYWORDS));
+            assertEquals("This is a comment",
+                    metadata.get(TikaCoreProperties.COMMENTS));
+            assertEquals("bejanpol",
+                    metadata.get(TikaCoreProperties.MODIFIER));
+            assertEquals("bejanpol",
+                    metadata.get(Metadata.LAST_AUTHOR));
+            assertEquals("http://mycompany/drawings";,
+                    metadata.get(TikaCoreProperties.RELATION));
+            assertEquals("MyCustomPropertyValue",
+                  metadata.get("MyCustomProperty"));
+
+            String content = handler.toString();
+            assertContains("This is a comment", content);
+            assertContains("mycompany", content);
+        } finally {
+            input.close();
+        }
+    }
+}

Added: 
tika/branches/2.x/tika-parser-modules/tika-cad-module/src/test/java/org/apache/tika/parser/prt/PRTParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-cad-module/src/test/java/org/apache/tika/parser/prt/PRTParserTest.java?rev=1723223&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-cad-module/src/test/java/org/apache/tika/parser/prt/PRTParserTest.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-cad-module/src/test/java/org/apache/tika/parser/prt/PRTParserTest.java
 Wed Jan  6 03:50:50 2016
@@ -0,0 +1,113 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.prt;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.InputStream;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class PRTParserTest extends TikaTest {
+    /**
+     * Try with a simple file
+     */
+    @Test
+    public void testPRTParserBasics() throws Exception {
+       try (InputStream input = 
getResourceAsStream("/test-documents/testCADKEY.prt")) {
+          Metadata metadata = new Metadata();
+          ContentHandler handler = new BodyContentHandler();
+          new PRTParser().parse(input, handler, metadata);
+
+          assertEquals("application/x-prt", 
metadata.get(Metadata.CONTENT_TYPE));
+
+          // This file has a date
+          assertEquals("2011-06-20T16:54:00",
+                  metadata.get(TikaCoreProperties.CREATED));
+          assertEquals("2011-06-20T16:54:00",
+                  metadata.get(Metadata.CREATION_DATE));
+          // But no description
+          assertEquals(null, metadata.get(TikaCoreProperties.DESCRIPTION));
+
+          String contents = handler.toString();
+
+          assertContains("Front View", contents);
+          assertContains("Back View", contents);
+          assertContains("Bottom View", contents);
+          assertContains("Right View", contents);
+          assertContains("Left View", contents);
+          //assertContains("Isometric View", contents); // Can't detect yet
+          assertContains("Axonometric View", contents);
+
+          assertContains("You've managed to extract all the text!", contents);
+          assertContains("This is more text", contents);
+          assertContains("Text Inside a PRT file", contents);
+       }
+    }
+
+    /**
+     * Now a more complex one
+     */
+    @Test
+    public void testPRTParserComplex() throws Exception {
+       try (InputStream input = 
getResourceAsStream("/test-documents/testCADKEY2.prt")) {
+          Metadata metadata = new Metadata();
+          ContentHandler handler = new BodyContentHandler();
+          new PRTParser().parse(input, handler, metadata);
+
+          assertEquals("application/x-prt", 
metadata.get(Metadata.CONTENT_TYPE));
+
+          // File has both a date and a description
+          assertEquals("1997-04-01T08:59:00",
+                  metadata.get(Metadata.DATE));
+          assertEquals("1997-04-01T08:59:00",
+                  metadata.get(Metadata.CREATION_DATE));
+          assertEquals("TIKA TEST PART DESCRIPTION INFORMATION\r\n",
+                  metadata.get(TikaCoreProperties.DESCRIPTION));
+
+          String contents = handler.toString();
+
+          assertContains("ITEM", contents);
+          assertContains("REQ.", contents);
+          assertContains("DESCRIPTION", contents);
+          assertContains("MAT'L", contents);
+          assertContains("TOLERANCES UNLESS", contents);
+          assertContains("FRACTIONS", contents);
+          assertContains("ANGLES", contents);
+          assertContains("Acme Corporation", contents);
+
+          assertContains("DATE", contents);
+          assertContains("CHANGE", contents);
+          assertContains("DRAWN BY", contents);
+          assertContains("SCALE", contents);
+          assertContains("TIKA TEST DRAWING", contents);
+          assertContains("TIKA LETTERS", contents);
+          assertContains("5.82", contents);
+          assertContains("112" + '\u00b0', contents); // Degrees
+          assertContains("TIKA TEST LETTER", contents);
+          assertContains("17.11", contents);
+          assertContains('\u00d8' + "\ufffd2.000", contents); // Diameter
+          assertContains("Diameter", contents);
+          assertContains("The Apache Tika toolkit", contents);
+       }
+    }
+}

svn commit: r1723223 [2/32] - in /tika/branches/2.x: tika-core/src/test/resources/META-INF/ tika-core/src/test/resources/META-INF/services/ tika-parser-modules/ tika-parser-modules/tika-advanced-module/ tika-parser-modules/tika-advanced-module/src/ tik...

Reply via email to