Added: tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/main/java/org/apache/tika/parser/ner/NamedEntityParser.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/main/java/org/apache/tika/parser/ner/NamedEntityParser.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/main/java/org/apache/tika/parser/ner/NamedEntityParser.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/main/java/org/apache/tika/parser/ner/NamedEntityParser.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,151 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.parser.ner; + +import org.apache.commons.io.IOUtils; +import org.apache.tika.Tika; +import org.apache.tika.config.TikaConfig; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.ner.opennlp.OpenNLPNERecogniser; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +/** + * + * This implementation of {@link org.apache.tika.parser.Parser} extracts + * entity names from text content and adds it to the metadata. + * <p>All the metadata keys will have a common suffix {@value #MD_KEY_PREFIX}</p> + * <p>The Named Entity recogniser implementation can be changed by setting the + * system property {@value #SYS_PROP_NER_IMPL} value to a name of class that + * implements {@link NERecogniser} contract</p> + * @see OpenNLPNERecogniser + * @see NERecogniser + * + */ +public class NamedEntityParser extends AbstractParser { + + public static final Logger LOG = LoggerFactory.getLogger(NamedEntityParser.class); + public static final Set<MediaType> MEDIA_TYPES = new HashSet<>(); + public static final String MD_KEY_PREFIX = "NER_"; + public static final String DEFAULT_NER_IMPL = OpenNLPNERecogniser.class.getName(); + public static final String SYS_PROP_NER_IMPL = "ner.impl.class"; + + public Tika secondaryParser; + + static { + MEDIA_TYPES.add(MediaType.TEXT_PLAIN); + } + + private List<NERecogniser> nerChain; + private volatile boolean initialized = false; + private volatile boolean available = false; + + private synchronized void initialize(ParseContext context) { + if (initialized) { + return; + } + initialized = true; + + //TODO: read class name from context or config + //There can be multiple classes in the form of comma separated class names; + String classNamesString = System.getProperty(SYS_PROP_NER_IMPL, + DEFAULT_NER_IMPL); + String[] classNames = classNamesString.split(","); + this.nerChain = new ArrayList<>(classNames.length); + for (String className : classNames) { + className = className.trim(); + LOG.info("going to load, instantiate and bind the instance of {}", + className); + try { + NERecogniser recogniser = + (NERecogniser) Class.forName(className).newInstance(); + LOG.info("{} is available ? {}", className, + recogniser.isAvailable()); + if (recogniser.isAvailable()) { + nerChain.add(recogniser); + } + } catch (Exception e) { + LOG.error(e.getMessage(), e); + } + } + try { + TikaConfig config = new TikaConfig(); + this.secondaryParser = new Tika(config); + this.available = !nerChain.isEmpty(); + LOG.info("Number of NERecognisers in chain {}", nerChain.size()); + } catch (Exception e){ + LOG.error(e.getMessage(), e); + this.available = false; + } + } + + public Set<MediaType> getSupportedTypes(ParseContext parseContext) { + return MEDIA_TYPES; + } + + public void parse(InputStream inputStream, ContentHandler contentHandler, + Metadata metadata, ParseContext parseContext) + throws IOException, SAXException, TikaException { + + if (!initialized) { + initialize(parseContext); + } + if (!available) { + return; + } + + Reader reader = MediaType.TEXT_PLAIN.toString() + .equals(metadata.get(Metadata.CONTENT_TYPE)) + ? new InputStreamReader(inputStream, StandardCharsets.UTF_8) + : secondaryParser.parse(inputStream); + + String text = IOUtils.toString(reader); + IOUtils.closeQuietly(reader); + + for (NERecogniser ner : nerChain) { + Map<String, Set<String>> names = ner.recognise(text); + if (names != null) { + for (Map.Entry<String, Set<String>> entry : names.entrySet()) { + if (entry.getValue() != null) { + String mdKey = MD_KEY_PREFIX + entry.getKey(); + for (String name : entry.getValue()) { + metadata.add(mdKey, name); + } + } + } + } + } + } +}
Added: tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/main/java/org/apache/tika/parser/ner/corenlp/CoreNLPNERecogniser.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/main/java/org/apache/tika/parser/ner/corenlp/CoreNLPNERecogniser.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/main/java/org/apache/tika/parser/ner/corenlp/CoreNLPNERecogniser.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/main/java/org/apache/tika/parser/ner/corenlp/CoreNLPNERecogniser.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,169 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.ner.corenlp; + +import org.apache.tika.io.IOUtils; +import org.apache.tika.parser.ner.NERecogniser; +import org.json.JSONObject; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.FileInputStream; +import java.io.IOException; +import java.lang.reflect.Field; +import java.lang.reflect.Method; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Properties; +import java.util.Set; + +/** + * This class offers an implementation of {@link NERecogniser} based on + * CRF classifiers from Stanford CoreNLP. This NER requires additional setup, + * due to runtime binding to Stanford CoreNLP. + * See <a href="http://wiki.apache.org/tika/TikaAndNER#CoreNLP"> + * Tika NER Wiki</a> for configuring this recogniser. + * @see NERecogniser + * + */ +public class CoreNLPNERecogniser implements NERecogniser { + + private static final Logger LOG = LoggerFactory.getLogger(CoreNLPNERecogniser.class); + + //default model paths + public static final String NER_3CLASS_MODEL = "edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz"; + public static final String NER_4CLASS_MODEL = "edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"; + public static final String NER_7CLASS_MODEL = "edu/stanford/nlp/models/ner/english.muc.7class.distsim.crf.ser.gz"; + /** + * default Model path + */ + public static final String DEFAULT_MODEL_PATH = NER_7CLASS_MODEL; + public static final String MODEL_PROP_NAME = "ner.corenlp.model"; + + public static final Set<String> ENTITY_TYPES = new HashSet<String>(){{ + add(PERSON); + add(TIME); + add(LOCATION); + add(ORGANIZATION); + add(MONEY); + add(PERCENT); + add(DATE); + }}; + private static final String CLASSIFIER_CLASS_NAME = "edu.stanford.nlp.ie.crf.CRFClassifier"; + + private boolean available = false; + private Field firstField; + private Field secondField; + private Field thirdField; + private Object classifierInstance; + private Method classifyMethod; + + public CoreNLPNERecogniser(){ + this(System.getProperty(MODEL_PROP_NAME, DEFAULT_MODEL_PATH)); + } + + /** + * Creates a NERecogniser by loading model from given path + * @param modelPath path to NER model file + */ + public CoreNLPNERecogniser(String modelPath) { + try { + Properties props = new Properties(); + Class<?> classifierClass = Class.forName(CLASSIFIER_CLASS_NAME); + Method loadMethod = classifierClass.getMethod("getClassifier", String.class, Properties.class); + classifierInstance = loadMethod.invoke(classifierClass, modelPath, props); + classifyMethod = classifierClass.getMethod("classifyToCharacterOffsets", String.class); + + //these fields are for accessing result + Class<?> tripleClass = Class.forName("edu.stanford.nlp.util.Triple"); + this.firstField = tripleClass.getField("first"); + this.secondField = tripleClass.getField("second"); + this.thirdField = tripleClass.getField("third"); + this.available = true; + } catch (Exception e) { + LOG.warn("{} while trying to load the model from {}", e.getMessage(), modelPath); + } + LOG.info("Available for service ? {}", available); + } + + /** + * + * @return {@code true} if model was available, valid and was able to initialise the classifier. + * returns {@code false} when this recogniser is not available for service. + */ + public boolean isAvailable() { + return available; + } + + /** + * Gets set of entity types recognised by this recogniser + * @return set of entity classes/types + */ + public Set<String> getEntityTypes() { + return ENTITY_TYPES; + } + + /** + * recognises names of entities in the text + * @param text text which possibly contains names + * @return map of entity type -> set of names + */ + public Map<String, Set<String>> recognise(String text) { + Map<String, Set<String>> names = new HashMap<>(); + try { + Object result = classifyMethod.invoke(classifierInstance, text); + List entries = (List) result; + for (Object entry : entries) { + String entityType = (String) firstField.get(entry); + if (!names.containsKey(entityType)) { + names.put(entityType, new HashSet<String>()); + } + Integer start = (Integer) secondField.get(entry); + Integer end = (Integer) thirdField.get(entry); + String name = text.substring(start, end); + //Clean repeating spaces, replace line breaks and tabs with single space + name = name.trim().replaceAll("(\\s\\s+)|\n|\t", " "); + if (!name.isEmpty()) { + names.get(entityType).add(name); + } + } + + } catch (Exception e) { + LOG.debug(e.getMessage(), e); + } + return names; + } + + public static void main(String[] args) throws IOException { + if (args.length != 1) { + System.err.println("Error: Invalid Args"); + System.err.println("This tool finds names inside text"); + System.err.println("Usage: <path/to/text/file>"); + return; + } + + try (FileInputStream stream = new FileInputStream(args[0])) { + String text = IOUtils.toString(stream); + CoreNLPNERecogniser ner = new CoreNLPNERecogniser(); + Map<String, Set<String>> names = ner.recognise(text); + JSONObject jNames = new JSONObject(names); + System.out.println(jNames.toString(2)); + } + } +} Added: tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/main/java/org/apache/tika/parser/ner/opennlp/OpenNLPNERecogniser.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/main/java/org/apache/tika/parser/ner/opennlp/OpenNLPNERecogniser.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/main/java/org/apache/tika/parser/ner/opennlp/OpenNLPNERecogniser.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/main/java/org/apache/tika/parser/ner/opennlp/OpenNLPNERecogniser.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,151 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.parser.ner.opennlp; + +import org.apache.tika.parser.ner.NERecogniser; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + + +/** + * + * This implementation of {@link NERecogniser} chains an array of + * {@link OpenNLPNameFinder}s for which NER models are + * available in classpath. + * + * The following models are scanned during initialization via class loader.: + * + * <table> + * <tr> + * <th>Entity Type</th><th>Path</th> + * </tr> + * <tr> + * <td>{@value PERSON}</td><td> {@value PERSON_FILE}</td> + * </tr> + * <tr> + * <td>{@value LOCATION}</td><td>{@value LOCATION_FILE}</td> + * </tr> + * <tr> + * <td>{@value ORGANIZATION}</td><td>{@value ORGANIZATION_FILE}</td> + * </tr> + * <tr> + * <td>{@value TIME}</td><td>{@value TIME_FILE}</td> + * </tr> + * <tr> + * <td>{@value DATE}</td><td>{@value DATE_FILE}</td> + * </tr> + * <tr> + * <td>{@value PERCENT}</td><td>{@value PERCENT_FILE}</td> + * </tr> + * <tr> + * <td>{@value MONEY}</td><td>{@value MONEY_FILE}</td> + * </tr> + * </table> + * + * @see org.apache.tika.parser.ner.NamedEntityParser#DEFAULT_NER_IMPL + */ +public class OpenNLPNERecogniser implements NERecogniser { + + public static final String MODELS_DIR = OpenNLPNERecogniser.class + .getPackage().getName().replace(".", "/"); + public static final String PERSON_FILE = "ner-person.bin"; + public static final String LOCATION_FILE = "ner-location.bin"; + public static final String ORGANIZATION_FILE = "ner-organization.bin"; + public static final String TIME_FILE = "ner-time.bin"; + public static final String DATE_FILE = "ner-date.bin"; + public static final String PERCENT_FILE = "ner-percentage.bin"; + public static final String MONEY_FILE = "ner-money.bin"; + + + //Default (English) Models for the common 7 classes of named types + public static final String NER_PERSON_MODEL = MODELS_DIR + "/" + PERSON_FILE; + public static final String NER_LOCATION_MODEL = MODELS_DIR + "/" + LOCATION_FILE; + public static final String NER_ORGANIZATION_MODEL = MODELS_DIR + "/" + ORGANIZATION_FILE; + public static final String NER_TIME_MODEL = MODELS_DIR + "/" + TIME_FILE; + public static final String NER_DATE_MODEL = MODELS_DIR + "/" + DATE_FILE; + public static final String NER_PERCENT_MODEL = MODELS_DIR + "/" + PERCENT_FILE; + public static final String NER_MONEY_MODEL = MODELS_DIR + "/" + MONEY_FILE; + + public static final Map<String, String> DEFAULT_MODELS = + new HashMap<String, String>(){{ + put(PERSON, NER_PERSON_MODEL); + put(LOCATION, NER_LOCATION_MODEL); + put(ORGANIZATION, NER_ORGANIZATION_MODEL); + put(TIME, NER_TIME_MODEL); + put(DATE, NER_DATE_MODEL); + put(PERCENT, NER_PERCENT_MODEL); + put(MONEY, NER_MONEY_MODEL); + }}; + + private Set<String> entityTypes; + private List<OpenNLPNameFinder> nameFinders; + private boolean available; + + /** + * Creates a default chain of Name finders using default OpenNLP recognizers + */ + public OpenNLPNERecogniser(){ + this(DEFAULT_MODELS); + } + + /** + * Creates a chain of Named Entity recognisers + * @param models map of entityType -> model path + * NOTE: the model path should be known to class loader. + */ + public OpenNLPNERecogniser(Map<String, String> models){ + this.nameFinders = new ArrayList<>(); + this.entityTypes = new HashSet<>(); + for (Map.Entry<String, String> entry : models.entrySet()) { + OpenNLPNameFinder finder = + new OpenNLPNameFinder(entry.getKey(), entry.getValue()); + if (finder.isAvailable()) { + this.nameFinders.add(finder); + this.entityTypes.add(entry.getKey()); + } + } + this.entityTypes = Collections.unmodifiableSet(this.entityTypes); + this.available = nameFinders.size() > 0; //at least one finder is present + } + + @Override + public boolean isAvailable() { + return available; + } + + @Override + public Set<String> getEntityTypes() { + return entityTypes; + } + + @Override + public Map<String, Set<String>> recognise(String text) { + String[] tokens = OpenNLPNameFinder.tokenize(text); + Map<String, Set<String>> names = new HashMap<>(); + for (OpenNLPNameFinder finder : nameFinders) { + names.putAll(finder.findNames(tokens)); + } + return names; + } +} Added: tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/main/java/org/apache/tika/parser/ner/opennlp/OpenNLPNameFinder.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/main/java/org/apache/tika/parser/ner/opennlp/OpenNLPNameFinder.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/main/java/org/apache/tika/parser/ner/opennlp/OpenNLPNameFinder.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/main/java/org/apache/tika/parser/ner/opennlp/OpenNLPNameFinder.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,112 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.parser.ner.opennlp; + +import opennlp.tools.namefind.NameFinderME; +import opennlp.tools.namefind.TokenNameFinderModel; +import opennlp.tools.util.Span; +import org.apache.tika.io.IOUtils; +import org.apache.tika.parser.ner.NERecogniser; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.io.InputStream; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + +/** + * An implementation of {@link NERecogniser} that finds names in text using Open NLP Model. + * This implementation works with only one entity type. For chain this name finder instances, + * see {@link OpenNLPNERecogniser} + */ +public class OpenNLPNameFinder implements NERecogniser { + + private static final Logger LOG = LoggerFactory.getLogger(OpenNLPNameFinder.class); + private final String nameType; + private final Set<String> nameTypes; + private NameFinderME nameFinder; + private boolean available; + + /** + * Creates OpenNLP name finder + * @param nameType the entity type recognised by the given NER model + * @param nerModelPath path to ner model + */ + public OpenNLPNameFinder(String nameType, String nerModelPath) { + this.nameTypes = Collections.singleton(nameType); + this.nameType = nameType; + InputStream nerModelStream = getClass().getClassLoader().getResourceAsStream(nerModelPath); + try { + if (nerModelStream != null){ + TokenNameFinderModel model = new TokenNameFinderModel(nerModelStream); + this.nameFinder = new NameFinderME(model); + this.available = true; + } else { + LOG.warn("Couldn't find model from {} using class loader", nerModelPath); + } + } catch (IOException e) { + LOG.error(e.getMessage(), e); + } finally { + IOUtils.closeQuietly(nerModelStream); + } + LOG.info("{} NER : Available for service ? {}", nameType, available); + } + + @Override + public boolean isAvailable() { + return available; + } + + @Override + public Set<String> getEntityTypes() { + return nameTypes; + } + + public static String[] tokenize(String text){ + //NOTE: replace this with a NLP tokenizer tool + //clean + split + return text.trim().replaceAll("(\\s\\s+)", " ").split("\\s"); + } + + @Override + public synchronized Map<String, Set<String>> recognise(String text) { + String[] tokens = tokenize(text); + return findNames(tokens); + } + + /** + * finds names from given array of tokens + * @param tokens the tokens array + * @return map of EntityType -> set of entity names + */ + public Map<String, Set<String>> findNames(String[] tokens) { + Span[] nameSpans = nameFinder.find(tokens); + String[] names = Span.spansToStrings(nameSpans, tokens); + Map<String, Set<String>> result = new HashMap<>(); + if (names != null && names.length > 0) { + result.put(nameType, new HashSet<>(Arrays.asList(names))); + } + nameFinder.clearAdaptiveData(); + return result; + } +} Added: tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/main/java/org/apache/tika/parser/ner/regex/RegexNERecogniser.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/main/java/org/apache/tika/parser/ner/regex/RegexNERecogniser.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/main/java/org/apache/tika/parser/ner/regex/RegexNERecogniser.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/main/java/org/apache/tika/parser/ner/regex/RegexNERecogniser.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,145 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.parser.ner.regex; + +import org.apache.commons.io.IOUtils; +import org.apache.tika.parser.ner.NERecogniser; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.InputStream; +import java.nio.charset.StandardCharsets; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * This class offers an implementation of {@link NERecogniser} based on + * Regular Expressions. + *<p> + * The default configuration file {@value NER_REGEX_FILE} is used when no + * argument constructor is used to instantiate this class. The regex file is + * loaded via {@link Class#getResourceAsStream(String)}, so the file should be + * placed in the same package path as of this class. + * </p> + * The format of regex configuration as follows: + * <pre> + * ENTITY_TYPE1=REGEX1 + * ENTITY_TYPE2=REGEX2 + * </pre> + * + * <i>For example, to extract week day from text:</i> + * <pre>WEEK_DAY=(?i)((sun)|(mon)|(tues)|(thurs)|(fri)|((sat)(ur)?))(day)? + * </pre> + * @since Nov. 7, 2015 + */ +public class RegexNERecogniser implements NERecogniser { + + public static final String NER_REGEX_FILE = "ner-regex.txt"; + private static Logger LOG = LoggerFactory.getLogger(RegexNERecogniser.class); + + public Set<String> entityTypes = new HashSet<>(); + public Map<String, Pattern> patterns; + private boolean available = false; + + private static RegexNERecogniser INSTANCE; + + public RegexNERecogniser(){ + this(RegexNERecogniser.class.getResourceAsStream(NER_REGEX_FILE)); + } + + public RegexNERecogniser(InputStream stream){ + try { + patterns = new HashMap<>(); + List<String> lines = IOUtils.readLines(stream, StandardCharsets.UTF_8); + IOUtils.closeQuietly(stream); + for (String line : lines) { + line = line.trim(); + if (line.isEmpty() || line.startsWith("#")){ //empty or comment + continue; //skip + } + + int delim = line.indexOf('='); + if (delim < 0) { //delim not found + //skip + LOG.error("Skipped : Invalid config : {} ", line); + continue; + } + String type = line.substring(0, delim).trim(); + String patternStr = line.substring(delim+1, line.length()).trim(); + patterns.put(type, Pattern.compile(patternStr)); + entityTypes.add(type); + } + } catch (Exception e) { + LOG.error(e.getMessage(), e); + } + available = !entityTypes.isEmpty(); + } + + public synchronized static RegexNERecogniser getInstance() { + if (INSTANCE == null) { + INSTANCE = new RegexNERecogniser(); + } + return INSTANCE; + } + + @Override + public boolean isAvailable() { + return available; + } + + @Override + public Set<String> getEntityTypes() { + return entityTypes; + } + + /** + * finds matching sub groups in text + * @param text text containing interesting sub strings + * @param pattern pattern to find sub strings + * @return set of sub strings if any found, or null if none found + */ + public Set<String> findMatches(String text, Pattern pattern){ + Set<String> results = null; + Matcher matcher = pattern.matcher(text); + if (matcher.find()) { + results = new HashSet<>(); + results.add(matcher.group(0)); + while (matcher.find()) { + results.add(matcher.group(0)); + } + } + return results; + } + + @Override + public Map<String, Set<String>> recognise(String text) { + Map<String, Set<String>> result = new HashMap<>(); + for (Map.Entry<String, Pattern> entry : patterns.entrySet()) { + Set<String> names = findMatches(text, entry.getValue()); + if (names != null) { + result.put(entry.getKey(), names); + } + } + return result; + } +} Added: tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser (added) +++ tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser Wed Jan 6 03:50:50 2016 @@ -0,0 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +org.apache.tika.parser.crypto.Pkcs7Parser +#org.apache.tika.parser.ner.NamedEntityParser Added: tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/main/resources/org/apache/tika/parser/ner/regex/ner-regex.txt URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/main/resources/org/apache/tika/parser/ner/regex/ner-regex.txt?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/main/resources/org/apache/tika/parser/ner/regex/ner-regex.txt (added) +++ tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/main/resources/org/apache/tika/parser/ner/regex/ner-regex.txt Wed Jan 6 03:50:50 2016 @@ -0,0 +1,22 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# The pattern as follows +# type = regex +# the first occurrence of '=' separates type from its regex + +# WEEK_DAY=(?i)((sun)|(mon)|(tues)|(thurs)|(fri)|((sat)(ur)?))(day)? \ No newline at end of file Added: tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/test/java/org/apache/tika/parser/crypto/Pkcs7ParserTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/test/java/org/apache/tika/parser/crypto/Pkcs7ParserTest.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/test/java/org/apache/tika/parser/crypto/Pkcs7ParserTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/test/java/org/apache/tika/parser/crypto/Pkcs7ParserTest.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,45 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.parser.crypto; + +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + +import java.io.InputStream; + +import org.apache.tika.TikaTest; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.BodyContentHandler; +import org.xml.sax.ContentHandler; + +public class Pkcs7ParserTest extends TikaTest { + public void testDetachedSignature() throws Exception { + try (InputStream input = Pkcs7ParserTest.class.getResourceAsStream( + "/test-documents/testDetached.p7s")) { + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + new Pkcs7Parser().parse(input, handler, metadata, new ParseContext()); + } catch (NullPointerException npe) { + fail("should not get NPE"); + } catch (TikaException te) { + assertTrue(te.toString().contains("cannot parse detached pkcs7 signature")); + } + } +} Added: tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/test/java/org/apache/tika/parser/ner/NamedEntityParserTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/test/java/org/apache/tika/parser/ner/NamedEntityParserTest.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/test/java/org/apache/tika/parser/ner/NamedEntityParserTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/test/java/org/apache/tika/parser/ner/NamedEntityParserTest.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.ner; + +import org.apache.tika.Tika; +import org.apache.tika.config.TikaConfig; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ner.opennlp.OpenNLPNERecogniser; +import org.apache.tika.parser.ner.regex.RegexNERecogniser; +import org.junit.Test; + +import java.io.ByteArrayInputStream; +import java.nio.charset.Charset; +import java.util.Arrays; +import java.util.HashSet; + +import static org.junit.Assert.assertTrue; + +/** + *Test case for {@link NamedEntityParser} + */ +public class NamedEntityParserTest { + + public static final String CONFIG_FILE = "tika-config.xml"; + + @Test + public void testParse() throws Exception { + + //test config is added to resources directory + TikaConfig config = new TikaConfig(getClass().getResourceAsStream(CONFIG_FILE)); + Tika tika = new Tika(config); + String text = "I am student at University of Southern California (USC)," + + " located in Los Angeles . USC's football team is called by name Trojans." + + " Mr. John McKay was a head coach of the team from 1960 - 1975"; + Metadata md = new Metadata(); + tika.parse(new ByteArrayInputStream(text.getBytes(Charset.defaultCharset())), md); + + HashSet<String> set = new HashSet<String>(); + set.addAll(Arrays.asList(md.getValues("X-Parsed-By"))); + assertTrue(set.contains(NamedEntityParser.class.getName())); + + set.clear(); + set.addAll(Arrays.asList(md.getValues("NER_PERSON"))); + assertTrue(set.contains("John McKay")); + + set.clear(); + set.addAll(Arrays.asList(md.getValues("NER_LOCATION"))); + assertTrue(set.contains("Los Angeles")); + + set.clear(); + set.addAll(Arrays.asList(md.getValues("NER_ORGANIZATION"))); + assertTrue(set.contains("University of Southern California")); + + set.clear(); + set.addAll(Arrays.asList(md.getValues("NER_DATE"))); + assertTrue(set.contains("1960 - 1975")); + + } + + @Test + public void testNerChain() throws Exception { + String classNames = OpenNLPNERecogniser.class.getName() + + "," + RegexNERecogniser.class.getName(); + System.setProperty(NamedEntityParser.SYS_PROP_NER_IMPL, classNames); + TikaConfig config = new TikaConfig(getClass().getResourceAsStream(CONFIG_FILE)); + Tika tika = new Tika(config); + String text = "University of Southern California (USC), is located in Los Angeles ." + + " Campus is busy from monday to saturday"; + Metadata md = new Metadata(); + tika.parse(new ByteArrayInputStream(text.getBytes(Charset.defaultCharset())), md); + HashSet<String> keys = new HashSet<>(Arrays.asList(md.names())); + assertTrue(keys.contains("NER_WEEK_DAY")); + assertTrue(keys.contains("NER_LOCATION")); + + } +} \ No newline at end of file Added: tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/test/java/org/apache/tika/parser/ner/regex/RegexNERecogniserTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/test/java/org/apache/tika/parser/ner/regex/RegexNERecogniserTest.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/test/java/org/apache/tika/parser/ner/regex/RegexNERecogniserTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-advanced-module/src/test/java/org/apache/tika/parser/ner/regex/RegexNERecogniserTest.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright owlocationNameEntitieship. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.ner.regex; + +import org.apache.tika.Tika; +import org.apache.tika.config.TikaConfig; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ner.NamedEntityParser; +import org.junit.Test; + +import java.io.ByteArrayInputStream; +import java.nio.charset.StandardCharsets; +import java.util.Arrays; +import java.util.HashSet; +import java.util.Set; + +import static org.junit.Assert.assertTrue; + +public class RegexNERecogniserTest { + + @Test + public void testGetEntityTypes() throws Exception { + + String text = "Hey, Lets meet on this Sunday or MONDAY because i am busy on Saturday"; + System.setProperty(NamedEntityParser.SYS_PROP_NER_IMPL, RegexNERecogniser.class.getName()); + + Tika tika = new Tika(new TikaConfig(NamedEntityParser.class.getResourceAsStream("tika-config.xml"))); + Metadata md = new Metadata(); + tika.parse(new ByteArrayInputStream(text.getBytes(StandardCharsets.UTF_8)), md); + + Set<String> days = new HashSet<>(Arrays.asList(md.getValues("NER_WEEK_DAY"))); + assertTrue(days.contains("Sunday")); + assertTrue(days.contains("MONDAY")); + assertTrue(days.contains("Saturday")); + assertTrue(days.size() == 3); //and nothing else + + + } +} \ No newline at end of file Added: tika/branches/2.x/tika-parser-modules/tika-cad-module/pom.xml URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-cad-module/pom.xml?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-cad-module/pom.xml (added) +++ tika/branches/2.x/tika-parser-modules/tika-cad-module/pom.xml Wed Jan 6 03:50:50 2016 @@ -0,0 +1,62 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor + license agreements. See the NOTICE file distributed with this work for additional + information regarding copyright ownership. The ASF licenses this file to + you under the Apache License, Version 2.0 (the "License"); you may not use + this file except in compliance with the License. You may obtain a copy of + the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required + by applicable law or agreed to in writing, software distributed under the + License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS + OF ANY KIND, either express or implied. See the License for the specific + language governing permissions and limitations under the License. --> +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.tika</groupId> + <artifactId>tika-parser-modules</artifactId> + <version>2.0-SNAPSHOT</version> + </parent> + + <artifactId>tika-cad-module</artifactId> + <name>Apache Tika CAD Module</name> + <url>http://tika.apache.org/</url> + + <dependencies> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-core</artifactId> + <version>${project.version}</version> + </dependency> + + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-core</artifactId> + <version>${project.version}</version> + <type>test-jar</type> + <scope>test</scope> + </dependency> + <dependency> + <groupId>org.apache.poi</groupId> + <artifactId>poi</artifactId> + <version>${poi.version}</version> + </dependency> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-text-module</artifactId> + <version>${project.version}</version> + <scope>test</scope> + </dependency> + </dependencies> + + <build> + <plugins> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-dependency-plugin</artifactId> + </plugin> + </plugins> + </build> + +</project> \ No newline at end of file Added: tika/branches/2.x/tika-parser-modules/tika-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParser.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParser.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParser.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParser.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,356 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.dwg; + +import java.io.IOException; +import java.io.InputStream; +import java.util.Collections; +import java.util.Set; + +import org.apache.poi.util.IOUtils; +import org.apache.poi.util.StringUtil; +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.EndianUtils; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Property; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +/** + * DWG (CAD Drawing) parser. This is a very basic parser, which just + * looks for bits of the headers. + * Note that we use Apache POI for various parts of the processing, as + * lots of the low level string/int/short concepts are the same. + */ +public class DWGParser extends AbstractParser { + + /** Serial version UID */ + private static final long serialVersionUID = -7744232583079169119L; + + private static MediaType TYPE = MediaType.image("vnd.dwg"); + + public Set<MediaType> getSupportedTypes(ParseContext context) { + return Collections.singleton(TYPE); + } + + /** The order of the fields in the header */ + private static final Property[] HEADER_PROPERTIES_ENTRIES = { + TikaCoreProperties.TITLE, + TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_DESCRIPTION, + TikaCoreProperties.CREATOR, + TikaCoreProperties.TRANSITION_KEYWORDS_TO_DC_SUBJECT, + TikaCoreProperties.COMMENTS, + TikaCoreProperties.MODIFIER, + null, // Unknown? + TikaCoreProperties.RELATION, // Hyperlink + }; + + /** For the 2000 file, they're indexed */ + private static final Property[] HEADER_2000_PROPERTIES_ENTRIES = { + null, + TikaCoreProperties.RELATION, // 0x01 + TikaCoreProperties.TITLE, // 0x02 + TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_DESCRIPTION, // 0x03 + TikaCoreProperties.CREATOR, // 0x04 + null, + TikaCoreProperties.COMMENTS,// 0x06 + TikaCoreProperties.TRANSITION_KEYWORDS_TO_DC_SUBJECT, // 0x07 + TikaCoreProperties.MODIFIER, // 0x08 + }; + + private static final String HEADER_2000_PROPERTIES_MARKER_STR = + "DWGPROPS COOKIE"; + + private static final byte[] HEADER_2000_PROPERTIES_MARKER = + new byte[HEADER_2000_PROPERTIES_MARKER_STR.length()]; + + static { + StringUtil.putCompressedUnicode( + HEADER_2000_PROPERTIES_MARKER_STR, + HEADER_2000_PROPERTIES_MARKER, 0); + } + + /** + * How far to skip after the last standard property, before + * we find any custom properties that might be there. + */ + private static final int CUSTOM_PROPERTIES_SKIP = 20; + + /** + * The value of padding bytes other than 0 in some DWG files. + */ + private static final int[] CUSTOM_PROPERTIES_ALT_PADDING_VALUES = new int[] {0x2, 0, 0, 0}; + + public void parse( + InputStream stream, ContentHandler handler, + Metadata metadata, ParseContext context) + throws IOException, TikaException, SAXException { + // First up, which version of the format are we handling? + byte[] header = new byte[128]; + IOUtils.readFully(stream, header); + String version = new String(header, 0, 6, "US-ASCII"); + + XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); + xhtml.startDocument(); + + if (version.equals("AC1015")) { + metadata.set(Metadata.CONTENT_TYPE, TYPE.toString()); + if (skipTo2000PropertyInfoSection(stream, header)) { + get2000Props(stream,metadata,xhtml); + } + } else if (version.equals("AC1018")) { + metadata.set(Metadata.CONTENT_TYPE, TYPE.toString()); + if (skipToPropertyInfoSection(stream, header)) { + get2004Props(stream,metadata,xhtml); + } + } else if (version.equals("AC1021") || version.equals("AC1024")) { + metadata.set(Metadata.CONTENT_TYPE, TYPE.toString()); + if (skipToPropertyInfoSection(stream, header)) { + get2007and2010Props(stream,metadata,xhtml); + } + } else { + throw new TikaException( + "Unsupported AutoCAD drawing version: " + version); + } + + xhtml.endDocument(); + } + + /** + * Stored as US-ASCII + */ + private void get2004Props( + InputStream stream, Metadata metadata, XHTMLContentHandler xhtml) + throws IOException, TikaException, SAXException { + // Standard properties + for (int i = 0; i < HEADER_PROPERTIES_ENTRIES.length; i++) { + String headerValue = read2004String(stream); + handleHeader(i, headerValue, metadata, xhtml); + } + + // Custom properties + int customCount = skipToCustomProperties(stream); + for (int i = 0; i < customCount; i++) { + String propName = read2004String(stream); + String propValue = read2004String(stream); + if(propName.length() > 0 && propValue.length() > 0) { + metadata.add(propName, propValue); + } + } + } + + private String read2004String(InputStream stream) throws IOException, TikaException { + int stringLen = EndianUtils.readUShortLE(stream); + + byte[] stringData = new byte[stringLen]; + IOUtils.readFully(stream, stringData); + + // Often but not always null terminated + if (stringData[stringLen-1] == 0) { + stringLen--; + } + String value = StringUtil.getFromCompressedUnicode(stringData, 0, stringLen); + return value; + } + + /** + * Stored as UCS2, so 16 bit "unicode" + */ + private void get2007and2010Props( + InputStream stream, Metadata metadata, XHTMLContentHandler xhtml) + throws IOException, TikaException, SAXException { + // Standard properties + for (int i = 0; i < HEADER_PROPERTIES_ENTRIES.length; i++) { + String headerValue = read2007and2010String(stream); + handleHeader(i, headerValue, metadata, xhtml); + } + + // Custom properties + int customCount = skipToCustomProperties(stream); + for (int i = 0; i < customCount; i++) { + String propName = read2007and2010String(stream); + String propValue = read2007and2010String(stream); + if(propName.length() > 0 && propValue.length() > 0) { + metadata.add(propName, propValue); + } + } + } + + private String read2007and2010String(InputStream stream) throws IOException, TikaException { + int stringLen = EndianUtils.readUShortLE(stream); + + byte[] stringData = new byte[stringLen * 2]; + IOUtils.readFully(stream, stringData); + String value = StringUtil.getFromUnicodeLE(stringData); + + // Some strings are null terminated + if(value.charAt(value.length()-1) == 0) { + value = value.substring(0, value.length()-1); + } + + return value; + } + + private void get2000Props( + InputStream stream, Metadata metadata, XHTMLContentHandler xhtml) + throws IOException, TikaException, SAXException { + int propCount = 0; + while(propCount < 30) { + int propIdx = EndianUtils.readUShortLE(stream); + int length = EndianUtils.readUShortLE(stream); + int valueType = stream.read(); + + if(propIdx == 0x28) { + // This one seems not to follow the pattern + length = 0x19; + } else if(propIdx == 90) { + // We think this means the end of properties + break; + } + + byte[] value = new byte[length]; + IOUtils.readFully(stream, value); + if(valueType == 0x1e) { + // Normal string, good + String val = StringUtil.getFromCompressedUnicode(value, 0, length); + + // Is it one we can look up by index? + if(propIdx < HEADER_2000_PROPERTIES_ENTRIES.length) { + metadata.add(HEADER_2000_PROPERTIES_ENTRIES[propIdx], val); + xhtml.element("p", val); + } else if(propIdx == 0x012c) { + int splitAt = val.indexOf('='); + if(splitAt > -1) { + String propName = val.substring(0, splitAt); + String propVal = val.substring(splitAt+1); + metadata.add(propName, propVal); + } + } + } else { + // No idea... + } + + propCount++; + } + } + + private void handleHeader( + int headerNumber, String value, Metadata metadata, + XHTMLContentHandler xhtml) throws SAXException { + if(value == null || value.length() == 0) { + return; + } + + Property headerProp = HEADER_PROPERTIES_ENTRIES[headerNumber]; + if(headerProp != null) { + metadata.set(headerProp, value); + } + + xhtml.element("p", value); + } + + /** + * Grab the offset, then skip there + */ + private boolean skipToPropertyInfoSection(InputStream stream, byte[] header) + throws IOException, TikaException { + // The offset is stored in the header from 0x20 onwards + long offsetToSection = EndianUtils.getLongLE(header, 0x20); + + // Sanity check the offset. Some files seem to use a different format, + // and the offset isn't available at 0x20. Until we can work out how + // to find the offset in those files, skip them if detected + if (offsetToSection > 0xa00000l) { + // Header should never be more than 10mb into the file, something is wrong + offsetToSection = 0; + } + + // Work out how far to skip, and sanity check + long toSkip = offsetToSection - header.length; + if(offsetToSection == 0){ + return false; + } + while (toSkip > 0) { + byte[] skip = new byte[Math.min((int) toSkip, 0x4000)]; + IOUtils.readFully(stream, skip); + toSkip -= skip.length; + } + return true; + } + + /** + * We think it can be anywhere... + */ + private boolean skipTo2000PropertyInfoSection(InputStream stream, byte[] header) + throws IOException { + int val = 0; + while(val != -1) { + val = stream.read(); + if(val == HEADER_2000_PROPERTIES_MARKER[0]) { + boolean going = true; + for(int i=1; i<HEADER_2000_PROPERTIES_MARKER.length && going; i++) { + val = stream.read(); + if(val != HEADER_2000_PROPERTIES_MARKER[i]) going = false; + } + if(going) { + // Bingo, found it + return true; + } + } + } + return false; + } + + private int skipToCustomProperties(InputStream stream) + throws IOException, TikaException { + // There should be 4 zero bytes or CUSTOM_PROPERTIES_ALT_PADDING_VALUES next + byte[] padding = new byte[4]; + IOUtils.readFully(stream, padding); + if((padding[0] == 0 && padding[1] == 0 && + padding[2] == 0 && padding[3] == 0) || + (padding[0] == CUSTOM_PROPERTIES_ALT_PADDING_VALUES[0] && + padding[1] == CUSTOM_PROPERTIES_ALT_PADDING_VALUES[1] && + padding[2] == CUSTOM_PROPERTIES_ALT_PADDING_VALUES[2] && + padding[3] == CUSTOM_PROPERTIES_ALT_PADDING_VALUES[3])) { + + // Looks hopeful, skip on + padding = new byte[CUSTOM_PROPERTIES_SKIP]; + IOUtils.readFully(stream, padding); + + // We should now have the count + int count = EndianUtils.readUShortLE(stream); + + // Sanity check it + if(count > 0 && count < 0x7f) { + // Looks plausible + return count; + } else { + // No properties / count is too high to trust + return 0; + } + } else { + // No padding. That probably means no custom props + return 0; + } + } + +} Added: tika/branches/2.x/tika-parser-modules/tika-cad-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-cad-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-cad-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-cad-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,275 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.prt; + +import java.io.IOException; +import java.io.InputStream; +import java.io.UnsupportedEncodingException; +import java.util.Collections; +import java.util.Set; + +import org.apache.poi.util.IOUtils; +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.EndianUtils; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +import static java.nio.charset.StandardCharsets.US_ASCII; + +/** + * A basic text extracting parser for the CADKey PRT (CAD Drawing) + * format. It outputs text from note entries. + */ + +public class PRTParser extends AbstractParser { + + /** Serial version UID */ + private static final long serialVersionUID = 4659638314375035178L; + + private static final Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MediaType.application("x-prt")); + public static final String PRT_MIME_TYPE = "application/x-prt"; + + public Set<MediaType> getSupportedTypes(ParseContext context) { + return SUPPORTED_TYPES; + } + + /** + * How long do we allow a text run to claim to be, before we + * decide we're confused and it's not really text after all? + */ + private static final int MAX_SANE_TEXT_LENGTH = 0x0800; + + /* + * Text types: + * 00 00 00 00 f0 [3b]f sz sz TEXT *view name* + * 00 00 00 00 f0 3f 00 00 00 00 00 00 00 00 sz sz TEXT *view name* + * (anything) e0 3f sz sz TEXT *view name* + * 3x 33 33 33 33 33 e3 3f 0x 00 00 0x 00 00 0x 0x 1f sz sz TEXT *note entries* + * + * Note - all text is null terminated + */ + + public void parse(InputStream stream, ContentHandler handler, Metadata metadata, + ParseContext context) throws IOException, SAXException, TikaException { + + XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); + Last5 l5 = new Last5(); + int read; + + // Try to get the creation date, which is YYYYMMDDhhmm + byte[] header = new byte[30]; + IOUtils.readFully(stream, header); + byte[] date = new byte[12]; + IOUtils.readFully(stream, date); + + String dateStr = new String(date, US_ASCII); + if(dateStr.startsWith("19") || dateStr.startsWith("20")) { + String formattedDate = dateStr.substring(0, 4) + "-" + dateStr.substring(4,6) + + "-" + dateStr.substring(6,8) + "T" + dateStr.substring(8,10) + ":" + + dateStr.substring(10, 12) + ":00"; + metadata.set(TikaCoreProperties.CREATED, formattedDate); + // TODO Metadata.DATE is used as modified, should it be here? + metadata.set(Metadata.DATE, formattedDate); + } + metadata.set(Metadata.CONTENT_TYPE, PRT_MIME_TYPE); + + // The description, if set, is the next up-to-500 bytes + byte[] desc = new byte[500]; + IOUtils.readFully(stream, desc); + String description = extractText(desc, true); + if(description.length() > 0) { + metadata.set(TikaCoreProperties.DESCRIPTION, description); + } + + // Now look for text + while( (read = stream.read()) > -1) { + if(read == 0xe0 || read == 0xe3 || read == 0xf0) { + int nread = stream.read(); + if(nread == 0x3f || nread == 0xbf) { + // Looks promising, check back for a suitable value + if(read == 0xe3 && nread == 0x3f) { + if(l5.is33()) { + // Bingo, note text + handleNoteText(stream, xhtml); + } + } else if(l5.is00()) { + // Likely view name + handleViewName(read, nread, stream, xhtml, l5); + } + } + } else { + l5.record(read); + } + } + } + + private void handleNoteText(InputStream stream, XHTMLContentHandler xhtml) + throws IOException, SAXException, TikaException { + // Ensure we have the right padding text + int read; + for(int i=0; i<10; i++) { + read = stream.read(); + if(read >= 0 && read <= 0x0f) { + // Promising + } else { + // Wrong, false detection + return; + } + } + read = stream.read(); + if(read != 0x1f) { + // Wrong, false detection + return; + } + + int length = EndianUtils.readUShortLE(stream); + if(length <= MAX_SANE_TEXT_LENGTH) { + // Length sanity check passed + handleText(length, stream, xhtml); + } + } + + private void handleViewName(int typeA, int typeB, InputStream stream, + XHTMLContentHandler xhtml, Last5 l5) + throws IOException, SAXException, TikaException { + // Is it 8 byte zero padded? + int maybeLength = EndianUtils.readUShortLE(stream); + if(maybeLength == 0) { + // Check the next 6 bytes too + for(int i=0; i<6; i++) { + int read = stream.read(); + if(read >= 0 && read <= 0x0f) { + // Promising + } else { + // Wrong, false detection + return; + } + } + + byte[] b2 = new byte[2]; + IOUtils.readFully(stream, b2); + int length = EndianUtils.getUShortLE(b2); + if(length > 1 && length <= MAX_SANE_TEXT_LENGTH) { + // Length sanity check passed + handleText(length, stream, xhtml); + } else { + // Was probably something else + l5.record(b2[0]); + l5.record(b2[1]); + } + } else if(maybeLength > 0 && maybeLength < MAX_SANE_TEXT_LENGTH) { + // Looks like it's straight into the text + handleText(maybeLength, stream, xhtml); + } + } + + private void handleText(int length, InputStream stream, XHTMLContentHandler xhtml) + throws IOException, SAXException, TikaException { + byte[] str = new byte[length]; + IOUtils.readFully(stream, str); + if(str[length-1] != 0) { + // Not properly null terminated, must be wrong + return; + } + + String text = extractText(str, false); + + xhtml.startElement("p"); + xhtml.characters(text); + xhtml.endElement("p"); + } + + /** + * Does our best to turn the bytes into text + */ + private String extractText(byte[] data, boolean trim) throws TikaException { + // The text is always stored null terminated, but sometimes + // may have extra null padding too + int length = data.length - 1; + if(trim) { + for(int i=0; i<data.length; i++) { + if(data[i] == 0) { + length = i; + break; + } + } + } + + // We believe that the text is basically stored as CP437 + // That said, there are a few characters slightly wrong for that... + String text; + try { + text = new String(data, 0, length, "cp437"); + } catch(UnsupportedEncodingException e) { + throw new TikaException("JVM Broken, core codepage CP437 missing!"); + } + + // Fix up the known character issues + text = text.replace("\u03C6","\u00D8"); + + // All done, as best as we can! + return text; + } + + /** + * Provides a view on the previous 5 bytes + */ + private static class Last5 { + byte[] data = new byte[5]; + int pos = 0; + + private void record(int b) { + data[pos] = (byte)b; + pos++; + if(pos >= data.length) { + pos = 0; + } + } + + private byte[] get() { + byte[] ret = new byte[5]; + for(int i=0; i<ret.length; i++) { + int p = pos - i; + if(p < 0) { p += ret.length; } + ret[i] = data[p]; + } + return ret; + } + + private boolean is33() { + byte[] last5 = get(); + for(byte b : last5) { + if(b != 0x33) return false; + } + return true; + } + + private boolean is00() { + byte[] last5 = get(); + for(byte b : last5) { + if(b != 0x00) return false; + } + return true; + } + } +} Added: tika/branches/2.x/tika-parser-modules/tika-cad-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-cad-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-cad-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser (added) +++ tika/branches/2.x/tika-parser-modules/tika-cad-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser Wed Jan 6 03:50:50 2016 @@ -0,0 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +org.apache.tika.parser.dwg.DWGParser +#org.apache.tika.parser.prt.PRTParser Added: tika/branches/2.x/tika-parser-modules/tika-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,202 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.dwg; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNull; +import static org.apache.tika.TikaTest.assertContains; + +import java.io.InputStream; + +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.sax.BodyContentHandler; +import org.junit.Test; +import org.xml.sax.ContentHandler; + +public class DWGParserTest { + + @Test + public void testDWG2000Parser() throws Exception { + InputStream input = DWGParserTest.class.getResourceAsStream( + "/test-documents/testDWG2000.dwg"); + testParserAlt(input); + } + + @Test + public void testDWG2004Parser() throws Exception { + InputStream input = DWGParserTest.class.getResourceAsStream( + "/test-documents/testDWG2004.dwg"); + testParser(input); + } + + @Test + public void testDWG2004ParserNoHeaderAddress() throws Exception { + InputStream input = DWGParserTest.class.getResourceAsStream( + "/test-documents/testDWG2004_no_header.dwg"); + testParserNoHeader(input); + } + + @Test + public void testDWG2007Parser() throws Exception { + InputStream input = DWGParserTest.class.getResourceAsStream( + "/test-documents/testDWG2007.dwg"); + testParser(input); + } + + @Test + public void testDWG2010Parser() throws Exception { + InputStream input = DWGParserTest.class.getResourceAsStream( + "/test-documents/testDWG2010.dwg"); + testParser(input); + } + + @Test + public void testDWG2010CustomPropertiesParser() throws Exception { + // Check that standard parsing works + InputStream testInput = DWGParserTest.class.getResourceAsStream( + "/test-documents/testDWG2010_custom_props.dwg"); + testParser(testInput); + + // Check that custom properties with alternate padding work + try (InputStream input = DWGParserTest.class.getResourceAsStream( + "/test-documents/testDWG2010_custom_props.dwg")) { + Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(); + new DWGParser().parse(input, handler, metadata, null); + + assertEquals("valueforcustomprop1", + metadata.get("customprop1")); + assertEquals("valueforcustomprop2", + metadata.get("customprop2")); + } + } + + @Test + public void testDWGMechParser() throws Exception { + String[] types = new String[] { + "6", "2004", "2004DX", "2005", "2006", + "2007", "2008", "2009", "2010", "2011" + }; + for (String type : types) { + InputStream input = DWGParserTest.class.getResourceAsStream( + "/test-documents/testDWGmech"+type+".dwg"); + testParserAlt(input); + } + } + + @SuppressWarnings("deprecation") + private void testParser(InputStream input) throws Exception { + try { + Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(); + new DWGParser().parse(input, handler, metadata); + + assertEquals("image/vnd.dwg", metadata.get(Metadata.CONTENT_TYPE)); + + assertEquals("The quick brown fox jumps over the lazy dog", + metadata.get(TikaCoreProperties.TITLE)); + assertEquals("Gym class featuring a brown fox and lazy dog", + metadata.get(TikaCoreProperties.DESCRIPTION)); + assertEquals("Gym class featuring a brown fox and lazy dog", + metadata.get(Metadata.SUBJECT)); + assertEquals("Nevin Nollop", + metadata.get(TikaCoreProperties.CREATOR)); + assertEquals("Pangram, fox, dog", + metadata.get(TikaCoreProperties.KEYWORDS)); + assertEquals("Lorem ipsum", + metadata.get(TikaCoreProperties.COMMENTS).substring(0,11)); + assertEquals("http://www.alfresco.com", + metadata.get(TikaCoreProperties.RELATION)); + + // Check some of the old style metadata too + assertEquals("The quick brown fox jumps over the lazy dog", + metadata.get(Metadata.TITLE)); + assertEquals("Gym class featuring a brown fox and lazy dog", + metadata.get(Metadata.SUBJECT)); + + String content = handler.toString(); + assertContains("The quick brown fox jumps over the lazy dog", content); + assertContains("Gym class", content); + assertContains("www.alfresco.com", content); + } finally { + input.close(); + } + } + + @SuppressWarnings("deprecation") + private void testParserNoHeader(InputStream input) throws Exception { + try { + Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(); + new DWGParser().parse(input, handler, metadata); + + assertEquals("image/vnd.dwg", metadata.get(Metadata.CONTENT_TYPE)); + + assertNull(metadata.get(TikaCoreProperties.TITLE)); + assertNull(metadata.get(TikaCoreProperties.DESCRIPTION)); + assertNull(metadata.get(Metadata.SUBJECT)); + assertNull(metadata.get(TikaCoreProperties.CREATOR)); + assertNull(metadata.get(TikaCoreProperties.KEYWORDS)); + assertNull(metadata.get(TikaCoreProperties.COMMENTS)); + assertNull(metadata.get(TikaCoreProperties.RELATION)); + + String content = handler.toString(); + assertEquals("", content); + } finally { + input.close(); + } + } + + @SuppressWarnings("deprecation") + private void testParserAlt(InputStream input) throws Exception { + try { + Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(); + new DWGParser().parse(input, handler, metadata); + + assertEquals("image/vnd.dwg", metadata.get(Metadata.CONTENT_TYPE)); + + assertEquals("Test Title", + metadata.get(TikaCoreProperties.TITLE)); + assertEquals("Test Subject", + metadata.get(TikaCoreProperties.DESCRIPTION)); + assertEquals("Test Subject", + metadata.get(Metadata.SUBJECT)); + assertEquals("My Author", + metadata.get(TikaCoreProperties.CREATOR)); + assertEquals("My keyword1, MyKeyword2", + metadata.get(TikaCoreProperties.KEYWORDS)); + assertEquals("This is a comment", + metadata.get(TikaCoreProperties.COMMENTS)); + assertEquals("bejanpol", + metadata.get(TikaCoreProperties.MODIFIER)); + assertEquals("bejanpol", + metadata.get(Metadata.LAST_AUTHOR)); + assertEquals("http://mycompany/drawings", + metadata.get(TikaCoreProperties.RELATION)); + assertEquals("MyCustomPropertyValue", + metadata.get("MyCustomProperty")); + + String content = handler.toString(); + assertContains("This is a comment", content); + assertContains("mycompany", content); + } finally { + input.close(); + } + } +} Added: tika/branches/2.x/tika-parser-modules/tika-cad-module/src/test/java/org/apache/tika/parser/prt/PRTParserTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-cad-module/src/test/java/org/apache/tika/parser/prt/PRTParserTest.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-cad-module/src/test/java/org/apache/tika/parser/prt/PRTParserTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-cad-module/src/test/java/org/apache/tika/parser/prt/PRTParserTest.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.prt; + +import static org.junit.Assert.assertEquals; + +import java.io.InputStream; + +import org.apache.tika.TikaTest; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.sax.BodyContentHandler; +import org.junit.Test; +import org.xml.sax.ContentHandler; + +public class PRTParserTest extends TikaTest { + /** + * Try with a simple file + */ + @Test + public void testPRTParserBasics() throws Exception { + try (InputStream input = getResourceAsStream("/test-documents/testCADKEY.prt")) { + Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(); + new PRTParser().parse(input, handler, metadata); + + assertEquals("application/x-prt", metadata.get(Metadata.CONTENT_TYPE)); + + // This file has a date + assertEquals("2011-06-20T16:54:00", + metadata.get(TikaCoreProperties.CREATED)); + assertEquals("2011-06-20T16:54:00", + metadata.get(Metadata.CREATION_DATE)); + // But no description + assertEquals(null, metadata.get(TikaCoreProperties.DESCRIPTION)); + + String contents = handler.toString(); + + assertContains("Front View", contents); + assertContains("Back View", contents); + assertContains("Bottom View", contents); + assertContains("Right View", contents); + assertContains("Left View", contents); + //assertContains("Isometric View", contents); // Can't detect yet + assertContains("Axonometric View", contents); + + assertContains("You've managed to extract all the text!", contents); + assertContains("This is more text", contents); + assertContains("Text Inside a PRT file", contents); + } + } + + /** + * Now a more complex one + */ + @Test + public void testPRTParserComplex() throws Exception { + try (InputStream input = getResourceAsStream("/test-documents/testCADKEY2.prt")) { + Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(); + new PRTParser().parse(input, handler, metadata); + + assertEquals("application/x-prt", metadata.get(Metadata.CONTENT_TYPE)); + + // File has both a date and a description + assertEquals("1997-04-01T08:59:00", + metadata.get(Metadata.DATE)); + assertEquals("1997-04-01T08:59:00", + metadata.get(Metadata.CREATION_DATE)); + assertEquals("TIKA TEST PART DESCRIPTION INFORMATION\r\n", + metadata.get(TikaCoreProperties.DESCRIPTION)); + + String contents = handler.toString(); + + assertContains("ITEM", contents); + assertContains("REQ.", contents); + assertContains("DESCRIPTION", contents); + assertContains("MAT'L", contents); + assertContains("TOLERANCES UNLESS", contents); + assertContains("FRACTIONS", contents); + assertContains("ANGLES", contents); + assertContains("Acme Corporation", contents); + + assertContains("DATE", contents); + assertContains("CHANGE", contents); + assertContains("DRAWN BY", contents); + assertContains("SCALE", contents); + assertContains("TIKA TEST DRAWING", contents); + assertContains("TIKA LETTERS", contents); + assertContains("5.82", contents); + assertContains("112" + '\u00b0', contents); // Degrees + assertContains("TIKA TEST LETTER", contents); + assertContains("17.11", contents); + assertContains('\u00d8' + "\ufffd2.000", contents); // Diameter + assertContains("Diameter", contents); + assertContains("The Apache Tika toolkit", contents); + } + } +}
