Added: tika/branches/2.x/tika-parser-modules/tika-code-module/pom.xml URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-code-module/pom.xml?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-code-module/pom.xml (added) +++ tika/branches/2.x/tika-parser-modules/tika-code-module/pom.xml Wed Jan 6 03:50:50 2016 @@ -0,0 +1,82 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor + license agreements. See the NOTICE file distributed with this work for additional + information regarding copyright ownership. The ASF licenses this file to + you under the Apache License, Version 2.0 (the "License"); you may not use + this file except in compliance with the License. You may obtain a copy of + the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required + by applicable law or agreed to in writing, software distributed under the + License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS + OF ANY KIND, either express or implied. See the License for the specific + language governing permissions and limitations under the License. --> +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.tika</groupId> + <artifactId>tika-parser-modules</artifactId> + <version>2.0-SNAPSHOT</version> + </parent> + + <artifactId>tika-code-module</artifactId> + <name>Apache Tika Code Module</name> + <url>http://tika.apache.org/</url> + + <dependencies> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-core</artifactId> + <version>${project.version}</version> + </dependency> + + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-core</artifactId> + <version>${project.version}</version> + <type>test-jar</type> + <scope>test</scope> + </dependency> + <dependency> + <groupId>org.ow2.asm</groupId> + <artifactId>asm</artifactId> + <version>5.0.4</version> + </dependency> + <dependency> + <groupId>org.codelibs</groupId> + <artifactId>jhighlight</artifactId> + <version>1.0.2</version> + </dependency> + <dependency> + <groupId>org.ccil.cowan.tagsoup</groupId> + <artifactId>tagsoup</artifactId> + <version>1.2.1</version> + </dependency> + <dependency> + <groupId>commons-io</groupId> + <artifactId>commons-io</artifactId> + <version>${commons.io.version}</version> + </dependency> + <dependency> + <groupId>org.apache.poi</groupId> + <artifactId>poi</artifactId> + <version>${poi.version}</version> + </dependency> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-text-module</artifactId> + <version>${project.version}</version> + <scope>test</scope> + </dependency> + </dependencies> + + <build> + <plugins> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-dependency-plugin</artifactId> + </plugin> + </plugins> + </build> + +</project> \ No newline at end of file
Added: tika/branches/2.x/tika-parser-modules/tika-code-module/src/main/java/org/apache/tika/parser/asm/ClassParser.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-code-module/src/main/java/org/apache/tika/parser/asm/ClassParser.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-code-module/src/main/java/org/apache/tika/parser/asm/ClassParser.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-code-module/src/main/java/org/apache/tika/parser/asm/ClassParser.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.asm; + +import java.io.IOException; +import java.io.InputStream; +import java.util.Collections; +import java.util.Set; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.ParseContext; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +/** + * Parser for Java .class files. + */ +public class ClassParser extends AbstractParser { + + /** Serial version UID */ + private static final long serialVersionUID = -3531388963354454357L; + + private static final Set<MediaType> SUPPORTED_TYPES = + Collections.singleton(MediaType.application("java-vm")); + + public Set<MediaType> getSupportedTypes(ParseContext context) { + return SUPPORTED_TYPES; + } + + public void parse( + InputStream stream, ContentHandler handler, + Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { + new XHTMLClassVisitor(handler, metadata).parse(stream); + } + +} Added: tika/branches/2.x/tika-parser-modules/tika-code-module/src/main/java/org/apache/tika/parser/asm/XHTMLClassVisitor.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-code-module/src/main/java/org/apache/tika/parser/asm/XHTMLClassVisitor.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-code-module/src/main/java/org/apache/tika/parser/asm/XHTMLClassVisitor.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-code-module/src/main/java/org/apache/tika/parser/asm/XHTMLClassVisitor.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,323 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.asm; + +import java.io.IOException; +import java.io.InputStream; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.sax.XHTMLContentHandler; +import org.objectweb.asm.AnnotationVisitor; +import org.objectweb.asm.Attribute; +import org.objectweb.asm.ClassReader; +import org.objectweb.asm.ClassVisitor; +import org.objectweb.asm.FieldVisitor; +import org.objectweb.asm.MethodVisitor; +import org.objectweb.asm.Opcodes; +import org.objectweb.asm.Type; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +/** + * Class visitor that generates XHTML SAX events to describe the + * contents of the visited class. + */ +class XHTMLClassVisitor extends ClassVisitor { + + private final XHTMLContentHandler xhtml; + + private final Metadata metadata; + + private Type type; + + private String packageName; + + public XHTMLClassVisitor(ContentHandler handler, Metadata metadata) { + super(Opcodes.ASM5); + this.xhtml = new XHTMLContentHandler(handler, metadata); + this.metadata = metadata; + } + + public void parse(InputStream stream) + throws TikaException, SAXException, IOException { + try { + ClassReader reader = new ClassReader(stream); + reader.accept(this, ClassReader.SKIP_FRAMES | ClassReader.SKIP_CODE); + } catch (RuntimeException e) { + if (e.getCause() instanceof SAXException) { + throw (SAXException) e.getCause(); + } else { + throw new TikaException("Failed to parse a Java class", e); + } + } + } + + public void visit( + int version, int access, String name, String signature, + String superName, String[] interfaces) { + type = Type.getObjectType(name); + + String className = type.getClassName(); + int dot = className.lastIndexOf('.'); + if (dot != -1) { + packageName = className.substring(0, dot); + className = className.substring(dot + 1); + } + + metadata.set(TikaCoreProperties.TITLE, className); + metadata.set(Metadata.RESOURCE_NAME_KEY, className + ".class"); + + try { + xhtml.startDocument(); + xhtml.startElement("pre"); + + if (packageName != null) { + writeKeyword("package"); + xhtml.characters(" " + packageName + ";\n"); + } + + writeAccess(access); + if (isSet(access, Opcodes.ACC_INTERFACE)) { + writeKeyword("interface"); + writeSpace(); + writeType(type); + writeSpace(); + writeInterfaces("extends", interfaces); + } else if (isSet(access, Opcodes.ACC_ENUM)) { + writeKeyword("enum"); + writeSpace(); + writeType(type); + writeSpace(); + } else { + writeKeyword("class"); + writeSpace(); + writeType(type); + writeSpace(); + if (superName != null) { + Type superType = Type.getObjectType(superName); + if (!superType.getClassName().equals("java.lang.Object")) { + writeKeyword("extends"); + writeSpace(); + writeType(superType); + writeSpace(); + } + } + writeInterfaces("implements", interfaces); + } + xhtml.characters("{\n"); + } catch (SAXException e) { + throw new RuntimeException(e); + } + } + + private void writeInterfaces(String keyword, String[] interfaces) + throws SAXException { + if (interfaces != null && interfaces.length > 0) { + writeKeyword(keyword); + String separator = " "; + for (String iface : interfaces) { + xhtml.characters(separator); + writeType(Type.getObjectType(iface)); + separator = ", "; + } + writeSpace(); + } + } + + public void visitEnd() { + try { + xhtml.characters("}\n"); + xhtml.endElement("pre"); + xhtml.endDocument(); + } catch (SAXException e) { + throw new RuntimeException(e); + } + } + + /** + * Ignored. + */ + public void visitOuterClass(String owner, String name, String desc) { + } + + /** + * Ignored. + */ + public void visitSource(String source, String debug) { + } + + + /** + * Ignored. + */ + public AnnotationVisitor visitAnnotation(String desc, boolean visible) { + return null; + } + + /** + * Ignored. + */ + public void visitAttribute(Attribute attr) { + } + + /** + * Ignored. + */ + public void visitInnerClass( + String name, String outerName, String innerName, int access) { + } + + /** + * Visits a field. + */ + public FieldVisitor visitField( + int access, String name, String desc, String signature, + Object value) { + if (!isSet(access, Opcodes.ACC_SYNTHETIC)) { + try { + xhtml.characters(" "); + writeAccess(access); + writeType(Type.getType(desc)); + writeSpace(); + writeIdentifier(name); + + if (isSet(access, Opcodes.ACC_STATIC) && value != null) { + xhtml.characters(" = "); + xhtml.characters(value.toString()); + } + + writeSemicolon(); + writeNewline(); + } catch (SAXException e) { + throw new RuntimeException(e); + } + } + + return null; + } + + /** + * Visits a method. + */ + public MethodVisitor visitMethod( + int access, String name, String desc, String signature, + String[] exceptions) { + if (!isSet(access, Opcodes.ACC_SYNTHETIC)) { + try { + xhtml.characters(" "); + writeAccess(access); + writeType(Type.getReturnType(desc)); + writeSpace(); + if ("<init>".equals(name)) { + writeType(type); + } else { + writeIdentifier(name); + } + + xhtml.characters("("); + String separator = ""; + for (Type arg : Type.getArgumentTypes(desc)) { + xhtml.characters(separator); + writeType(arg); + separator = ", "; + } + xhtml.characters(")"); + + if (exceptions != null && exceptions.length > 0) { + writeSpace(); + writeKeyword("throws"); + separator = " "; + for (String exception : exceptions) { + xhtml.characters(separator); + writeType(Type.getObjectType(exception)); + separator = ", "; + } + } + + writeSemicolon(); + writeNewline(); + } catch (SAXException e) { + throw new RuntimeException(e); + } + } + + return null; + } + + private void writeIdentifier(String identifier) throws SAXException { + xhtml.startElement("span", "class", "java-identifier"); + xhtml.characters(identifier); + xhtml.endElement("span"); + } + + private void writeKeyword(String keyword) throws SAXException { + xhtml.startElement("span", "class", "java-keyword"); + xhtml.characters(keyword); + xhtml.endElement("span"); + } + + private void writeSemicolon() throws SAXException { + xhtml.characters(";"); + } + + private void writeSpace() throws SAXException { + xhtml.characters(" "); + } + + private void writeNewline() throws SAXException { + xhtml.characters("\n"); + } + + private void writeAccess(int access) throws SAXException { + writeAccess(access, Opcodes.ACC_PRIVATE, "private"); + writeAccess(access, Opcodes.ACC_PROTECTED, "protected"); + writeAccess(access, Opcodes.ACC_PUBLIC, "public"); + writeAccess(access, Opcodes.ACC_STATIC, "static"); + writeAccess(access, Opcodes.ACC_FINAL, "final"); + writeAccess(access, Opcodes.ACC_ABSTRACT, "abstract"); + writeAccess(access, Opcodes.ACC_SYNCHRONIZED, "synchronized"); + writeAccess(access, Opcodes.ACC_TRANSIENT, "transient"); + writeAccess(access, Opcodes.ACC_VOLATILE, "volatile"); + writeAccess(access, Opcodes.ACC_NATIVE, "native"); + } + + private void writeAccess(int access, int code, String keyword) + throws SAXException { + if (isSet(access, code)) { + writeKeyword(keyword); + xhtml.characters(" "); + } + } + + private void writeType(Type type) throws SAXException { + String name = type.getClassName(); + if (name.startsWith(packageName + ".")) { + xhtml.characters(name.substring(packageName.length() + 1)); + } else if (name.startsWith("java.lang.")) { + xhtml.characters(name.substring("java.lang.".length())); + } else { + xhtml.characters(name); + } + } + + private static boolean isSet(int value, int flag) { + return (value & flag) != 0; + } + +} Added: tika/branches/2.x/tika-parser-modules/tika-code-module/src/main/java/org/apache/tika/parser/code/SourceCodeParser.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-code-module/src/main/java/org/apache/tika/parser/code/SourceCodeParser.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-code-module/src/main/java/org/apache/tika/parser/code/SourceCodeParser.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-code-module/src/main/java/org/apache/tika/parser/code/SourceCodeParser.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,142 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.code; + +import static com.uwyn.jhighlight.renderer.XhtmlRendererFactory.CPP; +import static com.uwyn.jhighlight.renderer.XhtmlRendererFactory.GROOVY; +import static com.uwyn.jhighlight.renderer.XhtmlRendererFactory.JAVA; + +import java.io.IOException; +import java.io.InputStream; +import java.io.StringReader; +import java.nio.charset.Charset; +import java.util.HashMap; +import java.util.Map; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.commons.io.input.CloseShieldInputStream; +import org.apache.tika.config.ServiceLoader; +import org.apache.tika.detect.AutoDetectReader; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.ccil.cowan.tagsoup.HTMLSchema; +import org.ccil.cowan.tagsoup.Schema; +import org.xml.sax.ContentHandler; +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; + +import com.uwyn.jhighlight.renderer.Renderer; +import com.uwyn.jhighlight.renderer.XhtmlRendererFactory; +/** + * Generic Source code parser for Java, Groovy, C++. + * Aware: This parser uses JHightlight library (https://github.com/codelibs/jhighlight) under CDDL/LGPL dual license + * + * @author Hong-Thai.Nguyen + * @since 1.6 + */ +public class SourceCodeParser implements Parser { + + private static final long serialVersionUID = -4543476498190054160L; + + private static final Pattern authorPattern = Pattern.compile("(?im)@author (.*) *$"); + + private static final Map<MediaType, String> TYPES_TO_RENDERER = new HashMap<MediaType, String>() { + private static final long serialVersionUID = -741976157563751152L; + { + put(MediaType.text("x-c++src"), CPP); + put(MediaType.text("x-java-source"), JAVA); + put(MediaType.text("x-groovy"), GROOVY); + } + }; + + private static final ServiceLoader LOADER = new ServiceLoader(SourceCodeParser.class.getClassLoader()); + + //Parse the HTML document + private static final Schema HTML_SCHEMA = new HTMLSchema(); + + @Override + public Set<MediaType> getSupportedTypes(ParseContext context) { + return TYPES_TO_RENDERER.keySet(); + } + + @Override + public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { + + try (AutoDetectReader reader = new AutoDetectReader( + new CloseShieldInputStream(stream), metadata, + context.get(ServiceLoader.class, LOADER))) { + Charset charset = reader.getCharset(); + String mediaType = metadata.get(Metadata.CONTENT_TYPE); + String name = metadata.get(Metadata.RESOURCE_NAME_KEY); + if (mediaType != null && name != null) { + MediaType type = MediaType.parse(mediaType); + metadata.set(Metadata.CONTENT_TYPE, type.toString()); + metadata.set(Metadata.CONTENT_ENCODING, charset.name()); + + StringBuilder out = new StringBuilder(); + String line; + int nbLines = 0; + while ((line = reader.readLine()) != null) { + out.append(line + System.getProperty("line.separator")); + String author = parserAuthor(line); + if (author != null) { + metadata.add(TikaCoreProperties.CREATOR, author); + } + nbLines ++; + } + metadata.set("LoC", String.valueOf(nbLines)); + Renderer renderer = getRenderer(type.toString()); + + String codeAsHtml = renderer.highlight(name, out.toString(), charset.name(), false); + + Schema schema = context.get(Schema.class, HTML_SCHEMA); + + org.ccil.cowan.tagsoup.Parser parser = new org.ccil.cowan.tagsoup.Parser(); + parser.setProperty(org.ccil.cowan.tagsoup.Parser.schemaProperty, schema); + parser.setContentHandler(handler); + parser.parse(new InputSource(new StringReader(codeAsHtml))); + } + } + + } + + private Renderer getRenderer(String mimeType) { + MediaType mt = MediaType.parse(mimeType); + String type = TYPES_TO_RENDERER.get(mt); + if (type == null) { + throw new RuntimeException("unparseable content type " + mimeType); + } + return XhtmlRendererFactory.getRenderer(type); + } + + + private String parserAuthor(String line) { + Matcher m = authorPattern.matcher(line); + if (m.find()) { + return m.group(1).trim(); + } + + return null; + } +} Added: tika/branches/2.x/tika-parser-modules/tika-code-module/src/main/java/org/apache/tika/parser/executable/ExecutableParser.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-code-module/src/main/java/org/apache/tika/parser/executable/ExecutableParser.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-code-module/src/main/java/org/apache/tika/parser/executable/ExecutableParser.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-code-module/src/main/java/org/apache/tika/parser/executable/ExecutableParser.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,406 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.executable; + +import java.io.IOException; +import java.io.InputStream; +import java.sql.Date; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; +import java.util.Set; + +import org.apache.poi.util.IOUtils; +import org.apache.poi.util.LittleEndian; +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.EndianUtils; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +/** + * Parser for executable files. Currently supports ELF and PE + */ +public class ExecutableParser extends AbstractParser implements MachineMetadata { + /** Serial version UID */ + private static final long serialVersionUID = 32128791892482l; + + private static final MediaType PE_EXE = MediaType.application("x-msdownload"); + private static final MediaType ELF_GENERAL = MediaType.application("x-elf"); + private static final MediaType ELF_OBJECT = MediaType.application("x-object"); + private static final MediaType ELF_EXECUTABLE = MediaType.application("x-executable"); + private static final MediaType ELF_SHAREDLIB = MediaType.application("x-sharedlib"); + private static final MediaType ELF_COREDUMP = MediaType.application("x-coredump"); + private static final Set<MediaType> SUPPORTED_TYPES = + Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList( + PE_EXE, + ELF_GENERAL, + ELF_OBJECT, ELF_EXECUTABLE, ELF_SHAREDLIB, ELF_COREDUMP + ))); + public Set<MediaType> getSupportedTypes(ParseContext context) { + return SUPPORTED_TYPES; + } + + public void parse( + InputStream stream, ContentHandler handler, + Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { + // We only do metadata, for now + XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); + + // What kind is it? + byte[] first4 = new byte[4]; + IOUtils.readFully(stream, first4); + + if (first4[0] == (byte)'M' && first4[1] == (byte)'Z') { + parsePE(xhtml, metadata, stream, first4); + } else if (first4[0] == (byte)0x7f && first4[1] == (byte)'E' && + first4[2] == (byte)'L' && first4[3] == (byte)'F') { + parseELF(xhtml, metadata, stream, first4); + } + + + // Finish everything + xhtml.endDocument(); + } + + /** + * Parses a DOS or Windows PE file + */ + public void parsePE(XHTMLContentHandler xhtml, Metadata metadata, + InputStream stream, byte[] first4) throws TikaException, IOException { + metadata.add(Metadata.CONTENT_TYPE, PE_EXE.toString()); + metadata.set(PLATFORM, PLATFORM_WINDOWS); + + // Skip over the MS-DOS bit + byte[] msdosSection = new byte[0x3c-4]; + IOUtils.readFully(stream, msdosSection); + + // Grab the PE header offset + int peOffset = LittleEndian.readInt(stream); + + // Sanity check - while it may go anywhere, it's normally in the first few kb + if (peOffset > 4096 || peOffset < 0x3f) return; + + // Skip the rest of the MS-DOS stub (if PE), until we reach what should + // be the PE header (if this is a PE executable) + stream.skip(peOffset - 0x40); + + // Read the PE header + byte[] pe = new byte[24]; + IOUtils.readFully(stream, pe); + + // Check it really is a PE header + if (pe[0] == (byte)'P' && pe[1] == (byte)'E' && pe[2]==0 && pe[3]==0) { + // Good, has a valid PE signature + } else { + // Old style MS-DOS + return; + } + + // Read the header values + int machine = LittleEndian.getUShort(pe, 4); + int numSectors = LittleEndian.getUShort(pe, 6); + long createdAt = LittleEndian.getInt(pe, 8); + long symbolTableOffset = LittleEndian.getInt(pe, 12); + long numSymbols = LittleEndian.getInt(pe, 16); + int sizeOptHdrs = LittleEndian.getUShort(pe, 20); + int characteristcs = LittleEndian.getUShort(pe, 22); + + // Turn this into helpful metadata + Date createdAtD = new Date(createdAt*1000l); + metadata.set(Metadata.CREATION_DATE, createdAtD); + + switch(machine) { + case 0x14c: + metadata.set(MACHINE_TYPE, MACHINE_x86_32); + metadata.set(ENDIAN, Endian.LITTLE.getName()); + metadata.set(ARCHITECTURE_BITS, "32"); + break; + case 0x8664: + metadata.set(MACHINE_TYPE, MACHINE_x86_32); + metadata.set(ENDIAN, Endian.LITTLE.getName()); + metadata.set(ARCHITECTURE_BITS, "64"); + break; + case 0x200: + metadata.set(MACHINE_TYPE, MACHINE_IA_64); + metadata.set(ENDIAN, Endian.LITTLE.getName()); + metadata.set(ARCHITECTURE_BITS, "64"); + break; + + case 0x184: + metadata.set(MACHINE_TYPE, MACHINE_ALPHA); + metadata.set(ENDIAN, Endian.LITTLE.getName()); + metadata.set(ARCHITECTURE_BITS, "32"); + break; + case 0x284: + metadata.set(MACHINE_TYPE, MACHINE_ALPHA); + metadata.set(ENDIAN, Endian.LITTLE.getName()); + metadata.set(ARCHITECTURE_BITS, "64"); + break; + + case 0x1c0: + case 0x1c4: + metadata.set(MACHINE_TYPE, MACHINE_ARM); + metadata.set(ENDIAN, Endian.LITTLE.getName()); + metadata.set(ARCHITECTURE_BITS, "32"); + break; + + case 0x268: + metadata.set(MACHINE_TYPE, MACHINE_M68K); + metadata.set(ENDIAN, Endian.BIG.getName()); + metadata.set(ARCHITECTURE_BITS, "32"); + break; + + case 0x266: + case 0x366: + case 0x466: + metadata.set(MACHINE_TYPE, MACHINE_MIPS); + metadata.set(ENDIAN, Endian.BIG.getName()); + metadata.set(ARCHITECTURE_BITS, "16"); + break; + case 0x162: + case 0x166: + case 0x168: + case 0x169: + metadata.set(MACHINE_TYPE, MACHINE_MIPS); + metadata.set(ENDIAN, Endian.LITTLE.getName()); + metadata.set(ARCHITECTURE_BITS, "16"); + break; + + case 0x1f0: + case 0x1f1: + metadata.set(MACHINE_TYPE, MACHINE_PPC); + metadata.set(ENDIAN, Endian.LITTLE.getName()); + metadata.set(ARCHITECTURE_BITS, "32"); + break; + + case 0x1a2: + case 0x1a3: + metadata.set(MACHINE_TYPE, MACHINE_SH3); + metadata.set(ENDIAN, Endian.BIG.getName()); + metadata.set(ARCHITECTURE_BITS, "32"); + break; + case 0x1a6: + metadata.set(MACHINE_TYPE, MACHINE_SH4); + metadata.set(ENDIAN, Endian.BIG.getName()); + metadata.set(ARCHITECTURE_BITS, "32"); + break; + case 0x1a8: + metadata.set(MACHINE_TYPE, MACHINE_SH3); + metadata.set(ENDIAN, Endian.BIG.getName()); + metadata.set(ARCHITECTURE_BITS, "32"); + break; + + case 0x9041: + metadata.set(MACHINE_TYPE, MACHINE_M32R); + metadata.set(ENDIAN, Endian.BIG.getName()); + metadata.set(ARCHITECTURE_BITS, "32"); + break; + + case 0xebc: + metadata.set(MACHINE_TYPE, MACHINE_EFI); + break; + + default: + metadata.set(MACHINE_TYPE, MACHINE_UNKNOWN); + break; + } + } + + /** + * Parses a Unix ELF file + */ + public void parseELF(XHTMLContentHandler xhtml, Metadata metadata, + InputStream stream, byte[] first4) throws TikaException, IOException { + // Byte 5 is the architecture + int architecture = stream.read(); + if (architecture == 1) { + metadata.set(ARCHITECTURE_BITS, "32"); + } else if (architecture == 2) { + metadata.set(ARCHITECTURE_BITS, "64"); + } + + // Byte 6 is the endian-ness + int endian = stream.read(); + if (endian == 1) { + metadata.set(ENDIAN, Endian.LITTLE.getName()); + } else if (endian == 2) { + metadata.set(ENDIAN, Endian.BIG.getName()); + } + + // Byte 7 is the elf version + int elfVer = stream.read(); + + // Byte 8 is the OS, if set (lots of compilers don't) + // Byte 9 is the OS (specific) ABI version + int os = stream.read(); + int osVer = stream.read(); + if (os > 0 || osVer > 0) + { + switch (os) { + case 0: + metadata.set(PLATFORM, PLATFORM_SYSV); + break; + + case 1: + metadata.set(PLATFORM, PLATFORM_HPUX); + break; + + case 2: + metadata.set(PLATFORM, PLATFORM_NETBSD); + break; + + case 3: + metadata.set(PLATFORM, PLATFORM_LINUX); + break; + + case 6: + metadata.set(PLATFORM, PLATFORM_SOLARIS); + break; + + case 7: + metadata.set(PLATFORM, PLATFORM_AIX); + break; + + case 8: + metadata.set(PLATFORM, PLATFORM_IRIX); + break; + + case 9: + metadata.set(PLATFORM, PLATFORM_FREEBSD); + break; + + case 10: + metadata.set(PLATFORM, PLATFORM_TRU64); + break; + + case 12: + metadata.set(PLATFORM, PLATFORM_FREEBSD); + break; + + case 64: + case 97: + metadata.set(PLATFORM, PLATFORM_ARM); + break; + + case 255: + metadata.set(PLATFORM, PLATFORM_EMBEDDED); + break; + } + } + + // Bytes 10-16 are padding and lengths + byte[] padLength = new byte[7]; + IOUtils.readFully(stream, padLength); + + // Bytes 16-17 are the object type (LE/BE) + int type; + if (endian == 1) { + type = EndianUtils.readUShortLE(stream); + } else { + type = EndianUtils.readUShortBE(stream); + } + switch(type) { + case 1: + metadata.add(Metadata.CONTENT_TYPE, ELF_OBJECT.toString()); + break; + + case 2: + metadata.add(Metadata.CONTENT_TYPE, ELF_EXECUTABLE.toString()); + break; + + case 3: + metadata.add(Metadata.CONTENT_TYPE, ELF_SHAREDLIB.toString()); + break; + + case 4: + metadata.add(Metadata.CONTENT_TYPE, ELF_COREDUMP.toString()); + break; + + default: + metadata.add(Metadata.CONTENT_TYPE, ELF_GENERAL.toString()); + break; + } + + // Bytes 18-19 are the machine (EM_*) + int machine; + if (endian == 1) { + machine = EndianUtils.readUShortLE(stream); + } else { + machine = EndianUtils.readUShortBE(stream); + } + switch(machine) { + case 2: + case 18: + case 43: + metadata.set(MACHINE_TYPE, MACHINE_SPARC); + break; + case 3: + metadata.set(MACHINE_TYPE, MACHINE_x86_32); + break; + case 4: + metadata.set(MACHINE_TYPE, MACHINE_M68K); + break; + case 5: + metadata.set(MACHINE_TYPE, MACHINE_M88K); + break; + case 8: + case 10: + metadata.set(MACHINE_TYPE, MACHINE_MIPS); + break; + case 7: + metadata.set(MACHINE_TYPE, MACHINE_S370); + break; + case 20: + case 21: + metadata.set(MACHINE_TYPE, MACHINE_PPC); + break; + case 22: + metadata.set(MACHINE_TYPE, MACHINE_S390); + break; + case 40: + metadata.set(MACHINE_TYPE, MACHINE_ARM); + break; + case 41: + case 0x9026: + metadata.set(MACHINE_TYPE, MACHINE_ALPHA); + break; + case 50: + metadata.set(MACHINE_TYPE, MACHINE_IA_64); + break; + case 62: + metadata.set(MACHINE_TYPE, MACHINE_x86_64); + break; + case 75: + metadata.set(MACHINE_TYPE, MACHINE_VAX); + break; + case 88: + metadata.set(MACHINE_TYPE, MACHINE_M32R); + break; + } + + + + // Bytes 20-23 are the version + // TODO + } +} Added: tika/branches/2.x/tika-parser-modules/tika-code-module/src/main/java/org/apache/tika/parser/executable/MachineMetadata.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-code-module/src/main/java/org/apache/tika/parser/executable/MachineMetadata.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-code-module/src/main/java/org/apache/tika/parser/executable/MachineMetadata.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-code-module/src/main/java/org/apache/tika/parser/executable/MachineMetadata.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,91 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.executable; + +import org.apache.tika.metadata.Property; + +/** + * Metadata for describing machines, such as their + * architecture, type and endian-ness + */ +public interface MachineMetadata { + public static final String PREFIX = "machine:"; + + public static Property ARCHITECTURE_BITS = Property.internalClosedChoise(PREFIX+"architectureBits", + "8", "16", "32", "64"); + + public static final String PLATFORM_SYSV = "System V"; + public static final String PLATFORM_HPUX = "HP-UX"; + public static final String PLATFORM_NETBSD = "NetBSD"; + public static final String PLATFORM_LINUX = "Linux"; + public static final String PLATFORM_SOLARIS = "Solaris"; + public static final String PLATFORM_AIX = "AIX"; + public static final String PLATFORM_IRIX = "IRIX"; + public static final String PLATFORM_FREEBSD = "FreeBSD"; + public static final String PLATFORM_TRU64 = "Tru64"; + public static final String PLATFORM_ARM = "ARM"; // ARM architecture ABI + public static final String PLATFORM_EMBEDDED = "Embedded"; // Stand-alone (embedded) ABI + public static final String PLATFORM_WINDOWS = "Windows"; + + public static Property PLATFORM = Property.internalClosedChoise(PREFIX+"platform", + PLATFORM_SYSV, PLATFORM_HPUX, PLATFORM_NETBSD, PLATFORM_LINUX, + PLATFORM_SOLARIS, PLATFORM_AIX, PLATFORM_IRIX, PLATFORM_FREEBSD, PLATFORM_TRU64, + PLATFORM_ARM, PLATFORM_EMBEDDED, PLATFORM_WINDOWS); + + public static final String MACHINE_x86_32 = "x86-32"; + public static final String MACHINE_x86_64 = "x86-64"; + public static final String MACHINE_IA_64 = "IA-64"; + public static final String MACHINE_SPARC = "SPARC"; + public static final String MACHINE_M68K = "Motorola-68000"; + public static final String MACHINE_M88K = "Motorola-88000"; + public static final String MACHINE_MIPS = "MIPS"; + public static final String MACHINE_PPC = "PPC"; + public static final String MACHINE_S370 = "S370"; + public static final String MACHINE_S390 = "S390"; + public static final String MACHINE_ARM = "ARM"; + public static final String MACHINE_VAX = "Vax"; + public static final String MACHINE_ALPHA = "Alpha"; + public static final String MACHINE_EFI = "EFI"; // EFI ByteCode + public static final String MACHINE_M32R = "M32R"; + public static final String MACHINE_SH3 = "SH3"; + public static final String MACHINE_SH4 = "SH4"; + public static final String MACHINE_SH5 = "SH5"; + public static final String MACHINE_UNKNOWN = "Unknown"; + + public static Property MACHINE_TYPE = Property.internalClosedChoise(PREFIX+"machineType", + MACHINE_x86_32, MACHINE_x86_64, MACHINE_IA_64, MACHINE_SPARC, + MACHINE_M68K, MACHINE_M88K, MACHINE_MIPS, MACHINE_PPC, + MACHINE_S370, MACHINE_S390, + MACHINE_ARM, MACHINE_VAX, MACHINE_ALPHA, MACHINE_EFI, MACHINE_M32R, + MACHINE_SH3, MACHINE_SH4, MACHINE_SH5, MACHINE_UNKNOWN); + + public static final class Endian { + private String name; + private boolean msb; + public String getName() { return name; } + @SuppressWarnings("unused") + public boolean isMSB() { return msb; } + @SuppressWarnings("unused") + public String getMSB() { if(msb) { return "MSB"; } else { return "LSB"; } } + private Endian(String name, boolean msb) { this.name = name; this.msb = msb; } + + public static final Endian LITTLE = new Endian("Little", false); + public static final Endian BIG = new Endian("Big", true); + } + public static Property ENDIAN = Property.internalClosedChoise(PREFIX+"endian", + Endian.LITTLE.name, Endian.BIG.name); +} Added: tika/branches/2.x/tika-parser-modules/tika-code-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-code-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-code-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser (added) +++ tika/branches/2.x/tika-parser-modules/tika-code-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser Wed Jan 6 03:50:50 2016 @@ -0,0 +1,19 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +org.apache.tika.parser.asm.ClassParser +org.apache.tika.parser.code.SourceCodeParser +org.apache.tika.parser.executable.ExecutableParser Added: tika/branches/2.x/tika-parser-modules/tika-code-module/src/test/java/org/apache/tika/parser/asm/ClassParserTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-code-module/src/test/java/org/apache/tika/parser/asm/ClassParserTest.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-code-module/src/test/java/org/apache/tika/parser/asm/ClassParserTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-code-module/src/test/java/org/apache/tika/parser/asm/ClassParserTest.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.asm; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; +import org.apache.tika.Tika; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.junit.Test; + +/** + * Test case for parsing Java class files. + */ +public class ClassParserTest { + + @Test + public void testClassParsing() throws Exception { + String path = "/test-documents/AutoDetectParser.class"; + Metadata metadata = new Metadata(); + String content = new Tika().parseToString( + ClassParserTest.class.getResourceAsStream(path), metadata); + + assertEquals("AutoDetectParser", metadata.get(TikaCoreProperties.TITLE)); + assertEquals( + "AutoDetectParser.class", + metadata.get(Metadata.RESOURCE_NAME_KEY)); + + assertTrue(content.contains("package org.apache.tika.parser;")); + assertTrue(content.contains( + "class AutoDetectParser extends CompositeParser")); + assertTrue(content.contains( + "private org.apache.tika.mime.MimeTypes types")); + assertTrue(content.contains( + "public void parse(" + + "java.io.InputStream, org.xml.sax.ContentHandler," + + " org.apache.tika.metadata.Metadata) throws" + + " java.io.IOException, org.xml.sax.SAXException," + + " org.apache.tika.exception.TikaException;")); + assertTrue(content.contains( + "private byte[] getPrefix(java.io.InputStream, int)" + + " throws java.io.IOException;")); + } + +} Added: tika/branches/2.x/tika-parser-modules/tika-code-module/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-code-module/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-code-module/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-code-module/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,101 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.code; + +import static java.nio.charset.StandardCharsets.UTF_8; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +import java.io.ByteArrayInputStream; +import java.util.Set; + +import org.apache.tika.TikaTest; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.ParseContext; +import org.junit.Test; + +public class SourceCodeParserTest extends TikaTest { + + private SourceCodeParser sourceCodeParser = new SourceCodeParser(); + + @Test + public void testSupportTypes() throws Exception { + Set<MediaType> supportedTypes = sourceCodeParser.getSupportedTypes(new ParseContext()); + assertTrue(supportedTypes.contains(new MediaType("text", "x-java-source"))); + assertTrue(supportedTypes.contains(new MediaType("text", "x-groovy"))); + assertTrue(supportedTypes.contains(new MediaType("text", "x-c++src"))); + + assertFalse(sourceCodeParser.getSupportedTypes(new ParseContext()).contains(new MediaType("text", "html"))); + } + + @Test + public void testHTMLRenderWithReturnLine() throws Exception { + String htmlContent = getXML(getResourceAsStream("/test-documents/testJAVA.java"), sourceCodeParser, createMetadata("text/x-java-source")).xml; + + assertTrue(htmlContent.indexOf("<html:html lang=\"en\" xml:lang=\"en\"") == 0); + assertTrue(htmlContent.indexOf("<html:span class=\"java_keyword\">public</span><html:span class=\"java_plain\">") > 0); + assertTrue(htmlContent.indexOf("<html:span class=\"java_keyword\">static</span>") > 0); + assertTrue(htmlContent.indexOf("<html:br clear=\"none\" />") > 0); + } + + @Test + public void testTextRender() throws Exception { + String textContent = getText(getResourceAsStream("/test-documents/testJAVA.java"), sourceCodeParser, createMetadata("text/x-java-source")); + + assertTrue(textContent.length() > 0); + assertTrue(textContent.indexOf("html") < 0); + + textContent = getText(new ByteArrayInputStream("public class HelloWorld {}".getBytes(UTF_8)), sourceCodeParser, createMetadata("text/x-java-source")); + assertTrue(textContent.length() > 0); + assertTrue(textContent.indexOf("html") < 0); + } + + @Test + public void testLoC() throws Exception { + Metadata metadata = createMetadata("text/x-groovy"); + getText(getResourceAsStream("/test-documents/testGROOVY.groovy"), sourceCodeParser, metadata); + + assertEquals(metadata.get("LoC"), "9"); + } + + @Test + public void testAuthor() throws Exception { + Metadata metadata = createMetadata("text/x-c++src"); + getText(getResourceAsStream("/test-documents/testCPP.cpp"), sourceCodeParser, metadata); + + assertEquals("Hong-Thai Nguyen", metadata.get(TikaCoreProperties.CREATOR)); + } + + @Test + public void testReturnContentAsIsForTextHandler() throws Exception { + String strContent = getXML(getResourceAsStream("/test-documents/testJAVA.java"), new AutoDetectParser(), createMetadata("text/plain")).xml; + + assertTrue(strContent.indexOf("public class HelloWorld {") > 0); + } + + private Metadata createMetadata(String mimeType) { + Metadata metadata = new Metadata(); + metadata.add(Metadata.RESOURCE_NAME_KEY, "testFile"); + metadata.add(Metadata.CONTENT_TYPE, mimeType); + return metadata; + } + +} Added: tika/branches/2.x/tika-parser-modules/tika-code-module/src/test/java/org/apache/tika/parser/executable/ExecutableParserTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-code-module/src/test/java/org/apache/tika/parser/executable/ExecutableParserTest.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-code-module/src/test/java/org/apache/tika/parser/executable/ExecutableParserTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-code-module/src/test/java/org/apache/tika/parser/executable/ExecutableParserTest.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,83 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.executable; + +import static org.junit.Assert.assertEquals; + +import java.io.InputStream; + +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.BodyContentHandler; +import org.junit.Test; +import org.xml.sax.ContentHandler; + +public class ExecutableParserTest { + + @Test + public void testWin32Parser() throws Exception { + try (InputStream input = ExecutableParserTest.class.getResourceAsStream( + "/test-documents/testWindows-x86-32.exe")) { + Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(); + new ExecutableParser().parse(input, handler, metadata, new ParseContext()); + + assertEquals("application/x-msdownload", + metadata.get(Metadata.CONTENT_TYPE)); + assertEquals("2012-05-13T13:40:11Z", + metadata.get(Metadata.CREATION_DATE)); + + assertEquals(ExecutableParser.MACHINE_x86_32, + metadata.get(ExecutableParser.MACHINE_TYPE)); + assertEquals("Little", + metadata.get(ExecutableParser.ENDIAN)); + assertEquals("32", + metadata.get(ExecutableParser.ARCHITECTURE_BITS)); + assertEquals("Windows", + metadata.get(ExecutableParser.PLATFORM)); + + String content = handler.toString(); + assertEquals("", content); // No text yet + } + } + + @Test + public void testElfParser_x86_32() throws Exception { + try (InputStream input = ExecutableParserTest.class.getResourceAsStream( + "/test-documents/testLinux-x86-32")) { + Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(); + new ExecutableParser().parse(input, handler, metadata, new ParseContext()); + + assertEquals("application/x-executable", + metadata.get(Metadata.CONTENT_TYPE)); + + assertEquals(ExecutableParser.MACHINE_x86_32, + metadata.get(ExecutableParser.MACHINE_TYPE)); + assertEquals("Little", + metadata.get(ExecutableParser.ENDIAN)); + assertEquals("32", + metadata.get(ExecutableParser.ARCHITECTURE_BITS)); +// assertEquals("Linux", +// metadata.get(ExecutableParser.PLATFORM)); + + String content = handler.toString(); + assertEquals("", content); // No text yet + } + } + +} Added: tika/branches/2.x/tika-parser-modules/tika-database-module/pom.xml URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-database-module/pom.xml?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-database-module/pom.xml (added) +++ tika/branches/2.x/tika-parser-modules/tika-database-module/pom.xml Wed Jan 6 03:50:50 2016 @@ -0,0 +1,69 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor + license agreements. See the NOTICE file distributed with this work for additional + information regarding copyright ownership. The ASF licenses this file to + you under the Apache License, Version 2.0 (the "License"); you may not use + this file except in compliance with the License. You may obtain a copy of + the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required + by applicable law or agreed to in writing, software distributed under the + License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS + OF ANY KIND, either express or implied. See the License for the specific + language governing permissions and limitations under the License. --> +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.tika</groupId> + <artifactId>tika-parser-modules</artifactId> + <version>2.0-SNAPSHOT</version> + </parent> + + <artifactId>tika-database-module</artifactId> + <name>Apache Tika Database Module</name> + <url>http://tika.apache.org/</url> + + <dependencies> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-core</artifactId> + <version>${project.version}</version> + </dependency> + + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-core</artifactId> + <version>${project.version}</version> + <type>test-jar</type> + <scope>test</scope> + </dependency> + <!-- Provided dependencies --> + <dependency> + <groupId>org.xerial</groupId> + <artifactId>sqlite-jdbc</artifactId> + <version>3.8.10.1</version> + <scope>provided</scope> + </dependency> + <dependency> + <groupId>commons-io</groupId> + <artifactId>commons-io</artifactId> + <version>${commons.io.version}</version> + </dependency> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-office-module</artifactId> + <version>${project.version}</version> + <scope>test</scope> + </dependency> + </dependencies> + + <build> + <plugins> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-dependency-plugin</artifactId> + </plugin> + </plugins> + </build> + +</project> \ No newline at end of file Added: tika/branches/2.x/tika-parser-modules/tika-database-module/src/appended-resources/META-INF/LICENSE URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-database-module/src/appended-resources/META-INF/LICENSE?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-database-module/src/appended-resources/META-INF/LICENSE (added) +++ tika/branches/2.x/tika-parser-modules/tika-database-module/src/appended-resources/META-INF/LICENSE Wed Jan 6 03:50:50 2016 @@ -0,0 +1,9 @@ +APACHE TIKA SUBCOMPONENTS + +Apache Tika includes a number of subcomponents with separate copyright notices +and license terms. Your use of these subcomponents is subject to the terms and +conditions of the following licenses. + +Sqlite (included in the "provided" org.xerial's sqlite-jdbc) + Sqlite is in the Public Domain. For details + see: https://www.sqlite.org/copyright.html Added: tika/branches/2.x/tika-parser-modules/tika-database-module/src/main/java/org/apache/tika/parser/jdbc/AbstractDBParser.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-database-module/src/main/java/org/apache/tika/parser/jdbc/AbstractDBParser.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-database-module/src/main/java/org/apache/tika/parser/jdbc/AbstractDBParser.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-database-module/src/main/java/org/apache/tika/parser/jdbc/AbstractDBParser.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,189 @@ +package org.apache.tika.parser.jdbc; +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.InputStream; +import java.sql.Connection; +import java.sql.DriverManager; +import java.sql.SQLException; +import java.util.List; +import java.util.Set; + +import org.apache.commons.io.IOExceptionWithCause; +import org.apache.tika.exception.TikaException; +import org.apache.tika.extractor.EmbeddedDocumentExtractor; +import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor; +import org.apache.tika.metadata.Database; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +/** + * Abstract class that handles iterating through tables within a database. + */ +abstract class AbstractDBParser extends AbstractParser { + + private final static byte[] EMPTY_BYTE_ARR = new byte[0]; + + private Connection connection; + + protected static EmbeddedDocumentExtractor getEmbeddedDocumentExtractor(ParseContext context) { + return context.get(EmbeddedDocumentExtractor.class, + new ParsingEmbeddedDocumentExtractor(context)); + } + + @Override + public Set<MediaType> getSupportedTypes(ParseContext context) { + return null; + } + + @Override + public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { + connection = getConnection(stream, metadata, context); + XHTMLContentHandler xHandler = null; + List<String> tableNames = null; + try { + tableNames = getTableNames(connection, metadata, context); + } catch (SQLException e) { + throw new IOExceptionWithCause(e); + } + for (String tableName : tableNames) { + //add table names to parent metadata + metadata.add(Database.TABLE_NAME, tableName); + } + xHandler = new XHTMLContentHandler(handler, metadata); + xHandler.startDocument(); + + try { + for (String tableName : tableNames) { + JDBCTableReader tableReader = getTableReader(connection, tableName, context); + xHandler.startElement("table", "name", tableReader.getTableName()); + xHandler.startElement("thead"); + xHandler.startElement("tr"); + for (String header : tableReader.getHeaders()) { + xHandler.startElement("th"); + xHandler.characters(header); + xHandler.endElement("th"); + } + xHandler.endElement("tr"); + xHandler.endElement("thead"); + xHandler.startElement("tbody"); + while (tableReader.nextRow(xHandler, context)) { + //no-op + } + xHandler.endElement("tbody"); + xHandler.endElement("table"); + } + } finally { + if (xHandler != null) { + xHandler.endDocument(); + } + try { + close(); + } catch (SQLException e) { + //swallow + } + } + } + + /** + * Override this for any special handling of closing the connection. + * + * @throws java.sql.SQLException + * @throws java.io.IOException + */ + protected void close() throws SQLException, IOException { + connection.close(); + } + + /** + * Override this for special configuration of the connection, such as limiting + * the number of rows to be held in memory. + * + * @param stream stream to use + * @param metadata metadata that could be used in parameterizing the connection + * @param context parsecontext that could be used in parameterizing the connection + * @return connection + * @throws java.io.IOException + * @throws org.apache.tika.exception.TikaException + */ + protected Connection getConnection(InputStream stream, Metadata metadata, ParseContext context) throws IOException, TikaException { + String connectionString = getConnectionString(stream, metadata, context); + + Connection connection = null; + try { + Class.forName(getJDBCClassName()); + } catch (ClassNotFoundException e) { + throw new TikaException(e.getMessage()); + } + try { + connection = DriverManager.getConnection(connectionString); + } catch (SQLException e) { + throw new IOExceptionWithCause(e); + } + return connection; + } + + /** + * Implement for db specific connection information, e.g. "jdbc:sqlite:/docs/mydb.db" + * <p/> + * Include any optimization settings, user name, password, etc. + * <p/> + * + * @param stream stream for processing + * @param metadata metadata might be useful in determining connection info + * @param parseContext context to use to help create connectionString + * @return connection string to be used by {@link #getConnection}. + * @throws java.io.IOException + */ + abstract protected String getConnectionString(InputStream stream, + Metadata metadata, ParseContext parseContext) throws IOException; + + /** + * JDBC class name, e.g. org.sqlite.JDBC + * + * @return jdbc class name + */ + abstract protected String getJDBCClassName(); + + /** + * Returns the names of the tables to process + * + * @param connection Connection to use to make the sql call(s) to get the names of the tables + * @param metadata Metadata to use (potentially) in decision about which tables to extract + * @param context ParseContext to use (potentially) in decision about which tables to extract + * @return + * @throws java.sql.SQLException + */ + abstract protected List<String> getTableNames(Connection connection, Metadata metadata, + ParseContext context) throws SQLException; + + /** + * Given a connection and a table name, return the JDBCTableReader for this db. + * + * @param connection + * @param tableName + * @return + */ + abstract protected JDBCTableReader getTableReader(Connection connection, String tableName, ParseContext parseContext); + +} Added: tika/branches/2.x/tika-parser-modules/tika-database-module/src/main/java/org/apache/tika/parser/jdbc/JDBCTableReader.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-database-module/src/main/java/org/apache/tika/parser/jdbc/JDBCTableReader.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-database-module/src/main/java/org/apache/tika/parser/jdbc/JDBCTableReader.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-database-module/src/main/java/org/apache/tika/parser/jdbc/JDBCTableReader.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,302 @@ +package org.apache.tika.parser.jdbc; +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.sql.Blob; +import java.sql.Clob; +import java.sql.Connection; +import java.sql.ResultSet; +import java.sql.ResultSetMetaData; +import java.sql.SQLException; +import java.sql.Statement; +import java.sql.Types; +import java.util.LinkedList; +import java.util.List; + +import org.apache.commons.io.FilenameUtils; +import org.apache.commons.io.IOExceptionWithCause; +import org.apache.commons.io.IOUtils; +import org.apache.tika.config.TikaConfig; +import org.apache.tika.detect.Detector; +import org.apache.tika.extractor.EmbeddedDocumentExtractor; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Database; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaMetadataKeys; +import org.apache.tika.mime.MediaType; +import org.apache.tika.mime.MimeType; +import org.apache.tika.mime.MimeTypeException; +import org.apache.tika.mime.MimeTypes; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.Attributes; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.AttributesImpl; + +import static java.nio.charset.StandardCharsets.UTF_8; + +/** + * General base class to iterate through rows of a JDBC table + */ +class JDBCTableReader { + + private final static Attributes EMPTY_ATTRIBUTES = new AttributesImpl(); + private final Connection connection; + private final String tableName; + int maxClobLength = 1000000; + ResultSet results = null; + int rows = 0; + private TikaConfig tikaConfig = null; + private Detector detector = null; + private MimeTypes mimeTypes = null; + + public JDBCTableReader(Connection connection, String tableName, ParseContext context) { + this.connection = connection; + this.tableName = tableName; + this.tikaConfig = context.get(TikaConfig.class); + } + + public boolean nextRow(ContentHandler handler, ParseContext context) throws IOException, SAXException { + //lazy initialization + if (results == null) { + reset(); + } + try { + if (!results.next()) { + return false; + } + } catch (SQLException e) { + throw new IOExceptionWithCause(e); + } + try { + ResultSetMetaData meta = results.getMetaData(); + handler.startElement(XHTMLContentHandler.XHTML, "tr", "tr", EMPTY_ATTRIBUTES); + for (int i = 1; i <= meta.getColumnCount(); i++) { + handler.startElement(XHTMLContentHandler.XHTML, "td", "td", EMPTY_ATTRIBUTES); + handleCell(meta, i, handler, context); + handler.endElement(XHTMLContentHandler.XHTML, "td", "td"); + } + handler.endElement(XHTMLContentHandler.XHTML, "tr", "tr"); + } catch (SQLException e) { + throw new IOExceptionWithCause(e); + } + rows++; + return true; + } + + private void handleCell(ResultSetMetaData rsmd, int i, ContentHandler handler, ParseContext context) throws SQLException, IOException, SAXException { + switch (rsmd.getColumnType(i)) { + case Types.BLOB: + handleBlob(tableName, rsmd.getColumnName(i), rows, results, i, handler, context); + break; + case Types.CLOB: + handleClob(tableName, rsmd.getColumnName(i), rows, results, i, handler, context); + break; + case Types.BOOLEAN: + handleBoolean(results.getBoolean(i), handler); + break; + case Types.DATE: + handleDate(results, i, handler); + break; + case Types.TIMESTAMP: + handleTimeStamp(results, i, handler); + break; + case Types.INTEGER: + handleInteger(rsmd.getColumnTypeName(i), results, i, handler); + break; + case Types.FLOAT: + //this is necessary to handle rounding issues in presentation + //Should we just use getString(i)? + addAllCharacters(Float.toString(results.getFloat(i)), handler); + break; + case Types.DOUBLE: + addAllCharacters(Double.toString(results.getDouble(i)), handler); + break; + default: + addAllCharacters(results.getString(i), handler); + break; + } + } + + public List<String> getHeaders() throws IOException { + List<String> headers = new LinkedList<String>(); + //lazy initialization + if (results == null) { + reset(); + } + try { + ResultSetMetaData meta = results.getMetaData(); + for (int i = 1; i <= meta.getColumnCount(); i++) { + headers.add(meta.getColumnName(i)); + } + } catch (SQLException e) { + throw new IOExceptionWithCause(e); + } + return headers; + } + + protected void handleInteger(String columnTypeName, ResultSet rs, int columnIndex, ContentHandler handler) throws SQLException, SAXException { + addAllCharacters(Integer.toString(rs.getInt(columnIndex)), handler); + } + + private void handleBoolean(boolean aBoolean, ContentHandler handler) throws SAXException { + addAllCharacters(Boolean.toString(aBoolean), handler); + } + + + protected void handleClob(String tableName, String columnName, int rowNum, + ResultSet resultSet, int columnIndex, + ContentHandler handler, ParseContext context) throws SQLException, IOException, SAXException { + Clob clob = resultSet.getClob(columnIndex); + boolean truncated = clob.length() > Integer.MAX_VALUE || clob.length() > maxClobLength; + + int readSize = (clob.length() < maxClobLength ? (int) clob.length() : maxClobLength); + Metadata m = new Metadata(); + m.set(Database.TABLE_NAME, tableName); + m.set(Database.COLUMN_NAME, columnName); + m.set(Database.PREFIX + "ROW_NUM", Integer.toString(rowNum)); + m.set(Database.PREFIX + "IS_CLOB", "true"); + m.set(Database.PREFIX + "CLOB_LENGTH", Long.toString(clob.length())); + m.set(Database.PREFIX + "IS_CLOB_TRUNCATED", Boolean.toString(truncated)); + m.set(Metadata.CONTENT_TYPE, "text/plain; charset=UTF-8"); + m.set(Metadata.CONTENT_LENGTH, Integer.toString(readSize)); + m.set(TikaMetadataKeys.RESOURCE_NAME_KEY, + //just in case something screwy is going on with the column name + FilenameUtils.normalize(FilenameUtils.getName(columnName + "_" + rowNum + ".txt"))); + + + //is there a more efficient way to go from a Reader to an InputStream? + String s = clob.getSubString(0, readSize); + EmbeddedDocumentExtractor ex = AbstractDBParser.getEmbeddedDocumentExtractor(context); + ex.parseEmbedded(new ByteArrayInputStream(s.getBytes(UTF_8)), handler, m, true); + } + + protected void handleBlob(String tableName, String columnName, int rowNum, ResultSet resultSet, int columnIndex, + ContentHandler handler, ParseContext context) throws SQLException, IOException, SAXException { + Metadata m = new Metadata(); + m.set(Database.TABLE_NAME, tableName); + m.set(Database.COLUMN_NAME, columnName); + m.set(Database.PREFIX + "ROW_NUM", Integer.toString(rowNum)); + m.set(Database.PREFIX + "IS_BLOB", "true"); + Blob blob = null; + InputStream is = null; + EmbeddedDocumentExtractor ex = AbstractDBParser.getEmbeddedDocumentExtractor(context); + try { + is = TikaInputStream.get(getInputStreamFromBlob(resultSet, columnIndex, blob, m)); + + Attributes attrs = new AttributesImpl(); + ((AttributesImpl) attrs).addAttribute("", "type", "type", "CDATA", "blob"); + ((AttributesImpl) attrs).addAttribute("", "column_name", "column_name", "CDATA", columnName); + ((AttributesImpl) attrs).addAttribute("", "row_number", "row_number", "CDATA", Integer.toString(rowNum)); + handler.startElement("", "span", "span", attrs); + MediaType mediaType = getDetector().detect(is, new Metadata()); + String extension = ""; + try { + MimeType mimeType = getMimeTypes().forName(mediaType.toString()); + m.set(Metadata.CONTENT_TYPE, mimeType.toString()); + extension = mimeType.getExtension(); + } catch (MimeTypeException e) { + //swallow + } + m.set(TikaMetadataKeys.RESOURCE_NAME_KEY, + //just in case something screwy is going on with the column name + FilenameUtils.normalize(FilenameUtils.getName(columnName + "_" + rowNum + extension))); + + ex.parseEmbedded(is, handler, m, true); + + } finally { + if (blob != null) { + try { + blob.free(); + } catch (SQLException e) { + //swallow + } + } + IOUtils.closeQuietly(is); + } + handler.endElement("", "span", "span"); + } + + protected InputStream getInputStreamFromBlob(ResultSet resultSet, int columnIndex, Blob blob, Metadata metadata) throws SQLException { + return TikaInputStream.get(blob, metadata); + } + + protected void handleDate(ResultSet resultSet, int columnIndex, ContentHandler handler) throws SAXException, SQLException { + addAllCharacters(resultSet.getString(columnIndex), handler); + } + + protected void handleTimeStamp(ResultSet resultSet, int columnIndex, ContentHandler handler) throws SAXException, SQLException { + addAllCharacters(resultSet.getString(columnIndex), handler); + } + + protected void addAllCharacters(String s, ContentHandler handler) throws SAXException { + char[] chars = s.toCharArray(); + handler.characters(chars, 0, chars.length); + } + + void reset() throws IOException { + + if (results != null) { + try { + results.close(); + } catch (SQLException e) { + //swallow + } + } + + String sql = "SELECT * from " + tableName; + try { + Statement st = connection.createStatement(); + results = st.executeQuery(sql); + } catch (SQLException e) { + throw new IOExceptionWithCause(e); + } + rows = 0; + } + + public String getTableName() { + return tableName; + } + + + protected TikaConfig getTikaConfig() { + if (tikaConfig == null) { + tikaConfig = TikaConfig.getDefaultConfig(); + } + return tikaConfig; + } + + protected Detector getDetector() { + if (detector != null) return detector; + + detector = getTikaConfig().getDetector(); + return detector; + } + + protected MimeTypes getMimeTypes() { + if (mimeTypes != null) return mimeTypes; + + mimeTypes = getTikaConfig().getMimeRepository(); + return mimeTypes; + } + +} Added: tika/branches/2.x/tika-parser-modules/tika-database-module/src/main/java/org/apache/tika/parser/jdbc/SQLite3DBParser.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-database-module/src/main/java/org/apache/tika/parser/jdbc/SQLite3DBParser.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-database-module/src/main/java/org/apache/tika/parser/jdbc/SQLite3DBParser.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-database-module/src/main/java/org/apache/tika/parser/jdbc/SQLite3DBParser.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,110 @@ +package org.apache.tika.parser.jdbc; +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.sql.Connection; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.sql.Statement; +import java.util.LinkedList; +import java.util.List; +import java.util.Set; + +import org.apache.commons.io.IOExceptionWithCause; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.ParseContext; +import org.sqlite.SQLiteConfig; + +/** + * This is the implementation of the db parser for SQLite. + * <p/> + * This parser is internal only; it should not be registered in the services + * file or configured in the TikaConfig xml file. + */ +class SQLite3DBParser extends AbstractDBParser { + + protected static final String SQLITE_CLASS_NAME = "org.sqlite.JDBC"; + + /** + * @param context context + * @return null (always) + */ + @Override + public Set<MediaType> getSupportedTypes(ParseContext context) { + return null; + } + + @Override + protected Connection getConnection(InputStream stream, Metadata metadata, ParseContext context) throws IOException { + String connectionString = getConnectionString(stream, metadata, context); + + Connection connection = null; + try { + Class.forName(getJDBCClassName()); + } catch (ClassNotFoundException e) { + throw new IOExceptionWithCause(e); + } + try { + SQLiteConfig config = new SQLiteConfig(); + + //good habit, but effectively meaningless here + config.setReadOnly(true); + connection = config.createConnection(connectionString); + + } catch (SQLException e) { + throw new IOException(e.getMessage()); + } + return connection; + } + + @Override + protected String getConnectionString(InputStream is, Metadata metadata, ParseContext context) throws IOException { + File dbFile = TikaInputStream.get(is).getFile(); + return "jdbc:sqlite:" + dbFile.getAbsolutePath(); + } + + @Override + protected String getJDBCClassName() { + return SQLITE_CLASS_NAME; + } + + @Override + protected List<String> getTableNames(Connection connection, Metadata metadata, + ParseContext context) throws SQLException { + List<String> tableNames = new LinkedList<String>(); + + try (Statement st = connection.createStatement()) { + String sql = "SELECT name FROM sqlite_master WHERE type='table'"; + ResultSet rs = st.executeQuery(sql); + + while (rs.next()) { + tableNames.add(rs.getString(1)); + } + } + return tableNames; + } + + @Override + public JDBCTableReader getTableReader(Connection connection, String tableName, ParseContext context) { + return new SQLite3TableReader(connection, tableName, context); + } +}
