Repository: tika
Updated Branches:
  refs/heads/2.x 1ce93ed9e -> cd12917fa


fix indents and whitespace


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/76744261
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/76744261
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/76744261

Branch: refs/heads/2.x
Commit: 767442614756b51b64427e663a2af1f6b6ac0bff
Parents: 1ce93ed
Author: tballison <[email protected]>
Authored: Fri Jun 24 11:06:33 2016 -0400
Committer: tballison <[email protected]>
Committed: Fri Jun 24 11:06:33 2016 -0400

----------------------------------------------------------------------
 .../org/apache/tika/parser/prt/PRTParser.java   | 554 ++++++++++---------
 1 file changed, 279 insertions(+), 275 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/76744261/tika-parser-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java
----------------------------------------------------------------------
diff --git 
a/tika-parser-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java
 
b/tika-parser-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java
index 92e3503..24418b0 100644
--- 
a/tika-parser-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java
+++ 
b/tika-parser-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java
@@ -1,275 +1,279 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.prt;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.UnsupportedEncodingException;
-import java.util.Collections;
-import java.util.Set;
-
-import org.apache.commons.io.IOUtils;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.io.EndianUtils;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-import static java.nio.charset.StandardCharsets.US_ASCII;
-
-/**
- * A basic text extracting parser for the CADKey PRT (CAD Drawing)
- *  format. It outputs text from note entries.
- */
-
-public class PRTParser extends AbstractParser {
-
-    /** Serial version UID */
-    private static final long serialVersionUID = 4659638314375035178L;
-
-    private static final Set<MediaType> SUPPORTED_TYPES = 
Collections.singleton(MediaType.application("x-prt"));
-    public static final String PRT_MIME_TYPE = "application/x-prt";
-
-    public Set<MediaType> getSupportedTypes(ParseContext context) {
-       return SUPPORTED_TYPES;
-    }
-
-    /**
-     * How long do we allow a text run to claim to be, before we
-     * decide we're confused and it's not really text after all?
-     */
-    private static final int MAX_SANE_TEXT_LENGTH = 0x0800;
-    
-    /*
-     * Text types:
-     *   00 00 00 00 f0 [3b]f sz sz TEXT     *view name*
-     *   00 00 00 00 f0 3f 00 00 00 00 00 00 00 00 sz sz TEXT  *view name*
-     *   (anything)  e0 3f sz sz TEXT    *view name*
-     *   3x 33 33 33 33 33 e3 3f 0x 00 00 0x 00 00 0x 0x 1f sz sz TEXT    
*note entries* 
-     *   
-     *  Note - all text is null terminated
-     */
-      
-    public void parse(InputStream stream, ContentHandler handler, Metadata 
metadata, 
-          ParseContext context) throws IOException, SAXException, 
TikaException {
-       
-       XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
-       Last5 l5 = new Last5();
-       int read;
-       
-       // Try to get the creation date, which is YYYYMMDDhhmm
-       byte[] header = new byte[30];
-       IOUtils.readFully(stream, header);
-       byte[] date = new byte[12];
-       IOUtils.readFully(stream, date);
-       
-       String dateStr = new String(date, US_ASCII);
-       if(dateStr.startsWith("19") || dateStr.startsWith("20")) {
-          String formattedDate = dateStr.substring(0, 4) + "-" + 
dateStr.substring(4,6) +
-             "-" + dateStr.substring(6,8) + "T" + dateStr.substring(8,10) + 
":" +
-             dateStr.substring(10, 12) + ":00";
-          metadata.set(TikaCoreProperties.CREATED, formattedDate);
-          // TODO Metadata.DATE is used as modified, should it be here?
-          metadata.set(Metadata.DATE, formattedDate);
-       }
-       metadata.set(Metadata.CONTENT_TYPE, PRT_MIME_TYPE);
-       
-       // The description, if set, is the next up-to-500 bytes
-       byte[] desc = new byte[500];
-       IOUtils.readFully(stream, desc);
-       String description = extractText(desc, true);
-       if(description.length() > 0) {
-          metadata.set(TikaCoreProperties.DESCRIPTION, description);
-       }
-       
-       // Now look for text
-       while( (read = stream.read()) > -1) {
-          if(read == 0xe0 || read == 0xe3 || read == 0xf0) {
-             int nread = stream.read();
-             if(nread == 0x3f || nread == 0xbf) {
-                // Looks promising, check back for a suitable value
-                if(read == 0xe3 && nread == 0x3f) {
-                   if(l5.is33()) {
-                      // Bingo, note text
-                      handleNoteText(stream, xhtml);
-                   }
-                } else if(l5.is00()) {
-                   // Likely view name
-                   handleViewName(read, nread, stream, xhtml, l5);
-                }
-             }
-          } else {
-             l5.record(read);
-          }
-       }
-    }
-    
-    private void handleNoteText(InputStream stream, XHTMLContentHandler xhtml) 
-    throws IOException, SAXException, TikaException {
-       // Ensure we have the right padding text
-       int read;
-       for(int i=0; i<10; i++) {
-          read = stream.read();
-          if(read >= 0 && read <= 0x0f) {
-             // Promising
-          } else {
-             // Wrong, false detection
-             return;
-          }
-       }
-       read = stream.read();
-       if(read != 0x1f) {
-          // Wrong, false detection
-          return;
-       }
-       
-       int length = EndianUtils.readUShortLE(stream);
-       if(length <= MAX_SANE_TEXT_LENGTH) {
-          // Length sanity check passed
-          handleText(length, stream, xhtml);
-       }
-    }
-    
-    private void handleViewName(int typeA, int typeB, InputStream stream, 
-          XHTMLContentHandler xhtml, Last5 l5) 
-    throws IOException, SAXException, TikaException {
-       // Is it 8 byte zero padded?
-       int maybeLength = EndianUtils.readUShortLE(stream);
-       if(maybeLength == 0) {
-          // Check the next 6 bytes too
-          for(int i=0; i<6; i++) {
-             int read = stream.read();
-             if(read >= 0 && read <= 0x0f) {
-                // Promising
-             } else {
-                // Wrong, false detection
-                return;
-             }
-          }
-          
-          byte[] b2 = new byte[2];
-          IOUtils.readFully(stream, b2);
-          int length = EndianUtils.getUShortLE(b2);
-          if(length > 1 && length <= MAX_SANE_TEXT_LENGTH) {
-             // Length sanity check passed
-             handleText(length, stream, xhtml);
-          } else {
-             // Was probably something else
-             l5.record(b2[0]);
-             l5.record(b2[1]);
-          }
-       } else if(maybeLength > 0 && maybeLength < MAX_SANE_TEXT_LENGTH) {
-          // Looks like it's straight into the text
-          handleText(maybeLength, stream, xhtml);
-       }
-    }
-    
-    private void handleText(int length, InputStream stream, 
XHTMLContentHandler xhtml) 
-    throws IOException, SAXException, TikaException {
-       byte[] str = new byte[length];
-       IOUtils.readFully(stream, str);
-       if(str[length-1] != 0) {
-          // Not properly null terminated, must be wrong
-          return;
-       }
-       
-       String text = extractText(str, false);
-       
-       xhtml.startElement("p");
-       xhtml.characters(text);
-       xhtml.endElement("p");
-    }
-    
-    /**
-     * Does our best to turn the bytes into text
-     */
-    private String extractText(byte[] data, boolean trim) throws TikaException 
{
-       // The text is always stored null terminated, but sometimes
-       //  may have extra null padding too
-       int length = data.length - 1;
-       if(trim) {
-          for(int i=0; i<data.length; i++) {
-             if(data[i] == 0) {
-                length = i;
-                break;
-             }
-          }
-       }
-       
-       // We believe that the text is basically stored as CP437
-       // That said, there are a few characters slightly wrong for that...
-       String text;
-       try {
-          text = new String(data, 0, length, "cp437");
-       } catch(UnsupportedEncodingException e) {
-          throw new TikaException("JVM Broken, core codepage CP437 missing!");
-       }
-       
-       // Fix up the known character issues
-       text = text.replace("\u03C6","\u00D8");
-
-       // All done, as best as we can!
-       return text;
-    }
-    
-    /**
-     * Provides a view on the previous 5 bytes
-     */
-    private static class Last5 {
-       byte[] data = new byte[5];
-       int pos = 0;
-       
-       private void record(int b) {
-          data[pos] = (byte)b;
-          pos++;
-          if(pos >= data.length) {
-             pos = 0;
-          }
-       }
-       
-       private byte[] get() {
-          byte[] ret = new byte[5];
-          for(int i=0; i<ret.length; i++) {
-             int p = pos - i;
-             if(p < 0) { p += ret.length; }
-             ret[i] = data[p];
-          }
-          return ret;
-       }
-       
-       private boolean is33() {
-          byte[] last5 = get();
-          for(byte b : last5) {
-             if(b != 0x33) return false;
-          }
-          return true;
-       }
-       
-       private boolean is00() {
-          byte[] last5 = get();
-          for(byte b : last5) {
-             if(b != 0x00) return false;
-          }
-          return true;
-       }
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.prt;
+
+import static java.nio.charset.StandardCharsets.US_ASCII;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.UnsupportedEncodingException;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.EndianUtils;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * A basic text extracting parser for the CADKey PRT (CAD Drawing)
+ * format. It outputs text from note entries.
+ */
+
+public class PRTParser extends AbstractParser {
+
+    /**
+     * Serial version UID
+     */
+    private static final long serialVersionUID = 4659638314375035178L;
+
+    private static final Set<MediaType> SUPPORTED_TYPES = 
Collections.singleton(MediaType.application("x-prt"));
+    public static final String PRT_MIME_TYPE = "application/x-prt";
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    /**
+     * How long do we allow a text run to claim to be, before we
+     * decide we're confused and it's not really text after all?
+     */
+    private static final int MAX_SANE_TEXT_LENGTH = 0x0800;
+    
+    /*
+     * Text types:
+     *   00 00 00 00 f0 [3b]f sz sz TEXT     *view name*
+     *   00 00 00 00 f0 3f 00 00 00 00 00 00 00 00 sz sz TEXT  *view name*
+     *   (anything)  e0 3f sz sz TEXT    *view name*
+     *   3x 33 33 33 33 33 e3 3f 0x 00 00 0x 00 00 0x 0x 1f sz sz TEXT    
*note entries* 
+     *   
+     *  Note - all text is null terminated
+     */
+
+    public void parse(InputStream stream, ContentHandler handler, Metadata 
metadata,
+                      ParseContext context) throws IOException, SAXException, 
TikaException {
+
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        Last5 l5 = new Last5();
+        int read;
+
+        // Try to get the creation date, which is YYYYMMDDhhmm
+        byte[] header = new byte[30];
+        IOUtils.readFully(stream, header);
+        byte[] date = new byte[12];
+        IOUtils.readFully(stream, date);
+
+        String dateStr = new String(date, US_ASCII);
+        if (dateStr.startsWith("19") || dateStr.startsWith("20")) {
+            String formattedDate = dateStr.substring(0, 4) + "-" + 
dateStr.substring(4, 6) +
+                    "-" + dateStr.substring(6, 8) + "T" + dateStr.substring(8, 
10) + ":" +
+                    dateStr.substring(10, 12) + ":00";
+            metadata.set(TikaCoreProperties.CREATED, formattedDate);
+            // TODO Metadata.DATE is used as modified, should it be here?
+            metadata.set(Metadata.DATE, formattedDate);
+        }
+        metadata.set(Metadata.CONTENT_TYPE, PRT_MIME_TYPE);
+
+        // The description, if set, is the next up-to-500 bytes
+        byte[] desc = new byte[500];
+        IOUtils.readFully(stream, desc);
+        String description = extractText(desc, true);
+        if (description.length() > 0) {
+            metadata.set(TikaCoreProperties.DESCRIPTION, description);
+        }
+
+        // Now look for text
+        while ((read = stream.read()) > -1) {
+            if (read == 0xe0 || read == 0xe3 || read == 0xf0) {
+                int nread = stream.read();
+                if (nread == 0x3f || nread == 0xbf) {
+                    // Looks promising, check back for a suitable value
+                    if (read == 0xe3 && nread == 0x3f) {
+                        if (l5.is33()) {
+                            // Bingo, note text
+                            handleNoteText(stream, xhtml);
+                        }
+                    } else if (l5.is00()) {
+                        // Likely view name
+                        handleViewName(read, nread, stream, xhtml, l5);
+                    }
+                }
+            } else {
+                l5.record(read);
+            }
+        }
+    }
+
+    private void handleNoteText(InputStream stream, XHTMLContentHandler xhtml)
+            throws IOException, SAXException, TikaException {
+        // Ensure we have the right padding text
+        int read;
+        for (int i = 0; i < 10; i++) {
+            read = stream.read();
+            if (read >= 0 && read <= 0x0f) {
+                // Promising
+            } else {
+                // Wrong, false detection
+                return;
+            }
+        }
+        read = stream.read();
+        if (read != 0x1f) {
+            // Wrong, false detection
+            return;
+        }
+
+        int length = EndianUtils.readUShortLE(stream);
+        if (length <= MAX_SANE_TEXT_LENGTH) {
+            // Length sanity check passed
+            handleText(length, stream, xhtml);
+        }
+    }
+
+    private void handleViewName(int typeA, int typeB, InputStream stream,
+                                XHTMLContentHandler xhtml, Last5 l5)
+            throws IOException, SAXException, TikaException {
+        // Is it 8 byte zero padded?
+        int maybeLength = EndianUtils.readUShortLE(stream);
+        if (maybeLength == 0) {
+            // Check the next 6 bytes too
+            for (int i = 0; i < 6; i++) {
+                int read = stream.read();
+                if (read >= 0 && read <= 0x0f) {
+                    // Promising
+                } else {
+                    // Wrong, false detection
+                    return;
+                }
+            }
+
+            byte[] b2 = new byte[2];
+            IOUtils.readFully(stream, b2);
+            int length = EndianUtils.getUShortLE(b2);
+            if (length > 1 && length <= MAX_SANE_TEXT_LENGTH) {
+                // Length sanity check passed
+                handleText(length, stream, xhtml);
+            } else {
+                // Was probably something else
+                l5.record(b2[0]);
+                l5.record(b2[1]);
+            }
+        } else if (maybeLength > 0 && maybeLength < MAX_SANE_TEXT_LENGTH) {
+            // Looks like it's straight into the text
+            handleText(maybeLength, stream, xhtml);
+        }
+    }
+
+    private void handleText(int length, InputStream stream, 
XHTMLContentHandler xhtml)
+            throws IOException, SAXException, TikaException {
+        byte[] str = new byte[length];
+        IOUtils.readFully(stream, str);
+        if (str[length - 1] != 0) {
+            // Not properly null terminated, must be wrong
+            return;
+        }
+
+        String text = extractText(str, false);
+
+        xhtml.startElement("p");
+        xhtml.characters(text);
+        xhtml.endElement("p");
+    }
+
+    /**
+     * Does our best to turn the bytes into text
+     */
+    private String extractText(byte[] data, boolean trim) throws TikaException 
{
+        // The text is always stored null terminated, but sometimes
+        //  may have extra null padding too
+        int length = data.length - 1;
+        if (trim) {
+            for (int i = 0; i < data.length; i++) {
+                if (data[i] == 0) {
+                    length = i;
+                    break;
+                }
+            }
+        }
+
+        // We believe that the text is basically stored as CP437
+        // That said, there are a few characters slightly wrong for that...
+        String text;
+        try {
+            text = new String(data, 0, length, "cp437");
+        } catch (UnsupportedEncodingException e) {
+            throw new TikaException("JVM Broken, core codepage CP437 
missing!");
+        }
+
+        // Fix up the known character issues
+        text = text.replace("\u03C6", "\u00D8");
+
+        // All done, as best as we can!
+        return text;
+    }
+
+    /**
+     * Provides a view on the previous 5 bytes
+     */
+    private static class Last5 {
+        byte[] data = new byte[5];
+        int pos = 0;
+
+        private void record(int b) {
+            data[pos] = (byte) b;
+            pos++;
+            if (pos >= data.length) {
+                pos = 0;
+            }
+        }
+
+        private byte[] get() {
+            byte[] ret = new byte[5];
+            for (int i = 0; i < ret.length; i++) {
+                int p = pos - i;
+                if (p < 0) {
+                    p += ret.length;
+                }
+                ret[i] = data[p];
+            }
+            return ret;
+        }
+
+        private boolean is33() {
+            byte[] last5 = get();
+            for (byte b : last5) {
+                if (b != 0x33) return false;
+            }
+            return true;
+        }
+
+        private boolean is00() {
+            byte[] last5 = get();
+            for (byte b : last5) {
+                if (b != 0x00) return false;
+            }
+            return true;
+        }
+    }
+}

Reply via email to