Author: grossws
Date: Wed Sep 30 16:38:24 2015
New Revision: 1706073
URL: http://svn.apache.org/viewvc?rev=1706073&view=rev
Log:
Reformat to avoid tabs and use JUL for logging
Related to #TIKA-1752
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/detect/NNExampleModelDetector.java
tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TrainedModelDetector.java
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/detect/NNExampleModelDetector.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/detect/NNExampleModelDetector.java?rev=1706073&r1=1706072&r2=1706073&view=diff
==============================================================================
---
tika/trunk/tika-core/src/main/java/org/apache/tika/detect/NNExampleModelDetector.java
(original)
+++
tika/trunk/tika-core/src/main/java/org/apache/tika/detect/NNExampleModelDetector.java
Wed Sep 30 16:38:24 2015
@@ -24,136 +24,137 @@ import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.nio.file.Path;
+import java.util.Objects;
+import java.util.logging.Level;
+import java.util.logging.Logger;
import org.apache.tika.mime.MediaType;
import static java.nio.charset.StandardCharsets.UTF_8;
public class NNExampleModelDetector extends TrainedModelDetector {
- private static final String EXAMPLE_NNMODEL_FILE =
"tika-example.nnmodel";
+ private static final String EXAMPLE_NNMODEL_FILE = "tika-example.nnmodel";
- private static final long serialVersionUID = 1L;
+ private static final long serialVersionUID = 1L;
- public NNExampleModelDetector() {
- super();
- }
-
- public NNExampleModelDetector(final Path modelFile) {
- loadDefaultModels(modelFile);
- }
-
- public NNExampleModelDetector(final File modelFile) {
- loadDefaultModels(modelFile);
- }
-
- @Override
- public void loadDefaultModels(InputStream modelStream) {
- BufferedReader bReader =
- new BufferedReader(new InputStreamReader(modelStream, UTF_8));
-
- NNTrainedModelBuilder nnBuilder = new NNTrainedModelBuilder();
- String line;
- try {
- while ((line = bReader.readLine()) != null) {
- line = line.trim();
- if (line.startsWith("#")) {
- readDescription(nnBuilder, line);
- } else {
- readNNParams(nnBuilder, line);
- // add this model into map of trained
models.
-
super.registerModels(nnBuilder.getType(), nnBuilder.build());
- }
-
- }
- } catch (IOException e) {
- throw new RuntimeException(
- "Unable to read the default media type
registry", e);
- }
- }
-
- /**
- * this method gets overwritten to register load neural network models
- */
- @Override
- public void loadDefaultModels(ClassLoader classLoader) {
- if (classLoader == null) {
- classLoader =
TrainedModelDetector.class.getClassLoader();
- }
-
- // This allows us to replicate class.getResource() when using
- // the classloader directly
- String classPrefix =
TrainedModelDetector.class.getPackage().getName()
- .replace('.', '/')
- + "/";
-
- // Get the core URL, and all the extensions URLs
- URL modelURL = classLoader.getResource(classPrefix
- + EXAMPLE_NNMODEL_FILE);
- try (InputStream stream = modelURL.openStream()) {
- loadDefaultModels(stream);
- } catch (IOException e) {
- throw new RuntimeException(
- "Unable to read the default media type
registry", e);
- }
-
- }
-
- /**
- * read the comments where the model configuration is written, e.g the
- * number of inputs, hiddens and output please ensure the first char in
the
- * given string is # In this example grb model file, there are 4
elements 1)
- * type 2) number of input units 3) number of hidden units. 4) number of
- * output units.
- *
- */
- private void readDescription(final NNTrainedModelBuilder builder,
- final String line) {
- int numInputs;
- int numHidden;
- int numOutputs;
- String[] sarr = line.split("\t");
-
- try {
- MediaType type = MediaType.parse(sarr[1]);
- numInputs = Integer.parseInt(sarr[2]);
- numHidden = Integer.parseInt(sarr[3]);
- numOutputs = Integer.parseInt(sarr[4]);
- builder.setNumOfInputs(numInputs);
- builder.setNumOfHidden(numHidden);
- builder.setNumOfOutputs(numOutputs);
- builder.setType(type);
- } catch (Exception e) {
- e.printStackTrace();
- throw new RuntimeException(
- "Unable to parse the model
configuration", e);
- }
- }
-
- /**
- * Read the next line for the model parameters and populate the build
which
- * later will be used to instantiate the instance of TrainedModel
- *
- * @param builder
- * @param line
- */
- private void readNNParams(final NNTrainedModelBuilder builder,
- final String line) {
- String[] sarr = line.split("\t");
- int n = sarr.length;
- float[] params = new float[n];
- try {
- int i = 0;
- for (String fstr : sarr) {
- params[i] = Float.parseFloat(fstr);
- i++;
- }
- builder.setParams(params);
- } catch (Exception e) {
- e.printStackTrace();
- throw new RuntimeException(
- "Unable to parse the model
configuration", e);
- }
-
- }
+ private static final Logger log =
Logger.getLogger(NNExampleModelDetector.class.getName());
+ public NNExampleModelDetector() {
+ super();
+ }
+
+ public NNExampleModelDetector(final Path modelFile) {
+ loadDefaultModels(modelFile);
+ }
+
+ public NNExampleModelDetector(final File modelFile) {
+ loadDefaultModels(modelFile);
+ }
+
+ @Override
+ public void loadDefaultModels(InputStream modelStream) {
+ BufferedReader bReader = new BufferedReader(new
InputStreamReader(modelStream, UTF_8));
+
+ NNTrainedModelBuilder nnBuilder = new NNTrainedModelBuilder();
+ String line;
+ try {
+ while ((line = bReader.readLine()) != null) {
+ line = line.trim();
+ if (line.startsWith("#")) {
+ readDescription(nnBuilder, line);
+ } else {
+ readNNParams(nnBuilder, line);
+ // add this model into map of trained models.
+ super.registerModels(nnBuilder.getType(),
nnBuilder.build());
+ }
+
+ }
+ } catch (IOException e) {
+ throw new RuntimeException("Unable to read the default media type
registry", e);
+ }
+ }
+
+ /**
+ * this method gets overwritten to register load neural network models
+ */
+ @Override
+ public void loadDefaultModels(ClassLoader classLoader) {
+ if (classLoader == null) {
+ classLoader = TrainedModelDetector.class.getClassLoader();
+ }
+
+ // This allows us to replicate class.getResource() when using
+ // the classloader directly
+ String classPrefix = TrainedModelDetector.class.getPackage().getName()
+ .replace('.', '/')
+ + "/";
+
+ // Get the core URL, and all the extensions URLs
+ URL modelURL = classLoader.getResource(classPrefix +
EXAMPLE_NNMODEL_FILE);
+ Objects.requireNonNull(modelURL, "required resource " + classPrefix +
EXAMPLE_NNMODEL_FILE + " not found");
+ try (InputStream stream = modelURL.openStream()) {
+ loadDefaultModels(stream);
+ } catch (IOException e) {
+ throw new RuntimeException("Unable to read the default media type
registry", e);
+ }
+
+ }
+
+ /**
+ * read the comments where the model configuration is written, e.g the
+ * number of inputs, hiddens and output please ensure the first char in the
+ * given string is # In this example grb model file, there are 4 elements
1)
+ * type 2) number of input units 3) number of hidden units. 4) number of
+ * output units.
+ */
+ private void readDescription(final NNTrainedModelBuilder builder,
+ final String line) {
+ int numInputs;
+ int numHidden;
+ int numOutputs;
+ String[] sarr = line.split("\t");
+
+ try {
+ MediaType type = MediaType.parse(sarr[1]);
+ numInputs = Integer.parseInt(sarr[2]);
+ numHidden = Integer.parseInt(sarr[3]);
+ numOutputs = Integer.parseInt(sarr[4]);
+ builder.setNumOfInputs(numInputs);
+ builder.setNumOfHidden(numHidden);
+ builder.setNumOfOutputs(numOutputs);
+ builder.setType(type);
+ } catch (Exception e) {
+ if (log.isLoggable(Level.WARNING)) {
+ log.log(Level.WARNING, "Unable to parse the model
configuration", e);
+ }
+ throw new RuntimeException("Unable to parse the model
configuration", e);
+ }
+ }
+
+ /**
+ * Read the next line for the model parameters and populate the build which
+ * later will be used to instantiate the instance of TrainedModel
+ *
+ * @param builder
+ * @param line
+ */
+ private void readNNParams(final NNTrainedModelBuilder builder,
+ final String line) {
+ String[] sarr = line.split("\t");
+ int n = sarr.length;
+ float[] params = new float[n];
+ try {
+ int i = 0;
+ for (String fstr : sarr) {
+ params[i] = Float.parseFloat(fstr);
+ i++;
+ }
+ builder.setParams(params);
+ } catch (Exception e) {
+ if (log.isLoggable(Level.WARNING)) {
+ log.log(Level.WARNING, "Unable to parse the model
configuration", e);
+ }
+ throw new RuntimeException("Unable to parse the model
configuration", e);
+ }
+ }
}
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TrainedModelDetector.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TrainedModelDetector.java?rev=1706073&r1=1706072&r2=1706073&view=diff
==============================================================================
---
tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TrainedModelDetector.java
(original)
+++
tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TrainedModelDetector.java
Wed Sep 30 16:38:24 2015
@@ -14,6 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
+
package org.apache.tika.detect;
import java.io.File;
@@ -29,150 +30,147 @@ import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
+import org.apache.tika.io.TemporaryResources;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
-import org.apache.tika.io.TemporaryResources;
import static java.nio.charset.StandardCharsets.UTF_8;
public abstract class TrainedModelDetector implements Detector {
- private final Map<MediaType, TrainedModel> MODEL_MAP = new
HashMap<MediaType, TrainedModel>();
-
- private static final long serialVersionUID = 1L;
+ private final Map<MediaType, TrainedModel> MODEL_MAP = new HashMap<>();
- public TrainedModelDetector() {
- loadDefaultModels(getClass().getClassLoader());
- }
-
- public int getMinLength() {
- return Integer.MAX_VALUE;
- }
-
- public MediaType detect(InputStream input, Metadata metadata)
- throws IOException {
- // convert to byte-histogram
- if (input != null) {
- input.mark(getMinLength());
- float[] histogram = readByteFrequencies(input);
- // writeHisto(histogram); //on testing purpose
- /*
- * iterate the map to find out the one that gives the
higher
- * prediction value.
- */
- Iterator<MediaType> iter =
MODEL_MAP.keySet().iterator();
- float threshold = 0.5f;// probability threshold, any
value below the
- //
threshold will be considered as
- //
MediaType.OCTET_STREAM
- float maxprob = threshold;
- MediaType maxType = MediaType.OCTET_STREAM;
- while (iter.hasNext()) {
- MediaType key = iter.next();
- TrainedModel model = MODEL_MAP.get(key);
- float prob = model.predict(histogram);
- if (maxprob < prob) {
- maxprob = prob;
- maxType = key;
- }
- }
- input.reset();
- return maxType;
- }
- return null;
- }
-
- /**
- * read the inputstream and build a byte frequence histogram
- *
- * @param input
- * @return
- * @throws IOException
- */
- protected float[] readByteFrequencies(final InputStream input)
- throws IOException {
-
- ReadableByteChannel inputChannel;
- try {
- inputChannel = Channels.newChannel(input);
- // long inSize = inputChannel.size();
- float histogram[] = new float[257];
- histogram[0] = 1;
-
- // create buffer with capacity of maxBufSize bytes
- ByteBuffer buf = ByteBuffer.allocate(1024 * 5);
- int bytesRead = inputChannel.read(buf); // read into
buffer.
-
- float max = -1;
- while (bytesRead != -1) {
-
- buf.flip(); // make buffer ready for read
-
- while (buf.hasRemaining()) {
- byte byt = buf.get();
- int idx = byt;
- idx++;
- if (byt < 0) {
- idx = 256 + idx;
- histogram[idx]++;
- } else {
- histogram[idx]++;
- }
- max = max < histogram[idx] ?
histogram[idx] : max;
- }
-
- buf.clear(); // make buffer ready for writing
- bytesRead = inputChannel.read(buf);
- }
-
- int i;
- for (i = 1; i < histogram.length; i++) {
- histogram[i] /= max;
- histogram[i] = (float) Math.sqrt(histogram[i]);
- }
-
- return histogram;
- } finally {
- // inputChannel.close();
- }
-
- }
-
- /**
- * for testing purposes; this method write the histogram vector to a
file.
- *
- * @param histogram
- * @throws IOException
- */
- private void writeHisto(final float[] histogram)
- throws IOException {
- Path histPath = new TemporaryResources().createTempFile();
- try (Writer writer = Files.newBufferedWriter(histPath, UTF_8)) {
- for (float bin : histogram) {
- writer.write(String.valueOf(bin) + "\t");
- // writer.write(i + "\t");
- }
- writer.write("\r\n");
- }
- }
-
- public void loadDefaultModels(Path modelFile) {
- try (InputStream in = Files.newInputStream(modelFile)) {
- loadDefaultModels(in);
- } catch (IOException e) {
- throw new RuntimeException(
- "Unable to read the default media type
registry", e);
- }
- }
-
- public void loadDefaultModels(File modelFile) {
- loadDefaultModels(modelFile.toPath());
- }
-
- public abstract void loadDefaultModels(final InputStream modelStream);
-
- public abstract void loadDefaultModels(final ClassLoader classLoader);
-
- protected void registerModels(MediaType type, TrainedModel model) {
- MODEL_MAP.put(type, model);
- }
+ private static final long serialVersionUID = 1L;
+ public TrainedModelDetector() {
+ loadDefaultModels(getClass().getClassLoader());
+ }
+
+ public int getMinLength() {
+ return Integer.MAX_VALUE;
+ }
+
+ public MediaType detect(InputStream input, Metadata metadata)
+ throws IOException {
+ // convert to byte-histogram
+ if (input != null) {
+ input.mark(getMinLength());
+ float[] histogram = readByteFrequencies(input);
+ // writeHisto(histogram); //on testing purpose
+ /*
+ * iterate the map to find out the one that gives the higher
+ * prediction value.
+ */
+ Iterator<MediaType> iter = MODEL_MAP.keySet().iterator();
+ float threshold = 0.5f;// probability threshold, any value below
the
+ // threshold will be considered as
+ // MediaType.OCTET_STREAM
+ float maxprob = threshold;
+ MediaType maxType = MediaType.OCTET_STREAM;
+ while (iter.hasNext()) {
+ MediaType key = iter.next();
+ TrainedModel model = MODEL_MAP.get(key);
+ float prob = model.predict(histogram);
+ if (maxprob < prob) {
+ maxprob = prob;
+ maxType = key;
+ }
+ }
+ input.reset();
+ return maxType;
+ }
+ return null;
+ }
+
+ /**
+ * Read the {@code inputstream} and build a byte frequency histogram
+ *
+ * @param input stream to read from
+ * @return byte frequencies array
+ * @throws IOException
+ */
+ protected float[] readByteFrequencies(final InputStream input)
+ throws IOException {
+ ReadableByteChannel inputChannel;
+ // TODO: any reason to avoid closing of input & inputChannel?
+ try {
+ inputChannel = Channels.newChannel(input);
+ // long inSize = inputChannel.size();
+ float histogram[] = new float[257];
+ histogram[0] = 1;
+
+ // create buffer with capacity of maxBufSize bytes
+ ByteBuffer buf = ByteBuffer.allocate(1024 * 5);
+ int bytesRead = inputChannel.read(buf); // read into buffer.
+
+ float max = -1;
+ while (bytesRead != -1) {
+
+ buf.flip(); // make buffer ready for read
+
+ while (buf.hasRemaining()) {
+ byte byt = buf.get();
+ int idx = byt;
+ idx++;
+ if (byt < 0) {
+ idx = 256 + idx;
+ histogram[idx]++;
+ } else {
+ histogram[idx]++;
+ }
+ max = max < histogram[idx] ? histogram[idx] : max;
+ }
+
+ buf.clear(); // make buffer ready for writing
+ bytesRead = inputChannel.read(buf);
+ }
+
+ int i;
+ for (i = 1; i < histogram.length; i++) {
+ histogram[i] /= max;
+ histogram[i] = (float) Math.sqrt(histogram[i]);
+ }
+
+ return histogram;
+ } finally {
+ // inputChannel.close();
+ }
+ }
+
+ /**
+ * for testing purposes; this method write the histogram vector to a file.
+ *
+ * @param histogram
+ * @throws IOException
+ */
+ private void writeHisto(final float[] histogram)
+ throws IOException {
+ Path histPath = new TemporaryResources().createTempFile();
+ try (Writer writer = Files.newBufferedWriter(histPath, UTF_8)) {
+ for (float bin : histogram) {
+ writer.write(String.valueOf(bin) + "\t");
+ // writer.write(i + "\t");
+ }
+ writer.write("\r\n");
+ }
+ }
+
+ public void loadDefaultModels(Path modelFile) {
+ try (InputStream in = Files.newInputStream(modelFile)) {
+ loadDefaultModels(in);
+ } catch (IOException e) {
+ throw new RuntimeException("Unable to read the default media type
registry", e);
+ }
+ }
+
+ public void loadDefaultModels(File modelFile) {
+ loadDefaultModels(modelFile.toPath());
+ }
+
+ public abstract void loadDefaultModels(final InputStream modelStream);
+
+ public abstract void loadDefaultModels(final ClassLoader classLoader);
+
+ protected void registerModels(MediaType type, TrainedModel model) {
+ MODEL_MAP.put(type, model);
+ }
}