[ 
https://issues.apache.org/jira/browse/OPENNLP-1267?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16866722#comment-16866722
 ] 

ASF GitHub Bot commented on OPENNLP-1267:
-----------------------------------------

tballison commented on pull request #357: OPENNLP-1267 -- add a 
ProbingLanguageDetector that can stop early.
URL: https://github.com/apache/opennlp/pull/357#discussion_r294884283
 
 

 ##########
 File path: 
opennlp-tools/src/main/java/opennlp/tools/langdetect/ProbingLanguageDetectorME.java
 ##########
 @@ -0,0 +1,412 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.Map;
+
+import opennlp.tools.util.MutableInt;
+import opennlp.tools.util.normalizer.AggregateCharSequenceNormalizer;
+import opennlp.tools.util.normalizer.CharSequenceNormalizer;
+import opennlp.tools.util.normalizer.EmojiCharSequenceNormalizer;
+import opennlp.tools.util.normalizer.NumberCharSequenceNormalizer;
+import opennlp.tools.util.normalizer.ShrinkCharSequenceNormalizer;
+import opennlp.tools.util.normalizer.TwitterCharSequenceNormalizer;
+import opennlp.tools.util.normalizer.UrlCharSequenceNormalizer;
+
+/**
+ * Implements learnable Language Detector.
+ * <p>
+ * Starts at the beginning of the charsequence and runs language
+ * detection on chunks of text.  If the end of the
+ * string is reached or there are {@link #minConsecImprovements}
+ * consecutive predictions for the best language and the confidence
+ * increases over those last predictions and if the difference
+ * in confidence between the highest confidence language
+ * and the second highest confidence language is greater than {@link #minDiff},
+ * the language detector will stop and report the results.
+ * </p>
+ * <p>
+ * The authors wish to thank Ken Krugler and
+ * <a href="https://github.com/kkrugler/yalder";>Yalder</a>}
+ * for the inspiration for many of the design
+ * components of this detector.
+ * </p>
+ *
+ */
+public class ProbingLanguageDetectorME extends LanguageDetectorME {
+
+  /**
+   * Default chunk size (in codepoints) to take from the
+   * initial String
+   */
+  public static final int DEFAULT_CHUNK_SIZE = 300;
+
+  /**
+   * Default minimum consecutive improvements in confidence.
+   * If the best language is the same over this many consecutive
+   * probes, and if the confidence did not go down over those probes,
+   * the detector stops early.
+   */
+  public static final int DEFAULT_MIN_CONSEC_IMPROVEMENTS = 2;
+
+  /**
+   * Default minimum difference in confidence between the language with
+   * the highest confidence and the language with the second highest 
confidence.
+   */
+  public static final double DEFAULT_MIN_DIFF = 0.10;
+
+  /**
+   * Default absolute maximum length of the String (in codepoints) to process
+   */
+  public static final int DEFAULT_MAX_LENGTH = 10000;
+
+  private static final String SPACE = " ";
+
+  //size at which to break strings for detection (in codepoints)
+  private int chunkSize = DEFAULT_CHUNK_SIZE;
+
+  //require that the "best" language be the same
+  //and that the confidence in that language increase over
+  //this number of probes.
+  private int minConsecImprovements = DEFAULT_MIN_CONSEC_IMPROVEMENTS;
+
+  //Minimum difference in confidence between the best candidate
+  //and the second best candidate
+  private double minDiff = DEFAULT_MIN_DIFF;
+
+  /**
+   * Absolute maximum length (in codepoints) that will processed
+   */
+  private int maxLength = DEFAULT_MAX_LENGTH;
+
+  private CharSequenceNormalizer normalizer;
+
+  /**
+   * Initializes the current instance with a language detector model. Default 
feature
+   * generation is used.
+   *
+   * @param model the language detector model
+   */
+  public ProbingLanguageDetectorME(LanguageDetectorModel model) {
+    super(model);
+    CharSequenceNormalizer[] normalizers = new CharSequenceNormalizer[] {
+      new EmojiCharSequenceNormalizer(),
+      new UrlCharSequenceNormalizer(),
+      new TwitterCharSequenceNormalizer(),
+      new NumberCharSequenceNormalizer(),
+      new ShrinkCharSequenceNormalizer()
+    };
+
+    this.normalizer = new AggregateCharSequenceNormalizer(normalizers);
+  }
+
+  @Override
+  public Language[] predictLanguages(CharSequence content) {
+    //list of the languages that received the highest
+    //confidence over the last n chunk detections
+    LinkedList<Language[]> predictions = new LinkedList();
 
 Review comment:
   y.  Thank you.
 
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


> Allow the LanguageDetector to stop before processing the full string
> --------------------------------------------------------------------
>
>                 Key: OPENNLP-1267
>                 URL: https://issues.apache.org/jira/browse/OPENNLP-1267
>             Project: OpenNLP
>          Issue Type: Improvement
>            Reporter: Tim Allison
>            Priority: Major
>
> On TIKA-2790, I found that Yalder is stopping after computing character 
> ngrams on roughly the first 60 characters.  That _likely_ explains its 
> impressive speed.  Let's make this "stopping short" feature available in 
> OpenNLP.
>  
> Ideally, the language detector wouldn't copy the full String, it wouldn't 
> normalize the full String, and it wouldn't compute ngrams on the full String.



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

Reply via email to