I found a bug in the LangDetect implementation of language detection, where the
maxTotalChars property isn't doing what it's description says it does: Solr uses
the append() method solely in the LangDetect library, which checks the string
length of the text to be appended and not its entire contents [1].

I've got a patch (attached) that solves this issue and hoists out a few of the
utility methods in the Tika implementation and reuses them in the LangDetect
one, but I stumbled upon SOLR-3881 [2], where the methods (concatFields and
getExpectedSize specifically) were taken out of the parent class for reasons
that are sort of unclear from the comments.

Could I get some historical context on the issue and feedback on my patch?
Thanks

[1] 
https://github.com/shuyo/language-detection/blob/master/src/com/cybozu/labs/langdetect/Detector.java#L170
[2] https://issues.apache.org/jira/browse/SOLR-3881
diff --git 
a/solr/contrib/langid/src/java/org/apache/solr/update/processor/LangDetectLanguageIdentifierUpdateProcessor.java
 
b/solr/contrib/langid/src/java/org/apache/solr/update/processor/LangDetectLanguageIdentifierUpdateProcessor.java
index b1c3cba..f100ca8 100644
--- 
a/solr/contrib/langid/src/java/org/apache/solr/update/processor/LangDetectLanguageIdentifierUpdateProcessor.java
+++ 
b/solr/contrib/langid/src/java/org/apache/solr/update/processor/LangDetectLanguageIdentifierUpdateProcessor.java
@@ -47,39 +47,21 @@ public class LangDetectLanguageIdentifierUpdateProcessor 
extends LanguageIdentif
   @Override
   protected List<DetectedLanguage> detectLanguage(SolrInputDocument doc) {
     try {
-      Detector detector = DetectorFactory.create();
-      detector.setMaxTextLength(maxTotalChars);
-
-      for (String fieldName : inputFields) {
-        log.debug("Appending field " + fieldName);
-        if (doc.containsKey(fieldName)) {
-          Collection<Object> fieldValues = doc.getFieldValues(fieldName);
-          if (fieldValues != null) {
-            for (Object content : fieldValues) {
-              if (content instanceof String) {
-                String stringContent = (String) content;
-                if (stringContent.length() > maxFieldValueChars) {
-                  detector.append(stringContent.substring(0, 
maxFieldValueChars));
-                } else {
-                  detector.append(stringContent);
-                }
-                detector.append(" ");
-              } else {
-                log.warn("Field " + fieldName + " not a String value, not 
including in detection");
-              }
-            }
-          }
+      String content = concatFields(doc);
+      if (content.length() != 0) {
+        Detector detector = DetectorFactory.create();
+        detector.append(content);
+        ArrayList<Language> langlist = detector.getProbabilities();
+        ArrayList<DetectedLanguage> solrLangList = new ArrayList<>();
+        for (Language l: langlist) {
+          solrLangList.add(new DetectedLanguage(l.lang, l.prob));
         }
+        return solrLangList;
       }
-      ArrayList<Language> langlist = detector.getProbabilities();
-      ArrayList<DetectedLanguage> solrLangList = new ArrayList<>();
-      for (Language l: langlist) {
-        solrLangList.add(new DetectedLanguage(l.lang, l.prob));
-      }
-      return solrLangList;
     } catch (LangDetectException e) {
       log.debug("Could not determine language, returning empty list: ", e);
       return Collections.emptyList();
     }
+    return Collections.emptyList();
   }
 }
diff --git 
a/solr/contrib/langid/src/java/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessor.java
 
b/solr/contrib/langid/src/java/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessor.java
index 6b85c9b..05039cc 100644
--- 
a/solr/contrib/langid/src/java/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessor.java
+++ 
b/solr/contrib/langid/src/java/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessor.java
@@ -276,6 +276,58 @@ public abstract class LanguageIdentifierUpdateProcessor 
extends UpdateRequestPro
   }
 
   /**
+   * Calculate expected string size of a Solr document with a list of fields 
to use
+   * @param doc           solr input document
+   * @return expected size of string value
+   */
+  private int getExpectedSize(SolrInputDocument doc) {
+    int docSize = 0;
+    for (String field : inputFields) {
+      Collection<Object> contents = doc.getFieldValues(field);
+      for (Object content : contents) {
+        if (content instanceof String) {
+          docSize += Math.min(((String) content).length(), maxFieldValueChars);
+        }
+      }
+      docSize = Math.min(docSize, maxTotalChars);
+    }
+    return docSize;
+  }
+
+  /**
+   * Concatenates content from multiple fields, respecting maximum field 
length setting
+   */
+  protected String concatFields(SolrInputDocument doc) {
+    StringBuilder sb = new StringBuilder(getExpectedSize(doc));
+    for (String fieldName : inputFields) {
+      log.debug("Appending field " + fieldName);
+      if (doc.containsKey(fieldName)) {
+        Collection<Object> fieldValues = doc.getFieldValues(fieldName);
+        if (fieldValues != null) {
+          for (Object content : fieldValues) {
+            if (content instanceof String) {
+              String stringContent = (String) content;
+              if (stringContent.length() > maxFieldValueChars) {
+                sb.append(stringContent.substring(0, maxFieldValueChars));
+              } else {
+                sb.append(stringContent);
+              }
+              sb.append(" ");
+              if (sb.length() > maxTotalChars) {
+                sb.setLength(maxTotalChars);
+                break;
+              }
+            } else {
+              log.warn("Field " + fieldName + " not a String value, not 
including in detection");
+            }
+          }
+        }
+      }
+    }
+    return sb.toString();
+  }
+    
+  /**
    * Decides the fallback language, either from content of fallback field or 
fallback value
    * @param doc the Solr document
    * @param fallbackFields an array of strings with field names containing 
fallback language codes
diff --git 
a/solr/contrib/langid/src/java/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessor.java
 
b/solr/contrib/langid/src/java/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessor.java
index 5d12174..229e08d 100644
--- 
a/solr/contrib/langid/src/java/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessor.java
+++ 
b/solr/contrib/langid/src/java/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessor.java
@@ -61,59 +61,4 @@ public class TikaLanguageIdentifierUpdateProcessor extends 
LanguageIdentifierUpd
     }
     return languages;
   }
-
-
-  /**
-   * Concatenates content from multiple fields
-   */
-  protected String concatFields(SolrInputDocument doc) {
-    StringBuilder sb = new StringBuilder(getExpectedSize(doc, inputFields));
-    for (String fieldName : inputFields) {
-      log.debug("Appending field " + fieldName);
-      if (doc.containsKey(fieldName)) {
-        Collection<Object> fieldValues = doc.getFieldValues(fieldName);
-        if (fieldValues != null) {
-          for (Object content : fieldValues) {
-            if (content instanceof String) {
-              String stringContent = (String) content;
-              if (stringContent.length() > maxFieldValueChars) {
-                sb.append(stringContent.substring(0, maxFieldValueChars));
-              } else {
-                sb.append(stringContent);
-}
-              sb.append(" ");
-              if (sb.length() > maxTotalChars) {
-                sb.setLength(maxTotalChars);
-                break;
-              }
-            } else {
-              log.warn("Field " + fieldName + " not a String value, not 
including in detection");
-            }
-          }
-        }
-      }
-    }
-    return sb.toString();
-  }
-
-  /**
-   * Calculate expected string size.
-   *
-   * @param doc           solr input document
-   * @param fields        fields to select
-   * @return expected size of string value
-   */
-  private int getExpectedSize(SolrInputDocument doc, String[] fields) {
-    int docSize = 0;
-    for (String field : fields) {
-      Collection<Object> contents = doc.getFieldValues(field);
-      for (Object content : contents) {
-        if (content instanceof String) {
-          docSize += Math.min(((String) content).length(), maxFieldValueChars);
-        }
-      }
-      docSize = Math.min(docSize, maxTotalChars);
-    }
-    return docSize;
-  }
 }

---------------------------------------------------------------------
To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org
For additional commands, e-mail: dev-h...@lucene.apache.org

Reply via email to