I found a bug in the LangDetect implementation of language detection, where the maxTotalChars property isn't doing what it's description says it does: Solr uses the append() method solely in the LangDetect library, which checks the string length of the text to be appended and not its entire contents [1].
I've got a patch (attached) that solves this issue and hoists out a few of the utility methods in the Tika implementation and reuses them in the LangDetect one, but I stumbled upon SOLR-3881 [2], where the methods (concatFields and getExpectedSize specifically) were taken out of the parent class for reasons that are sort of unclear from the comments. Could I get some historical context on the issue and feedback on my patch? Thanks [1] https://github.com/shuyo/language-detection/blob/master/src/com/cybozu/labs/langdetect/Detector.java#L170 [2] https://issues.apache.org/jira/browse/SOLR-3881
diff --git a/solr/contrib/langid/src/java/org/apache/solr/update/processor/LangDetectLanguageIdentifierUpdateProcessor.java b/solr/contrib/langid/src/java/org/apache/solr/update/processor/LangDetectLanguageIdentifierUpdateProcessor.java index b1c3cba..f100ca8 100644 --- a/solr/contrib/langid/src/java/org/apache/solr/update/processor/LangDetectLanguageIdentifierUpdateProcessor.java +++ b/solr/contrib/langid/src/java/org/apache/solr/update/processor/LangDetectLanguageIdentifierUpdateProcessor.java @@ -47,39 +47,21 @@ public class LangDetectLanguageIdentifierUpdateProcessor extends LanguageIdentif @Override protected List<DetectedLanguage> detectLanguage(SolrInputDocument doc) { try { - Detector detector = DetectorFactory.create(); - detector.setMaxTextLength(maxTotalChars); - - for (String fieldName : inputFields) { - log.debug("Appending field " + fieldName); - if (doc.containsKey(fieldName)) { - Collection<Object> fieldValues = doc.getFieldValues(fieldName); - if (fieldValues != null) { - for (Object content : fieldValues) { - if (content instanceof String) { - String stringContent = (String) content; - if (stringContent.length() > maxFieldValueChars) { - detector.append(stringContent.substring(0, maxFieldValueChars)); - } else { - detector.append(stringContent); - } - detector.append(" "); - } else { - log.warn("Field " + fieldName + " not a String value, not including in detection"); - } - } - } + String content = concatFields(doc); + if (content.length() != 0) { + Detector detector = DetectorFactory.create(); + detector.append(content); + ArrayList<Language> langlist = detector.getProbabilities(); + ArrayList<DetectedLanguage> solrLangList = new ArrayList<>(); + for (Language l: langlist) { + solrLangList.add(new DetectedLanguage(l.lang, l.prob)); } + return solrLangList; } - ArrayList<Language> langlist = detector.getProbabilities(); - ArrayList<DetectedLanguage> solrLangList = new ArrayList<>(); - for (Language l: langlist) { - solrLangList.add(new DetectedLanguage(l.lang, l.prob)); - } - return solrLangList; } catch (LangDetectException e) { log.debug("Could not determine language, returning empty list: ", e); return Collections.emptyList(); } + return Collections.emptyList(); } } diff --git a/solr/contrib/langid/src/java/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessor.java b/solr/contrib/langid/src/java/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessor.java index 6b85c9b..05039cc 100644 --- a/solr/contrib/langid/src/java/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessor.java +++ b/solr/contrib/langid/src/java/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessor.java @@ -276,6 +276,58 @@ public abstract class LanguageIdentifierUpdateProcessor extends UpdateRequestPro } /** + * Calculate expected string size of a Solr document with a list of fields to use + * @param doc solr input document + * @return expected size of string value + */ + private int getExpectedSize(SolrInputDocument doc) { + int docSize = 0; + for (String field : inputFields) { + Collection<Object> contents = doc.getFieldValues(field); + for (Object content : contents) { + if (content instanceof String) { + docSize += Math.min(((String) content).length(), maxFieldValueChars); + } + } + docSize = Math.min(docSize, maxTotalChars); + } + return docSize; + } + + /** + * Concatenates content from multiple fields, respecting maximum field length setting + */ + protected String concatFields(SolrInputDocument doc) { + StringBuilder sb = new StringBuilder(getExpectedSize(doc)); + for (String fieldName : inputFields) { + log.debug("Appending field " + fieldName); + if (doc.containsKey(fieldName)) { + Collection<Object> fieldValues = doc.getFieldValues(fieldName); + if (fieldValues != null) { + for (Object content : fieldValues) { + if (content instanceof String) { + String stringContent = (String) content; + if (stringContent.length() > maxFieldValueChars) { + sb.append(stringContent.substring(0, maxFieldValueChars)); + } else { + sb.append(stringContent); + } + sb.append(" "); + if (sb.length() > maxTotalChars) { + sb.setLength(maxTotalChars); + break; + } + } else { + log.warn("Field " + fieldName + " not a String value, not including in detection"); + } + } + } + } + } + return sb.toString(); + } + + /** * Decides the fallback language, either from content of fallback field or fallback value * @param doc the Solr document * @param fallbackFields an array of strings with field names containing fallback language codes diff --git a/solr/contrib/langid/src/java/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessor.java b/solr/contrib/langid/src/java/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessor.java index 5d12174..229e08d 100644 --- a/solr/contrib/langid/src/java/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessor.java +++ b/solr/contrib/langid/src/java/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessor.java @@ -61,59 +61,4 @@ public class TikaLanguageIdentifierUpdateProcessor extends LanguageIdentifierUpd } return languages; } - - - /** - * Concatenates content from multiple fields - */ - protected String concatFields(SolrInputDocument doc) { - StringBuilder sb = new StringBuilder(getExpectedSize(doc, inputFields)); - for (String fieldName : inputFields) { - log.debug("Appending field " + fieldName); - if (doc.containsKey(fieldName)) { - Collection<Object> fieldValues = doc.getFieldValues(fieldName); - if (fieldValues != null) { - for (Object content : fieldValues) { - if (content instanceof String) { - String stringContent = (String) content; - if (stringContent.length() > maxFieldValueChars) { - sb.append(stringContent.substring(0, maxFieldValueChars)); - } else { - sb.append(stringContent); -} - sb.append(" "); - if (sb.length() > maxTotalChars) { - sb.setLength(maxTotalChars); - break; - } - } else { - log.warn("Field " + fieldName + " not a String value, not including in detection"); - } - } - } - } - } - return sb.toString(); - } - - /** - * Calculate expected string size. - * - * @param doc solr input document - * @param fields fields to select - * @return expected size of string value - */ - private int getExpectedSize(SolrInputDocument doc, String[] fields) { - int docSize = 0; - for (String field : fields) { - Collection<Object> contents = doc.getFieldValues(field); - for (Object content : contents) { - if (content instanceof String) { - docSize += Math.min(((String) content).length(), maxFieldValueChars); - } - } - docSize = Math.min(docSize, maxTotalChars); - } - return docSize; - } }
--------------------------------------------------------------------- To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org For additional commands, e-mail: dev-h...@lucene.apache.org