This is an automated email from the ASF dual-hosted git repository. dweiss pushed a commit to branch jira/solr-13105-toMerge in repository https://gitbox.apache.org/repos/asf/solr.git
commit 467b6772d10c673d6bb05ed97c39700fe1c74866 Author: Peter Gromov <[email protected]> AuthorDate: Fri Jan 15 09:35:25 2021 +0100 LUCENE-9665: Hunspell: support default encoding (#2203, Peter Gromov via Dawid Weiss) --- lucene/CHANGES.txt | 2 ++ .../src/java/org/apache/lucene/analysis/hunspell/Dictionary.java | 8 ++++---- .../test/org/apache/lucene/analysis/hunspell/TestDictionary.java | 3 +++ 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 84240fb..8702d5f 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -84,6 +84,8 @@ API Changes Improvements +* LUCENE-9665: Hunspell: support default encoding (Peter Gromov) + * LUCENE-9633: Improve match highlighter behavior for degenerate intervals (on non-existing positions). (Dawid Weiss) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java index 2a6017f..a4b2f6c 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java @@ -98,6 +98,7 @@ public class Dictionary { // TODO: really for suffixes we should reverse the automaton and run them backwards private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*"; private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s"; + static final String DEFAULT_ENCODING = StandardCharsets.ISO_8859_1.name(); FST<IntsRef> prefixes; FST<IntsRef> suffixes; @@ -642,10 +643,8 @@ public class Dictionary { * @param affix InputStream for reading the affix file * @return Encoding specified in the affix file * @throws IOException Can be thrown while reading from the InputStream - * @throws ParseException Thrown if the first non-empty non-comment line read from the file does - * not adhere to the format {@code SET <encoding>} */ - static String getDictionaryEncoding(InputStream affix) throws IOException, ParseException { + static String getDictionaryEncoding(InputStream affix) throws IOException { final StringBuilder encoding = new StringBuilder(); for (; ; ) { encoding.setLength(0); @@ -664,7 +663,7 @@ public class Dictionary { // this test only at the end as ineffective but would allow lines only containing spaces: encoding.toString().trim().length() == 0) { if (ch < 0) { - throw new ParseException("Unexpected end of affix file.", 0); + return DEFAULT_ENCODING; } continue; } @@ -673,6 +672,7 @@ public class Dictionary { int last = matcher.end(); return encoding.substring(last).trim(); } + return DEFAULT_ENCODING; } } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java index 89d607f..34852cf 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java @@ -267,6 +267,9 @@ public class TestDictionary extends LuceneTestCase { "UTF-8", Dictionary.getDictionaryEncoding( new ByteArrayInputStream("\uFEFFSET\tUTF-8\r\n".getBytes(StandardCharsets.UTF_8)))); + assertEquals( + Dictionary.DEFAULT_ENCODING, + Dictionary.getDictionaryEncoding(new ByteArrayInputStream(new byte[0]))); } public void testFlagWithCrazyWhitespace() throws Exception {
