Revision: 7229
http://languagetool.svn.sourceforge.net/languagetool/?rev=7229&view=rev
Author: milek_pl
Date: 2012-06-04 09:32:38 +0000 (Mon, 04 Jun 2012)
Log Message:
-----------
Javadoc change for HunspellRule, and some internal charset encoding code (in a
comment) for further testing on UTF-8 dictionaries
Modified Paths:
--------------
trunk/JLanguageTool/src/java/org/languagetool/rules/spelling/hunspell/Hunspell.java
trunk/JLanguageTool/src/java/org/languagetool/rules/spelling/hunspell/HunspellRule.java
Modified:
trunk/JLanguageTool/src/java/org/languagetool/rules/spelling/hunspell/Hunspell.java
===================================================================
---
trunk/JLanguageTool/src/java/org/languagetool/rules/spelling/hunspell/Hunspell.java
2012-06-03 22:08:18 UTC (rev 7228)
+++
trunk/JLanguageTool/src/java/org/languagetool/rules/spelling/hunspell/Hunspell.java
2012-06-04 09:32:38 UTC (rev 7229)
@@ -6,10 +6,18 @@
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CharsetEncoder;
+import java.nio.charset.CodingErrorAction;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
+import morfologik.util.BufferUtils;
+
import com.sun.jna.Native;
import com.sun.jna.Pointer;
import com.sun.jna.ptr.PointerByReference;
@@ -245,7 +253,19 @@
private String encoding;
+ private final CharsetEncoder encoder;
+
/**
+ * Charset decoder for hunspell.
+ */
+ private final CharsetDecoder decoder;
+
+
+ ByteBuffer bytes = ByteBuffer.allocate(0);
+
+ CharBuffer charBuffer = CharBuffer.allocate(0);
+
+ /**
* Creates an instance of the dictionary.
* @param baseFileName the base name of the dictionary,
*/
@@ -262,6 +282,13 @@
hunspellDict = hsl.Hunspell_create(aff.toString(),
dic.toString());
encoding = hsl.Hunspell_get_dic_encoding(hunspellDict);
+
+ Charset charset = Charset.forName(encoding);
+
+ encoder = charset.newEncoder();
+ decoder = charset.newDecoder()
+ .onMalformedInput(CodingErrorAction.REPORT)
+ .onUnmappableCharacter(CodingErrorAction.REPORT);
// This will blow up if the encoding doesn't exist
stringToBytes("test");
@@ -284,7 +311,16 @@
*/
public boolean misspelled(String word) {
try {
- return hsl.Hunspell_spell(hunspellDict,
stringToBytes(word)) == 0;
+ if (hsl.Hunspell_spell(hunspellDict,
stringToBytes(word)) == 0) {
+ byte[] arr = stringToBytes(word);
+ for (int i = 0; i < arr.length; i++) {
+ System.err.print(arr[i]);
+ System.err.print(' ');
+ } System.err.println();
+
+ return true;
+ }
+ return false;
} catch (UnsupportedEncodingException e) {
return true; // this should probably never
happen.
}
@@ -296,7 +332,40 @@
*/
protected byte[] stringToBytes(String str)
throws UnsupportedEncodingException {
- return (str+"\u0000").getBytes(encoding);
+ return (str+"\u0000").getBytes(encoding);
+
+ /*
+ bytes.clear();
+ charBuffer.clear();
+
+ if ("UTF-8".equals(encoding)) {
+ charBuffer = BufferUtils.ensureCapacity(charBuffer,
str.length() + 4);
+ charBuffer.put((char)0xEF);
+ charBuffer.put((char)0xBB);
+ charBuffer.put((char)0xBF);
+
+ } else {
+ charBuffer = BufferUtils.ensureCapacity(charBuffer,
str.length() + 1);
+ }
+ for (int i = 0; i < str.length(); i++) {
+ char chr = str.charAt(i);
+ charBuffer.put(chr);
+ }
+ charBuffer.put('\u0000');
+ charBuffer.flip();
+ final int maxCapacity = (int) (charBuffer.remaining() * encoder
+ .maxBytesPerChar());
+ if (bytes.capacity() <= maxCapacity) {
+ bytes = ByteBuffer.allocate(maxCapacity);
+ }
+
+ charBuffer.mark();
+ encoder.reset();
+ encoder.encode(charBuffer, bytes, true);
+ bytes.flip();
+ charBuffer.reset();
+ return bytes.array();
+ */
}
/**
Modified:
trunk/JLanguageTool/src/java/org/languagetool/rules/spelling/hunspell/HunspellRule.java
===================================================================
---
trunk/JLanguageTool/src/java/org/languagetool/rules/spelling/hunspell/HunspellRule.java
2012-06-03 22:08:18 UTC (rev 7228)
+++
trunk/JLanguageTool/src/java/org/languagetool/rules/spelling/hunspell/HunspellRule.java
2012-06-04 09:32:38 UTC (rev 7229)
@@ -41,6 +41,9 @@
/**
* A hunspell-based spellchecking-rule.
*
+ * The default dictionary is set to the first country variant on the list - so
the order
+ in the Language class declaration is important!
+ *
* @author Marcin MiĆkowski
*
*/
@@ -56,11 +59,6 @@
super(messages, language);
super.setCategory(new
Category(messages.getString("category_typo")));
- // TODO: currently, the default dictionary is now
- // set to the first country variant on the list - so the order
- // in the Language class declaration is important!
- // we might support country variants in the near future
-
final String langCountry = language.getShortName()
+ "_"
+ language.getCountryVariants()[0];
This was sent by the SourceForge.net collaborative development platform, the
world's largest Open Source development site.
------------------------------------------------------------------------------
Live Security Virtual Conference
Exclusive live event will cover all the ways today's security and
threat landscape has changed and how IT managers can respond. Discussions
will include endpoint security, mobile security and the latest in malware
threats. http://www.accelacomm.com/jaw/sfrnl04242012/114/50122263/
_______________________________________________
Languagetool-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/languagetool-cvs