Author: j16sdiz
Date: 2008-12-04 09:06:29 +0000 (Thu, 04 Dec 2008)
New Revision: 24033
Modified:
trunk/plugins/XMLSpider/XMLSpider.java
Log:
solve bug 1714, index site with accent character
Modified: trunk/plugins/XMLSpider/XMLSpider.java
===================================================================
--- trunk/plugins/XMLSpider/XMLSpider.java 2008-12-04 08:33:28 UTC (rev
24032)
+++ trunk/plugins/XMLSpider/XMLSpider.java 2008-12-04 09:06:29 UTC (rev
24033)
@@ -803,7 +803,7 @@
MessageDigest md;
md = MessageDigest.getInstance("MD5");
byte[] md5hash = new byte[32];
- md.update(text.getBytes("iso-8859-1"), 0, text.length());
+ md.update(text.getBytes("UTF-8"), 0, text.length());
md5hash = md.digest();
return convertToHex(md5hash);
}
@@ -1176,8 +1176,9 @@
else type = null;
/*
* determine the position of the word in the retrieved
page
+ * FIXME - replace with a real tokenizor
*/
- String[] words = s.split("[^A-Za-z0-9]");
+ String[] words = s.split("[^\\p{L}\\{N}]");
Integer lastPosition = null;
lastPosition = (Integer)lastPositionById.get(id);
_______________________________________________
cvs mailing list
[email protected]
http://emu.freenetproject.org/cgi-bin/mailman/listinfo/cvs