Author: j16sdiz
Date: 2008-12-04 09:06:29 +0000 (Thu, 04 Dec 2008)
New Revision: 24033

Modified:
   trunk/plugins/XMLSpider/XMLSpider.java
Log:
solve bug 1714, index site with accent character


Modified: trunk/plugins/XMLSpider/XMLSpider.java
===================================================================
--- trunk/plugins/XMLSpider/XMLSpider.java      2008-12-04 08:33:28 UTC (rev 
24032)
+++ trunk/plugins/XMLSpider/XMLSpider.java      2008-12-04 09:06:29 UTC (rev 
24033)
@@ -803,7 +803,7 @@
                MessageDigest md;
                md = MessageDigest.getInstance("MD5");
                byte[] md5hash = new byte[32];
-               md.update(text.getBytes("iso-8859-1"), 0, text.length());
+               md.update(text.getBytes("UTF-8"), 0, text.length());
                md5hash = md.digest();
                return convertToHex(md5hash);
        }
@@ -1176,8 +1176,9 @@
                        else type = null;
                        /*
                         * determine the position of the word in the retrieved 
page
+                        * FIXME - replace with a real tokenizor
                         */
-                       String[] words = s.split("[^A-Za-z0-9]");
+                       String[] words = s.split("[^\\p{L}\\{N}]");
                        Integer lastPosition = null;
                        lastPosition = (Integer)lastPositionById.get(id);
 

_______________________________________________
cvs mailing list
[email protected]
http://emu.freenetproject.org/cgi-bin/mailman/listinfo/cvs

Reply via email to