http://www.mediawiki.org/wiki/Special:Code/MediaWiki/65466

Revision: 65466
Author:   daniel
Date:     2010-04-23 16:03:12 +0000 (Fri, 23 Apr 2010)

Log Message:
-----------
NameHashTrial

Modified Paths:
--------------
    trunk/WikiWord/WikiWordBuilder/debug-tweaks.properties
    
trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/builder/NameMaps.java

Added Paths:
-----------
    
trunk/WikiWord/WikiWordBuilder/src/test/java/de/brightbyte/wikiword/store/builder/NameHashTrial.java

Modified: trunk/WikiWord/WikiWordBuilder/debug-tweaks.properties
===================================================================
--- trunk/WikiWord/WikiWordBuilder/debug-tweaks.properties      2010-04-23 
15:16:24 UTC (rev 65465)
+++ trunk/WikiWord/WikiWordBuilder/debug-tweaks.properties      2010-04-23 
16:03:12 UTC (rev 65466)
@@ -75,11 +75,19 @@
 # NOTE: when using this, allow for 116 bytes plus the average size of names 
per ID entry.
 #       So if you have anaverage name length of 12 and expect1million entries, 
 #       allow for about 1.3 gigabyte RAM to be used for ID caching.
-dbstore.idManager=false
+dbstore.idManager=true
 #dbstore.auxFileDir defaults to system temp dir
 #dbstore.auxFileDir="/tmp" 
 dbstore.idManager.bufferSize=16384 
 
+#idStoreParameters:
+# basic: string (default), utf8, or utf16
+# for utf8 and utf16: md5, sha1, or huffman (or nothing)
+# for utf8 and utf16: wrap8 (wrap to 8 bytes), fold64 (wrap to single long 
value)
+# for fold64: primitive (use gnu trove primitive hash)
+# "utf16+md5+fold64+primitive" uses about one third of the memory used by 
"string"
+dbstore.idManager.idStoreParameters="utf16+md5+fold64+primitive"
+
 ### CycleFinder #####################################
 dbstore.CycleFinder.levelWarningThreshold=32
 dbstore.CycleFinder.degreeWarningThreshold=1024

Modified: 
trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/builder/NameMaps.java
===================================================================
--- 
trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/builder/NameMaps.java
   2010-04-23 15:16:24 UTC (rev 65465)
+++ 
trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/builder/NameMaps.java
   2010-04-23 16:03:12 UTC (rev 65466)
@@ -1,6 +1,5 @@
 package de.brightbyte.wikiword.builder;
 
-import java.io.File;
 import java.io.IOException;
 import java.io.UnsupportedEncodingException;
 import java.net.URL;
@@ -8,12 +7,8 @@
 import java.util.Arrays;
 import java.util.HashMap;
 import java.util.HashSet;
-import java.util.Map;
 import java.util.Set;
 
-import org.ardverk.collection.PatriciaTrie;
-import org.ardverk.collection.StringKeyAnalyzer;
-
 import de.brightbyte.data.BlockDigest;
 import de.brightbyte.data.ByteString;
 import de.brightbyte.data.Codec;
@@ -22,7 +17,6 @@
 import de.brightbyte.data.KeyValueStore;
 import de.brightbyte.data.LongIntLookup;
 import de.brightbyte.data.MapLookup;
-import de.brightbyte.data.XorFold32;
 import de.brightbyte.data.XorFold64;
 import de.brightbyte.data.XorWrap;
 import de.brightbyte.io.HuffmanDataCodec;
@@ -43,14 +37,62 @@
                        return new HashMap<String, V>();
                }*/
 
+       protected static Set<String> parseParams(String spec) {
+               String[] tt = spec.split("[,;|+/ &]+");
+               
+               Set<String> params = new HashSet<String>();
+               params.addAll(Arrays.asList(tt));
+               
+               return params;
+       }
+       
+       public static Functor<?, String> newHash(String params, String lang) {
+               return newHash(parseParams(params), lang);
+       }
+       
+       public static Functor<?, String> newHash(Set<String> params, String 
lang) {
+               //initial digest turns string into UTF-8 bytes
+               Functor<byte[], String> digest;
+               
+               try {
+                       if (params.contains("utf8")) digest = new 
Codec.Encoder<String, byte[]>(new CharsetCodec("UTF-8"));
+                       else digest = new Codec.Encoder<String, byte[]>(new 
CharsetCodec("UTF-16"));
+                       
+                       //apply md5 digest or huffman compression
+                       if (params.contains("md5")) digest = new 
Functor.Composite<byte[], byte[], String>(digest, new BlockDigest("MD5"));
+                       else if (params.contains("sha1")) digest = new 
Functor.Composite<byte[], byte[], String>(digest, new BlockDigest("SHA-1"));
+                       else if (params.contains("huff") || 
params.contains("huffman")) digest = new Functor.Composite<byte[], byte[], 
String>(digest, getHuffmanEncoder(lang));
+               } catch (UnsupportedEncodingException e) {
+                       throw new IllegalArgumentException(e);
+               } catch (NoSuchAlgorithmException e) {
+                       throw new IllegalArgumentException(e);
+               } catch (IOException e) {
+                       throw new RuntimeException(e);
+               }
+               
+               if (params.contains("fold64")) { //fold into Long
+                       Functor<Long, byte[]> fold;
+                       fold = XorFold64.instance;
+                       
+                       Functor<Long, String> convert = new 
Functor.Composite<Long, byte[], String>(digest, fold);
+                       return convert;
+               } else { //keep bytes, wrap in ByteArray
+                               if (params.contains("wrap8")) digest = new 
Functor.Composite<byte[], byte[], String>(digest, new XorWrap(8));
+                               else if (params.contains("wrap6")) digest = new 
Functor.Composite<byte[], byte[], String>(digest, new XorWrap(6));
+                               else if (params.contains("wrap4")) digest = new 
Functor.Composite<byte[], byte[], String>(digest, new XorWrap(4));
+                               else if (params.contains("wrap4")) digest = new 
Functor.Composite<byte[], byte[], String>(digest, new XorWrap(4));
+                               
+                               //create converter that includes wrapping the 
byte array in a ByteString
+                               Functor<ByteString, String> convert = new 
Functor.Composite<ByteString, byte[], String>(digest, ByteString.wrap);
+                               return convert;
+               }
+       }
+       
                public static KeyValueStore<String, Integer> newStore(String 
storeParams, String lang) {
                        KeyValueStore<String, Integer> store = null;
                        
-                       String[] tt = storeParams.split("[,;|+/ &]+");
+                       Set<String> params = parseParams(storeParams);
                        
-                       Set<String> params = new HashSet<String>();
-                       params.addAll(Arrays.asList(tt));
-                       
                        if (params.contains("none") || params.contains("null")) 
store = null;
                        else if (params.contains("string")) store = new 
MapLookup<String, Integer>(new HashMap<String, Integer>());
                        else if (params.contains("utf8") || 
params.contains("utf16")) {
@@ -74,10 +116,7 @@
                                }
                                
                                if (params.contains("fold64")) { //fold into 
Long
-                                       Functor<Long, byte[]> fold;
-                                       fold = XorFold64.instance;
-                                       
-                                       Functor<Long, String> convert = new 
Functor.Composite<Long, byte[], String>(digest, fold);
+                                       Functor<Long, String> convert = 
(Functor<Long, String>)newHash(params, lang); //XXX: ugly cast
 
                                        if (params.contains("primitive")) {
                                                LongIntLookup<Long> numStore = 
new LongIntLookup<Long>();
@@ -87,13 +126,9 @@
                                                store = new 
KeyDigestingValueStore<String, Long, Integer>(numStore, convert);
                                        }
                                } else { //keep bytes, wrap in ByteArray
-                                               if (params.contains("wrap8")) 
digest = new Functor.Composite<byte[], byte[], String>(digest, new XorWrap(8));
-                                               else if 
(params.contains("wrap6")) digest = new Functor.Composite<byte[], byte[], 
String>(digest, new XorWrap(6));
-                                               else if 
(params.contains("wrap4")) digest = new Functor.Composite<byte[], byte[], 
String>(digest, new XorWrap(4));
-                                               else if 
(params.contains("wrap4")) digest = new Functor.Composite<byte[], byte[], 
String>(digest, new XorWrap(4));
-                                               
+
                                                //create converter that 
includes wrapping the byte array in a ByteString
-                                               Functor<ByteString, String> 
convert = new Functor.Composite<ByteString, byte[], String>(digest, 
ByteString.wrap);
+                                               Functor<ByteString, String> 
convert = (Functor<ByteString, String>)newHash(params, lang); //XXX: ugly cast
                        
                                                //set up the store
                                                MapLookup<ByteString, Integer> 
byteStore = new MapLookup<ByteString, Integer>(new HashMap<ByteString, 
Integer>());

Added: 
trunk/WikiWord/WikiWordBuilder/src/test/java/de/brightbyte/wikiword/store/builder/NameHashTrial.java
===================================================================
--- 
trunk/WikiWord/WikiWordBuilder/src/test/java/de/brightbyte/wikiword/store/builder/NameHashTrial.java
                                (rev 0)
+++ 
trunk/WikiWord/WikiWordBuilder/src/test/java/de/brightbyte/wikiword/store/builder/NameHashTrial.java
        2010-04-23 16:03:12 UTC (rev 65466)
@@ -0,0 +1,57 @@
+package de.brightbyte.wikiword.store.builder;
+
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.OutputStream;
+import java.io.OutputStreamWriter;
+import java.io.PrintWriter;
+import java.security.NoSuchAlgorithmException;
+
+import de.brightbyte.data.Functor;
+import de.brightbyte.util.PersistenceException;
+import de.brightbyte.wikiword.builder.NameMaps;
+
+public class NameHashTrial {
+       public static void main(String[] args) throws IOException, 
PersistenceException, NoSuchAlgorithmException, InterruptedException {
+               String params = args[0];
+               int limit = Integer.parseInt(args[1]);
+               
+               Functor<?, String> hash = NameMaps.newHash(params, "en");
+               
+               InputStream rawIn = args.length>2  && !args[2].equals("-") ? 
new FileInputStream(args[2]) : System.in;
+               OutputStream rawOut = args.length>3 && !args[3].equals("-") ? 
new FileOutputStream(args[3]) : System.out;
+               
+               BufferedReader in = new BufferedReader(new 
InputStreamReader(rawIn, "UTF-8"));
+               PrintWriter out = new PrintWriter(new BufferedWriter(new 
OutputStreamWriter(rawOut, "UTF-8")));
+               
+               long start = System.nanoTime();
+               
+               System.out.println("Reading input...");
+               String s;
+               int c = 0;
+               while ((s = in.readLine()) != null) {
+                       c++;
+                       if (c>limit) break;
+
+                       Object h = hash.apply(s);
+                       
+                       out.println(h+"\t"+s);
+                       if (rawOut==System.out) out.flush();
+                       
+                       if (c % 10000 == 0) System.out.format(" at %d\n", c);
+               }
+               
+               if (rawOut!=System.out) out.close();
+               else out.flush();
+               
+               if (rawIn!=System.in) in.close();
+               
+               long t = System.nanoTime() - start;
+               System.out.format("Processed %d entries in %01.3f sec\n", c, 
t/1000000000.0);
+       }
+}



_______________________________________________
MediaWiki-CVS mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs

Reply via email to