http://www.mediawiki.org/wiki/Special:Code/MediaWiki/65466
Revision: 65466
Author: daniel
Date: 2010-04-23 16:03:12 +0000 (Fri, 23 Apr 2010)
Log Message:
-----------
NameHashTrial
Modified Paths:
--------------
trunk/WikiWord/WikiWordBuilder/debug-tweaks.properties
trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/builder/NameMaps.java
Added Paths:
-----------
trunk/WikiWord/WikiWordBuilder/src/test/java/de/brightbyte/wikiword/store/builder/NameHashTrial.java
Modified: trunk/WikiWord/WikiWordBuilder/debug-tweaks.properties
===================================================================
--- trunk/WikiWord/WikiWordBuilder/debug-tweaks.properties 2010-04-23
15:16:24 UTC (rev 65465)
+++ trunk/WikiWord/WikiWordBuilder/debug-tweaks.properties 2010-04-23
16:03:12 UTC (rev 65466)
@@ -75,11 +75,19 @@
# NOTE: when using this, allow for 116 bytes plus the average size of names
per ID entry.
# So if you have anaverage name length of 12 and expect1million entries,
# allow for about 1.3 gigabyte RAM to be used for ID caching.
-dbstore.idManager=false
+dbstore.idManager=true
#dbstore.auxFileDir defaults to system temp dir
#dbstore.auxFileDir="/tmp"
dbstore.idManager.bufferSize=16384
+#idStoreParameters:
+# basic: string (default), utf8, or utf16
+# for utf8 and utf16: md5, sha1, or huffman (or nothing)
+# for utf8 and utf16: wrap8 (wrap to 8 bytes), fold64 (wrap to single long
value)
+# for fold64: primitive (use gnu trove primitive hash)
+# "utf16+md5+fold64+primitive" uses about one third of the memory used by
"string"
+dbstore.idManager.idStoreParameters="utf16+md5+fold64+primitive"
+
### CycleFinder #####################################
dbstore.CycleFinder.levelWarningThreshold=32
dbstore.CycleFinder.degreeWarningThreshold=1024
Modified:
trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/builder/NameMaps.java
===================================================================
---
trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/builder/NameMaps.java
2010-04-23 15:16:24 UTC (rev 65465)
+++
trunk/WikiWord/WikiWordBuilder/src/main/java/de/brightbyte/wikiword/builder/NameMaps.java
2010-04-23 16:03:12 UTC (rev 65466)
@@ -1,6 +1,5 @@
package de.brightbyte.wikiword.builder;
-import java.io.File;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URL;
@@ -8,12 +7,8 @@
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
-import java.util.Map;
import java.util.Set;
-import org.ardverk.collection.PatriciaTrie;
-import org.ardverk.collection.StringKeyAnalyzer;
-
import de.brightbyte.data.BlockDigest;
import de.brightbyte.data.ByteString;
import de.brightbyte.data.Codec;
@@ -22,7 +17,6 @@
import de.brightbyte.data.KeyValueStore;
import de.brightbyte.data.LongIntLookup;
import de.brightbyte.data.MapLookup;
-import de.brightbyte.data.XorFold32;
import de.brightbyte.data.XorFold64;
import de.brightbyte.data.XorWrap;
import de.brightbyte.io.HuffmanDataCodec;
@@ -43,14 +37,62 @@
return new HashMap<String, V>();
}*/
+ protected static Set<String> parseParams(String spec) {
+ String[] tt = spec.split("[,;|+/ &]+");
+
+ Set<String> params = new HashSet<String>();
+ params.addAll(Arrays.asList(tt));
+
+ return params;
+ }
+
+ public static Functor<?, String> newHash(String params, String lang) {
+ return newHash(parseParams(params), lang);
+ }
+
+ public static Functor<?, String> newHash(Set<String> params, String
lang) {
+ //initial digest turns string into UTF-8 bytes
+ Functor<byte[], String> digest;
+
+ try {
+ if (params.contains("utf8")) digest = new
Codec.Encoder<String, byte[]>(new CharsetCodec("UTF-8"));
+ else digest = new Codec.Encoder<String, byte[]>(new
CharsetCodec("UTF-16"));
+
+ //apply md5 digest or huffman compression
+ if (params.contains("md5")) digest = new
Functor.Composite<byte[], byte[], String>(digest, new BlockDigest("MD5"));
+ else if (params.contains("sha1")) digest = new
Functor.Composite<byte[], byte[], String>(digest, new BlockDigest("SHA-1"));
+ else if (params.contains("huff") ||
params.contains("huffman")) digest = new Functor.Composite<byte[], byte[],
String>(digest, getHuffmanEncoder(lang));
+ } catch (UnsupportedEncodingException e) {
+ throw new IllegalArgumentException(e);
+ } catch (NoSuchAlgorithmException e) {
+ throw new IllegalArgumentException(e);
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+
+ if (params.contains("fold64")) { //fold into Long
+ Functor<Long, byte[]> fold;
+ fold = XorFold64.instance;
+
+ Functor<Long, String> convert = new
Functor.Composite<Long, byte[], String>(digest, fold);
+ return convert;
+ } else { //keep bytes, wrap in ByteArray
+ if (params.contains("wrap8")) digest = new
Functor.Composite<byte[], byte[], String>(digest, new XorWrap(8));
+ else if (params.contains("wrap6")) digest = new
Functor.Composite<byte[], byte[], String>(digest, new XorWrap(6));
+ else if (params.contains("wrap4")) digest = new
Functor.Composite<byte[], byte[], String>(digest, new XorWrap(4));
+ else if (params.contains("wrap4")) digest = new
Functor.Composite<byte[], byte[], String>(digest, new XorWrap(4));
+
+ //create converter that includes wrapping the
byte array in a ByteString
+ Functor<ByteString, String> convert = new
Functor.Composite<ByteString, byte[], String>(digest, ByteString.wrap);
+ return convert;
+ }
+ }
+
public static KeyValueStore<String, Integer> newStore(String
storeParams, String lang) {
KeyValueStore<String, Integer> store = null;
- String[] tt = storeParams.split("[,;|+/ &]+");
+ Set<String> params = parseParams(storeParams);
- Set<String> params = new HashSet<String>();
- params.addAll(Arrays.asList(tt));
-
if (params.contains("none") || params.contains("null"))
store = null;
else if (params.contains("string")) store = new
MapLookup<String, Integer>(new HashMap<String, Integer>());
else if (params.contains("utf8") ||
params.contains("utf16")) {
@@ -74,10 +116,7 @@
}
if (params.contains("fold64")) { //fold into
Long
- Functor<Long, byte[]> fold;
- fold = XorFold64.instance;
-
- Functor<Long, String> convert = new
Functor.Composite<Long, byte[], String>(digest, fold);
+ Functor<Long, String> convert =
(Functor<Long, String>)newHash(params, lang); //XXX: ugly cast
if (params.contains("primitive")) {
LongIntLookup<Long> numStore =
new LongIntLookup<Long>();
@@ -87,13 +126,9 @@
store = new
KeyDigestingValueStore<String, Long, Integer>(numStore, convert);
}
} else { //keep bytes, wrap in ByteArray
- if (params.contains("wrap8"))
digest = new Functor.Composite<byte[], byte[], String>(digest, new XorWrap(8));
- else if
(params.contains("wrap6")) digest = new Functor.Composite<byte[], byte[],
String>(digest, new XorWrap(6));
- else if
(params.contains("wrap4")) digest = new Functor.Composite<byte[], byte[],
String>(digest, new XorWrap(4));
- else if
(params.contains("wrap4")) digest = new Functor.Composite<byte[], byte[],
String>(digest, new XorWrap(4));
-
+
//create converter that
includes wrapping the byte array in a ByteString
- Functor<ByteString, String>
convert = new Functor.Composite<ByteString, byte[], String>(digest,
ByteString.wrap);
+ Functor<ByteString, String>
convert = (Functor<ByteString, String>)newHash(params, lang); //XXX: ugly cast
//set up the store
MapLookup<ByteString, Integer>
byteStore = new MapLookup<ByteString, Integer>(new HashMap<ByteString,
Integer>());
Added:
trunk/WikiWord/WikiWordBuilder/src/test/java/de/brightbyte/wikiword/store/builder/NameHashTrial.java
===================================================================
---
trunk/WikiWord/WikiWordBuilder/src/test/java/de/brightbyte/wikiword/store/builder/NameHashTrial.java
(rev 0)
+++
trunk/WikiWord/WikiWordBuilder/src/test/java/de/brightbyte/wikiword/store/builder/NameHashTrial.java
2010-04-23 16:03:12 UTC (rev 65466)
@@ -0,0 +1,57 @@
+package de.brightbyte.wikiword.store.builder;
+
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.OutputStream;
+import java.io.OutputStreamWriter;
+import java.io.PrintWriter;
+import java.security.NoSuchAlgorithmException;
+
+import de.brightbyte.data.Functor;
+import de.brightbyte.util.PersistenceException;
+import de.brightbyte.wikiword.builder.NameMaps;
+
+public class NameHashTrial {
+ public static void main(String[] args) throws IOException,
PersistenceException, NoSuchAlgorithmException, InterruptedException {
+ String params = args[0];
+ int limit = Integer.parseInt(args[1]);
+
+ Functor<?, String> hash = NameMaps.newHash(params, "en");
+
+ InputStream rawIn = args.length>2 && !args[2].equals("-") ?
new FileInputStream(args[2]) : System.in;
+ OutputStream rawOut = args.length>3 && !args[3].equals("-") ?
new FileOutputStream(args[3]) : System.out;
+
+ BufferedReader in = new BufferedReader(new
InputStreamReader(rawIn, "UTF-8"));
+ PrintWriter out = new PrintWriter(new BufferedWriter(new
OutputStreamWriter(rawOut, "UTF-8")));
+
+ long start = System.nanoTime();
+
+ System.out.println("Reading input...");
+ String s;
+ int c = 0;
+ while ((s = in.readLine()) != null) {
+ c++;
+ if (c>limit) break;
+
+ Object h = hash.apply(s);
+
+ out.println(h+"\t"+s);
+ if (rawOut==System.out) out.flush();
+
+ if (c % 10000 == 0) System.out.format(" at %d\n", c);
+ }
+
+ if (rawOut!=System.out) out.close();
+ else out.flush();
+
+ if (rawIn!=System.in) in.close();
+
+ long t = System.nanoTime() - start;
+ System.out.format("Processed %d entries in %01.3f sec\n", c,
t/1000000000.0);
+ }
+}
_______________________________________________
MediaWiki-CVS mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs