Author: j16sdiz
Date: 2009-01-03 06:40:55 +0000 (Sat, 03 Jan 2009)
New Revision: 24889
Modified:
trunk/plugins/XMLSpider/IndexWriter.java
trunk/plugins/XMLSpider/db/PerstRoot.java
Log:
IndexWriter: don't load complete term list. use lessor memory.
Modified: trunk/plugins/XMLSpider/IndexWriter.java
===================================================================
--- trunk/plugins/XMLSpider/IndexWriter.java 2009-01-03 06:40:43 UTC (rev
24888)
+++ trunk/plugins/XMLSpider/IndexWriter.java 2009-01-03 06:40:55 UTC (rev
24889)
@@ -31,14 +31,19 @@
import plugins.XMLSpider.db.PerstRoot;
import plugins.XMLSpider.db.Term;
import plugins.XMLSpider.db.TermPosition;
+import plugins.XMLSpider.org.garret.perst.IterableIterator;
import plugins.XMLSpider.org.garret.perst.Storage;
import plugins.XMLSpider.org.garret.perst.StorageFactory;
import freenet.support.Logger;
+import freenet.support.io.Closer;
/**
* Write index to disk file
*/
public class IndexWriter {
+ private static final String[] HEX = { "0", "1", "2", "3", "4", "5",
"6", "7", "8", "9", "a", "b", "c", "d", "e",
+ "f" };
+
//- Writing Index
public long tProducedIndex;
private Vector<String> indices;
@@ -63,7 +68,7 @@
return;
}
- makeSubIndices(perstRoot, config);
+ makeSubIndices(perstRoot);
makeMainIndex(config);
time_taken = System.currentTimeMillis() - time_taken;
@@ -206,109 +211,52 @@
*
* @throws Exception
*/
- private void makeSubIndices(PerstRoot perstRoot, Config config) throws
Exception {
+ private void makeSubIndices(PerstRoot perstRoot) throws Exception {
Logger.normal(this, "Generating index...");
List<Term> termList = perstRoot.getTermList();
int termCount = perstRoot.getTermCount();
indices = new Vector<String>();
- int prefix = (int) ((Math.log(termCount) -
Math.log(config.getIndexMaxEntries())) / Math.log(16)) - 1;
- if (prefix <= 0)
- prefix = 1;
match = 1;
- Vector<Term> list = new Vector<Term>();
- Term term0 = termList.get(0);
- String currentPrefix = term0.getMD5().substring(0, prefix);
-
- int i = 0;
- for (Term term : termList) {
- String key = term.getMD5();
- //create a list of the words to be added in the same
subindex
- if (key.startsWith(currentPrefix)) {
- i++;
- list.add(term);
- } else {
- //generate the appropriate subindex with the
current list
- generateSubIndex(config, prefix, list);
-
- // next list
- currentPrefix = key.substring(0, prefix);
- list = new Vector<Term>();
- list.add(term);
- }
- }
-
- generateSubIndex(config, prefix, list);
+ for (String hex : HEX)
+ generateSubIndex(perstRoot, hex);
}
- private void generateSubIndex(Config config, int p, List<Term> list)
throws Exception {
- /*
- * if the list is less than max allowed entries in a file then
directly generate the xml
- * otherwise split the list into further sublists and iterate
till the number of entries per
- * subindex is less than the allowed value
- */
+ private void generateSubIndex(PerstRoot perstRoot, String prefix)
throws Exception {
if (logMINOR)
- Logger.minor(this, "Generating subindex for " +
list.size() + " entries with prefix ("
- + list.get(0).getMD5().substring(0, p) + ")");
+ Logger.minor(this, "Generating subindex for (" + prefix
+ ")");
- try {
- if (list.size() == 0)
- return;
- if (list.size() < config.getIndexMaxEntries()) {
- generateXML(config, list, p);
- return;
- }
- } catch (TooBigIndexException e) {
- // Handle below
- }
+ if (generateXML(perstRoot, prefix))
+ return;
+
if (logMINOR)
- Logger.minor(this, "Too big subindex for " +
list.size() + " entries with prefix ("
- + list.get(0).getMD5().substring(0, p) + ")");
- //prefix needs to be incremented
- if (match <= p)
- match = p + 1;
- int prefix = p + 1;
- int i = 0;
- String str = list.get(i).getMD5();
- int index = 0;
- while (i < list.size()) {
- Term term = list.get(i);
- String key = term.getMD5();
- if ((key.substring(0, prefix)).equals(str.substring(0,
prefix))) {
- i++;
- } else {
- generateSubIndex(config, prefix,
list.subList(index, i));
- index = i;
- str = key;
- }
- }
- generateSubIndex(config, prefix, list.subList(index, i));
- }
+ Logger.minor(this, "Too big subindex for (" + prefix +
")");
- private static class TooBigIndexException extends Exception {
- private static final long serialVersionUID =
-6172560811504794914L;
+ for (String hex : HEX)
+ generateSubIndex(perstRoot, prefix + hex);
}
/**
* generates the xml index with the given list of words with prefix
number of matching bits in
* md5
*
- * @param list
- * list of the words to be added in the index
* @param prefix
- * number of matching bits of md5
- * @throws Exception
+ * prefix string
+ * @return successful
+ * @throws IOException
*/
- private void generateXML(Config config, List<Term> list, int prefix)
throws TooBigIndexException, Exception {
- String p = list.get(0).getMD5().substring(0, prefix);
- indices.add(p);
- File outputFile = new File(config.getIndexDir() + "index_" + p
+ ".xml");
+ private boolean generateXML(PerstRoot perstRoot, String prefix) throws
IOException {
+ Config config = perstRoot.getConfig();
+
+ File outputFile = new File(config.getIndexDir() + "index_" +
prefix + ".xml");
BufferedOutputStream fos = new BufferedOutputStream(new
FileOutputStream(outputFile));
- StreamResult resultStream;
- resultStream = new StreamResult(fos);
+ StreamResult resultStream = new StreamResult(fos);
+ IterableIterator<Term> termIterator =
perstRoot.getTermIterator(prefix, prefix + "g");
+
+ int count = 0;
try {
/* Initialize xml builder */
Document xmlDoc = null;
@@ -321,8 +269,7 @@
try {
xmlBuilder = xmlFactory.newDocumentBuilder();
} catch (javax.xml.parsers.ParserConfigurationException
e) {
- Logger.error(this, "Spider: Error while
initializing XML generator: " + e.toString(), e);
- return;
+ throw new RuntimeException("Spider: Error while
initializing XML generator", e);
}
impl = xmlBuilder.getDOMImplementation();
@@ -341,16 +288,14 @@
headerElement.appendChild(subHeaderElement);
Element filesElement = xmlDoc.createElement("files");
/* filesElement != fileElement */
- Element EntriesElement =
xmlDoc.createElement("entries");
- EntriesElement.setNodeValue(list.size() + "");
- EntriesElement.setAttribute("value", list.size() + "");
-
+
/* Adding word index */
Element keywordsElement =
xmlDoc.createElement("keywords");
Vector<Long> fileid = new Vector<Long>();
- for (int i = 0; i < list.size(); i++) {
+ for (Term term : termIterator) {
+ count++;
+
Element wordElement =
xmlDoc.createElement("word");
- Term term = list.get(i);
wordElement.setAttribute("v", term.getWord());
Set<Page> pages = term.getPages();
@@ -397,7 +342,11 @@
keywordsElement.appendChild(xmlDoc.createComment(term.getMD5()));
keywordsElement.appendChild(wordElement);
}
- rootElement.appendChild(EntriesElement);
+
+ Element entriesElement =
xmlDoc.createElement("entries");
+ entriesElement.setAttribute("value", count + "");
+
+ rootElement.appendChild(entriesElement);
rootElement.appendChild(headerElement);
rootElement.appendChild(filesElement);
rootElement.appendChild(keywordsElement);
@@ -410,9 +359,7 @@
try {
serializer = transformFactory.newTransformer();
} catch
(javax.xml.transform.TransformerConfigurationException e) {
- Logger.error(this, "Spider: Error while
serializing XML (transformFactory.newTransformer()): "
- + e.toString(), e);
- return;
+ throw new RuntimeException("Spider: Error while
serializing XML (transformFactory.newTransformer())", e);
}
serializer.setOutputProperty(OutputKeys.ENCODING,
"UTF-8");
serializer.setOutputProperty(OutputKeys.INDENT, "yes");
@@ -420,19 +367,21 @@
try {
serializer.transform(domSource, resultStream);
} catch (javax.xml.transform.TransformerException e) {
- Logger.error(this, "Spider: Error while
serializing XML (transform()): " + e.toString(), e);
- return;
+ throw new RuntimeException("Spider: Error while
serializing XML (transform())", e);
}
} finally {
- fos.close();
+ Closer.close(fos);
}
- if (outputFile.length() > config.getIndexSubindexMaxSize() &&
list.size() > 1) {
+
+ if (outputFile.length() > config.getIndexSubindexMaxSize() &&
count > 1) {
outputFile.delete();
- throw new TooBigIndexException();
+ return false;
}
if (logMINOR)
Logger.minor(this, "Spider: indexes regenerated.");
+ indices.add(prefix);
+ return true;
}
public static void main(String[] arg) throws Exception {
Modified: trunk/plugins/XMLSpider/db/PerstRoot.java
===================================================================
--- trunk/plugins/XMLSpider/db/PerstRoot.java 2009-01-03 06:40:43 UTC (rev
24888)
+++ trunk/plugins/XMLSpider/db/PerstRoot.java 2009-01-03 06:40:55 UTC (rev
24889)
@@ -4,6 +4,7 @@
import java.util.List;
import plugins.XMLSpider.org.garret.perst.FieldIndex;
+import plugins.XMLSpider.org.garret.perst.IterableIterator;
import plugins.XMLSpider.org.garret.perst.Key;
import plugins.XMLSpider.org.garret.perst.Persistent;
import plugins.XMLSpider.org.garret.perst.Storage;
@@ -63,10 +64,10 @@
}
}
- public Iterator<Term> getTermIterator() {
+ public IterableIterator<Term> getTermIterator(String from, String till)
{
md5Term.sharedLock();
try {
- return md5Term.iterator();
+ return md5Term.iterator(from, till, 0);
} finally {
md5Term.unlock();
}
_______________________________________________
cvs mailing list
[email protected]
http://emu.freenetproject.org/cgi-bin/mailman/listinfo/cvs