[Nutch Wiki] Trivial Update of PublicServers by Gal Nitzan
Dear Wiki user, You have subscribed to a wiki page or wiki category on Nutch Wiki for change notification. The following page has been changed by Gal Nitzan: http://wiki.apache.org/nutch/PublicServers -- * [http://www.ertech.ch/ ertech] uses nutch as its search engine. It is integrated with the CMS system aarcat from aarboard. * [http://www.expeditors.com Expeditors International] uses Nutch as its site search engine and on its Intranet. + + * [http://search.fileratings.com FileRatings Search] is a search engine of software product. * [http://www.gensphere.org/ GenSphere] - Genealogy Search Engine based on Nutch.
svn commit: r412399 - /lucene/nutch/trunk/conf/mime-types.xml
Author: jerome Date: Wed Jun 7 06:07:27 2006 New Revision: 412399 URL: http://svn.apache.org/viewvc?rev=412399view=rev Log: NUTCH-275 : Remove the magic resolution for xml content-type Modified: lucene/nutch/trunk/conf/mime-types.xml Modified: lucene/nutch/trunk/conf/mime-types.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/mime-types.xml?rev=412399r1=412398r2=412399view=diff == --- lucene/nutch/trunk/conf/mime-types.xml (original) +++ lucene/nutch/trunk/conf/mime-types.xml Wed Jun 7 06:07:27 2006 @@ -693,7 +693,7 @@ mime-type name=text/xml description=Extensible Markup Language File extxml/extextxsl/ext -magic offset=0 value=lt;?xml/ +!--magic offset=0 value=lt;?xml/-- /mime-type mime-type name=text/x-setext
svn commit: r412577 - in /lucene/nutch/trunk/src/test/org/apache/nutch/util/mime: mime-types.txt test.xml
Author: jerome Date: Wed Jun 7 15:06:53 2006 New Revision: 412577 URL: http://svn.apache.org/viewvc?rev=412577view=rev Log: NUTCH-275 : Remove unit test for magic based content type guessing for xml Removed: lucene/nutch/trunk/src/test/org/apache/nutch/util/mime/test.xml Modified: lucene/nutch/trunk/src/test/org/apache/nutch/util/mime/mime-types.txt Modified: lucene/nutch/trunk/src/test/org/apache/nutch/util/mime/mime-types.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/util/mime/mime-types.txt?rev=412577r1=412576r2=412577view=diff == --- lucene/nutch/trunk/src/test/org/apache/nutch/util/mime/mime-types.txt (original) +++ lucene/nutch/trunk/src/test/org/apache/nutch/util/mime/mime-types.txt Wed Jun 7 15:06:53 2006 @@ -136,7 +136,7 @@ text/tab-separated-values;tsv text/vnd.wap.wml;wml text/vnd.wap.wmlscript;wmls -text/xml;xml;test.xml +text/xml;xml text/xml;xsl text/x-setext;etx video/mpeg;mpg
svn commit: r412582 - /lucene/nutch/trunk/src/java/org/apache/nutch/analysis/CommonGrams.java
Author: jerome Date: Wed Jun 7 15:19:08 2006 New Revision: 412582 URL: http://svn.apache.org/viewvc?rev=412582view=rev Log: NUTCH-301 : CommonTerms are cached in the Configuration Modified: lucene/nutch/trunk/src/java/org/apache/nutch/analysis/CommonGrams.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/analysis/CommonGrams.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/analysis/CommonGrams.java?rev=412582r1=412581r2=412582view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/analysis/CommonGrams.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/analysis/CommonGrams.java Wed Jun 7 15:19:08 2006 @@ -37,7 +37,10 @@ private static final Logger LOG = LogFormatter.getLogger(org.apache.nutch.analysis.CommonGrams); private static final char SEPARATOR = '-'; - private HashMap COMMON_TERMS = new HashMap(); + /** The key used to cache commonTerms in Configuration */ + private static final String KEY = CommonGrams.class.getName(); + + private HashMap commonTerms = new HashMap(); /** * The constructor. @@ -135,7 +138,13 @@ /** Construct using the provided config file. */ private void init(Configuration conf) { +// First, try to retrieve some commonTerms cached in configuration. +commonTerms = (HashMap) conf.getObject(KEY); +if (commonTerms != null) { return; } + +// Otherwise, read the terms.file try { + commonTerms = new HashMap(); Reader reader = conf.getConfResourceAsReader (conf.get(analysis.common.terms.file)); BufferedReader in = new BufferedReader(reader); @@ -160,13 +169,14 @@ while ((token = ts.next()) != null) { gram = gram + SEPARATOR + token.termText(); } -HashSet table = (HashSet)COMMON_TERMS.get(field); +HashSet table = (HashSet)commonTerms.get(field); if (table == null) { table = new HashSet(); - COMMON_TERMS.put(field, table); + commonTerms.put(field, table); } table.add(gram); } + conf.setObject(KEY, commonTerms); } catch (IOException e) { throw new RuntimeException(e.toString()); } @@ -175,7 +185,7 @@ /** Construct a token filter that inserts n-grams for common terms. For use * while indexing documents. */ public TokenFilter getFilter(TokenStream ts, String field) { -return new Filter(ts, (HashSet)COMMON_TERMS.get(field)); +return new Filter(ts, (HashSet)commonTerms.get(field)); } /** Utility to convert an array of Query.Terms into a token stream. */