Author: siren Date: Mon Jun 5 13:12:48 2006 New Revision: 411904 URL: http://svn.apache.org/viewvc?rev=411904&view=rev Log: NUTCH-201 add support for subcollections
Added: lucene/nutch/trunk/conf/subcollections.xml.template lucene/nutch/trunk/src/plugin/subcollection/ lucene/nutch/trunk/src/plugin/subcollection/README.txt lucene/nutch/trunk/src/plugin/subcollection/build.xml lucene/nutch/trunk/src/plugin/subcollection/plugin.xml lucene/nutch/trunk/src/plugin/subcollection/src/ lucene/nutch/trunk/src/plugin/subcollection/src/java/ lucene/nutch/trunk/src/plugin/subcollection/src/java/org/ lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/ lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/ lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/ lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/package.html lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/indexer/ lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/ lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/searcher/ lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/searcher/subcollection/ lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/searcher/subcollection/SubcollectionQueryFilter.java lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/util/ lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/util/DomUtil.java lucene/nutch/trunk/src/plugin/subcollection/src/test/ Modified: lucene/nutch/trunk/src/plugin/build.xml Added: lucene/nutch/trunk/conf/subcollections.xml.template URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/subcollections.xml.template?rev=411904&view=auto ============================================================================== --- lucene/nutch/trunk/conf/subcollections.xml.template (added) +++ lucene/nutch/trunk/conf/subcollections.xml.template Mon Jun 5 13:12:48 2006 @@ -0,0 +1,12 @@ +<?xml version="1.0" encoding="UTF-8"?> +<subcollections> + <subcollection> + <name>nutch</name> + <id>nutch</id> + <whitelist> +http://lucene.apache.org/nutch/ +http://wiki.apache.org/nutch/ + </whitelist> + <blacklist /> + </subcollection> +</subcollections> Modified: lucene/nutch/trunk/src/plugin/build.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/build.xml?rev=411904&r1=411903&r2=411904&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/build.xml (original) +++ lucene/nutch/trunk/src/plugin/build.xml Mon Jun 5 13:12:48 2006 @@ -52,6 +52,7 @@ <ant dir="query-url" target="deploy"/> <ant dir="scoring-opic" target="deploy"/> <ant dir="summary-basic" target="deploy"/> + <ant dir="subcollection" target="deploy"/> <ant dir="summary-lucene" target="deploy"/> <ant dir="urlfilter-automaton" target="deploy"/> <ant dir="urlfilter-prefix" target="deploy"/> @@ -133,6 +134,7 @@ <ant dir="query-site" target="clean"/> <ant dir="query-url" target="clean"/> <ant dir="scoring-opic" target="clean"/> + <ant dir="subcollection" target="clean"/> <ant dir="summary-basic" target="clean"/> <ant dir="summary-lucene" target="clean"/> <ant dir="urlfilter-automaton" target="clean"/> @@ -140,5 +142,4 @@ <ant dir="urlfilter-regex" target="clean"/> <ant dir="urlfilter-suffix" target="clean"/> </target> - </project> Added: lucene/nutch/trunk/src/plugin/subcollection/README.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/subcollection/README.txt?rev=411904&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/subcollection/README.txt (added) +++ lucene/nutch/trunk/src/plugin/subcollection/README.txt Mon Jun 5 13:12:48 2006 @@ -0,0 +1,10 @@ +For brief description about this plugin see +src/java/org/apache/nutch/collection/package.html + +Basically: +You need to enable this during indexing and during searching + +After indexing you can limit your searches to certain +subcollection with keyword subcollection, eg. + +"subcollection:nutch hadoop" Added: lucene/nutch/trunk/src/plugin/subcollection/build.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/subcollection/build.xml?rev=411904&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/subcollection/build.xml (added) +++ lucene/nutch/trunk/src/plugin/subcollection/build.xml Mon Jun 5 13:12:48 2006 @@ -0,0 +1,7 @@ +<?xml version="1.0"?> + +<project name="subcollection" default="jar"> + + <import file="../build-plugin.xml"/> + +</project> Added: lucene/nutch/trunk/src/plugin/subcollection/plugin.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/subcollection/plugin.xml?rev=411904&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/subcollection/plugin.xml (added) +++ lucene/nutch/trunk/src/plugin/subcollection/plugin.xml Mon Jun 5 13:12:48 2006 @@ -0,0 +1,31 @@ +<?xml version="1.0" encoding="UTF-8"?> +<plugin + id="subcollection" + name="Subcollection indexing and query filter" + version="1.0.0" + provider-name="apache.org"> + + <requires> + <import plugin="nutch-extensionpoints"/> + </requires> + + <runtime> + <library name="subcollection.jar"/> + </runtime> + + <extension id="org.apache.nutch.searcher.subcollection.query" + name="Subcollection Query Filter" + point="org.apache.nutch.searcher.QueryFilter"> + <implementation id="SubcollectionQueryFilter" + class="org.apache.nutch.searcher.subcollection.SubcollectionQueryFilter" + raw-fields="subcollection"/> + </extension> + + <extension id="org.apache.nutch.indexer.subcollection.indexing" + name="Subcollection Indexing Filter" + point="org.apache.nutch.indexer.IndexingFilter"> + <implementation id="SubcollectionIndexingFilter" + class="org.apache.nutch.indexer.subcollection.SubcollectionIndexingFilter"/> + + </extension> +</plugin> Added: lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java?rev=411904&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java (added) +++ lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java Mon Jun 5 13:12:48 2006 @@ -0,0 +1,212 @@ +/* + * Copyright 2006 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.collection; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.net.URL; +import java.util.Collection; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; +import java.util.logging.Logger; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.nutch.util.DomUtil; +import org.apache.xerces.dom.DocumentImpl; +import org.w3c.dom.Document; +import org.w3c.dom.Element; +import org.w3c.dom.NodeList; + +public class CollectionManager extends Configured { + + public static final String DEFAULT_FILE_NAME = "subcollections.xml"; + + static final Logger LOG = org.apache.hadoop.util.LogFormatter.getLogger(CollectionManager.class + .getName()); + + transient Map collectionMap = new HashMap(); + + transient URL configfile; + + public CollectionManager(Configuration conf) { + super(conf); + init(); + } + + protected void init(){ + try { + LOG.info("initializing CollectionManager"); + // initialize known subcollections + configfile = getConf().getResource( + getConf().get("subcollections.config", DEFAULT_FILE_NAME)); + + InputStream input = getConf().getConfResourceAsInputStream( + getConf().get("subcollections.config", DEFAULT_FILE_NAME)); + Element collections = DomUtil.getDom(input); + + if (collections != null) { + NodeList nodeList = collections + .getElementsByTagName(Subcollection.TAG_COLLECTION); + + LOG.info("file has" + nodeList.getLength() + " elements"); + + for (int i = 0; i < nodeList.getLength(); i++) { + Element scElem = (Element) nodeList.item(i); + Subcollection subCol = new Subcollection(getConf()); + subCol.initialize(scElem); + collectionMap.put(subCol.name, subCol); + } + } else { + LOG.info("Cannot find collections"); + } + } catch (Exception e) { + LOG.info("Error occured:" + e); + e.printStackTrace(System.out); + } + } + + public static CollectionManager getCollectionManager(Configuration conf) { + String key = "collectionmanager"; + CollectionManager impl = (CollectionManager)conf.getObject(key); + if (impl == null) { + try { + LOG.info("Instantiating CollectionManager"); + impl=new CollectionManager(conf); + conf.setObject(key,impl); + } catch (Exception e) { + throw new RuntimeException("Couldn't create CollectionManager",e); + } + } + return impl; + } + + /** + * Returns named subcollection + * + * @param id + * @return Named SubCollection (or null if not existing) + */ + public Subcollection getSubColection(final String id) { + return (Subcollection) collectionMap.get(id); + } + + /** + * Delete named subcollection + * + * @param id + * Id of SubCollection to delete + */ + public void deleteSubCollection(final String id) throws IOException { + final Subcollection subCol = getSubColection(id); + if (subCol != null) { + collectionMap.remove(id); + } + } + + /** + * Create a new subcollection. + * + * @param name + * Name of SubCollection to create + * @return Created SubCollection or null if allready existed + */ + public Subcollection createSubCollection(final String id, final String name) { + Subcollection subCol = null; + + if (!collectionMap.containsKey(id)) { + subCol = new Subcollection(id, name, getConf()); + collectionMap.put(id, subCol); + } + + return subCol; + } + + /** + * Return names of collections url is part of + * + * @param url + * The url to test against Collections + * @return Space delimited string of collection names url is part of + */ + public String getSubCollections(final String url) { + String collections = ""; + final Iterator iterator = collectionMap.values().iterator(); + + while (iterator.hasNext()) { + final Subcollection subCol = (Subcollection) iterator.next(); + if (subCol.filter(url) != null) { + collections += " " + subCol.name; + } + } + LOG.fine("subcollections:" + collections); + + return collections; + } + + /** + * Returns all collections + * + * @return All collections CollectionManager knows about + */ + public Collection getAll() { + return collectionMap.values(); + } + + /** + * Save collections into file + * + * @throws Exception + */ + public void save() throws IOException { + try { + final FileOutputStream fos = new FileOutputStream(new File(configfile + .getFile())); + final Document doc = new DocumentImpl(); + final Element collections = doc + .createElement(Subcollection.TAG_COLLECTIONS); + final Iterator iterator = collectionMap.values().iterator(); + + while (iterator.hasNext()) { + final Subcollection subCol = (Subcollection) iterator.next(); + final Element collection = doc + .createElement(Subcollection.TAG_COLLECTION); + collections.appendChild(collection); + final Element name = doc.createElement(Subcollection.TAG_NAME); + name.setNodeValue(subCol.getName()); + collection.appendChild(name); + final Element whiteList = doc + .createElement(Subcollection.TAG_WHITELIST); + whiteList.setNodeValue(subCol.getWhiteListString()); + collection.appendChild(whiteList); + final Element blackList = doc + .createElement(Subcollection.TAG_BLACKLIST); + blackList.setNodeValue(subCol.getBlackListString()); + collection.appendChild(blackList); + } + + DomUtil.saveDom(fos, collections); + fos.flush(); + fos.close(); + } catch (FileNotFoundException e) { + throw new IOException(e.toString()); + } + } +} Added: lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java?rev=411904&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java (added) +++ lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java Mon Jun 5 13:12:48 2006 @@ -0,0 +1,214 @@ +/* + * Copyright 2006 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.collection; + +import java.util.ArrayList; +import java.util.Iterator; +import java.util.StringTokenizer; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.nutch.net.URLFilter; +import org.apache.xerces.util.DOMUtil; +import org.w3c.dom.Element; + +/** + * SubCollection represents a subset of index, you can define url patterns that + * will indicate that particular page (url) is part of SubCollection. + */ +public class Subcollection extends Configured implements URLFilter{ + + public static final String TAG_COLLECTIONS="subcollections"; + public static final String TAG_COLLECTION="subcollection"; + public static final String TAG_WHITELIST="whitelist"; + public static final String TAG_BLACKLIST="blacklist"; + public static final String TAG_NAME="name"; + public static final String TAG_ID="id"; + + ArrayList blackList = new ArrayList(); + + ArrayList whiteList = new ArrayList(); + + /** + * SubCollection identifier + */ + String id; + + /** + * SubCollection name + */ + String name; + + /** + * SubCollection whitelist as String + */ + String wlString; + + /** + * SubCollection blacklist as String + */ + String blString; + + /** public Constructor + * + * @param id id of SubCollection + * @param name name of SubCollection + */ + public Subcollection(String id, String name, Configuration conf) { + this(conf); + this.id=id; + this.name = name; + } + + public Subcollection(Configuration conf){ + super(conf); + } + + /** + * @return Returns the name + */ + public String getName() { + return name; + } + + /** + * @return Returns the id + */ + public String getId() { + return id; + } + + /** + * Returns whitelist + * + * @return Whitelist entries + */ + public ArrayList getWhiteList() { + return whiteList; + } + + /** + * Returns whitelist String + * + * @return Whitelist String + */ + public String getWhiteListString() { + return wlString; + } + + /** + * Returns blacklist String + * + * @return Blacklist String + */ + public String getBlackListString() { + return blString; + } + + /** + * @param whiteList + * The whiteList to set. + */ + public void setWhiteList(ArrayList whiteList) { + this.whiteList = whiteList; + } + + /** + * Simple "indexOf" currentFilter for matching patterns. + * + * <pre> + * rules for evaluation are as follows: + * 1. if pattern matches in blacklist then url is rejected + * 2. if pattern matches in whitelist then url is allowed + * 3. url is rejected + * </pre> + * + * @see org.apache.nutch.net.URLFilter#filter(java.lang.String) + */ + public String filter(String urlString) { + // first the blacklist + Iterator i = blackList.iterator(); + while (i.hasNext()) { + String row = (String) i.next(); + if (urlString.indexOf(row) != -1) + return null; + } + + // then whitelist + i = whiteList.iterator(); + while (i.hasNext()) { + String row = (String) i.next(); + if (urlString.indexOf(row) != -1) + return urlString; + } + return null; + } + + /** + * Initialize SubCollection from dom element + * + * @param collection + */ + public void initialize(Element collection) { + this.name = DOMUtil.getChildText( + collection.getElementsByTagName(TAG_NAME).item(0)).trim(); + this.wlString = DOMUtil.getChildText( + collection.getElementsByTagName(TAG_WHITELIST).item(0)).trim(); + this.blString = DOMUtil.getChildText( + collection.getElementsByTagName(TAG_BLACKLIST).item(0)).trim(); + + parseList(this.whiteList, wlString); + parseList(this.blackList, blString); + } + + /** + * Create a list of patterns from chunk of text, patterns are separated with + * newline + * + * @param list + * @param text + */ + protected void parseList(ArrayList list, String text) { + list.clear(); + + StringTokenizer st = new StringTokenizer(text, "\n\r"); + + while (st.hasMoreElements()) { + String line = (String) st.nextElement(); + list.add(line.trim()); + } + } + + /** + * Set contents of blacklist from String + * + * @param list the blacklist contents + */ + public void setBlackList(String list) { + this.blString = list; + parseList(blackList, list); + } + + /** + * Set contents of whitelist from String + * + * @param list the whitelist contents + */ + public void setWhiteList(String list) { + this.wlString = list; + parseList(whiteList, list); + } +} Added: lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/package.html URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/package.html?rev=411904&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/package.html (added) +++ lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/package.html Mon Jun 5 13:12:48 2006 @@ -0,0 +1,36 @@ +<html> +<body> +<p> +Subcollection is a subset of an index. Subcollections are defined +by urlpatterns in form of white/blacklist. So to get the page into +subcollection it must match the whitelist and not the blacklist. +</p> +<p> +Subcollection definitions are read from a file subcollections.xml +and the format is as follows (imagine here that you are crawling all +the virtualhosts from apache.org and you wan't to tag pages with +url pattern "http://lucene.apache.org/nutch" and http://wiki.apache.org/nutch/ +to be part of subcollection "nutch", this allows you to later search +specifically from this subcollection) +</p> +<p/> +<p/> +<pre> +<?xml version="1.0" encoding="UTF-8"?> +<subcollections> + <subcollection> + <name>nutch</name> + <id>lucene</id> + <whitelist>http://lucene.apache.org/nutch</whitelist> + <whitelist>http://wiki.apache.org/nutch/</whitelist> + <blacklist /> + </subcollection> +</subcollections> +</pre> +</p> +<p>Despite of this configuration you still can crawl any urls +as long as they pass through your global url filters. (note that +you must also seed your urls in normal nutch way) +</p> +</body> +</html> Added: lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java?rev=411904&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java (added) +++ lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java Mon Jun 5 13:12:48 2006 @@ -0,0 +1,74 @@ +/* + * Copyright 2006 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.indexer.subcollection; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.io.UTF8; +import org.apache.hadoop.util.LogFormatter; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; + +import org.apache.nutch.parse.Parse; +import org.apache.nutch.util.NutchConfiguration; + +import org.apache.nutch.indexer.IndexingFilter; +import org.apache.nutch.indexer.IndexingException; + +import org.apache.nutch.collection.CollectionManager; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.Inlinks; + +import java.util.logging.Logger; + +public class SubcollectionIndexingFilter extends Configured implements IndexingFilter { + + public SubcollectionIndexingFilter(){ + super(NutchConfiguration.create()); + } + + public SubcollectionIndexingFilter(Configuration conf) { + super(conf); + } + + /** + * Doc field name + */ + public static final String FIELD_NAME = "subcollection"; + + /** + * Logger + */ + public static final Logger LOG = LogFormatter + .getLogger(SubcollectionIndexingFilter.class.getName()); + + /** + * "Mark" document to be a part of subcollection + * + * @param doc + * @param url + */ + private void addSubCollectionField(Document doc, String url) { + String collname = CollectionManager.getCollectionManager(getConf()).getSubCollections(url); + doc.add(new Field(FIELD_NAME, collname, Field.Store.YES, Field.Index.TOKENIZED)); + } + + public Document filter(Document doc, Parse parse, UTF8 url, CrawlDatum datum, Inlinks inlinks) throws IndexingException { + String sUrl = url.toString(); + addSubCollectionField(doc, sUrl); + return doc; + } +} Added: lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/searcher/subcollection/SubcollectionQueryFilter.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/searcher/subcollection/SubcollectionQueryFilter.java?rev=411904&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/searcher/subcollection/SubcollectionQueryFilter.java (added) +++ lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/searcher/subcollection/SubcollectionQueryFilter.java Mon Jun 5 13:12:48 2006 @@ -0,0 +1,37 @@ +/* + * Copyright 2006 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.searcher.subcollection; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.indexer.subcollection.SubcollectionIndexingFilter; +import org.apache.nutch.searcher.RawFieldQueryFilter; + +/** Handles "collection:" query clauses, causing them to search the "collection" field + * indexed by SubcollectionINdexingFilter. */ +public class SubcollectionQueryFilter extends RawFieldQueryFilter { + public SubcollectionQueryFilter() { + super(SubcollectionIndexingFilter.FIELD_NAME); + } + + public void setConf(Configuration conf) { + // nothing to configure + } + + public Configuration getConf() { + // nothing configured + return null; + } +} Added: lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/util/DomUtil.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/util/DomUtil.java?rev=411904&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/util/DomUtil.java (added) +++ lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/util/DomUtil.java Mon Jun 5 13:12:48 2006 @@ -0,0 +1,93 @@ +/* + * Copyright 2006 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.util; + +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.io.UnsupportedEncodingException; + +import javax.xml.transform.Transformer; +import javax.xml.transform.TransformerConfigurationException; +import javax.xml.transform.TransformerException; +import javax.xml.transform.TransformerFactory; +import javax.xml.transform.dom.DOMSource; +import javax.xml.transform.stream.StreamResult; + +import org.apache.xerces.parsers.DOMParser; +import org.w3c.dom.Element; +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; + +public class DomUtil { + + /** + * Returns parsed dom tree or null if any error + * + * @param is + * @return + */ + public static Element getDom(InputStream is) { + + Element element = null; + + DOMParser parser = new DOMParser(); + + InputSource input; + try { + input = new InputSource(is); + input.setEncoding("UTF-8"); + parser.parse(input); + element = (Element) parser.getDocument().getChildNodes().item(0); + } catch (FileNotFoundException e) { + e.printStackTrace(); + } catch (SAXException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + return element; + } + + /** + * save dom into ouputstream + * + * @param os + * @param e + */ + public static void saveDom(OutputStream os, Element e) { + + DOMSource source = new DOMSource(e); + TransformerFactory transFactory = TransformerFactory.newInstance(); + Transformer transformer; + try { + transformer = transFactory.newTransformer(); + transformer.setOutputProperty("indent", "yes"); + StreamResult result = new StreamResult(os); + transformer.transform(source, result); + os.flush(); + } catch (UnsupportedEncodingException e1) { + e1.printStackTrace(); + } catch (IOException e1) { + e1.printStackTrace(); + } catch (TransformerConfigurationException e2) { + e2.printStackTrace(); + } catch (TransformerException ex) { + ex.printStackTrace(); + } + } +} _______________________________________________ Nutch-cvs mailing list Nutch-cvs@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/nutch-cvs