Author: ivol37 at gmail.com Date: Wed Jan 26 10:59:13 2011 New Revision: 708
Log: Added: sandbox/ivol/amdatu-searchandindex/nutch/ sandbox/ivol/amdatu-searchandindex/nutch/install_nutch.bat sandbox/ivol/amdatu-searchandindex/nutch/nutch-1.2.jar (contents, props changed) sandbox/ivol/amdatu-searchandindex/nutch/pom.xml sandbox/ivol/amdatu-searchandindex/nutch/src/ sandbox/ivol/amdatu-searchandindex/nutch/src/main/ sandbox/ivol/amdatu-searchandindex/nutch/src/main/java/ sandbox/ivol/amdatu-searchandindex/nutch/src/main/java/org/ sandbox/ivol/amdatu-searchandindex/nutch/src/main/java/org/amdatu/ sandbox/ivol/amdatu-searchandindex/nutch/src/main/java/org/amdatu/searchandindex/ sandbox/ivol/amdatu-searchandindex/nutch/src/main/java/org/amdatu/searchandindex/nutch/ sandbox/ivol/amdatu-searchandindex/nutch/src/main/java/org/amdatu/searchandindex/nutch/impl/ sandbox/ivol/amdatu-searchandindex/nutch/src/main/java/org/amdatu/searchandindex/nutch/impl/NutchIndex.java sandbox/ivol/amdatu-searchandindex/nutch/src/main/java/org/amdatu/searchandindex/nutch/osgi/ sandbox/ivol/amdatu-searchandindex/nutch/src/main/java/org/amdatu/searchandindex/nutch/osgi/Activator.java sandbox/ivol/amdatu-searchandindex/nutch/src/main/resources/ sandbox/ivol/amdatu-searchandindex/nutch/src/main/resources/conf/ sandbox/ivol/amdatu-searchandindex/nutch/src/main/resources/conf/common-terms.utf8 sandbox/ivol/amdatu-searchandindex/nutch/src/main/resources/conf/nutch-site.xml Added: sandbox/ivol/amdatu-searchandindex/nutch/install_nutch.bat ============================================================================== --- (empty file) +++ sandbox/ivol/amdatu-searchandindex/nutch/install_nutch.bat Wed Jan 26 10:59:13 2011 @@ -0,0 +1 @@ +mvn install:install-file -Dfile=nutch-1.2.jar -DgroupId=org.apache.nutch -DartifactId=nutch -Dversion=1.2 -Dpackaging=jar Added: sandbox/ivol/amdatu-searchandindex/nutch/nutch-1.2.jar ============================================================================== Binary file. No diff available. Added: sandbox/ivol/amdatu-searchandindex/nutch/pom.xml ============================================================================== --- (empty file) +++ sandbox/ivol/amdatu-searchandindex/nutch/pom.xml Wed Jan 26 10:59:13 2011 @@ -0,0 +1,116 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> + <modelVersion>4.0.0</modelVersion> + <parent> + <groupId>org.amdatu</groupId> + <artifactId>org.amdatu.searchandindex</artifactId> + <version>0.1.0-SNAPSHOT</version> + </parent> + <groupId>org.amdatu.searchandindex</groupId> + <artifactId>nutch</artifactId> + <packaging>bundle</packaging> + <name>Amdatu Search and Index - Nutch Service</name> + <version>${platform.version}</version> + + <dependencies> + <dependency> + <groupId>org.amdatu.searchandindex</groupId> + <artifactId>solr</artifactId> + <version>${platform.version}</version> + <scope>provided</scope> + <type>bundle</type> + </dependency> + <dependency> + <groupId>org.amdatu.core.config</groupId> + <artifactId>templates</artifactId> + <version>${platform.version}</version> + <scope>provided</scope> + <type>bundle</type> + </dependency> + <dependency> + <groupId>org.apache.nutch</groupId> + <artifactId>nutch</artifactId> + <version>1.2</version> + <scope>compile</scope> + <type>jar</type> + </dependency> + <dependency> + <groupId>org.apache.hadoop</groupId> + <artifactId>hadoop-core</artifactId> + <version>0.20.2</version> + <scope>compile</scope> + <type>jar</type> + </dependency> + <dependency> + <groupId>org.apache.lucene</groupId> + <artifactId>lucene-core</artifactId> + <version>3.0.1</version> + <scope>compile</scope> + <type>jar</type> + </dependency> + <dependency> + <groupId>org.apache.lucene</groupId> + <artifactId>lucene-misc</artifactId> + <version>3.0.1</version> + <scope>compile</scope> + <type>jar</type> + </dependency> + <dependency> + <groupId>commons-lang</groupId> + <artifactId>commons-lang</artifactId> + <version>2.1</version> + <scope>compile</scope> + <type>jar</type> + </dependency> + <dependency> + <groupId>commons-logging</groupId> + <artifactId>commons-logging</artifactId> + <version>1.0.4</version> + <scope>compile</scope> + <type>jar</type> + </dependency> + </dependencies> + + <build> + <plugins> + <plugin> + <groupId>org.apache.felix</groupId> + <artifactId>maven-bundle-plugin</artifactId> + <extensions>true</extensions> + <configuration> + <instructions> + <Bundle-Activator>org.amdatu.searchandindex.nutch.osgi.Activator</Bundle-Activator> + <Bundle-SymbolicName>org.amdatu.searchandindex.nutch</Bundle-SymbolicName> + <Embed-Dependency>*;scope=compile</Embed-Dependency> + <Import-Package> + *;resolution:=optional + </Import-Package> + <Bundle-ClassPath> + conf, + . + </Bundle-ClassPath> + </instructions> + </configuration> + </plugin> + <plugin> + <artifactId>maven-antrun-plugin</artifactId> + <executions> + <execution> + <phase>package</phase> + <goals> + <goal>run</goal> + </goals> + <configuration> + <tasks> + <copy + file="${project.build.directory}/${project.build.finalName}.jar" + todir="D:\Amdatu-svn\trunk\amdatu-release\target\org.amdatu.amdatu-release-0.1.0-SNAPSHOT-dev\deploy" overwrite="true" /> + </tasks> + </configuration> + </execution> + </executions> + </plugin> + </plugins> + </build> +</project> Added: sandbox/ivol/amdatu-searchandindex/nutch/src/main/java/org/amdatu/searchandindex/nutch/impl/NutchIndex.java ============================================================================== --- (empty file) +++ sandbox/ivol/amdatu-searchandindex/nutch/src/main/java/org/amdatu/searchandindex/nutch/impl/NutchIndex.java Wed Jan 26 10:59:13 2011 @@ -0,0 +1,115 @@ +package org.amdatu.searchandindex.nutch.impl; + + +import java.io.File; +import java.io.IOException; +import java.net.URL; +import java.util.Date; +import java.util.Enumeration; + +import org.amdatu.core.config.templates.ConfigTemplateManager; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.searcher.Hit; +import org.apache.nutch.searcher.HitDetails; +import org.apache.nutch.searcher.Hits; +import org.apache.nutch.searcher.NutchBean; +import org.apache.nutch.searcher.Query; +import org.apache.nutch.util.NutchConfiguration; +import org.osgi.framework.BundleContext; +import org.osgi.service.log.LogService; + +public class NutchIndex { + // Services injected by the Felix dependency manager + private volatile LogService m_logService; + private volatile BundleContext m_bundleContext; + private volatile ConfigTemplateManager m_configTemplateManager; + + private File m_workDir; + + public void init() { + try { + m_logService.log(LogService.LOG_INFO, "Initializing Solr configuration"); + + File workBaseDir = new File(System.getProperty("user.dir"), "work"); + m_workDir = new File(workBaseDir, "nutch"); + m_workDir.mkdirs(); + + // Find all entries in our 'conf' directory. + final Enumeration<URL> resources = m_bundleContext.getBundle().findEntries("conf", "*.*", true); + if (resources != null) { + while (resources.hasMoreElements()) { + final URL resource = resources.nextElement(); + File coreConfFile = new File(m_workDir, resource.getFile().replace("conf/", "")); + if (!coreConfFile.exists()) { + // Only write this file if it does not yet exist + m_configTemplateManager.writeConfiguration(resource, coreConfFile); + } + } + } + + System.setProperty("nutch.site.plugin.folders", new File(m_workDir, "plugins").getAbsolutePath()); + System.setProperty("nutch.site.searcher.dir", new File(m_workDir, "crawl").getAbsolutePath()); + + } catch (IOException e) { + m_logService.log(LogService.LOG_ERROR, "Could not replace configuration entries in storage-conf.xml", e); + } + } + + public void start() { + // Save the current context classloader first + final ClassLoader contextClassLoader = Thread.currentThread().getContextClassLoader(); + try { + + // overwrite the context classloader, restore it later + final ClassLoader classLoader = this.getClass().getClassLoader(); + Thread.currentThread().setContextClassLoader(classLoader); + + try { + System.setProperty("hadoop.log.dir", m_workDir.getAbsolutePath()); + + // define a keyword for the search + String nutchSearchString = "smart"; + + // configure nutch + Configuration nutchConf = NutchConfiguration.create(); + NutchBean nutchBean = new NutchBean(nutchConf); + + // build the query + Query nutchQuery = Query.parse(nutchSearchString, nutchConf); + // optionally specify the maximum number of hits (default is 10) + // nutchQuery.getParams().setNumHits(100); + // nutchQuery.getParams().setMaxHitsPerDup(100); + Hits nutchHits = nutchBean.search(nutchQuery); + + // display the number of hits + System.out.println("Found " + nutchHits.getLength() + " hits.\n"); + + // get the details about each hit (includes title, URL, a summary + // and the date when this was fetched) + for (int i = 0; i < nutchHits.getLength(); i++) { + Hit hit = nutchHits.getHit(i); + HitDetails details = nutchBean.getDetails(hit); + String title = details.getValue("title"); + String url = details.getValue("url"); + String summary = nutchBean.getSummary(details, nutchQuery) + .toString(); + System.out.println("Title is: " + title); + System.out.println("(" + url + ")"); + Date date = new Date(nutchBean.getFetchDate(details)); + System.out.println("Date Fetched: " + date); + System.out.println(summary + "\n"); + System.out.println("----------------------------------------"); + } + + // as usually, don't forget to close the resources + nutchBean.close(); + } catch (Throwable e) { + e.printStackTrace(); + } + } finally { + // Restore classloader + Thread.currentThread().setContextClassLoader(contextClassLoader); + } + } + +} Added: sandbox/ivol/amdatu-searchandindex/nutch/src/main/java/org/amdatu/searchandindex/nutch/osgi/Activator.java ============================================================================== --- (empty file) +++ sandbox/ivol/amdatu-searchandindex/nutch/src/main/java/org/amdatu/searchandindex/nutch/osgi/Activator.java Wed Jan 26 10:59:13 2011 @@ -0,0 +1,50 @@ +/* + Copyright (C) 2010 Amdatu.org + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +package org.amdatu.searchandindex.nutch.osgi; + +import org.amdatu.core.config.templates.ConfigTemplateManager; +import org.amdatu.searchandindex.nutch.impl.NutchIndex; +import org.amdatu.searchandindex.solr.SolrRestService; +import org.apache.felix.dm.DependencyActivatorBase; +import org.apache.felix.dm.DependencyManager; +import org.osgi.framework.BundleContext; +import org.osgi.service.log.LogService; + +/** + * This is the OSGi activator for this Solr service bundle. + * + * @author ivol + */ +public class Activator extends DependencyActivatorBase { + /** + * The resource identifier for this bundle. Resources are only considered to be 'ours' when it is prefixed with this + * id. + */ + public static final String RESOURCE_ID = "index"; + + public void init(BundleContext context, DependencyManager manager) throws Exception { + // Create and register the Solr service. + manager.add(createComponent() + .setImplementation(NutchIndex.class) + .add(createServiceDependency().setService(LogService.class).setRequired(true)) + .add(createServiceDependency().setService(ConfigTemplateManager.class).setRequired(true)) + .add(createServiceDependency().setService(SolrRestService.class).setRequired(true))); + } + + public void destroy(BundleContext context, DependencyManager manager) throws Exception { + } +} Added: sandbox/ivol/amdatu-searchandindex/nutch/src/main/resources/conf/common-terms.utf8 ============================================================================== --- (empty file) +++ sandbox/ivol/amdatu-searchandindex/nutch/src/main/resources/conf/common-terms.utf8 Wed Jan 26 10:59:13 2011 @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Common terms and phrases which will be indexed in n-grams +# in order to optimize search. +content:a +content:and +content:for +content:in +content:of +content:the +content:to +url:com +url:http +url:http-www +url:www Added: sandbox/ivol/amdatu-searchandindex/nutch/src/main/resources/conf/nutch-site.xml ============================================================================== --- (empty file) +++ sandbox/ivol/amdatu-searchandindex/nutch/src/main/resources/conf/nutch-site.xml Wed Jan 26 10:59:13 2011 @@ -0,0 +1,16 @@ +<?xml version="1.0"?> +<?xml-stylesheet type="text/xsl" href="configuration.xsl"?> + +<configuration> + <property> + <name>plugin.folders</name> + <value>${nutch.site.plugin.folders}</value> + <description /> + </property> + + <property> + <name>searcher.dir</name> + <value>${nutch.site.searcher.dir}</value> + <description /> + </property> +</configuration> \ No newline at end of file
