Author: ivol37 at gmail.com
Date: Wed Jan 26 12:25:34 2011
New Revision: 709
Log:
Added:
sandbox/ivol/amdatu-searchandindex/nutch/src/main/resources/conf/regex-urlfilter.txt
sandbox/ivol/amdatu-searchandindex/nutch/src/main/resources/lib/
sandbox/ivol/amdatu-searchandindex/nutch/src/main/resources/lib/plugins.zip
(contents, props changed)
sandbox/ivol/amdatu-searchandindex/nutch/src/main/resources/seed.txt
Modified:
sandbox/ivol/amdatu-searchandindex/nutch/pom.xml
sandbox/ivol/amdatu-searchandindex/nutch/src/main/java/org/amdatu/searchandindex/nutch/impl/NutchIndex.java
sandbox/ivol/amdatu-searchandindex/nutch/src/main/resources/conf/nutch-site.xml
Modified: sandbox/ivol/amdatu-searchandindex/nutch/pom.xml
==============================================================================
--- sandbox/ivol/amdatu-searchandindex/nutch/pom.xml (original)
+++ sandbox/ivol/amdatu-searchandindex/nutch/pom.xml Wed Jan 26 12:25:34 2011
@@ -70,6 +70,13 @@
<scope>compile</scope>
<type>jar</type>
</dependency>
+ <dependency>
+ <groupId>commons-cli</groupId>
+ <artifactId>commons-cli</artifactId>
+ <version>1.2</version>
+ <scope>compile</scope>
+ <type>jar</type>
+ </dependency>
</dependencies>
<build>
Modified:
sandbox/ivol/amdatu-searchandindex/nutch/src/main/java/org/amdatu/searchandindex/nutch/impl/NutchIndex.java
==============================================================================
---
sandbox/ivol/amdatu-searchandindex/nutch/src/main/java/org/amdatu/searchandindex/nutch/impl/NutchIndex.java
(original)
+++
sandbox/ivol/amdatu-searchandindex/nutch/src/main/java/org/amdatu/searchandindex/nutch/impl/NutchIndex.java
Wed Jan 26 12:25:34 2011
@@ -1,24 +1,28 @@
package org.amdatu.searchandindex.nutch.impl;
+import java.io.BufferedOutputStream;
import java.io.File;
+import java.io.FileOutputStream;
import java.io.IOException;
+import java.io.OutputStream;
import java.net.URL;
-import java.util.Date;
+import java.net.URLConnection;
import java.util.Enumeration;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipInputStream;
import org.amdatu.core.config.templates.ConfigTemplateManager;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.searcher.Hit;
-import org.apache.nutch.searcher.HitDetails;
-import org.apache.nutch.searcher.Hits;
-import org.apache.nutch.searcher.NutchBean;
-import org.apache.nutch.searcher.Query;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.nutch.crawl.Injector;
import org.apache.nutch.util.NutchConfiguration;
import org.osgi.framework.BundleContext;
import org.osgi.service.log.LogService;
public class NutchIndex {
+ // Private constant for buffer size.
+ private static final int BUFFER_SIZE = 40 * 1024; // Blocks of 40 Kb each
+
// Services injected by the Felix dependency manager
private volatile LogService m_logService;
private volatile BundleContext m_bundleContext;
@@ -47,6 +51,22 @@
}
}
+ File pluginsDir = new File(m_workDir, "plugins");
+ if (!pluginsDir.exists()) {
+ // Unzip plugins.zip to this directory
+ pluginsDir.mkdir();
+ URL pluginsZip =
m_bundleContext.getBundle().getResource("lib/plugins.zip");
+ unzip(pluginsZip, pluginsDir);
+
+ }
+
+ // Prepare seeds
+ File urlsDir = new File(m_workDir, "urls");
+ urlsDir.mkdir();
+ File seedFile = new File(urlsDir, "seed.txt");
+ URL seed = m_bundleContext.getBundle().getResource("seed.txt");
+ m_configTemplateManager.writeConfiguration(seed, seedFile);
+
System.setProperty("nutch.site.plugin.folders", new
File(m_workDir, "plugins").getAbsolutePath());
System.setProperty("nutch.site.searcher.dir", new File(m_workDir,
"crawl").getAbsolutePath());
@@ -67,42 +87,10 @@
try {
System.setProperty("hadoop.log.dir",
m_workDir.getAbsolutePath());
- // define a keyword for the search
- String nutchSearchString = "smart";
+ // Inject URL into crawl db
+ inject();
- // configure nutch
- Configuration nutchConf = NutchConfiguration.create();
- NutchBean nutchBean = new NutchBean(nutchConf);
-
- // build the query
- Query nutchQuery = Query.parse(nutchSearchString, nutchConf);
- // optionally specify the maximum number of hits (default is
10)
- // nutchQuery.getParams().setNumHits(100);
- // nutchQuery.getParams().setMaxHitsPerDup(100);
- Hits nutchHits = nutchBean.search(nutchQuery);
-
- // display the number of hits
- System.out.println("Found " + nutchHits.getLength() + "
hits.\n");
-
- // get the details about each hit (includes title, URL, a
summary
- // and the date when this was fetched)
- for (int i = 0; i < nutchHits.getLength(); i++) {
- Hit hit = nutchHits.getHit(i);
- HitDetails details = nutchBean.getDetails(hit);
- String title = details.getValue("title");
- String url = details.getValue("url");
- String summary = nutchBean.getSummary(details, nutchQuery)
- .toString();
- System.out.println("Title is: " + title);
- System.out.println("(" + url + ")");
- Date date = new Date(nutchBean.getFetchDate(details));
- System.out.println("Date Fetched: " + date);
- System.out.println(summary + "\n");
-
System.out.println("----------------------------------------");
- }
- // as usually, don't forget to close the resources
- nutchBean.close();
} catch (Throwable e) {
e.printStackTrace();
}
@@ -112,4 +100,92 @@
}
}
+ /**
+ * Unzips the file to the specified target directory.
+ * @param zipFile The zipfile to unzip
+ * @param targetDir The directory in which the resulting files should be
Stored
+ * @exception IOException If not all files could be saved
+ */
+ public static void unzip(URL zipFile, File targetDir) throws IOException {
+ // Create the buffer for streaming the contents
+
+ // Create the zip input stream
+ ZipInputStream zip = null;
+ try {
+ URLConnection urlConnection = zipFile.openConnection();
+
+ zip = new ZipInputStream(urlConnection.getInputStream());
+ // Process each entry
+ ZipEntry entry;
+ while ((entry = zip.getNextEntry()) != null) {
+ // Get and normalize the path
+ String path = entry.getName();
+ path = path.replace('/', File.separatorChar);
+ path = path.replace('\\', File.separatorChar);
+ File target = new File(targetDir, path);
+ // Check whether the target is a file or directory
+ if (entry.isDirectory()) {
+ target.mkdirs();
+ } else {
+ writeZipEntryToFile(zip, target);
+ }
+
+ // Set the last modified to the date specified in the zip
+ long time = entry.getTime();
+ if (time != -1) {
+ target.setLastModified(time);
+ }
+ }
+ } finally {
+ if (zip != null) {
+ zip.close();
+ }
+ }
+ }
+
+ /**
+ * Writes the content of the zip entry to the specified file.
+ * @param zip The zip entry to write
+ * @param target The file to write to
+ * @throws IOException In case a IO exception occurs
+ */
+ private static void writeZipEntryToFile(ZipInputStream zip, File target)
throws IOException {
+ // Create the parent directory
+ File parent = target.getParentFile();
+ if (!parent.exists()) {
+ parent.mkdirs();
+ }
+ // Stream the contents to the target file
+ int bytes;
+ byte[] buffer = new byte[BUFFER_SIZE];
+ OutputStream out = null;
+ try {
+ out = new BufferedOutputStream(new FileOutputStream(target),
BUFFER_SIZE);
+ while ((bytes = zip.read(buffer, 0, buffer.length)) != -1) {
+ out.write(buffer, 0, bytes);
+ }
+ } catch (IOException ex) {
+ throw ex;
+ } finally {
+ if (out != null) {
+ out.close();
+ }
+ }
+ }
+
+ private void inject() {
+ String[] args = new String[]{"crawl/crawldb", "urls"};
+ try {
+ // Doing this you will find out that Nutch is not supported on
Windows. You will need
+ // to install Cygwin toe be able to use it.
+ int res = ToolRunner.run(NutchConfiguration.create(), new
Injector(), args);
+ }
+ catch (Exception e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+
+ }
+
+
}
Modified:
sandbox/ivol/amdatu-searchandindex/nutch/src/main/resources/conf/nutch-site.xml
==============================================================================
---
sandbox/ivol/amdatu-searchandindex/nutch/src/main/resources/conf/nutch-site.xml
(original)
+++
sandbox/ivol/amdatu-searchandindex/nutch/src/main/resources/conf/nutch-site.xml
Wed Jan 26 12:25:34 2011
@@ -2,15 +2,28 @@
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
- <property>
- <name>plugin.folders</name>
- <value>${nutch.site.plugin.folders}</value>
- <description />
- </property>
+ <property>
+ <name>plugin.folders</name>
+ <value>${nutch.site.plugin.folders}</value>
+ <description />
+ </property>
+ <property>
+ <name>searcher.dir</name>
+ <value>${nutch.site.searcher.dir}</value>
+ <description />
+ </property>
+
+ <property>
+ <name>http.agent.name</name>
+ <value>nutch-solr-integration</value>
+ </property>
+ <property>
+ <name>generate.max.per.host</name>
+ <value>100</value>
+ </property>
+ <property>
+ <name>plugin.includes</name>
+
<value>protocol-http|urlfilter-regex|parse-html|index-(basic|anchor)|query-(basic|site|url)|response-(json|xml)|summary-basic|scoring-opic|urlnormalizer-(pass|regex|basic)</value>
+ </property>
- <property>
- <name>searcher.dir</name>
- <value>${nutch.site.searcher.dir}</value>
- <description />
- </property>
</configuration>
\ No newline at end of file
Added:
sandbox/ivol/amdatu-searchandindex/nutch/src/main/resources/conf/regex-urlfilter.txt
==============================================================================
--- (empty file)
+++
sandbox/ivol/amdatu-searchandindex/nutch/src/main/resources/conf/regex-urlfilter.txt
Wed Jan 26 12:25:34 2011
@@ -0,0 +1,37 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# The default url filter.
+# Better for whole-internet crawling.
+
+# Each non-comment, non-blank line contains a regular expression
+# prefixed by '+' or '-'. The first matching pattern in the file
+# determines whether a URL is included or ignored. If no pattern
+# matches, the URL is ignored.
+
+-^(https|telnet|file|ftp|mailto):
+
+# skip some suffixes
+-\.(swf|SWF|doc|DOC|mp3|MP3|WMV|wmv|txt|TXT|rtf|RTF|avi|AVI|m3u|M3U|flv|FLV|WAV|wav|mp4|MP4|avi|AVI|rss|RSS|xml|XML|pdf|PDF|js|JS|gif|GIF|jpg|JPG|png|PNG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe|jpeg|JPEG|bmp|BMP)$
+
+# skip URLs containing certain characters as probable queries, etc.
+-[?*!@=]
+
+# allow urls in foofactory.fi domain
++^http://([a-z0-9\-A-Z]*\.)*lucidimagination.com/
+
+# deny anything else
+-.
\ No newline at end of file
Added:
sandbox/ivol/amdatu-searchandindex/nutch/src/main/resources/lib/plugins.zip
==============================================================================
Binary file. No diff available.
Added: sandbox/ivol/amdatu-searchandindex/nutch/src/main/resources/seed.txt
==============================================================================
--- (empty file)
+++ sandbox/ivol/amdatu-searchandindex/nutch/src/main/resources/seed.txt
Wed Jan 26 12:25:34 2011
@@ -0,0 +1 @@
+http://www.gxsoftware.com/index-nl.htm
\ No newline at end of file