[Amdatu-commits] SVN r709 - in sandbox/ivol/amdatu-searchandindex/nutch: . src/main/java/org/amdatu/searchandindex/nutch/impl src/main/resources src/main/resources/conf src/main/resources/lib

[email protected] Wed, 26 Jan 2011 12:25:34 +0100

Author: ivol37 at gmail.com
Date: Wed Jan 26 12:25:34 2011
New Revision: 709


Log:


Added:
   
sandbox/ivol/amdatu-searchandindex/nutch/src/main/resources/conf/regex-urlfilter.txt
   sandbox/ivol/amdatu-searchandindex/nutch/src/main/resources/lib/
   sandbox/ivol/amdatu-searchandindex/nutch/src/main/resources/lib/plugins.zip  
 (contents, props changed)
   sandbox/ivol/amdatu-searchandindex/nutch/src/main/resources/seed.txt
Modified:
   sandbox/ivol/amdatu-searchandindex/nutch/pom.xml
   
sandbox/ivol/amdatu-searchandindex/nutch/src/main/java/org/amdatu/searchandindex/nutch/impl/NutchIndex.java
   
sandbox/ivol/amdatu-searchandindex/nutch/src/main/resources/conf/nutch-site.xml

Modified: sandbox/ivol/amdatu-searchandindex/nutch/pom.xml
==============================================================================
--- sandbox/ivol/amdatu-searchandindex/nutch/pom.xml    (original)
+++ sandbox/ivol/amdatu-searchandindex/nutch/pom.xml    Wed Jan 26 12:25:34 2011
@@ -70,6 +70,13 @@
       <scope>compile</scope>
       <type>jar</type>
     </dependency>
+    <dependency>
+      <groupId>commons-cli</groupId>
+      <artifactId>commons-cli</artifactId>
+      <version>1.2</version>
+      <scope>compile</scope>
+      <type>jar</type>
+    </dependency>
   </dependencies>
 
   <build>

Modified: 
sandbox/ivol/amdatu-searchandindex/nutch/src/main/java/org/amdatu/searchandindex/nutch/impl/NutchIndex.java
==============================================================================
--- 
sandbox/ivol/amdatu-searchandindex/nutch/src/main/java/org/amdatu/searchandindex/nutch/impl/NutchIndex.java
 (original)
+++ 
sandbox/ivol/amdatu-searchandindex/nutch/src/main/java/org/amdatu/searchandindex/nutch/impl/NutchIndex.java
 Wed Jan 26 12:25:34 2011
@@ -1,24 +1,28 @@
 package org.amdatu.searchandindex.nutch.impl;
 
 
+import java.io.BufferedOutputStream;
 import java.io.File;
+import java.io.FileOutputStream;
 import java.io.IOException;
+import java.io.OutputStream;
 import java.net.URL;
-import java.util.Date;
+import java.net.URLConnection;
 import java.util.Enumeration;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipInputStream;
 
 import org.amdatu.core.config.templates.ConfigTemplateManager;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.searcher.Hit;
-import org.apache.nutch.searcher.HitDetails;
-import org.apache.nutch.searcher.Hits;
-import org.apache.nutch.searcher.NutchBean;
-import org.apache.nutch.searcher.Query;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.nutch.crawl.Injector;
 import org.apache.nutch.util.NutchConfiguration;
 import org.osgi.framework.BundleContext;
 import org.osgi.service.log.LogService;
 
 public class NutchIndex {
+    // Private constant for buffer size.
+    private static final int BUFFER_SIZE = 40 * 1024; // Blocks of 40 Kb each
+
     // Services injected by the Felix dependency manager
     private volatile LogService m_logService;
     private volatile BundleContext m_bundleContext;
@@ -47,6 +51,22 @@
                 }
             }
 
+            File pluginsDir = new File(m_workDir, "plugins");
+            if (!pluginsDir.exists()) {
+                // Unzip plugins.zip to this directory
+                pluginsDir.mkdir();
+                URL pluginsZip = 
m_bundleContext.getBundle().getResource("lib/plugins.zip");
+                unzip(pluginsZip, pluginsDir);
+
+            }
+
+            // Prepare seeds
+            File urlsDir = new File(m_workDir, "urls");
+            urlsDir.mkdir();
+            File seedFile = new File(urlsDir, "seed.txt");
+            URL seed = m_bundleContext.getBundle().getResource("seed.txt");
+            m_configTemplateManager.writeConfiguration(seed, seedFile);
+
             System.setProperty("nutch.site.plugin.folders", new 
File(m_workDir, "plugins").getAbsolutePath());
             System.setProperty("nutch.site.searcher.dir", new File(m_workDir, 
"crawl").getAbsolutePath());
 
@@ -67,42 +87,10 @@
             try {
                 System.setProperty("hadoop.log.dir", 
m_workDir.getAbsolutePath());
 
-                // define a keyword for the search
-                String nutchSearchString = "smart";
+                // Inject URL into crawl db
+                inject();
 
-                // configure nutch
-                Configuration nutchConf = NutchConfiguration.create();
-                NutchBean nutchBean = new NutchBean(nutchConf);
-
-                // build the query
-                Query nutchQuery = Query.parse(nutchSearchString, nutchConf);
-                // optionally specify the maximum number of hits (default is 
10)
-                // nutchQuery.getParams().setNumHits(100);
-                // nutchQuery.getParams().setMaxHitsPerDup(100);
-                Hits nutchHits = nutchBean.search(nutchQuery);
-
-                // display the number of hits
-                System.out.println("Found " + nutchHits.getLength() + " 
hits.\n");
-
-                // get the details about each hit (includes title, URL, a 
summary
-                // and the date when this was fetched)
-                for (int i = 0; i < nutchHits.getLength(); i++) {
-                    Hit hit = nutchHits.getHit(i);
-                    HitDetails details = nutchBean.getDetails(hit);
-                    String title = details.getValue("title");
-                    String url = details.getValue("url");
-                    String summary = nutchBean.getSummary(details, nutchQuery)
-                    .toString();
-                    System.out.println("Title is: " + title);
-                    System.out.println("(" + url + ")");
-                    Date date = new Date(nutchBean.getFetchDate(details));
-                    System.out.println("Date Fetched: " + date);
-                    System.out.println(summary + "\n");
-                    
System.out.println("----------------------------------------");
-                }
 
-                // as usually, don't forget to close the resources
-                nutchBean.close();
             } catch (Throwable e) {
                 e.printStackTrace();
             }
@@ -112,4 +100,92 @@
         }
     }
 
+    /**
+     * Unzips the file to the specified target directory.
+     * @param zipFile The zipfile to unzip
+     * @param targetDir The directory in which the resulting files should be 
Stored
+     * @exception IOException If not all files could be saved
+     */
+    public static void unzip(URL zipFile, File targetDir) throws IOException {
+        // Create the buffer for streaming the contents
+
+        // Create the zip input stream
+        ZipInputStream zip = null;
+        try {
+            URLConnection urlConnection = zipFile.openConnection();
+
+            zip = new ZipInputStream(urlConnection.getInputStream());
+            // Process each entry
+            ZipEntry entry;
+            while ((entry = zip.getNextEntry()) != null) {
+                // Get and normalize the path
+                String path = entry.getName();
+                path = path.replace('/', File.separatorChar);
+                path = path.replace('\\', File.separatorChar);
+                File target = new File(targetDir, path);
+                // Check whether the target is a file or directory
+                if (entry.isDirectory()) {
+                    target.mkdirs();
+                } else {
+                    writeZipEntryToFile(zip, target);
+                }
+
+                // Set the last modified to the date specified in the zip
+                long time = entry.getTime();
+                if (time != -1) {
+                    target.setLastModified(time);
+                }
+            }
+        } finally {
+            if (zip != null) {
+                zip.close();
+            }
+        }
+    }
+
+    /**
+     * Writes the content of the zip entry to the specified file.
+     * @param zip The zip entry to write
+     * @param target The file to write to
+     * @throws IOException In case a IO exception occurs
+     */
+    private static void writeZipEntryToFile(ZipInputStream zip, File target) 
throws IOException {
+        // Create the parent directory
+        File parent = target.getParentFile();
+        if (!parent.exists()) {
+            parent.mkdirs();
+        }
+        // Stream the contents to the target file
+        int bytes;
+        byte[] buffer = new byte[BUFFER_SIZE];
+        OutputStream out = null;
+        try {
+            out = new BufferedOutputStream(new FileOutputStream(target), 
BUFFER_SIZE);
+            while ((bytes = zip.read(buffer, 0, buffer.length)) != -1) {
+                out.write(buffer, 0, bytes);
+            }
+        } catch (IOException ex) {
+            throw ex;
+        } finally {
+            if (out != null) {
+                out.close();
+            }
+        }
+    }
+
+    private void inject() {
+        String[] args = new String[]{"crawl/crawldb", "urls"};
+        try {
+            // Doing this you will find out that Nutch is not supported on 
Windows. You will need
+            // to install Cygwin toe be able to use it.
+            int res = ToolRunner.run(NutchConfiguration.create(), new 
Injector(), args);
+        }
+        catch (Exception e) {
+            // TODO Auto-generated catch block
+            e.printStackTrace();
+        }
+
+    }
+
+
 }

Modified: 
sandbox/ivol/amdatu-searchandindex/nutch/src/main/resources/conf/nutch-site.xml
==============================================================================
--- 
sandbox/ivol/amdatu-searchandindex/nutch/src/main/resources/conf/nutch-site.xml 
    (original)
+++ 
sandbox/ivol/amdatu-searchandindex/nutch/src/main/resources/conf/nutch-site.xml 
    Wed Jan 26 12:25:34 2011
@@ -2,15 +2,28 @@
 <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 
 <configuration>
-        <property>
-                <name>plugin.folders</name>
-                <value>${nutch.site.plugin.folders}</value>
-                <description />
-        </property>
+  <property>
+    <name>plugin.folders</name>
+    <value>${nutch.site.plugin.folders}</value>
+    <description />
+  </property>
+  <property>
+    <name>searcher.dir</name>
+    <value>${nutch.site.searcher.dir}</value>
+    <description />
+  </property>
+
+  <property>
+    <name>http.agent.name</name>
+    <value>nutch-solr-integration</value>
+  </property>
+  <property>
+    <name>generate.max.per.host</name>
+    <value>100</value>
+  </property>
+  <property>
+    <name>plugin.includes</name>
+    
<value>protocol-http|urlfilter-regex|parse-html|index-(basic|anchor)|query-(basic|site|url)|response-(json|xml)|summary-basic|scoring-opic|urlnormalizer-(pass|regex|basic)</value>
+  </property>
 
-        <property>
-                <name>searcher.dir</name>
-                <value>${nutch.site.searcher.dir}</value>
-                <description />
-        </property>
 </configuration>
\ No newline at end of file

Added: 
sandbox/ivol/amdatu-searchandindex/nutch/src/main/resources/conf/regex-urlfilter.txt
==============================================================================
--- (empty file)
+++ 
sandbox/ivol/amdatu-searchandindex/nutch/src/main/resources/conf/regex-urlfilter.txt
        Wed Jan 26 12:25:34 2011
@@ -0,0 +1,37 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# The default url filter.
+# Better for whole-internet crawling.
+
+# Each non-comment, non-blank line contains a regular expression
+# prefixed by '+' or '-'.  The first matching pattern in the file
+# determines whether a URL is included or ignored.  If no pattern
+# matches, the URL is ignored.
+
+-^(https|telnet|file|ftp|mailto):
+
+# skip some suffixes
+-\.(swf|SWF|doc|DOC|mp3|MP3|WMV|wmv|txt|TXT|rtf|RTF|avi|AVI|m3u|M3U|flv|FLV|WAV|wav|mp4|MP4|avi|AVI|rss|RSS|xml|XML|pdf|PDF|js|JS|gif|GIF|jpg|JPG|png|PNG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe|jpeg|JPEG|bmp|BMP)$
+
+# skip URLs containing certain characters as probable queries, etc.
+-[?*!@=]
+
+# allow urls in foofactory.fi domain
++^http://([a-z0-9\-A-Z]*\.)*lucidimagination.com/
+
+# deny anything else
+-.
\ No newline at end of file

Added: 
sandbox/ivol/amdatu-searchandindex/nutch/src/main/resources/lib/plugins.zip
==============================================================================
Binary file. No diff available.

Added: sandbox/ivol/amdatu-searchandindex/nutch/src/main/resources/seed.txt
==============================================================================
--- (empty file)
+++ sandbox/ivol/amdatu-searchandindex/nutch/src/main/resources/seed.txt        
Wed Jan 26 12:25:34 2011
@@ -0,0 +1 @@
+http://www.gxsoftware.com/index-nl.htm
\ No newline at end of file

[Amdatu-commits] SVN r709 - in sandbox/ivol/amdatu-searchandindex/nutch: . src/main/java/org/amdatu/searchandindex/nutch/impl src/main/resources src/main/resources/conf src/main/resources/lib

Reply via email to