Hello,
I'm attempting to use Nutch (2.2.1, one node, local HBase backend, crawl script
edited to push to Elasticsearch instead of Solr) to crawl a filesytem, but I
want to only crawl a specific directory and I want to exclude specific
subdirectories. To do this, I'm trying to use regex-urlfilter.txt to set
directories to (in|ex)clued. When I run the Nutch crawl script with this
configuration, it indexes the seed directory only, and stops. As far as I could
determine on my own, this set of regexes should work. What have I gotten wrong?
Thanks,
David
--- Seed URL ---
file:///home/user/IT%20DOCUMENTATION/
--- regex-urlfilter.txt ---
# skip file: ftp: and mailto: urls
#-^(file|ftp|mailto):
# skip http: ftp: and mailto: urls
-^(http|https|ftp|mailto):
# skip image and other suffixes we can't yet parse
# for a more extensive coverage use the urlfilter-suffix plugin
#-\.(gif|GIF|jpg|JPG|png|PNG|ico|ICO|css|CSS|sit|SIT|eps|EPS|wmf|WMF|zip|ZIP|ppt|PPT|mpg|MPG|xls|XLS|gz|GZ|rpm|RPM|tgz|TGZ|mov|MOV|exe|EXE|jpeg|JPEG|bmp|BMP|js|JS)$
-\.(ico|ICO|css|CSS|sit|SIT|eps|EPS|wmf|WMF|zip|ZIP|mpg|MPG|gz|GZ|rpm|RPM|tgz|TGZ|mov|MOV|exe|EXE|js|JS|lnk|jar|class)$
# skip URLs containing certain characters as probable queries, etc.
-[?*!@=]
# skip URLs with slash-delimited segment that repeats 3+ times, to break loops
-.*(/[^/]+)/[^/]+\1/[^/]+\1/
# accept anything else
#+.
# customized filter version
-^file:///home/user/IT(%20|\s)DOCUMENTATION/excludeme
+^file:///home/user/IT(%20|\s)DOCUMENTATION/
-.
--- nutch-site.xml ---
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!-- Put site-specific property overrides in this file. -->
<configuration>
<property>
<name>storage.data.store.class</name>
<value>org.apache.gora.hbase.store.HBaseStore</value>
<description>Default class for storing data</description>
</property>
<property>
<name>plugin.folders</name>
<value>/home/user/Nutch/plugins</value>
</property>
<property>
<name>http.agent.name</name>
<value>Mybot</value>
</property>
<property>
<name>http.robots.agents</name>
<value>Mybot,MybotDev,*</value>
</property>
<property>
<name>elastic.index</name>
<value>drive</value>
</property>
<property>
<name>plugin.includes</name>
<value>protocol-file|protocol-http|urlfilter-regex|url-filter-regex|parse-(html|tika)|index-(basic|anchor)|urlnormalizer-(pass|regex|basic)|scoring-opic</value>
</property>
<property>
<name>file.content.limit</name>
<value>-1</value>
</property>
<property>
<name>file.crawl.parent</name>
<value>false</value>
</property>
<property>
<name>fetcher.server.delay</name>
<value>0.25</value>
</property>
<property>
<name>db.fetch.interval.default</name>
<value>604800</value>
<description>Crawl each file every week by default.</description>
</property>
<property>
<name>urlfilter.order</name>
<value>org.apache.nutch.urlfilter.regex.RegexURLFilter</value>
</property>
</configuration>