Author: mattmann
Date: Wed Jul  1 04:22:01 2015
New Revision: 1688549

URL: http://svn.apache.org/r1688549
Log:
Updates to make tests pass related to NUTCH-2038: Naive Bayes classifier based 
html Parse filter (for filtering outlinks) this closes #42.

Added:
    nutch/trunk/conf/naivebayes-train.txt.template
    nutch/trunk/conf/naivebayes-wordlist.txt.template
Modified:
    nutch/trunk/conf/nutch-default.xml
    nutch/trunk/default.properties
    nutch/trunk/ivy/ivy.xml
    nutch/trunk/src/plugin/parsefilter-naivebayes/ivy.xml

Added: nutch/trunk/conf/naivebayes-train.txt.template
URL: 
http://svn.apache.org/viewvc/nutch/trunk/conf/naivebayes-train.txt.template?rev=1688549&view=auto
==============================================================================
--- nutch/trunk/conf/naivebayes-train.txt.template (added)
+++ nutch/trunk/conf/naivebayes-train.txt.template Wed Jul  1 04:22:01 2015
@@ -0,0 +1,6 @@
+1      Subject energy finance conference presentations available fyi  you can 
now retrieve most all the speaker presentations of the 2001 energy finance 
conference  feb 22  23  from our website at http    cefer  bus  utexas  edu  
with the exception of presentations made by john mccormack  peter nance  
sailesh ramamurtie  and ehud ronn  anoop kapoor  which i hope to still receive  
sincerely  angela               angela dorsey assistant director center for 
energy finance education  research the university of texas at austin department 
of finance  cba 6  222 austin  tx 78712 angela  dorsey  bus  utexas  edu        
      
+1      Subject re  meter  6009  lundell ranch c  p  gato creek fyi from  
robert cotten  ect 11  17  2000 10  56 am to  vance l taylor  hou  ect  ect cc  
pat clynes  corp  enron  enron  o  neal d winfree  hou  ect  ect subject  meter 
 6009  lundell ranch c  p  gato creek vance  it appears the actual volumes have 
been significantly higher than nominations at the subject meter the past 
several months  the following represents activity during the months of june 
through september  gas month total nom mmbtu total actual mmbtu 06  2000 19  
680 116  040 07  2000 19  933 128  755 08  2000 19  530 136  845 09  2000 18  
540 159  935 deal  135708  calpine natural gas company  is the only activity at 
this meter  should we adjust the nomination to more closely resemble the actual 
volume  please advise  thanks  bob
+0      Subject fw  malowney promotion from tim belden one more promo doc       
original message      from  foster  chris h   mailto  chris  h  foster  enron  
com  sent  wednesday  july 18  2001 1  44 pm to  tbelden  nwlink  com cc  
fitzpatrick  amy subject  malowney promotion tim  here is a write  up on 
malowney  i tried contacting him today so he could review it  but he has not 
called me back  nevertheless  i think i got the most of it  let me know if this 
meets your needs   john malowneypromo  doc
+1      Subject re  personal information needs to be updated janet  please 
submit this name change to the tpc as soon as possible  thanks  hgm susan 
wimberley  ect 11  07  2000 02  45 pm to  hector mcloughlin  corp  enron  enron 
cc  dfarmer  enron  com  enron subject  re  personal information needs to be 
updated once this is fixed jerry d to farmer  j daren
+0      Subject re  confidential sophie  i think it  s a fair deal  vince 
sophie kingsley 08  30  2000 11  49 am to  dale surbey  lon  ect  ect cc  vince 
j kaminski  hou  ect  ect  michele small  lon  ect  ect subject  re  
confidential both  thanks for your comments and comparisons  it is good to get 
context  based on your commensley 29  08  2000 20  32 to  dale surbey  lon  ect 
 ect cc  subject  confidential sorry dale  long day  here are the proposed 
numbers 2 year exec o 62  000 basic  currently o 55 k  ol 0 k each year kickers 
 50  000 worth of options to vest 1  3 1  3 1  3 let me know what you think  
regards sophie
+1      Subject west power trading administrative assistant opening a position 
has become available as an administrative assistant working in west power 
trading reporting to debra davidson  you will be responsible for the following 
complex administrative duties   compose memos  reports and other correspondence 
from a brief outline   sketchy  draft or verbal instruction   greet external 
clients   code invoices  process complex expense reports  and manage employee 
ril 23  2001  if you have any questions  please feel free to see amy or debra

Added: nutch/trunk/conf/naivebayes-wordlist.txt.template
URL: 
http://svn.apache.org/viewvc/nutch/trunk/conf/naivebayes-wordlist.txt.template?rev=1688549&view=auto
==============================================================================
--- nutch/trunk/conf/naivebayes-wordlist.txt.template (added)
+++ nutch/trunk/conf/naivebayes-wordlist.txt.template Wed Jul  1 04:22:01 2015
@@ -0,0 +1,4 @@
+nutch
+funny
+happy
+search
\ No newline at end of file

Modified: nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1688549&r1=1688548&r2=1688549&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Wed Jul  1 04:22:01 2015
@@ -1224,7 +1224,7 @@ CAUTION: Set the parser.timeout to -1 or
 
 <property>
   <name>parsefilter.naivebayes.wordlist</name>
-  <value>wordlist.txt</value>
+  <value>naivebayes-wordlist.txt</value>
   <description>Put the name of the file you want to be used as a list of 
   important words to be matched in the url for the model filter. The format 
should be one word per line.
   </description>

Modified: nutch/trunk/default.properties
URL: 
http://svn.apache.org/viewvc/nutch/trunk/default.properties?rev=1688549&r1=1688548&r2=1688549&view=diff
==============================================================================
--- nutch/trunk/default.properties (original)
+++ nutch/trunk/default.properties Wed Jul  1 04:22:01 2015
@@ -140,6 +140,7 @@ plugins.parse=\
 #
 plugins.parsefilter=\
    org.apache.nutch.parse.headings*:\
+   org.apache.nutch.parsefilter.naivebayes*:\
    org.apache.nutch.parse.metatags*
 
 #

Modified: nutch/trunk/ivy/ivy.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/ivy/ivy.xml?rev=1688549&r1=1688548&r2=1688549&view=diff
==============================================================================
--- nutch/trunk/ivy/ivy.xml (original)
+++ nutch/trunk/ivy/ivy.xml Wed Jul  1 04:22:01 2015
@@ -80,6 +80,11 @@
                 <dependency org="com.fasterxml.jackson.dataformat" 
name="jackson-dataformat-cbor" rev="2.5.1" />
                 <dependency org="com.fasterxml.jackson.jaxrs" 
name="jackson-jaxrs-json-provider" rev="2.5.1" />        
 
+                 <dependency org="org.apache.mahout" name="mahout-math" 
rev="0.8" />
+                 <dependency org="org.apache.mahout" name="mahout-core" 
rev="0.8" />
+                 <dependency org="org.apache.lucene" name="lucene-core" 
rev="4.3.0" />
+                 <dependency org="org.apache.lucene" 
name="lucene-analyzers-common" rev="4.3.0" />
+              
                <!--Configuration: test -->
 
                <!--artifacts needed for testing -->
@@ -104,6 +109,8 @@
                <exclude module="jmxtools" />
                <exclude module="jms" />
                <exclude module="jmxri" />
+                <exclude org="com.thoughtworks.xstream"/>
+                <exclude org="org.apache.mrunit"/>
 
        </dependencies>
 

Modified: nutch/trunk/src/plugin/parsefilter-naivebayes/ivy.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parsefilter-naivebayes/ivy.xml?rev=1688549&r1=1688548&r2=1688549&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parsefilter-naivebayes/ivy.xml (original)
+++ nutch/trunk/src/plugin/parsefilter-naivebayes/ivy.xml Wed Jul  1 04:22:01 
2015
@@ -36,12 +36,6 @@
   </publications>
 
   <dependencies>
-    <dependency org="org.apache.mahout" name="mahout-math" rev="0.8" />
-                <dependency org="org.apache.mahout" name="mahout-core" 
rev="0.8" />
-                <dependency org="org.apache.lucene" name="lucene-core" 
rev="4.3.0" />
-                <dependency org="org.apache.lucene" 
name="lucene-analyzers-common" rev="4.3.0" />
-     <exclude org="com.thoughtworks.xstream"/>
-    <exclude org="org.apache.mrunit"/>           
   </dependencies>
   
 </ivy-module>


Reply via email to