This is an automated email from the ASF dual-hosted git repository.
lewismc pushed a change to branch 2.x
in repository https://gitbox.apache.org/repos/asf/nutch.git.
from 365077c Merge branch 'kaidul-NUTCH-2393' into 2.x
add f41735c NUTCH-2389 jsoup-extractor with parse filter, indexing filter
and unit testing implemented
add fe6997f NUTCH-2389 jsoup-extractor/ivy.xml commited
add 17bd8f6 NUTCH-2389 Unit test implemented but not passed
add 52e785d NUTCH-2389 package name changed
add 82ff292 NUTCH-2389 JsoupDocumentReader parsing bug fixed
add 39ef777 NUTCH-2389 Unit test passed, xml parsing issue fixed
add 66cbd7f NUTCH-2389 IndexingFilter Utf8 conversion issue solved
add 60abbc4 NUTCH-2389 Class name corrected in jsoup-extractor.xml
add 6e11dee NUTCH-2389 Unnecessary header import removed
add 1ede445 NUTCH-2389 Missing license information added
add b2130b4 NUTCH-2389 Diamond sign declaration removed and make Java 1.6
compatible
new 5f6c383 Merge pull request #192 from kaidul/NUTCH-2389
The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails. The revisions
listed as "add" were already present in the repository and have only
been added to this reference.
Summary of changes:
build.xml | 5 +
conf/jsoup-extractor-example.xml | 88 ++++++++++
conf/jsoup-extractor.xml | 53 ++++++
conf/nutch-default.xml | 9 ++
src/plugin/build.xml | 3 +
.../plugin/jsoup-extractor/build.xml | 20 +--
.../{index-metadata => jsoup-extractor}/ivy.xml | 3 +-
src/plugin/jsoup-extractor/plugin.xml | 56 +++++++
.../nutch/core/jsoup/extractor/JsoupDocument.java | 127 +++++++++++++++
.../core/jsoup/extractor/JsoupDocumentReader.java | 179 +++++++++++++++++++++
.../jsoup/extractor/JsoupExtractorConstants.java | 36 +++++
.../jsoup/extractor/normalizer/Normalizable.java} | 8 +-
.../normalizer/SimpleStringNormalizer.java} | 24 ++-
.../jsoup/extractor/normalizer}/package-info.java | 4 +-
.../nutch/core/jsoup/extractor}/package-info.java | 4 +-
.../jsoup/extractor/JsoupIndexingFilter.java} | 68 ++++----
.../indexer/jsoup/extractor}/package-info.java | 4 +-
.../parse/jsoup/extractor/JsoupHtmlParser.java | 118 ++++++++++++++
.../nutch/parse/jsoup/extractor}/package-info.java | 4 +-
.../parse/jsoup/extractor/TestJsoupHtmlParser.java | 102 ++++++++++++
.../jsoup/extractor/ViewCountNormalizer.java} | 21 +--
21 files changed, 856 insertions(+), 80 deletions(-)
create mode 100644 conf/jsoup-extractor-example.xml
create mode 100644 conf/jsoup-extractor.xml
copy ivy/ivy-configurations.xml => src/plugin/jsoup-extractor/build.xml (65%)
copy src/plugin/{index-metadata => jsoup-extractor}/ivy.xml (95%)
create mode 100644 src/plugin/jsoup-extractor/plugin.xml
create mode 100644
src/plugin/jsoup-extractor/src/java/org/apache/nutch/core/jsoup/extractor/JsoupDocument.java
create mode 100644
src/plugin/jsoup-extractor/src/java/org/apache/nutch/core/jsoup/extractor/JsoupDocumentReader.java
create mode 100644
src/plugin/jsoup-extractor/src/java/org/apache/nutch/core/jsoup/extractor/JsoupExtractorConstants.java
copy src/{java/org/apache/nutch/host/package-info.java =>
plugin/jsoup-extractor/src/java/org/apache/nutch/core/jsoup/extractor/normalizer/Normalizable.java}
(85%)
copy src/{java/org/apache/nutch/crawl/InjectType.java =>
plugin/jsoup-extractor/src/java/org/apache/nutch/core/jsoup/extractor/normalizer/SimpleStringNormalizer.java}
(70%)
copy src/{java/org/apache/nutch/host =>
plugin/jsoup-extractor/src/java/org/apache/nutch/core/jsoup/extractor/normalizer}/package-info.java
(89%)
copy src/{java/org/apache/nutch/api =>
plugin/jsoup-extractor/src/java/org/apache/nutch/core/jsoup/extractor}/package-info.java
(85%)
copy
src/plugin/{tld/src/java/org/apache/nutch/indexer/tld/TLDIndexingFilter.java =>
jsoup-extractor/src/java/org/apache/nutch/indexer/jsoup/extractor/JsoupIndexingFilter.java}
(55%)
copy src/{java/org/apache/nutch/host =>
plugin/jsoup-extractor/src/java/org/apache/nutch/indexer/jsoup/extractor}/package-info.java
(89%)
create mode 100644
src/plugin/jsoup-extractor/src/java/org/apache/nutch/parse/jsoup/extractor/JsoupHtmlParser.java
copy src/{java/org/apache/nutch/host =>
plugin/jsoup-extractor/src/java/org/apache/nutch/parse/jsoup/extractor}/package-info.java
(87%)
create mode 100644
src/plugin/jsoup-extractor/src/test/org/apache/nutch/parse/jsoup/extractor/TestJsoupHtmlParser.java
copy src/{java/org/apache/nutch/crawl/InjectType.java =>
plugin/jsoup-extractor/src/test/org/apache/nutch/parse/jsoup/extractor/ViewCountNormalizer.java}
(70%)
--
To stop receiving notification emails like this one, please contact
['"[email protected]" <[email protected]>'].