[
https://issues.apache.org/jira/browse/NUTCH-1472?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=13465617#comment-13465617
]
zhaixuepan commented on NUTCH-1472:
-----------------------------------
Hi Lewis.Configuration and environment are as follows:
gora.properties
gora.cassandrastore.servers=localhost:9160
nutch-site.xml
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!-- Put site-specific property overrides in this file. -->
<configuration>
<property>
<name>http.agent.name</name>
<value>zhaitest</value>
<description>HTTP ‘User-Agent’ request header. </description>
</property>
<property>
<name>http.agent.description</name>
<value>zhaitest</value>
<description>Further description of our bot- this text is used
in the
User-Agent header.
</description>
</property>
<property>
<name>http.agent.url</name>
<value>http://www.zhaixuepan.cn</value>
<description>A URL to advertise in the User-Agent header.
</description>
</property>
<property>
<name>http.agent.email</name>
<value>[email protected]</value>
<description>An email address to advertise in the HTTP ‘From’
reques
header and User-Agent header.
</description>
</property>
<property>
<name>plugin.folders</name>
<value>./src/plugin</value>
<description>Directories where nutch plugins are located. Each
element
may be a relative or absolute path. If absolute, it is
used
as is. If
relative, it is searched for on the classpath.
</description>
</property>
<property>
<name>storage.data.store.class</name>
<value>org.apache.gora.cassandra.store.CassandraStore</value>
<description>data
</description>
</property>
<property>
<name>parser.character.encoding.default</name>
<value>utf-8</value>
<description>The character encoding to fall back to when no
other
information
is available
</description>
</property>
<property>
<name>encodingdetector.charset.min.confidence</name>
<value>100</value>
<description>A integer between 0-100 indicating minimum
confidence
value
for charset auto-detection. Any negative value disables
auto-detection.
</description>
</property>
</configuration>
run config: org.apache.nutch.crawl.Crawler urls -solr
http://localhost:8080/solr/ -depth 1 -topN 5
vm arguments: -Dhadoop.log.dir=logs -Dhadoop.log.file=hadoop.log -Xms64m
-Xmx512m
urls config: http://auto.qq.com/
pom.xml
<?xml version="1.0"?>
<!-- Licensed to the Apache Software Foundation (ASF) under one or more
contributor
license agreements. See the NOTICE file distributed with this work for
additional
information regarding copyright ownership. The ASF licenses this file
to
You under the Apache License, Version 2.0 (the "License"); you may not
use
this file except in compliance with the License. You may obtain a copy
of
the License at http://www.apache.org/licenses/LICENSE-2.0 Unless
required
by applicable law or agreed to in writing, software distributed under
the
License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
CONDITIONS
OF ANY KIND, either express or implied. See the License for the
specific
language governing permissions and limitations under the License. -->
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>org.apache.nutch</groupId>
<artifactId>nutch</artifactId>
<packaging>jar</packaging>
<version>2.0</version>
<name>Apache Nutch</name>
<url>http://nutch.apache.org</url>
<licenses>
<license>
<name>The Apache Software License, Version 2.0</name>
<url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
<distribution>repo</distribution>
</license>
</licenses>
<scm>
<url>http://svn.apache.org/viewvc/nutch</url>
<connection>http://svn.apache.org/viewvc/nutch</connection>
</scm>
<developers>
<developer>
<id>ab</id>
<name>Andrzej Bialecki</name>
<email>[email protected]</email>
</developer>
<developer>
<id>mattmann</id>
<name>Chris A. Mattmann</name>
<email>[email protected]</email>
</developer>
<developer>
<id>kubes</id>
<name>Dennis Kubes</name>
<email>[email protected]</email>
</developer>
<developer>
<id>dogacan</id>
<name>Dogacan Güney</name>
<email>[email protected]</email>
</developer>
<developer>
<id>jnioche</id>
<name>Julien Nioche</name>
<email>[email protected]</email>
</developer>
<developer>
<id>siren</id>
<name>Sami Siren</name>
<email>[email protected]</email>
</developer>
<developer>
<id>markus</id>
<name>Markus Jelsma</name>
<email>[email protected]</email>
</developer>
<developer>
<id>alexis</id>
<name>Alexis Detlegrode</name>
<email>[email protected]</email>
</developer>
<developer>
<id>lewismc</id>
<name>Lewis John McGibbney</name>
<email>[email protected]</email>
</developer>
</developers>
<!-- add -->
<repositories>
<repository>
<id>maven-restlet</id>
<name>Public online Restlet repository</name>
<url>http://maven.restlet.org</url>
</repository>
<repository>
<id>sonatype</id>
<name>Sonatype Groups</name>
<url>https://oss.sonatype.org/content/groups/public/</url>
</repository>
</repositories>
<!-- add end -->
<build>
<sourceDirectory>src/java</sourceDirectory>
<testSourceDirectory>src/test</testSourceDirectory>
<testResources>
<testResource>
<directory>src/testresources</directory>
</testResource>
<testResource>
<directory>src/testprocess</directory>
</testResource>
</testResources>
<pluginManagement>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>1.6</source>
<target>1.6</target>
</configuration>
</plugin>
</plugins>
</pluginManagement>
</build>
<dependencies>
<!-- add -->
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.17</version>
</dependency>
<dependency>
<groupId>org.elasticsearch</groupId>
<artifactId>elasticsearch</artifactId>
<version>0.19.9</version>
</dependency>
<dependency>
<groupId>net.sourceforge.nekohtml</groupId>
<artifactId>nekohtml</artifactId>
<version>1.9.15</version>
</dependency>
<dependency>
<groupId>org.ccil.cowan.tagsoup</groupId>
<artifactId>tagsoup</artifactId>
<version>1.2</version>
</dependency>
<dependency>
<groupId>org.hsqldb</groupId>
<artifactId>hsqldb</artifactId>
<version>2.2.8</version>
</dependency>
<dependency>
<groupId>org.apache.gora</groupId>
<artifactId>gora-cassandra</artifactId>
<version>0.2.1</version>
<exclusions>
<exclusion>
<artifactId>avro</artifactId>
<groupId>org.apache.hadoop</groupId>
</exclusion>
<exclusion>
<artifactId>avro</artifactId>
<groupId>org.apache.cassandra.deps</groupId>
</exclusion>
</exclusions>
<optional>true</optional>
</dependency>
<dependency>
<groupId>org.apache.avro</groupId>
<artifactId>avro</artifactId>
<version>1.4.0</version>
</dependency>
<!-- add end -->
<dependency>
<groupId>org.apache.solr</groupId>
<artifactId>solr-solrj</artifactId>
<version>3.4.0</version>
<optional>true</optional>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>1.6.1</version>
<optional>true</optional>
</dependency>
<dependency>
<groupId>commons-lang</groupId>
<artifactId>commons-lang</artifactId>
<version>2.4</version>
<optional>true</optional>
</dependency>
<dependency>
<groupId>commons-collections</groupId>
<artifactId>commons-collections</artifactId>
<version>3.1</version>
<optional>true</optional>
</dependency>
<dependency>
<groupId>commons-httpclient</groupId>
<artifactId>commons-httpclient</artifactId>
<version>3.1</version>
<optional>true</optional>
</dependency>
<dependency>
<groupId>commons-codec</groupId>
<artifactId>commons-codec</artifactId>
<version>1.3</version>
<optional>true</optional>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-core</artifactId>
<version>1.0.3</version>
<exclusions>
<exclusion>
<artifactId>hsqldb</artifactId>
<groupId>hsqldb</groupId>
</exclusion>
<exclusion>
<artifactId>ant</artifactId>
<groupId>ant</groupId>
</exclusion>
</exclusions>
<optional>true</optional>
</dependency>
<dependency>
<groupId>com.ibm.icu</groupId>
<artifactId>icu4j</artifactId>
<version>4.0.1</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-core</artifactId>
<version>1.1</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.gora</groupId>
<artifactId>gora-core</artifactId>
<!-- <version>0.2</version> -->
<version>0.2.1</version>
<exclusions>
<exclusion>
<artifactId>jackson-mapper-asl</artifactId>
<groupId>org.codehaus.jackson</groupId>
</exclusion>
<exclusion>
<artifactId>avro</artifactId>
<groupId>org.apache.hadoop</groupId>
</exclusion>
</exclusions>
<optional>true</optional>
</dependency>
<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
<!-- <version>1.2.15</version> -->
<version>1.2.16</version>
<optional>true</optional>
</dependency>
<dependency>
<groupId>xerces</groupId>
<artifactId>xercesImpl</artifactId>
<version>2.9.1</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>xerces</groupId>
<artifactId>xmlParserAPIs</artifactId>
<version>2.6.2</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>oro</groupId>
<artifactId>oro</artifactId>
<version>2.0.8</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.jdom</groupId>
<artifactId>jdom</artifactId>
<version>1.1</version>
<optional>true</optional>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>11.0.2</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<!-- <version>3.8.1</version> -->
<version>4.8.2</version>
<optional>true</optional>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-test</artifactId>
<version>1.0.3</version>
<optional>true</optional>
</dependency>
<dependency>
<groupId>org.mortbay.jetty</groupId>
<artifactId>jetty</artifactId>
<version>6.1.26</version>
<optional>true</optional>
</dependency>
<dependency>
<groupId>org.mortbay.jetty</groupId>
<artifactId>jetty-util</artifactId>
<version>6.1.26</version>
<optional>true</optional>
</dependency>
<dependency>
<groupId>org.mortbay.jetty</groupId>
<artifactId>jetty-client</artifactId>
<version>6.1.26</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.jdom</groupId>
<artifactId>jdom</artifactId>
<version>1.1</version>
<optional>true</optional>
</dependency>
<dependency>
<groupId>org.apache.gora</groupId>
<artifactId>gora-sql</artifactId>
<version>0.1.1-incubating</version>
<optional>true</optional>
</dependency>
<dependency>
<groupId>org.restlet.jse</groupId>
<artifactId>org.restlet</artifactId>
<version>2.0.5</version>
<optional>true</optional>
</dependency>
<dependency>
<groupId>org.restlet.jse</groupId>
<artifactId>org.restlet.ext.jackson</artifactId>
<version>2.0.5</version>
<exclusions>
<exclusion>
<artifactId>jackson-mapper-asl</artifactId>
<groupId>org.codehaus.jackson</groupId>
</exclusion>
</exclusions>
<optional>true</optional>
</dependency>
<dependency>
<groupId>org.codehaus.jackson</groupId>
<artifactId>jackson-mapper-asl</artifactId>
<version>1.4.3</version>
<type>jar</type>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>com.oracle</groupId>
<artifactId>ojdbc14</artifactId>
<version>10.2.0.4.0</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.codehaus.jackson</groupId>
<artifactId>jackson-core-asl</artifactId>
<version>1.3.4</version>
<type>jar</type>
<scope>compile</scope>
</dependency>
</dependencies>
</project>
> InvalidRequestException(why:(String didn't validate.) [webpage][f][ts]
> failed validation)
> ------------------------------------------------------------------------------------------
>
> Key: NUTCH-1472
> URL: https://issues.apache.org/jira/browse/NUTCH-1472
> Project: Nutch
> Issue Type: Bug
> Affects Versions: 2.1
> Reporter: zhaixuepan
>
> me.prettyprint.hector.api.exceptions.HInvalidRequestException:
> InvalidRequestException(why:(String didn't validate.) [webpage][f][ts] failed
> validation)
> at
> me.prettyprint.cassandra.service.ExceptionsTranslatorImpl.translate(ExceptionsTranslatorImpl.java:45)
> at
> me.prettyprint.cassandra.connection.HConnectionManager.operateWithFailover(HConnectionManager.java:264)
> at
> me.prettyprint.cassandra.model.ExecutingKeyspace.doExecuteOperation(ExecutingKeyspace.java:97)
> at
> me.prettyprint.cassandra.model.MutatorImpl.execute(MutatorImpl.java:243)
> at
> me.prettyprint.cassandra.model.MutatorImpl.insert(MutatorImpl.java:69)
> at
> org.apache.gora.cassandra.store.HectorUtils.insertColumn(HectorUtils.java:47)
> at
> org.apache.gora.cassandra.store.CassandraClient.addColumn(CassandraClient.java:169)
> at
> org.apache.gora.cassandra.store.CassandraStore.addOrUpdateField(CassandraStore.java:341)
> at
> org.apache.gora.cassandra.store.CassandraStore.flush(CassandraStore.java:228)
> at
> org.apache.gora.cassandra.store.CassandraStore.close(CassandraStore.java:95)
> at
> org.apache.gora.mapreduce.GoraRecordWriter.close(GoraRecordWriter.java:55)
> at
> org.apache.hadoop.mapred.MapTask$NewDirectOutputCollector.close(MapTask.java:651)
> at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:766)
> at org.apache.hadoop.mapred.MapTask.run(MapTask.java:370)
> at
> org.apache.hadoop.mapred.LocalJobRunner$Job.run(LocalJobRunner.java:212)
> Caused by: InvalidRequestException(why:(String didn't validate.)
> [webpage][f][ts] failed validation)
> at
> org.apache.cassandra.thrift.Cassandra$batch_mutate_result.read(Cassandra.java:20253)
> at org.apache.thrift.TServiceClient.receiveBase(TServiceClient.java:78)
> at
> org.apache.cassandra.thrift.Cassandra$Client.recv_batch_mutate(Cassandra.java:922)
> at
> org.apache.cassandra.thrift.Cassandra$Client.batch_mutate(Cassandra.java:908)
> at
> me.prettyprint.cassandra.model.MutatorImpl$3.execute(MutatorImpl.java:246)
> at
> me.prettyprint.cassandra.model.MutatorImpl$3.execute(MutatorImpl.java:243)
> at
> me.prettyprint.cassandra.service.Operation.executeAndSetResult(Operation.java:103)
> at
> me.prettyprint.cassandra.connection.HConnectionManager.operateWithFailover(HConnectionManager.java:258)
> ... 13 more
--
This message is automatically generated by JIRA.
If you think it was sent incorrectly, please contact your JIRA administrators
For more information on JIRA, see: http://www.atlassian.com/software/jira