[ 
https://issues.apache.org/jira/browse/NUTCH-1472?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=13465617#comment-13465617
 ] 

zhaixuepan commented on NUTCH-1472:
-----------------------------------

Hi Lewis.Configuration and environment are as follows:
gora.properties
gora.cassandrastore.servers=localhost:9160

nutch-site.xml
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>

<!-- Put site-specific property overrides in this file. -->

<configuration>
        <property>
                <name>http.agent.name</name>
                <value>zhaitest</value>
                <description>HTTP ‘User-Agent’ request header. </description>
        </property>
        <property>
                <name>http.agent.description</name>
                <value>zhaitest</value>
                <description>Further description of our bot- this text is used 
in the
                        User-Agent header.
                </description>
        </property>
        <property>
                <name>http.agent.url</name>
                <value>http://www.zhaixuepan.cn</value>
                <description>A URL to advertise in the User-Agent header.
                </description>
        </property>
        <property>
                <name>http.agent.email</name>
                <value>[email protected]</value>
                <description>An email address to advertise in the HTTP ‘From’ 
reques
                        header and User-Agent header.
                </description>
        </property>
        <property>
                <name>plugin.folders</name>
                <value>./src/plugin</value>
                <description>Directories where nutch plugins are located. Each
                        element
                        may be a relative or absolute path. If absolute, it is 
used
                        as is. If
                        relative, it is searched for on the classpath.
                </description>
        </property>
        <property>
                <name>storage.data.store.class</name>
                <value>org.apache.gora.cassandra.store.CassandraStore</value>
                <description>data
                </description>
        </property>

        <property>
                <name>parser.character.encoding.default</name>
                <value>utf-8</value>
                <description>The character encoding to fall back to when no 
other
                        information
                        is available
                </description>
        </property>
        <property>
                <name>encodingdetector.charset.min.confidence</name>
                <value>100</value>
                <description>A integer between 0-100 indicating minimum 
confidence
                        value
                        for charset auto-detection. Any negative value disables
                        auto-detection.
                </description>
        </property>

</configuration>


run   config: org.apache.nutch.crawl.Crawler urls -solr 
http://localhost:8080/solr/ -depth 1 -topN 5 
vm arguments: -Dhadoop.log.dir=logs -Dhadoop.log.file=hadoop.log -Xms64m 
-Xmx512m
urls  config: http://auto.qq.com/ 

pom.xml
<?xml version="1.0"?>
<!-- Licensed to the Apache Software Foundation (ASF) under one or more 
contributor 
        license agreements. See the NOTICE file distributed with this work for 
additional 
        information regarding copyright ownership. The ASF licenses this file 
to 
        You under the Apache License, Version 2.0 (the "License"); you may not 
use 
        this file except in compliance with the License. You may obtain a copy 
of 
        the License at http://www.apache.org/licenses/LICENSE-2.0 Unless 
required 
        by applicable law or agreed to in writing, software distributed under 
the 
        License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 
CONDITIONS 
        OF ANY KIND, either express or implied. See the License for the 
specific 
        language governing permissions and limitations under the License. -->
<project xmlns="http://maven.apache.org/POM/4.0.0"; 
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance";
        xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 
http://maven.apache.org/maven-v4_0_0.xsd";>

        <modelVersion>4.0.0</modelVersion>
        <groupId>org.apache.nutch</groupId>
        <artifactId>nutch</artifactId>
        <packaging>jar</packaging>
        <version>2.0</version>
        <name>Apache Nutch</name>
        <url>http://nutch.apache.org</url>
        <licenses>
                <license>
                        <name>The Apache Software License, Version 2.0</name>
                        
<url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
                        <distribution>repo</distribution>
                </license>
        </licenses>
        <scm>
                <url>http://svn.apache.org/viewvc/nutch</url>
                <connection>http://svn.apache.org/viewvc/nutch</connection>
        </scm>
        <developers>
                <developer>
                        <id>ab</id>
                        <name>Andrzej Bialecki</name>
                        <email>[email protected]</email>
                </developer>
                <developer>
                        <id>mattmann</id>
                        <name>Chris A. Mattmann</name>
                        <email>[email protected]</email>
                </developer>
                <developer>
                        <id>kubes</id>
                        <name>Dennis Kubes</name>
                        <email>[email protected]</email>
                </developer>
                <developer>
                        <id>dogacan</id>
                        <name>Dogacan Güney</name>
                        <email>[email protected]</email>
                </developer>
                <developer>
                        <id>jnioche</id>
                        <name>Julien Nioche</name>
                        <email>[email protected]</email>
                </developer>
                <developer>
                        <id>siren</id>
                        <name>Sami Siren</name>
                        <email>[email protected]</email>
                </developer>
                <developer>
                        <id>markus</id>
                        <name>Markus Jelsma</name>
                        <email>[email protected]</email>
                </developer>
                <developer>
                        <id>alexis</id>
                        <name>Alexis Detlegrode</name>
                        <email>[email protected]</email>
                </developer>
                <developer>
                        <id>lewismc</id>
                        <name>Lewis John McGibbney</name>
                        <email>[email protected]</email>
                </developer>
        </developers>
        <!-- add -->
        <repositories>
                <repository>
                        <id>maven-restlet</id>
                        <name>Public online Restlet repository</name>
                        <url>http://maven.restlet.org</url>
                </repository>
                <repository>
                        <id>sonatype</id>
                        <name>Sonatype Groups</name>
                        
<url>https://oss.sonatype.org/content/groups/public/</url>
                </repository>
        </repositories>
        <!-- add end -->
        <build>
                <sourceDirectory>src/java</sourceDirectory>
                <testSourceDirectory>src/test</testSourceDirectory>
                <testResources>
                        <testResource>
                                <directory>src/testresources</directory>
                        </testResource>
                        <testResource>
                                <directory>src/testprocess</directory>
                        </testResource>
                </testResources>
                <pluginManagement>
                        <plugins>
                                <plugin>
                                        
<groupId>org.apache.maven.plugins</groupId>
                                        
<artifactId>maven-compiler-plugin</artifactId>
                                        <configuration>
                                                <source>1.6</source>
                                                <target>1.6</target>
                                        </configuration>
                                </plugin>
                        </plugins>
                </pluginManagement>
        </build>
        <dependencies>
                <!-- add -->
                <dependency>
                        <groupId>mysql</groupId>
                        <artifactId>mysql-connector-java</artifactId>
                        <version>5.1.17</version>
                </dependency>
                <dependency>
                        <groupId>org.elasticsearch</groupId>
                        <artifactId>elasticsearch</artifactId>
                        <version>0.19.9</version>
                </dependency>
                <dependency>
                        <groupId>net.sourceforge.nekohtml</groupId>
                        <artifactId>nekohtml</artifactId>
                        <version>1.9.15</version>
                </dependency>
                <dependency>
                        <groupId>org.ccil.cowan.tagsoup</groupId>
                        <artifactId>tagsoup</artifactId>
                        <version>1.2</version>
                </dependency>
                <dependency>
                        <groupId>org.hsqldb</groupId>
                        <artifactId>hsqldb</artifactId>
                        <version>2.2.8</version>
                </dependency>
                <dependency>
                        <groupId>org.apache.gora</groupId>
                        <artifactId>gora-cassandra</artifactId>
                        <version>0.2.1</version>
                        <exclusions>
                                <exclusion>
                                        <artifactId>avro</artifactId>
                                        <groupId>org.apache.hadoop</groupId>
                                </exclusion>
                                <exclusion>
                                        <artifactId>avro</artifactId>
                                        
<groupId>org.apache.cassandra.deps</groupId>
                                </exclusion>
                        </exclusions>
                        <optional>true</optional>
                </dependency>
                <dependency>
                        <groupId>org.apache.avro</groupId>
                        <artifactId>avro</artifactId>
                        <version>1.4.0</version>
                </dependency>
                <!-- add end -->
                <dependency>
                        <groupId>org.apache.solr</groupId>
                        <artifactId>solr-solrj</artifactId>
                        <version>3.4.0</version>
                        <optional>true</optional>
                </dependency>
                <dependency>
                        <groupId>org.slf4j</groupId>
                        <artifactId>slf4j-log4j12</artifactId>
                        <version>1.6.1</version>
                        <optional>true</optional>
                </dependency>
                <dependency>
                        <groupId>commons-lang</groupId>
                        <artifactId>commons-lang</artifactId>
                        <version>2.4</version>
                        <optional>true</optional>
                </dependency>
                <dependency>
                        <groupId>commons-collections</groupId>
                        <artifactId>commons-collections</artifactId>
                        <version>3.1</version>
                        <optional>true</optional>
                </dependency>
                <dependency>
                        <groupId>commons-httpclient</groupId>
                        <artifactId>commons-httpclient</artifactId>
                        <version>3.1</version>
                        <optional>true</optional>
                </dependency>
                <dependency>
                        <groupId>commons-codec</groupId>
                        <artifactId>commons-codec</artifactId>
                        <version>1.3</version>
                        <optional>true</optional>
                </dependency>
                <dependency>
                        <groupId>org.apache.hadoop</groupId>
                        <artifactId>hadoop-core</artifactId>
                        <version>1.0.3</version>
                        <exclusions>
                                <exclusion>
                                        <artifactId>hsqldb</artifactId>
                                        <groupId>hsqldb</groupId>
                                </exclusion>
                                <exclusion>
                                        <artifactId>ant</artifactId>
                                        <groupId>ant</groupId>
                                </exclusion>
                        </exclusions>
                        <optional>true</optional>
                </dependency>
                <dependency>
                        <groupId>com.ibm.icu</groupId>
                        <artifactId>icu4j</artifactId>
                        <version>4.0.1</version>
                        <scope>compile</scope>
                </dependency>
                <dependency>
                        <groupId>org.apache.tika</groupId>
                        <artifactId>tika-core</artifactId>
                        <version>1.1</version>
                        <scope>compile</scope>
                </dependency>
                <dependency>
                        <groupId>org.apache.gora</groupId>
                        <artifactId>gora-core</artifactId>
                        <!-- <version>0.2</version> -->
                        <version>0.2.1</version>
                        <exclusions>
                                <exclusion>
                                        
<artifactId>jackson-mapper-asl</artifactId>
                                        <groupId>org.codehaus.jackson</groupId>
                                </exclusion>
                                <exclusion>
                                        <artifactId>avro</artifactId>
                                        <groupId>org.apache.hadoop</groupId>
                                </exclusion>
                        </exclusions>
                        <optional>true</optional>
                </dependency>

                <dependency>
                        <groupId>log4j</groupId>
                        <artifactId>log4j</artifactId>
                        <!-- <version>1.2.15</version> -->
                        <version>1.2.16</version>
                        <optional>true</optional>
                </dependency>
                <dependency>
                        <groupId>xerces</groupId>
                        <artifactId>xercesImpl</artifactId>
                        <version>2.9.1</version>
                        <scope>compile</scope>
                </dependency>
                <dependency>
                        <groupId>xerces</groupId>
                        <artifactId>xmlParserAPIs</artifactId>
                        <version>2.6.2</version>
                        <scope>compile</scope>
                </dependency>
                <dependency>
                        <groupId>oro</groupId>
                        <artifactId>oro</artifactId>
                        <version>2.0.8</version>
                        <scope>compile</scope>
                </dependency>
                <dependency>
                        <groupId>org.jdom</groupId>
                        <artifactId>jdom</artifactId>
                        <version>1.1</version>
                        <optional>true</optional>
                </dependency>
                <dependency>
                        <groupId>com.google.guava</groupId>
                        <artifactId>guava</artifactId>
                        <version>11.0.2</version>
                        <scope>compile</scope>
                </dependency>
                <dependency>
                        <groupId>junit</groupId>
                        <artifactId>junit</artifactId>
                        <!-- <version>3.8.1</version> -->
                        <version>4.8.2</version>
                        <optional>true</optional>
                </dependency>
                <dependency>
                        <groupId>org.apache.hadoop</groupId>
                        <artifactId>hadoop-test</artifactId>
                        <version>1.0.3</version>
                        <optional>true</optional>
                </dependency>
                <dependency>
                        <groupId>org.mortbay.jetty</groupId>
                        <artifactId>jetty</artifactId>
                        <version>6.1.26</version>
                        <optional>true</optional>
                </dependency>
                <dependency>
                        <groupId>org.mortbay.jetty</groupId>
                        <artifactId>jetty-util</artifactId>
                        <version>6.1.26</version>
                        <optional>true</optional>
                </dependency>
                <dependency>
                        <groupId>org.mortbay.jetty</groupId>
                        <artifactId>jetty-client</artifactId>
                        <version>6.1.26</version>
                        <scope>compile</scope>
                </dependency>
                <dependency>
                        <groupId>org.jdom</groupId>
                        <artifactId>jdom</artifactId>
                        <version>1.1</version>
                        <optional>true</optional>
                </dependency>
                <dependency>
                        <groupId>org.apache.gora</groupId>
                        <artifactId>gora-sql</artifactId>
                        <version>0.1.1-incubating</version>
                        <optional>true</optional>
                </dependency>
                <dependency>
                        <groupId>org.restlet.jse</groupId>
                        <artifactId>org.restlet</artifactId>
                        <version>2.0.5</version>
                        <optional>true</optional>
                </dependency>
                <dependency>
                        <groupId>org.restlet.jse</groupId>
                        <artifactId>org.restlet.ext.jackson</artifactId>
                        <version>2.0.5</version>
                        <exclusions>
                                <exclusion>
                                        
<artifactId>jackson-mapper-asl</artifactId>
                                        <groupId>org.codehaus.jackson</groupId>
                                </exclusion>
                        </exclusions>
                        <optional>true</optional>
                </dependency>
                <dependency>
                        <groupId>org.codehaus.jackson</groupId>
                        <artifactId>jackson-mapper-asl</artifactId>
                        <version>1.4.3</version>
                        <type>jar</type>
                        <scope>compile</scope>
                </dependency>

                <dependency>
                        <groupId>com.oracle</groupId>
                        <artifactId>ojdbc14</artifactId>
                        <version>10.2.0.4.0</version>
                        <scope>compile</scope>
                </dependency>
                <dependency>
                        <groupId>org.codehaus.jackson</groupId>
                        <artifactId>jackson-core-asl</artifactId>
                        <version>1.3.4</version>
                        <type>jar</type>
                        <scope>compile</scope>
                </dependency>
        </dependencies>
</project>



                
>  InvalidRequestException(why:(String didn't validate.) [webpage][f][ts] 
> failed validation)
> ------------------------------------------------------------------------------------------
>
>                 Key: NUTCH-1472
>                 URL: https://issues.apache.org/jira/browse/NUTCH-1472
>             Project: Nutch
>          Issue Type: Bug
>    Affects Versions: 2.1
>            Reporter: zhaixuepan
>
> me.prettyprint.hector.api.exceptions.HInvalidRequestException: 
> InvalidRequestException(why:(String didn't validate.) [webpage][f][ts] failed 
> validation)
>       at 
> me.prettyprint.cassandra.service.ExceptionsTranslatorImpl.translate(ExceptionsTranslatorImpl.java:45)
>       at 
> me.prettyprint.cassandra.connection.HConnectionManager.operateWithFailover(HConnectionManager.java:264)
>       at 
> me.prettyprint.cassandra.model.ExecutingKeyspace.doExecuteOperation(ExecutingKeyspace.java:97)
>       at 
> me.prettyprint.cassandra.model.MutatorImpl.execute(MutatorImpl.java:243)
>       at 
> me.prettyprint.cassandra.model.MutatorImpl.insert(MutatorImpl.java:69)
>       at 
> org.apache.gora.cassandra.store.HectorUtils.insertColumn(HectorUtils.java:47)
>       at 
> org.apache.gora.cassandra.store.CassandraClient.addColumn(CassandraClient.java:169)
>       at 
> org.apache.gora.cassandra.store.CassandraStore.addOrUpdateField(CassandraStore.java:341)
>       at 
> org.apache.gora.cassandra.store.CassandraStore.flush(CassandraStore.java:228)
>       at 
> org.apache.gora.cassandra.store.CassandraStore.close(CassandraStore.java:95)
>       at 
> org.apache.gora.mapreduce.GoraRecordWriter.close(GoraRecordWriter.java:55)
>       at 
> org.apache.hadoop.mapred.MapTask$NewDirectOutputCollector.close(MapTask.java:651)
>       at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:766)
>       at org.apache.hadoop.mapred.MapTask.run(MapTask.java:370)
>       at 
> org.apache.hadoop.mapred.LocalJobRunner$Job.run(LocalJobRunner.java:212)
> Caused by: InvalidRequestException(why:(String didn't validate.) 
> [webpage][f][ts] failed validation)
>       at 
> org.apache.cassandra.thrift.Cassandra$batch_mutate_result.read(Cassandra.java:20253)
>       at org.apache.thrift.TServiceClient.receiveBase(TServiceClient.java:78)
>       at 
> org.apache.cassandra.thrift.Cassandra$Client.recv_batch_mutate(Cassandra.java:922)
>       at 
> org.apache.cassandra.thrift.Cassandra$Client.batch_mutate(Cassandra.java:908)
>       at 
> me.prettyprint.cassandra.model.MutatorImpl$3.execute(MutatorImpl.java:246)
>       at 
> me.prettyprint.cassandra.model.MutatorImpl$3.execute(MutatorImpl.java:243)
>       at 
> me.prettyprint.cassandra.service.Operation.executeAndSetResult(Operation.java:103)
>       at 
> me.prettyprint.cassandra.connection.HConnectionManager.operateWithFailover(HConnectionManager.java:258)
>       ... 13 more

--
This message is automatically generated by JIRA.
If you think it was sent incorrectly, please contact your JIRA administrators
For more information on JIRA, see: http://www.atlassian.com/software/jira

Reply via email to