Saturday
i would liek to modify simple word count program so that i can produce text
file from given html files ( by extracting text content only beween <title> and
</title> and <text> and </text> . When i try to modify map and reduce task. it
seems that i could not overwrite inwritable. the error is
10/10/16 09:07:18 INFO jvm.JvmMetrics: Initializing JVM Metrics with
processName=JobTracker, sessionId=
10/10/16 09:07:18 WARN mapred.JobClient: Use GenericOptionsParser for parsing
the arguments. Applications should implement Tool for the same.
10/10/16 09:07:18 WARN mapred.JobClient: No job jar file set. User classes may
not be found. See JobConf(Class) or JobConf#setJar(String).
10/10/16 09:07:18 INFO mapred.FileInputFormat: Total input paths to process : 20
10/10/16 09:07:19 INFO mapred.JobClient: Running job: job_local_0001
10/10/16 09:07:19 INFO mapred.FileInputFormat: Total input paths to process : 20
10/10/16 09:07:19 INFO mapred.MapTask: numReduceTasks: 1
10/10/16 09:07:19 INFO mapred.MapTask: io.sort.mb = 100
10/10/16 09:07:19 INFO mapred.MapTask: data buffer = 79691776/99614720
10/10/16 09:07:19 INFO mapred.MapTask: record buffer = 262144/327680
10/10/16 09:07:19 INFO mapred.MapTask: Starting flush of map output
10/10/16 09:07:20 INFO mapred.JobClient: map 0% reduce 0%
10/10/16 09:07:21 WARN mapred.LocalJobRunner: job_local_0001
java.lang.ClassCastException: org.apache.hadoop.io.Text cannot be cast to
org.apache.hadoop.io.IntWritable <<<--------------------
at WordProcess$Reduce.reduce(WordProcess.java:44)
at WordProcess$Reduce.reduce(WordProcess.java:1)
at
org.apache.hadoop.mapred.Task$OldCombinerRunner.combine(Task.java:1151)
at
org.apache.hadoop.mapred.MapTask$MapOutputBuffer.sortAndSpill(MapTask.java:1265)
at
org.apache.hadoop.mapred.MapTask$MapOutputBuffer.flush(MapTask.java:1129)
at org.apache.hadoop.mapred.MapTask.runOldMapper(MapTask.java:359)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:307)
at
org.apache.hadoop.mapred.LocalJobRunner$Job.run(LocalJobRunner.java:177)
10/10/16 09:07:22 INFO mapred.JobClient: Job complete: job_local_0001
10/10/16 09:07:22 INFO mapred.JobClient: Counters: 0
Exception in thread "main" java.io.IOException: Job failed!
<--------------------------------------
at org.apache.hadoop.mapred.JobClient.runJob(JobClient.java:1252)
at WordProcess.main(WordProcess.java:88)
my code is
import java.io.IOException;
import java.util.*;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.*;
public class WordProcess {
public static class Map extends MapReduceBase implements Mapper<LongWritable,
Text, Text, Text> {
private final static IntWritable one = new IntWritable(1);
private Text id = new Text();
private Text value = new Text();
public void map(LongWritable key, Text value, OutputCollector<Text, Text>
output, Reporter reporter) throws IOException {
String line = value.toString();
FileSplit fileSplit = (FileSplit)reporter.getInputSplit();
String fileName = fileSplit.getPath().getName();
id.set(fileName);
value.set(line);
output.collect(id, value);
}
}
public static class Reduce extends MapReduceBase implements Reducer<Text,
IntWritable, Text, Text> {
public void reduce(Text key, Iterator<IntWritable> values,
OutputCollector<Text, Text> output, Reporter reporter) throws IOException {
int sum = 0;
String str = "";
String substr1,substr2;
Text text = new Text();
while (values.hasNext())
{
String s = values.next().toString();
str = str.concat(s);
}
// locate tags and extract content
int x1 = str.indexOf("<TITLE>");
int y1 = str.indexOf("</TITLE>");
substr1 = str.substring(x1+7,y1);
int x2 = str.indexOf("<TEXT>");
int y2 = str.indexOf("</TEXT>");
substr2 = str.substring(x2+5,y2);
str = substr1 +" "+ substr2;
text.set(str);
output.collect(key, text);
System.out.println(key+","+text);
}
}
public static void main(String[] args) throws Exception {
JobConf conf = new JobConf(WordProcess.class);
conf.setJobName("wordprocess");
conf.setOutputKeyClass(Text.class);
// conf.setOutputValueClass(IntWritable.class);
conf.setOutputValueClass(Text.class);
conf.setMapperClass(Map.class);
conf.setCombinerClass(Reduce.class);
conf.setReducerClass(Reduce.class);
conf.setInputFormat(TextInputFormat.class);
conf.setOutputFormat(TextOutputFormat.class);
FileInputFormat.setInputPaths(conf, new Path(args[0]));
FileOutputFormat.setOutputPath(conf, new Path(args[1]));
// delete the output directory if it exists already
FileSystem.get(conf).delete(new Path(args[1]), true);
JobClient.runJob(conf);
}
}
anyone have experience with this problem, pls tell me how to fix
thank in advances
best regard
Tri Doan
1429 Laramie Apt 3, Manhattan
KS 66502
USA