Hi,
The below code parses XML file, Here the output of the code is correct but the
job takes long time for completion.
It took 20 hours to parse 2MB file.
Kindly suggest what changes could be done to increase the performance.
package xml;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.*;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.log4j.Logger;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
public class ReadXmlMR
{
static Logger log = Logger.getLogger(ReadXmlMR.class.getName());
public static String fileName = new String();
public static Document dom;
public void configure(JobConf job) {
fileName = job.get("map.input.file");
}
public static class Map extends
Mapper<LongWritable,Text,Text,Text>
{
public void map(LongWritable key, Text
value,Context context ) throws IOException, InterruptedException
{
try {
FileSplit
fileSplit = (FileSplit)context.getInputSplit();
Configuration
conf = context.getConfiguration();
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
FSDataInputStream fstream1;
Path file =
fileSplit.getPath();
FileSystem fs =
file.getFileSystem(conf);
fstream1 =
fs.open(fileSplit.getPath());
DocumentBuilder
db = dbf.newDocumentBuilder();
dom =
db.parse(fstream1);
Element docEle
= null;
docEle =
dom.getDocumentElement();
XPath xpath =
XPathFactory.newInstance().newXPath();
Object result =
xpath.compile("//*").evaluate(dom, XPathConstants.NODESET);
NodeList nodes
= (NodeList) result;
for (int n = 2;
n < nodes.getLength(); n++)
{
Text colvalue=new Text("");
Text nodename= new Text("");
nodename = new Text(nodes.item(n).getNodeName());
try{colvalue = new
Text(nodes.item(n).getFirstChild().getNodeValue());}catch(Exception e){}
if(colvalue.toString().equalsIgnoreCase(null)){colvalue=new Text("");}
context.write(nodename, colvalue);
}
} catch
(ParserConfigurationException e) {
// TODO
Auto-generated catch block
e.printStackTrace();
} catch
(SAXException e) {
// TODO
Auto-generated catch block
e.printStackTrace();
} catch
(XPathExpressionException e) {
// TODO
Auto-generated catch block
e.printStackTrace();
}
}
}
public static void main(String[] args) throws Exception
{
Configuration conf = new Configuration();
Job job = new Job(conf, "XmlParsing");
job.setJarByClass(ReadXmlMR.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setMapperClass(Map.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.submit();
job.waitForCompletion(true);
}
}
Regards,
Chhaya Vishwakarma
________________________________
The contents of this e-mail and any attachment(s) may contain confidential or
privileged information for the intended recipient(s). Unintended recipients are
prohibited from taking action on the basis of information in this e-mail and
using or disseminating the information, and must notify the sender and delete
it from their system. L&T Infotech will not accept responsibility or liability
for the accuracy or completeness of, or the presence of any virus or disabling
code in this e-mail"