Re: XML parsing in Hadoop

Adam Kawa Fri, 29 Nov 2013 02:12:58 -0800

Alternatively you can try an input format called WholeFileInputFormat
(nicely explained in "Hadoop: The Definitive Guide" by Tom White), where
you process a whole file as a record in a single map() method. Refer to a
book, for a code example.



2013/11/28 Devaraj K <[email protected]>

> Hi,
>
> Here this map() function will be called for every (key,value) pair (i.e.
> for every line of split in your Job because of TextInputFormat). This xml
> parsing code which you have written in map() function will be executed for
> every line of your input which is causing the problem.
>
> You can customize your InputFormat to read the xml file, instead of
> parsing in map() or you could place this parsing code in run() method by
> overriding it from Mapper.run(Context context).
>
>
> On Thu, Nov 28, 2013 at 12:15 PM, Chhaya Vishwakarma <
> [email protected]> wrote:
>
>>  Hi,
>>
>>
>>
>>
>>
>> The below code parses XML file, Here the output of the code is correct
>> but the job takes long time for completion.
>>
>> It took 20 hours to parse 2MB file.
>>
>> Kindly suggest what changes could be done to increase the performance.
>>
>>
>>
>>
>>
>>
>>
>> package xml;
>>
>>
>>
>> import java.io.FileInputStream;
>>
>> import java.io.FileNotFoundException;
>>
>> import java.io.IOException;
>>
>> import java.util.*;
>>
>>
>>
>> import javax.xml.parsers.DocumentBuilder;
>>
>> import javax.xml.parsers.DocumentBuilderFactory;
>>
>> import javax.xml.parsers.ParserConfigurationException;
>>
>> import javax.xml.xpath.XPath;
>>
>> import javax.xml.xpath.XPathConstants;
>>
>> import javax.xml.xpath.XPathExpressionException;
>>
>> import javax.xml.xpath.XPathFactory;
>>
>>
>>
>> import org.apache.hadoop.fs.FSDataInputStream;
>>
>> import org.apache.hadoop.fs.FSInputStream;
>>
>> import org.apache.hadoop.fs.FileSystem;
>>
>> import org.apache.hadoop.fs.Path;
>>
>>
>>
>> import org.apache.hadoop.conf.*;
>>
>> import org.apache.hadoop.io.*;
>>
>>
>>
>> import org.apache.hadoop.mapred.JobConf;
>>
>> import org.apache.hadoop.mapreduce.*;
>>
>> import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
>>
>> import org.apache.hadoop.mapreduce.lib.input.FileSplit;
>>
>> import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
>>
>> import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
>>
>> import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
>>
>>
>>
>>
>>
>> import org.apache.log4j.Logger;
>>
>> import org.w3c.dom.Document;
>>
>> import org.w3c.dom.Element;
>>
>> import org.w3c.dom.NodeList;
>>
>> import org.xml.sax.SAXException;
>>
>>
>>
>>
>>
>> public class ReadXmlMR
>>
>> {
>>
>>                 static Logger log =
>> Logger.getLogger(ReadXmlMR.class.getName());
>>
>>                  public static String fileName = new String();
>>
>>                  public static Document dom;
>>
>>                  public void configure(JobConf job) {
>>
>>          fileName = job.get("map.input.file");
>>
>> }
>>
>>
>>
>>
>>
>>                 public static class Map extends
>> Mapper<LongWritable,Text,Text,Text>
>>
>>                {
>>
>>
>>
>>                                 public void map(LongWritable key, Text
>> value,Context context ) throws IOException, InterruptedException
>>
>>                                 {
>>
>>                                                 try {
>>
>>                                                                 FileSplit
>> fileSplit = (FileSplit)context.getInputSplit();
>>
>>
>> Configuration conf = context.getConfiguration();
>>
>>
>>
>>
>> DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
>>
>>
>>
>>
>> FSDataInputStream fstream1;
>>
>>                                                                 Path file
>> = fileSplit.getPath();
>>
>>                                                 FileSystem fs =
>> file.getFileSystem(conf);
>>
>>                                                 fstream1 =
>> fs.open(fileSplit.getPath());
>>
>>
>> DocumentBuilder db = dbf.newDocumentBuilder();
>>
>>                                                                 dom =
>> db.parse(fstream1);
>>
>>                                                                 Element
>> docEle = null;
>>
>>                                                                 docEle =
>> dom.getDocumentElement();
>>
>>
>>
>>                                                                 XPath
>> xpath = XPathFactory.newInstance().newXPath();
>>
>>
>>
>>                                                                 Object
>> result =  xpath.compile("//*").evaluate(dom, XPathConstants.NODESET);
>>
>>
>>
>>                                                                 NodeList
>> nodes = (NodeList) result;
>>
>>
>>
>>
>>
>>                                                                 for (int
>> n = 2; n < nodes.getLength(); n++)
>>
>>
>>
>>                                                                 {
>>
>>
>> Text colvalue=new Text("");
>>
>>
>> Text nodename= new Text("");
>>
>>
>>
>>
>> nodename = new Text(nodes.item(n).getNodeName());
>>
>>
>> try{colvalue = new
>> Text(nodes.item(n).getFirstChild().getNodeValue());}catch(Exception e){}
>>
>>
>> if(colvalue.toString().equalsIgnoreCase(null)){colvalue=new Text("");}
>>
>>
>> context.write(nodename, colvalue);
>>
>>                                                                 }
>>
>>
>>
>>
>>
>>                                                                 } catch
>> (ParserConfigurationException e) {
>>
>>                                                                 // TODO
>> Auto-generated catch block
>>
>>
>> e.printStackTrace();
>>
>>                                                                 } catch
>> (SAXException e) {
>>
>>                                                                 // TODO
>> Auto-generated catch block
>>
>>
>> e.printStackTrace();
>>
>>
>>
>>                                                                 } catch
>> (XPathExpressionException e) {
>>
>>                                                                 // TODO
>> Auto-generated catch block
>>
>>
>> e.printStackTrace();
>>
>>                                                                 }
>>
>>
>>
>>                                                                 }
>>
>>
>>
>>                                 }
>>
>>
>>
>>
>>
>>
>>
>>                 public static void main(String[] args) throws Exception
>>
>>
>>
>>                 {
>>
>>
>>
>>                 Configuration conf = new Configuration();
>>
>>
>>
>>         Job job = new Job(conf, "XmlParsing");
>>
>>         job.setJarByClass(ReadXmlMR.class);
>>
>>                 job.setOutputKeyClass(Text.class);
>>
>>                 job.setOutputValueClass(Text.class);
>>
>>
>>
>>
>>
>>                 job.setMapperClass(Map.class);
>>
>>
>>
>>
>>
>>                 job.setInputFormatClass(TextInputFormat.class);
>>
>>                 job.setOutputFormatClass(TextOutputFormat.class);
>>
>>
>>
>>                 FileInputFormat.addInputPath(job, new Path(args[0]));
>>
>>                 FileOutputFormat.setOutputPath(job, new Path(args[1]));
>>
>>
>>
>>
>>
>>                 job.submit();
>>
>>
>>
>>                 job.waitForCompletion(true);
>>
>>
>>
>>
>>
>>                 }
>>
>>
>>
>> }
>>
>>
>>
>>
>>
>>
>>
>> Regards,
>>
>> Chhaya Vishwakarma
>>
>>
>>
>> ------------------------------
>> The contents of this e-mail and any attachment(s) may contain
>> confidential or privileged information for the intended recipient(s).
>> Unintended recipients are prohibited from taking action on the basis of
>> information in this e-mail and using or disseminating the information, and
>> must notify the sender and delete it from their system. L&T Infotech will
>> not accept responsibility or liability for the accuracy or completeness of,
>> or the presence of any virus or disabling code in this e-mail"
>>
>
>
>
> --
>
>
> Thanks
> Devaraj K
>

Re: XML parsing in Hadoop

Reply via email to