Repository: crunch Updated Branches: refs/heads/master 5dccb5330 -> 958d011a4
CRUNCH-491: Add an Xml File Source Project: http://git-wip-us.apache.org/repos/asf/crunch/repo Commit: http://git-wip-us.apache.org/repos/asf/crunch/commit/958d011a Tree: http://git-wip-us.apache.org/repos/asf/crunch/tree/958d011a Diff: http://git-wip-us.apache.org/repos/asf/crunch/diff/958d011a Branch: refs/heads/master Commit: 958d011a4c4c704bd2e12dfbb8f7216342c7fb1f Parents: 5dccb53 Author: tzolov <[email protected]> Authored: Mon Jan 26 01:50:38 2015 +0100 Committer: tzolov <[email protected]> Committed: Wed Feb 4 01:14:15 2015 +0100 ---------------------------------------------------------------------- .../apache/crunch/io/text/xml/XmlSourceIT.java | Bin 0 -> 3325 bytes .../crunch/io/text/xml/XmlInputFormat.java | 193 ++++++++++++ .../apache/crunch/io/text/xml/XmlSource.java | 71 +++++ .../src/main/resources/xmlSourceSample1.xml | 291 +++++++++++++++++++ .../src/main/resources/xmlSourceSample2.xml | Bin 0 -> 248 bytes 5 files changed, 555 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/crunch/blob/958d011a/crunch-core/src/it/java/org/apache/crunch/io/text/xml/XmlSourceIT.java ---------------------------------------------------------------------- diff --git a/crunch-core/src/it/java/org/apache/crunch/io/text/xml/XmlSourceIT.java b/crunch-core/src/it/java/org/apache/crunch/io/text/xml/XmlSourceIT.java new file mode 100644 index 0000000..4b4b9e1 Binary files /dev/null and b/crunch-core/src/it/java/org/apache/crunch/io/text/xml/XmlSourceIT.java differ http://git-wip-us.apache.org/repos/asf/crunch/blob/958d011a/crunch-core/src/main/java/org/apache/crunch/io/text/xml/XmlInputFormat.java ---------------------------------------------------------------------- diff --git a/crunch-core/src/main/java/org/apache/crunch/io/text/xml/XmlInputFormat.java b/crunch-core/src/main/java/org/apache/crunch/io/text/xml/XmlInputFormat.java new file mode 100644 index 0000000..58157fe --- /dev/null +++ b/crunch-core/src/main/java/org/apache/crunch/io/text/xml/XmlInputFormat.java @@ -0,0 +1,193 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.crunch.io.text.xml; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.OutputStreamWriter; +import java.io.UnsupportedEncodingException; +import java.nio.charset.Charset; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.DataOutputBuffer; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapreduce.InputSplit; +import org.apache.hadoop.mapreduce.RecordReader; +import org.apache.hadoop.mapreduce.TaskAttemptContext; +import org.apache.hadoop.mapreduce.lib.input.FileSplit; +import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.common.base.Charsets; +import com.google.common.primitives.Chars; + +/** + * Reads records that are delimited by a specific begin/end tag. + * + * The {@link XmlInputFormat} extends the Mahout's XmlInputFormat implementation providing encoding support + */ +public class XmlInputFormat extends TextInputFormat { + + private static final Logger log = LoggerFactory.getLogger(XmlInputFormat.class); + + public static final String START_TAG_KEY = "xmlinput.start"; + public static final String END_TAG_KEY = "xmlinput.end"; + public static final String ENCODING = "xml.encoding"; + + @Override + public RecordReader<LongWritable, Text> createRecordReader(InputSplit split, TaskAttemptContext context) { + try { + return new XmlRecordReader((FileSplit) split, context.getConfiguration()); + } catch (IOException ioe) { + log.warn("Error while creating XmlRecordReader", ioe); + return null; + } + } + + /** + * XMLRecordReader class to read through a given xml document to output xml blocks as records as specified by the + * start tag and end tag + */ + public static class XmlRecordReader extends RecordReader<LongWritable, Text> { + + private static final String DEFAULT_ENCODING = Charsets.UTF_8.name(); + + private final char[] startTag; + private final char[] endTag; + private final long start; + private final long end; + + private LongWritable currentKey; + private Text currentValue; + private final DataOutputBuffer outBuffer; + private final BufferedReader inReader; + private final OutputStreamWriter outWriter; + private final String inputEncoding; + private int readByteCounter = 0; + + public XmlRecordReader(FileSplit split, Configuration conf) throws IOException { + inputEncoding = conf.get(ENCODING, DEFAULT_ENCODING); + startTag = new String(conf.get(START_TAG_KEY).getBytes(inputEncoding), inputEncoding).toCharArray(); + endTag = new String(conf.get(END_TAG_KEY).getBytes(inputEncoding), inputEncoding).toCharArray(); + + // open the file and seek to the start of the split + start = split.getStart(); + end = start + split.getLength(); + Path file = split.getPath(); + FileSystem fs = file.getFileSystem(conf); + FSDataInputStream fsin = fs.open(split.getPath()); + fsin.seek(start); + inReader = new BufferedReader(new InputStreamReader(fsin, Charset.forName(inputEncoding))); + outBuffer = new DataOutputBuffer(); + outWriter = new OutputStreamWriter(outBuffer, inputEncoding); + } + + private boolean next(LongWritable key, Text value) throws IOException { + + if (readByteCounter < end && readUntilMatch(startTag, false)) { + try { + outWriter.write(startTag); + + if (readUntilMatch(endTag, true)) { + key.set(readByteCounter); + outWriter.flush(); + value.set(toUTF8(outBuffer.getData()), 0, outBuffer.getLength()); + return true; + } + } finally { + outWriter.flush(); + outBuffer.reset(); + } + } + return false; + } + + private byte[] toUTF8(byte[] in) throws UnsupportedEncodingException { + return new String(in, inputEncoding).getBytes(Charsets.UTF_8); + } + + @Override + public void close() throws IOException { + inReader.close(); + } + + @Override + public float getProgress() throws IOException { + return (readByteCounter - start) / (float) (end - start); + } + + private boolean readUntilMatch(char[] match, boolean withinBlock) throws IOException { + int i = 0; + while (true) { + int nextInCharacter = inReader.read(); + + readByteCounter = +Chars.toByteArray((char) nextInCharacter).length; + + // end of file: + if (nextInCharacter == -1) { + return false; + } + // save to buffer: + if (withinBlock) { + outWriter.write(nextInCharacter); + } + + // check if we're matching: + if (nextInCharacter == match[i]) { + i++; + if (i >= match.length) { + return true; + } + } else { + i = 0; + } + // see if we've passed the stop point + if (!withinBlock && i == 0 && readByteCounter >= end) { + return false; + } + } + } + + @Override + public LongWritable getCurrentKey() throws IOException, InterruptedException { + return currentKey; + } + + @Override + public Text getCurrentValue() throws IOException, InterruptedException { + return currentValue; + } + + @Override + public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { + } + + @Override + public boolean nextKeyValue() throws IOException, InterruptedException { + currentKey = new LongWritable(); + currentValue = new Text(); + return next(currentKey, currentValue); + } + } +} http://git-wip-us.apache.org/repos/asf/crunch/blob/958d011a/crunch-core/src/main/java/org/apache/crunch/io/text/xml/XmlSource.java ---------------------------------------------------------------------- diff --git a/crunch-core/src/main/java/org/apache/crunch/io/text/xml/XmlSource.java b/crunch-core/src/main/java/org/apache/crunch/io/text/xml/XmlSource.java new file mode 100644 index 0000000..2e434e7 --- /dev/null +++ b/crunch-core/src/main/java/org/apache/crunch/io/text/xml/XmlSource.java @@ -0,0 +1,71 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.crunch.io.text.xml; + +import org.apache.crunch.io.FormatBundle; +import org.apache.crunch.io.impl.FileSourceImpl; +import org.apache.crunch.types.writable.Writables; +import org.apache.hadoop.fs.Path; + +import com.google.common.base.Charsets; + +/** + * Large XML documents composed of repetitive XML elements can be broken into chunks delimited by element's start and + * end tag. The {@link XmlSource2} process XML files and extract out the XML between the pre-configured start / end + * tags. Developer should process the content between the tags. + * + * The {@link XmlSource} does not parse the input XML files and is not aware of the XML semantics. It just splits the + * input file in chunks defined by the start/end tags. Nested XML elements are not supported. + */ +public class XmlSource extends FileSourceImpl<String> { + + /** + * Create new XML data loader using the UTF-8 encoding. + * + * @param inputPath + * Input XML file location + * @param tagStart + * Elements's start tag + * @param tagEnd + * Elements's end tag + */ + public XmlSource(String inputPath, String tagStart, String tagEnd) { + this(inputPath, tagStart, tagEnd, Charsets.UTF_8.name()); + } + + /** + * Create new XML data loader using the specified encoding. + * + * @param inputPath + * Input XML file location + * @param tagStart + * Elements's start tag + * @param tagEnd + * Elements's end tag + * @param encoding + * Input file encoding + */ + public XmlSource(String inputPath, String tagStart, String tagEnd, String encoding) { + super(new Path(inputPath), + Writables.strings(), + FormatBundle.forInput(XmlInputFormat.class) + .set(XmlInputFormat.START_TAG_KEY, tagStart) + .set(XmlInputFormat.END_TAG_KEY, tagEnd) + .set(XmlInputFormat.ENCODING, encoding)); + } +} http://git-wip-us.apache.org/repos/asf/crunch/blob/958d011a/crunch-test/src/main/resources/xmlSourceSample1.xml ---------------------------------------------------------------------- diff --git a/crunch-test/src/main/resources/xmlSourceSample1.xml b/crunch-test/src/main/resources/xmlSourceSample1.xml new file mode 100644 index 0000000..8734c1e --- /dev/null +++ b/crunch-test/src/main/resources/xmlSourceSample1.xml @@ -0,0 +1,291 @@ +<?xml version="1.0" encoding="UTF-8"?> +<CATALOG> + <PLANT> + <COMMON>Bloodroot</COMMON> + <BOTANICAL>Sanguinaria canadensis</BOTANICAL> + <ZONE>4</ZONE> + <LIGHT>Mostly Shady</LIGHT> + <PRICE>$2.44</PRICE> + <AVAILABILITY>031599</AVAILABILITY> + </PLANT> + <PLANT> + <COMMON>Columbine</COMMON> + <BOTANICAL>Aquilegia canadensis</BOTANICAL> + <ZONE>3</ZONE> + <LIGHT>Mostly Shady</LIGHT> + <PRICE>$9.37</PRICE> + <AVAILABILITY>030699</AVAILABILITY> + </PLANT> + <PLANT> + <COMMON>Marsh Marigold</COMMON> + <BOTANICAL>Caltha palustris</BOTANICAL> + <ZONE>4</ZONE> + <LIGHT>Mostly Sunny</LIGHT> + <PRICE>$6.81</PRICE> + <AVAILABILITY>051799</AVAILABILITY> + </PLANT> + <PLANT> + <COMMON>Cowslip</COMMON> + <BOTANICAL>Caltha palustris</BOTANICAL> + <ZONE>4</ZONE> + <LIGHT>Mostly Shady</LIGHT> + <PRICE>$9.90</PRICE> + <AVAILABILITY>030699</AVAILABILITY> + </PLANT> + <PLANT> + <COMMON>Dutchman's-Breeches</COMMON> + <BOTANICAL>Dicentra cucullaria</BOTANICAL> + <ZONE>3</ZONE> + <LIGHT>Mostly Shady</LIGHT> + <PRICE>$6.44</PRICE> + <AVAILABILITY>012099</AVAILABILITY> + </PLANT> + <PLANT> + <COMMON>Ginger, Wild</COMMON> + <BOTANICAL>Asarum canadense</BOTANICAL> + <ZONE>3</ZONE> + <LIGHT>Mostly Shady</LIGHT> + <PRICE>$9.03</PRICE> + <AVAILABILITY>041899</AVAILABILITY> + </PLANT> + <PLANT> + <COMMON>Hepatica</COMMON> + <BOTANICAL>Hepatica americana</BOTANICAL> + <ZONE>4</ZONE> + <LIGHT>Mostly Shady</LIGHT> + <PRICE>$4.45</PRICE> + <AVAILABILITY>012699</AVAILABILITY> + </PLANT> + <PLANT> + <COMMON>Liverleaf</COMMON> + <BOTANICAL>Hepatica americana</BOTANICAL> + <ZONE>4</ZONE> + <LIGHT>Mostly Shady</LIGHT> + <PRICE>$3.99</PRICE> + <AVAILABILITY>010299</AVAILABILITY> + </PLANT> + <PLANT> + <COMMON>Jack-In-The-Pulpit</COMMON> + <BOTANICAL>Arisaema triphyllum</BOTANICAL> + <ZONE>4</ZONE> + <LIGHT>Mostly Shady</LIGHT> + <PRICE>$3.23</PRICE> + <AVAILABILITY>020199</AVAILABILITY> + </PLANT> + <PLANT> + <COMMON>Mayapple</COMMON> + <BOTANICAL>Podophyllum peltatum</BOTANICAL> + <ZONE>3</ZONE> + <LIGHT>Mostly Shady</LIGHT> + <PRICE>$2.98</PRICE> + <AVAILABILITY>060599</AVAILABILITY> + </PLANT> + <PLANT> + <COMMON>Phlox, Woodland</COMMON> + <BOTANICAL>Phlox divaricata</BOTANICAL> + <ZONE>3</ZONE> + <LIGHT>Sun or Shade</LIGHT> + <PRICE>$2.80</PRICE> + <AVAILABILITY>012299</AVAILABILITY> + </PLANT> + <PLANT> + <COMMON>Phlox, Blue</COMMON> + <BOTANICAL>Phlox divaricata</BOTANICAL> + <ZONE>3</ZONE> + <LIGHT>Sun or Shade</LIGHT> + <PRICE>$5.59</PRICE> + <AVAILABILITY>021699</AVAILABILITY> + </PLANT> + <PLANT> + <COMMON>Spring-Beauty</COMMON> + <BOTANICAL>Claytonia Virginica</BOTANICAL> + <ZONE>7</ZONE> + <LIGHT>Mostly Shady</LIGHT> + <PRICE>$6.59</PRICE> + <AVAILABILITY>020199</AVAILABILITY> + </PLANT> + <PLANT> + <COMMON>Trillium</COMMON> + <BOTANICAL>Trillium grandiflorum</BOTANICAL> + <ZONE>5</ZONE> + <LIGHT>Sun or Shade</LIGHT> + <PRICE>$3.90</PRICE> + <AVAILABILITY>042999</AVAILABILITY> + </PLANT> + <PLANT> + <COMMON>Wake Robin</COMMON> + <BOTANICAL>Trillium grandiflorum</BOTANICAL> + <ZONE>5</ZONE> + <LIGHT>Sun or Shade</LIGHT> + <PRICE>$3.20</PRICE> + <AVAILABILITY>022199</AVAILABILITY> + </PLANT> + <PLANT> + <COMMON>Violet, Dog-Tooth</COMMON> + <BOTANICAL>Erythronium americanum</BOTANICAL> + <ZONE>4</ZONE> + <LIGHT>Shade</LIGHT> + <PRICE>$9.04</PRICE> + <AVAILABILITY>020199</AVAILABILITY> + </PLANT> + <PLANT> + <COMMON>Trout Lily</COMMON> + <BOTANICAL>Erythronium americanum</BOTANICAL> + <ZONE>4</ZONE> + <LIGHT>Shade</LIGHT> + <PRICE>$6.94</PRICE> + <AVAILABILITY>032499</AVAILABILITY> + </PLANT> + <PLANT> + <COMMON>Adder's-Tongue</COMMON> + <BOTANICAL>Erythronium americanum</BOTANICAL> + <ZONE>4</ZONE> + <LIGHT>Shade</LIGHT> + <PRICE>$9.58</PRICE> + <AVAILABILITY>041399</AVAILABILITY> + </PLANT> + <PLANT> + <COMMON>Anemone</COMMON> + <BOTANICAL>Anemone blanda</BOTANICAL> + <ZONE>6</ZONE> + <LIGHT>Mostly Shady</LIGHT> + <PRICE>$8.86</PRICE> + <AVAILABILITY>122698</AVAILABILITY> + </PLANT> + <PLANT> + <COMMON>Grecian Windflower</COMMON> + <BOTANICAL>Anemone blanda</BOTANICAL> + <ZONE>6</ZONE> + <LIGHT>Mostly Shady</LIGHT> + <PRICE>$9.16</PRICE> + <AVAILABILITY>071099</AVAILABILITY> + </PLANT> + <PLANT> + <COMMON>Bee Balm</COMMON> + <BOTANICAL>Monarda didyma</BOTANICAL> + <ZONE>4</ZONE> + <LIGHT>Shade</LIGHT> + <PRICE>$4.59</PRICE> + <AVAILABILITY>050399</AVAILABILITY> + </PLANT> + <PLANT> + <COMMON>Bergamot</COMMON> + <BOTANICAL>Monarda didyma</BOTANICAL> + <ZONE>4</ZONE> + <LIGHT>Shade</LIGHT> + <PRICE>$7.16</PRICE> + <AVAILABILITY>042799</AVAILABILITY> + </PLANT> + <PLANT> + <COMMON>Black-Eyed Susan</COMMON> + <BOTANICAL>Rudbeckia hirta</BOTANICAL> + <ZONE>Annual</ZONE> + <LIGHT>Sunny</LIGHT> + <PRICE>$9.80</PRICE> + <AVAILABILITY>061899</AVAILABILITY> + </PLANT> + <PLANT> + <COMMON>Buttercup</COMMON> + <BOTANICAL>Ranunculus</BOTANICAL> + <ZONE>4</ZONE> + <LIGHT>Shade</LIGHT> + <PRICE>$2.57</PRICE> + <AVAILABILITY>061099</AVAILABILITY> + </PLANT> + <PLANT> + <COMMON>Crowfoot</COMMON> + <BOTANICAL>Ranunculus</BOTANICAL> + <ZONE>4</ZONE> + <LIGHT>Shade</LIGHT> + <PRICE>$9.34</PRICE> + <AVAILABILITY>040399</AVAILABILITY> + </PLANT> + <PLANT> + <COMMON>Butterfly Weed</COMMON> + <BOTANICAL>Asclepias tuberosa</BOTANICAL> + <ZONE>Annual</ZONE> + <LIGHT>Sunny</LIGHT> + <PRICE>$2.78</PRICE> + <AVAILABILITY>063099</AVAILABILITY> + </PLANT> + <PLANT> + <COMMON>Cinquefoil</COMMON> + <BOTANICAL>Potentilla</BOTANICAL> + <ZONE>Annual</ZONE> + <LIGHT>Shade</LIGHT> + <PRICE>$7.06</PRICE> + <AVAILABILITY>052599</AVAILABILITY> + </PLANT> + <PLANT> + <COMMON>Primrose</COMMON> + <BOTANICAL>Oenothera</BOTANICAL> + <ZONE>3 - 5</ZONE> + <LIGHT>Sunny</LIGHT> + <PRICE>$6.56</PRICE> + <AVAILABILITY>013099</AVAILABILITY> + </PLANT> + <PLANT> + <COMMON>Gentian</COMMON> + <BOTANICAL>Gentiana</BOTANICAL> + <ZONE>4</ZONE> + <LIGHT>Sun or Shade</LIGHT> + <PRICE>$7.81</PRICE> + <AVAILABILITY>051899</AVAILABILITY> + </PLANT> + <PLANT> + <COMMON>Blue Gentian</COMMON> + <BOTANICAL>Gentiana</BOTANICAL> + <ZONE>4</ZONE> + <LIGHT>Sun or Shade</LIGHT> + <PRICE>$8.56</PRICE> + <AVAILABILITY>050299</AVAILABILITY> + </PLANT> + <PLANT> + <COMMON>Jacob's Ladder</COMMON> + <BOTANICAL>Polemonium caeruleum</BOTANICAL> + <ZONE>Annual</ZONE> + <LIGHT>Shade</LIGHT> + <PRICE>$9.26</PRICE> + <AVAILABILITY>022199</AVAILABILITY> + </PLANT> + <PLANT> + <COMMON>Greek Valerian</COMMON> + <BOTANICAL>Polemonium caeruleum</BOTANICAL> + <ZONE>Annual</ZONE> + <LIGHT>Shade</LIGHT> + <PRICE>$4.36</PRICE> + <AVAILABILITY>071499</AVAILABILITY> + </PLANT> + <PLANT> + <COMMON>California Poppy</COMMON> + <BOTANICAL>Eschscholzia californica</BOTANICAL> + <ZONE>Annual</ZONE> + <LIGHT>Sun</LIGHT> + <PRICE>$7.89</PRICE> + <AVAILABILITY>032799</AVAILABILITY> + </PLANT> + <PLANT> + <COMMON>Shooting Star</COMMON> + <BOTANICAL>Dodecatheon</BOTANICAL> + <ZONE>Annual</ZONE> + <LIGHT>Mostly Shady</LIGHT> + <PRICE>$8.60</PRICE> + <AVAILABILITY>051399</AVAILABILITY> + </PLANT> + <PLANT> + <COMMON>Snakeroot</COMMON> + <BOTANICAL>Cimicifuga</BOTANICAL> + <ZONE>Annual</ZONE> + <LIGHT>Shade</LIGHT> + <PRICE>$5.63</PRICE> + <AVAILABILITY>071199</AVAILABILITY> + </PLANT> + <PLANT> + <COMMON>Cardinal Flower</COMMON> + <BOTANICAL>Lobelia cardinalis</BOTANICAL> + <ZONE>2</ZONE> + <LIGHT>Shade</LIGHT> + <PRICE>$3.02</PRICE> + <AVAILABILITY>022299</AVAILABILITY> + </PLANT> +</CATALOG> http://git-wip-us.apache.org/repos/asf/crunch/blob/958d011a/crunch-test/src/main/resources/xmlSourceSample2.xml ---------------------------------------------------------------------- diff --git a/crunch-test/src/main/resources/xmlSourceSample2.xml b/crunch-test/src/main/resources/xmlSourceSample2.xml new file mode 100644 index 0000000..7d90532 Binary files /dev/null and b/crunch-test/src/main/resources/xmlSourceSample2.xml differ
