[
https://issues.apache.org/jira/browse/PIG-3304?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Ahmed Eldawy updated PIG-3304:
------------------------------
Status: Open (was: Patch Available)
diff --git java/src/main/java/org/apache/pig/piggybank/storage/XMLLoader.java
java/src/main/java/org/apache/pig/piggybank/storage/XMLLoader.java
index 589a545..9daa1a4 100644
--- java/src/main/java/org/apache/pig/piggybank/storage/XMLLoader.java
+++ java/src/main/java/org/apache/pig/piggybank/storage/XMLLoader.java
@@ -212,10 +212,16 @@ class XMLLoaderBufferedPositionedInputStream extends
BufferedPositionedInputStre
//startTag[tmp.length+1] = (byte)'>';
+ // Used to detect tags that are closed inline
+ byte[] inlineCloseTag = {'/', '>'};
ByteArrayOutputStream collectBuf = new ByteArrayOutputStream(1024);
int idxTagChar = 0;
int idxStartTagChar = 0;
+ int idxInlineCloseTagChar = 0;
+ // A flag to indicate that we are currently inside the tag to be matched
+ // Initially set to true as skipToTag has been called earlier
+ boolean insideMatchTag = true;
boolean startTagMatched = false;
/*
* Read till an end tag is found.It need not check for any condition
since it
@@ -247,10 +253,18 @@ class XMLLoaderBufferedPositionedInputStream extends
BufferedPositionedInputStre
if (b == startTag[idxStartTagChar]){
++idxStartTagChar;
- if(idxStartTagChar == startTag.length)
- startTagMatched = true ; // Set the flag as true if start tag
matches
- }else
- idxStartTagChar = 0;
+ if(idxStartTagChar == startTag.length) {
+ startTagMatched = true ; // Set the flag as true if start tag
matches
+ // We are currently inside the tag to be matched
+ insideMatchTag = true;
+ }
+ } else {
+ idxStartTagChar = 0;
+ if (idxStartTagChar > 1) {
+ // Matched only a part of the start tag of some element
+ insideMatchTag = false;
+ }
+ }
@@ -268,6 +282,23 @@ class XMLLoaderBufferedPositionedInputStream extends
BufferedPositionedInputStre
} else
idxTagChar = 0;
+ if (b == inlineCloseTag[idxInlineCloseTagChar]) {
+ idxInlineCloseTagChar++;
+ if (idxInlineCloseTagChar == inlineCloseTag.length) {
+ idxInlineCloseTagChar = 0;
+ if (insideMatchTag) {
+ if(nestedTags==0) // Break the loop if there were no nested
tags
+ break;
+ else{
+ --nestedTags; // Else decrement the count
+ idxInlineCloseTagChar = 0; // Reset the index
+ }
+ }
+ }
+ } else {
+ idxInlineCloseTagChar = 0;
+ }
+
}
catch (IOException e) {
this.setReadable(false);
@@ -339,7 +370,7 @@ class XMLLoaderBufferedPositionedInputStream extends
BufferedPositionedInputStre
break;
case S_MATCH_PREFIX:
// tag match iff next character is whitespaces or close tag mark
- if (b == ' ' || b == '\t' || b == '>') {
+ if (Character.isWhitespace(b) || b == '/' || b == '>') {
matchBuf.write((byte)(b));
state = S_MATCH_TAG;
} else {
@@ -355,7 +386,7 @@ class XMLLoaderBufferedPositionedInputStream extends
BufferedPositionedInputStre
default:
throw new IllegalArgumentException("Invalid state: " + state);
}
- if (state == S_MATCH_TAG && b == '>') {
+ if (state == S_MATCH_TAG && (b == '>' || Character.isWhitespace(b)))
{
break;
}
if (state != S_MATCH_TAG && this.getPosition() > limit) {
@@ -406,6 +437,12 @@ class XMLLoaderBufferedPositionedInputStream extends
BufferedPositionedInputStre
byte[] collectTag(String tagName, long limit) throws IOException {
ByteArrayOutputStream collectBuf = new ByteArrayOutputStream(1024);
byte[] beginTag = skipToTag(tagName, limit);
+
+ // Check if the tag is closed inline
+ if (beginTag.length > 2 && beginTag[beginTag.length - 2] == '/' &&
+ beginTag[beginTag.length-1] == '>') {
+ return beginTag;
+ }
// No need to search for the end tag if the start tag is not found
if(beginTag.length > 0 ){
diff --git
java/src/test/java/org/apache/pig/piggybank/test/storage/TestXMLLoader.java
java/src/test/java/org/apache/pig/piggybank/test/storage/TestXMLLoader.java
index 4adc9cd..f83f0d9 100644
--- java/src/test/java/org/apache/pig/piggybank/test/storage/TestXMLLoader.java
+++ java/src/test/java/org/apache/pig/piggybank/test/storage/TestXMLLoader.java
@@ -75,6 +75,15 @@ public class TestXMLLoader extends TestCase {
nestedTags.add(new String[] { "</events>"});
}
+ public static ArrayList<String[]> inlineClosedTags = new
ArrayList<String[]>();
+ static {
+ inlineClosedTags.add(new String[] { "<events>"});
+ inlineClosedTags.add(new String[] { "<event id='3423'/>"});
+ inlineClosedTags.add(new String[] { "<event/>"});
+ inlineClosedTags.add(new String[] { "<event><event/></event>"});
+ inlineClosedTags.add(new String[] { "</events>"});
+ }
+
public void testShouldReturn0TupleCountIfSearchTagIsNotFound () throws
Exception
{
String filename = TestHelper.createTempFile(data, "");
@@ -333,5 +342,26 @@ public class TestXMLLoader extends TestCase {
assertEquals(3, tupleCount);
}
-
+
+ public void testXMLLoaderShouldWorkWithInlineClosedTags() throws Exception {
+ String filename = TestHelper.createTempFile(inlineClosedTags, "");
+ PigServer pig = new PigServer(LOCAL);
+ filename = filename.replace("\\", "\\\\");
+ patternString = patternString.replace("\\", "\\\\");
+ String query = "A = LOAD 'file:" + filename + "' USING
org.apache.pig.piggybank.storage.XMLLoader('event') as (doc:chararray);";
+ pig.registerQuery(query);
+ Iterator<?> it = pig.openIterator("A");
+ int tupleCount = 0;
+ while (it.hasNext()) {
+ Tuple tuple = (Tuple) it.next();
+ if (tuple == null)
+ break;
+ else {
+ if (tuple.size() > 0) {
+ tupleCount++;
+ }
+ }
+ }
+ assertEquals(3, tupleCount);
+ }
}
> XMLLoader in piggybank does not work with inline closed tags
> ------------------------------------------------------------
>
> Key: PIG-3304
> URL: https://issues.apache.org/jira/browse/PIG-3304
> Project: Pig
> Issue Type: Bug
> Components: piggybank
> Affects Versions: 0.11.1
> Reporter: Ahmed Eldawy
> Labels: patch
> Attachments: xmlloader_inline_close_tag.patch
>
>
> The XMLLoader fails to return elements when tags are closed inline such as
> <event id="342"/>
--
This message is automatically generated by JIRA.
If you think it was sent incorrectly, please contact your JIRA administrators
For more information on JIRA, see: http://www.atlassian.com/software/jira