[ 
https://issues.apache.org/jira/browse/PIG-3304?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Ahmed Eldawy updated PIG-3304:
------------------------------

    Status: Open  (was: Patch Available)

diff --git java/src/main/java/org/apache/pig/piggybank/storage/XMLLoader.java 
java/src/main/java/org/apache/pig/piggybank/storage/XMLLoader.java
index 589a545..9daa1a4 100644
--- java/src/main/java/org/apache/pig/piggybank/storage/XMLLoader.java
+++ java/src/main/java/org/apache/pig/piggybank/storage/XMLLoader.java
@@ -212,10 +212,16 @@ class XMLLoaderBufferedPositionedInputStream extends 
BufferedPositionedInputStre
       //startTag[tmp.length+1] = (byte)'>';
       
       
+      // Used to detect tags that are closed inline
+      byte[] inlineCloseTag = {'/', '>'};
 
       ByteArrayOutputStream collectBuf = new ByteArrayOutputStream(1024);
       int idxTagChar = 0;
       int idxStartTagChar = 0;
+      int idxInlineCloseTagChar = 0;
+      // A flag to indicate that we are currently inside the tag to be matched
+      // Initially set to true as skipToTag has been called earlier
+      boolean insideMatchTag = true;
       boolean startTagMatched = false;
       /*
        * Read till an end tag is found.It need not check for any condition 
since it 
@@ -247,10 +253,18 @@ class XMLLoaderBufferedPositionedInputStream extends 
BufferedPositionedInputStre
           
           if (b == startTag[idxStartTagChar]){
              ++idxStartTagChar;
-             if(idxStartTagChar == startTag.length)
-                startTagMatched = true ; // Set the flag as true if start tag 
matches
-          }else
-             idxStartTagChar = 0;
+             if(idxStartTagChar == startTag.length) {
+               startTagMatched = true ; // Set the flag as true if start tag 
matches
+               // We are currently inside the tag to be matched
+               insideMatchTag = true;               
+             }
+          } else {
+            idxStartTagChar = 0;
+            if (idxStartTagChar > 1) {
+              // Matched only a part of the start tag of some element
+              insideMatchTag = false;
+            }
+          }
             
           
           
@@ -268,6 +282,23 @@ class XMLLoaderBufferedPositionedInputStream extends 
BufferedPositionedInputStre
           } else 
             idxTagChar = 0; 
           
+          if (b == inlineCloseTag[idxInlineCloseTagChar]) {
+            idxInlineCloseTagChar++;
+            if (idxInlineCloseTagChar == inlineCloseTag.length) {
+              idxInlineCloseTagChar = 0;
+              if (insideMatchTag) {
+                if(nestedTags==0) // Break the loop if there were no nested 
tags
+                  break;
+               else{
+                  --nestedTags; // Else decrement the count
+                  idxInlineCloseTagChar = 0; // Reset the index
+               }
+              }
+            }
+          } else {
+            idxInlineCloseTagChar = 0;
+          }
+          
         }
         catch (IOException e) {
           this.setReadable(false);
@@ -339,7 +370,7 @@ class XMLLoaderBufferedPositionedInputStream extends 
BufferedPositionedInputStre
               break;
             case S_MATCH_PREFIX:
               // tag match iff next character is whitespaces or close tag mark
-              if (b == ' ' || b == '\t' || b == '>') {
+              if (Character.isWhitespace(b) || b == '/' || b == '>') {
                 matchBuf.write((byte)(b));
                 state = S_MATCH_TAG;
               } else {
@@ -355,7 +386,7 @@ class XMLLoaderBufferedPositionedInputStream extends 
BufferedPositionedInputStre
             default:
               throw new IllegalArgumentException("Invalid state: " + state);
           }
-          if (state == S_MATCH_TAG && b == '>') {
+          if (state == S_MATCH_TAG && (b == '>' || Character.isWhitespace(b))) 
{
             break;
           }
           if (state != S_MATCH_TAG && this.getPosition() > limit) {
@@ -406,6 +437,12 @@ class XMLLoaderBufferedPositionedInputStream extends 
BufferedPositionedInputStre
     byte[] collectTag(String tagName, long limit) throws IOException {
        ByteArrayOutputStream collectBuf = new ByteArrayOutputStream(1024);
        byte[] beginTag = skipToTag(tagName, limit);
+       
+       // Check if the tag is closed inline
+       if (beginTag.length > 2 && beginTag[beginTag.length - 2] == '/' &&
+           beginTag[beginTag.length-1] == '>') {
+         return beginTag;
+       }
 
        // No need to search for the end tag if the start tag is not found
        if(beginTag.length > 0 ){ 
diff --git 
java/src/test/java/org/apache/pig/piggybank/test/storage/TestXMLLoader.java 
java/src/test/java/org/apache/pig/piggybank/test/storage/TestXMLLoader.java
index 4adc9cd..f83f0d9 100644
--- java/src/test/java/org/apache/pig/piggybank/test/storage/TestXMLLoader.java
+++ java/src/test/java/org/apache/pig/piggybank/test/storage/TestXMLLoader.java
@@ -75,6 +75,15 @@ public class TestXMLLoader extends TestCase {
      nestedTags.add(new String[] { "</events>"});
   }
   
+  public static ArrayList<String[]> inlineClosedTags = new 
ArrayList<String[]>();
+  static {
+    inlineClosedTags.add(new String[] { "<events>"});
+    inlineClosedTags.add(new String[] { "<event id='3423'/>"});
+    inlineClosedTags.add(new String[] { "<event/>"});
+    inlineClosedTags.add(new String[] { "<event><event/></event>"});
+    inlineClosedTags.add(new String[] { "</events>"});
+  }
+  
   public void testShouldReturn0TupleCountIfSearchTagIsNotFound () throws 
Exception
   {
     String filename = TestHelper.createTempFile(data, "");
@@ -333,5 +342,26 @@ public class TestXMLLoader extends TestCase {
       assertEquals(3, tupleCount);  
    }
    
-   
+
+   public void testXMLLoaderShouldWorkWithInlineClosedTags() throws Exception {
+     String filename = TestHelper.createTempFile(inlineClosedTags, "");
+     PigServer pig = new PigServer(LOCAL);
+     filename = filename.replace("\\", "\\\\");
+     patternString = patternString.replace("\\", "\\\\");
+     String query = "A = LOAD 'file:" + filename + "' USING 
org.apache.pig.piggybank.storage.XMLLoader('event') as (doc:chararray);";
+     pig.registerQuery(query);
+     Iterator<?> it = pig.openIterator("A");
+     int tupleCount = 0;
+     while (it.hasNext()) {
+       Tuple tuple = (Tuple) it.next();
+       if (tuple == null)
+         break;
+       else {
+         if (tuple.size() > 0) {
+             tupleCount++;
+         }
+       }
+     }
+     assertEquals(3, tupleCount);  
+   }
 }
                
> XMLLoader in piggybank does not work with inline closed tags
> ------------------------------------------------------------
>
>                 Key: PIG-3304
>                 URL: https://issues.apache.org/jira/browse/PIG-3304
>             Project: Pig
>          Issue Type: Bug
>          Components: piggybank
>    Affects Versions: 0.11.1
>            Reporter: Ahmed Eldawy
>              Labels: patch
>         Attachments: xmlloader_inline_close_tag.patch
>
>
> The XMLLoader fails to return elements when tags are closed inline such as
> <event id="342"/>

--
This message is automatically generated by JIRA.
If you think it was sent incorrectly, please contact your JIRA administrators
For more information on JIRA, see: http://www.atlassian.com/software/jira

Reply via email to