[ 
https://issues.apache.org/jira/browse/PIG-3373?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Ahmed Eldawy updated PIG-3373:
------------------------------

    Status: Patch Available  (was: Open)

diff --git java/src/main/java/org/apache/pig/piggybank/storage/XMLLoader.java 
java/src/main/java/org/apache/pig/piggybank/storage/XMLLoader.java
index 7b1a75c..0909795 100644
--- java/src/main/java/org/apache/pig/piggybank/storage/XMLLoader.java
+++ java/src/main/java/org/apache/pig/piggybank/storage/XMLLoader.java
@@ -336,7 +336,7 @@ class XMLLoaderBufferedPositionedInputStream extends 
BufferedPositionedInputStre
           if (state == S_MATCH_TAG && (b == '>' || Character.isWhitespace(b))) 
{
             break;
           }
-          if (state != S_MATCH_TAG && this.getPosition() > limit) {
+          if (matchBuf.size() == 0 && this.getPosition() > limit) {
             // need to break, no record in this block
             break;
           }
diff --git 
java/src/test/java/org/apache/pig/piggybank/test/storage/TestXMLLoader.java 
java/src/test/java/org/apache/pig/piggybank/test/storage/TestXMLLoader.java
index e9abbc0..d908cde 100644
--- java/src/test/java/org/apache/pig/piggybank/test/storage/TestXMLLoader.java
+++ java/src/test/java/org/apache/pig/piggybank/test/storage/TestXMLLoader.java
@@ -17,6 +17,7 @@ import static org.apache.pig.ExecType.LOCAL;
 
 import java.io.ByteArrayInputStream;
 import java.io.File;
+import java.io.PrintStream;
 import java.util.ArrayList;
 import java.util.Iterator;
 
@@ -25,6 +26,8 @@ import javax.xml.parsers.DocumentBuilderFactory;
 
 import junit.framework.TestCase;
 
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
 import org.apache.pig.ExecType;
 import org.apache.pig.PigServer;
 import org.apache.pig.data.Tuple;
@@ -391,4 +394,54 @@ public class TestXMLLoader extends TestCase {
        }
      }
    }
+   
+   
+   public void testXMLLoaderShouldNotReturnLastNonMatchedTag() throws 
Exception {
+     long blockSize = FileSystem.get(new 
Configuration()).getDefaultBlockSize();
+     File tempFile = File.createTempFile("bad-file", ".xml");
+     String one_k_line = "<event>xxx</event>";
+     int content_length = 1024 - one_k_line.length() + 3;
+     String content = String.format("%"+content_length+"s", "content");
+     one_k_line = one_k_line.replace("xxx", content);
+     
+     PrintStream ps = new PrintStream(tempFile);
+     long total_size = 0;
+     int number_of_correct_tags = 0;
+     while (total_size + one_k_line.length() < blockSize) {
+       ps.print(one_k_line);
+       total_size += one_k_line.length();
+       number_of_correct_tags++;
+     }
+     String last_tag = one_k_line.replace("content", "cont");
+     ps.print(last_tag);
+     number_of_correct_tags++;
+     total_size += last_tag.length();
+     String bad_content = "<event-content-should-not-return-this/>";
+     ps.print(bad_content);
+     total_size += bad_content.length();
+     while (total_size % blockSize < (blockSize / 2)) {
+       ps.print(bad_content);
+       total_size += bad_content.length();
+     }
+     ps.close();
+     
+     PigServer pig = new PigServer(LOCAL);
+     String tempFileName = tempFile.getAbsolutePath().replace("\\", "\\\\");
+     patternString = patternString.replace("\\", "\\\\");
+     String query = "A = LOAD '" + tempFileName + "' USING 
org.apache.pig.piggybank.storage.XMLLoader('event') as (doc:chararray);";
+     pig.registerQuery(query);
+     Iterator<?> it = pig.openIterator("A");
+     int tupleCount = 0;
+     while (it.hasNext()) {
+       Tuple tuple = (Tuple) it.next();
+       if (tuple == null)
+         break;
+       else {
+         if (tuple.size() > 0) {
+             tupleCount++;
+         }
+       }
+     }
+     assertEquals(number_of_correct_tags, tupleCount);  
+   }
 }
                
> XMLLoader returns non-matching nodes when a tag name spans through the block 
> boundary
> -------------------------------------------------------------------------------------
>
>                 Key: PIG-3373
>                 URL: https://issues.apache.org/jira/browse/PIG-3373
>             Project: Pig
>          Issue Type: Bug
>          Components: piggybank
>            Reporter: Ahmed Eldawy
>            Assignee: Ahmed Eldawy
>              Labels: patch
>         Attachments: PIG3373.patch
>
>
> When node start tag spans two blocks this tag is returned even if it is not 
> of the type.
> Example: For the following input file
> <event id="3423">
> <ev
> -------- BLOCK BOUNDARY
> entually id="dfasd">
> XMLoader with tag type 'event' should return only the first one but it 
> actually returns both of them

--
This message is automatically generated by JIRA.
If you think it was sent incorrectly, please contact your JIRA administrators
For more information on JIRA, see: http://www.atlassian.com/software/jira

Reply via email to