[ https://issues.apache.org/jira/browse/PIG-3373?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Ahmed Eldawy updated PIG-3373: ------------------------------ Status: Patch Available (was: Open) diff --git java/src/main/java/org/apache/pig/piggybank/storage/XMLLoader.java java/src/main/java/org/apache/pig/piggybank/storage/XMLLoader.java index 7b1a75c..0909795 100644 --- java/src/main/java/org/apache/pig/piggybank/storage/XMLLoader.java +++ java/src/main/java/org/apache/pig/piggybank/storage/XMLLoader.java @@ -336,7 +336,7 @@ class XMLLoaderBufferedPositionedInputStream extends BufferedPositionedInputStre if (state == S_MATCH_TAG && (b == '>' || Character.isWhitespace(b))) { break; } - if (state != S_MATCH_TAG && this.getPosition() > limit) { + if (matchBuf.size() == 0 && this.getPosition() > limit) { // need to break, no record in this block break; } diff --git java/src/test/java/org/apache/pig/piggybank/test/storage/TestXMLLoader.java java/src/test/java/org/apache/pig/piggybank/test/storage/TestXMLLoader.java index e9abbc0..d908cde 100644 --- java/src/test/java/org/apache/pig/piggybank/test/storage/TestXMLLoader.java +++ java/src/test/java/org/apache/pig/piggybank/test/storage/TestXMLLoader.java @@ -17,6 +17,7 @@ import static org.apache.pig.ExecType.LOCAL; import java.io.ByteArrayInputStream; import java.io.File; +import java.io.PrintStream; import java.util.ArrayList; import java.util.Iterator; @@ -25,6 +26,8 @@ import javax.xml.parsers.DocumentBuilderFactory; import junit.framework.TestCase; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; import org.apache.pig.ExecType; import org.apache.pig.PigServer; import org.apache.pig.data.Tuple; @@ -391,4 +394,54 @@ public class TestXMLLoader extends TestCase { } } } + + + public void testXMLLoaderShouldNotReturnLastNonMatchedTag() throws Exception { + long blockSize = FileSystem.get(new Configuration()).getDefaultBlockSize(); + File tempFile = File.createTempFile("bad-file", ".xml"); + String one_k_line = "<event>xxx</event>"; + int content_length = 1024 - one_k_line.length() + 3; + String content = String.format("%"+content_length+"s", "content"); + one_k_line = one_k_line.replace("xxx", content); + + PrintStream ps = new PrintStream(tempFile); + long total_size = 0; + int number_of_correct_tags = 0; + while (total_size + one_k_line.length() < blockSize) { + ps.print(one_k_line); + total_size += one_k_line.length(); + number_of_correct_tags++; + } + String last_tag = one_k_line.replace("content", "cont"); + ps.print(last_tag); + number_of_correct_tags++; + total_size += last_tag.length(); + String bad_content = "<event-content-should-not-return-this/>"; + ps.print(bad_content); + total_size += bad_content.length(); + while (total_size % blockSize < (blockSize / 2)) { + ps.print(bad_content); + total_size += bad_content.length(); + } + ps.close(); + + PigServer pig = new PigServer(LOCAL); + String tempFileName = tempFile.getAbsolutePath().replace("\\", "\\\\"); + patternString = patternString.replace("\\", "\\\\"); + String query = "A = LOAD '" + tempFileName + "' USING org.apache.pig.piggybank.storage.XMLLoader('event') as (doc:chararray);"; + pig.registerQuery(query); + Iterator<?> it = pig.openIterator("A"); + int tupleCount = 0; + while (it.hasNext()) { + Tuple tuple = (Tuple) it.next(); + if (tuple == null) + break; + else { + if (tuple.size() > 0) { + tupleCount++; + } + } + } + assertEquals(number_of_correct_tags, tupleCount); + } } > XMLLoader returns non-matching nodes when a tag name spans through the block > boundary > ------------------------------------------------------------------------------------- > > Key: PIG-3373 > URL: https://issues.apache.org/jira/browse/PIG-3373 > Project: Pig > Issue Type: Bug > Components: piggybank > Reporter: Ahmed Eldawy > Assignee: Ahmed Eldawy > Labels: patch > Attachments: PIG3373.patch > > > When node start tag spans two blocks this tag is returned even if it is not > of the type. > Example: For the following input file > <event id="3423"> > <ev > -------- BLOCK BOUNDARY > entually id="dfasd"> > XMLoader with tag type 'event' should return only the first one but it > actually returns both of them -- This message is automatically generated by JIRA. If you think it was sent incorrectly, please contact your JIRA administrators For more information on JIRA, see: http://www.atlassian.com/software/jira