[
https://issues.apache.org/jira/browse/PIG-3373?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Ahmed Eldawy updated PIG-3373:
------------------------------
Status: Patch Available (was: Open)
diff --git java/src/main/java/org/apache/pig/piggybank/storage/XMLLoader.java
java/src/main/java/org/apache/pig/piggybank/storage/XMLLoader.java
index 7b1a75c..0909795 100644
--- java/src/main/java/org/apache/pig/piggybank/storage/XMLLoader.java
+++ java/src/main/java/org/apache/pig/piggybank/storage/XMLLoader.java
@@ -336,7 +336,7 @@ class XMLLoaderBufferedPositionedInputStream extends
BufferedPositionedInputStre
if (state == S_MATCH_TAG && (b == '>' || Character.isWhitespace(b)))
{
break;
}
- if (state != S_MATCH_TAG && this.getPosition() > limit) {
+ if (matchBuf.size() == 0 && this.getPosition() > limit) {
// need to break, no record in this block
break;
}
diff --git
java/src/test/java/org/apache/pig/piggybank/test/storage/TestXMLLoader.java
java/src/test/java/org/apache/pig/piggybank/test/storage/TestXMLLoader.java
index e9abbc0..d908cde 100644
--- java/src/test/java/org/apache/pig/piggybank/test/storage/TestXMLLoader.java
+++ java/src/test/java/org/apache/pig/piggybank/test/storage/TestXMLLoader.java
@@ -17,6 +17,7 @@ import static org.apache.pig.ExecType.LOCAL;
import java.io.ByteArrayInputStream;
import java.io.File;
+import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Iterator;
@@ -25,6 +26,8 @@ import javax.xml.parsers.DocumentBuilderFactory;
import junit.framework.TestCase;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
import org.apache.pig.ExecType;
import org.apache.pig.PigServer;
import org.apache.pig.data.Tuple;
@@ -391,4 +394,54 @@ public class TestXMLLoader extends TestCase {
}
}
}
+
+
+ public void testXMLLoaderShouldNotReturnLastNonMatchedTag() throws
Exception {
+ long blockSize = FileSystem.get(new
Configuration()).getDefaultBlockSize();
+ File tempFile = File.createTempFile("bad-file", ".xml");
+ String one_k_line = "<event>xxx</event>";
+ int content_length = 1024 - one_k_line.length() + 3;
+ String content = String.format("%"+content_length+"s", "content");
+ one_k_line = one_k_line.replace("xxx", content);
+
+ PrintStream ps = new PrintStream(tempFile);
+ long total_size = 0;
+ int number_of_correct_tags = 0;
+ while (total_size + one_k_line.length() < blockSize) {
+ ps.print(one_k_line);
+ total_size += one_k_line.length();
+ number_of_correct_tags++;
+ }
+ String last_tag = one_k_line.replace("content", "cont");
+ ps.print(last_tag);
+ number_of_correct_tags++;
+ total_size += last_tag.length();
+ String bad_content = "<event-content-should-not-return-this/>";
+ ps.print(bad_content);
+ total_size += bad_content.length();
+ while (total_size % blockSize < (blockSize / 2)) {
+ ps.print(bad_content);
+ total_size += bad_content.length();
+ }
+ ps.close();
+
+ PigServer pig = new PigServer(LOCAL);
+ String tempFileName = tempFile.getAbsolutePath().replace("\\", "\\\\");
+ patternString = patternString.replace("\\", "\\\\");
+ String query = "A = LOAD '" + tempFileName + "' USING
org.apache.pig.piggybank.storage.XMLLoader('event') as (doc:chararray);";
+ pig.registerQuery(query);
+ Iterator<?> it = pig.openIterator("A");
+ int tupleCount = 0;
+ while (it.hasNext()) {
+ Tuple tuple = (Tuple) it.next();
+ if (tuple == null)
+ break;
+ else {
+ if (tuple.size() > 0) {
+ tupleCount++;
+ }
+ }
+ }
+ assertEquals(number_of_correct_tags, tupleCount);
+ }
}
> XMLLoader returns non-matching nodes when a tag name spans through the block
> boundary
> -------------------------------------------------------------------------------------
>
> Key: PIG-3373
> URL: https://issues.apache.org/jira/browse/PIG-3373
> Project: Pig
> Issue Type: Bug
> Components: piggybank
> Reporter: Ahmed Eldawy
> Assignee: Ahmed Eldawy
> Labels: patch
> Attachments: PIG3373.patch
>
>
> When node start tag spans two blocks this tag is returned even if it is not
> of the type.
> Example: For the following input file
> <event id="3423">
> <ev
> -------- BLOCK BOUNDARY
> entually id="dfasd">
> XMLoader with tag type 'event' should return only the first one but it
> actually returns both of them
--
This message is automatically generated by JIRA.
If you think it was sent incorrectly, please contact your JIRA administrators
For more information on JIRA, see: http://www.atlassian.com/software/jira