Author: daijy
Date: Thu Jan 15 19:18:34 2015
New Revision: 1652238
URL: http://svn.apache.org/r1652238
Log:
PIG-4355: Piggybank: XPath cant handle namespace in xpath, nor can it return
more than one match
Added:
pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/xml/XPathAll.java
pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/xml/XPathAllTest.java
Modified:
pig/trunk/CHANGES.txt
pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/xml/XPath.java
pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/xml/XPathTest.java
Modified: pig/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/pig/trunk/CHANGES.txt?rev=1652238&r1=1652237&r2=1652238&view=diff
==============================================================================
--- pig/trunk/CHANGES.txt (original)
+++ pig/trunk/CHANGES.txt Thu Jan 15 19:18:34 2015
@@ -42,6 +42,9 @@ PIG-4333: Split BigData tests into multi
BUG FIXES
+PIG-4355: Piggybank: XPath cant handle namespace in xpath, nor can it return
more than one match
+ (cavanaug via daijy)
+
PIG-4371: Duplicate snappy.version in libraries.properties (daijy)
PIG-4368: Port local mode tests to Tez - TestLoadStoreFuncLifeCycle (daijy)
Modified:
pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/xml/XPath.java
URL:
http://svn.apache.org/viewvc/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/xml/XPath.java?rev=1652238&r1=1652237&r2=1652238&view=diff
==============================================================================
---
pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/xml/XPath.java
(original)
+++
pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/xml/XPath.java
Thu Jan 15 19:18:34 2015
@@ -38,113 +38,203 @@ import org.xml.sax.InputSource;
*/
public class XPath extends EvalFunc<String> {
- /** Hold onto last xpath & xml in case the next call to xpath() is feeding
the same xml document
- * The reason for this is because creating an xpath object is costly. */
+ /**
+ * Hold onto last xpath & xml in case the next call to xpath() is feeding
+ * the same xml document The reason for this is because creating an xpath
+ * object is costly.
+ */
private javax.xml.xpath.XPath xpath = null;
private String xml = null;
private Document document;
private static boolean cache = true;
+ private static boolean ignoreNamespace = true;
+ public static final String EMPTY_STRING = "";
/**
- * input should contain: 1) xml 2) xpath 3) optional cache xml doc flag
+ * input should contain: 1) xml 2) xpath
+ * 3) optional cache xml doc flag
+ * 4) optional ignore namespace flag
*
* Usage:
* 1) XPath(xml, xpath)
* 2) XPath(xml, xpath, false)
+ * 3) XPath(xml, xpath, false, false)
*
- * @param 1st element should to be the xml
+ * @param input
+ * 1st element should to be the xml
* 2nd element should be the xpath
* 3rd optional boolean cache flag (default true)
+ * 4th optional boolean ignore namespace flag (default true)
+ *
+ *
+ * This UDF will cache the last xml document. This is helpful when
+ * multiple consecutive xpathAll calls are made for the same xml
+ * document. Caching can be turned off to ensure that the UDF's
+ * recreates the internal javax.xml.xpath.XPathAll for every call
*
- * This UDF will cache the last xml document. This is helpful when
multiple consecutive xpath calls are made for the same xml document.
- * Caching can be turned off to ensure that the UDF's recreates the
internal javax.xml.xpath.XPath for every call
+ * This UDF will also support ignoring the namespace in the xml
tags.
+ * This will help to search xpath items by ignoring its namespace.
+ * Ignoring of the namespace can be turned off for special cases
using
+ * a fourth argument in the UDF.
*
* @return chararrary result or null if no match
*/
@Override
public String exec(final Tuple input) throws IOException {
- if (input == null || input.size() <= 1) {
- warn("Error processing input, not enough parameters or null input"
+ input,
- PigWarning.UDF_WARNING_1);
- return null;
- }
-
-
- if (input.size() > 3) {
- warn("Error processing input, too many parameters" + input,
- PigWarning.UDF_WARNING_1);
+ if (!isArgsValid(input)) { // Validate arguments
return null;
}
try {
final String xml = (String) input.get(0);
+
if (xml == null) {
return null;
}
if(input.size() > 2)
cache = (Boolean) input.get(2);
-
- if(!cache || xpath == null || !xml.equals(this.xml))
- {
+
+ if (!cache || xpath == null || !xml.equals(this.xml)) {
final InputSource source = new InputSource(new
StringReader(xml));
-
- this.xml = xml; //track the xml for subsequent calls to this
udf
+
+ this.xml = xml; // track the xml for subsequent calls to this
udf
final DocumentBuilderFactory dbf =
DocumentBuilderFactory.newInstance();
final DocumentBuilder db = dbf.newDocumentBuilder();
-
+
this.document = db.parse(source);
final XPathFactory xpathFactory = XPathFactory.newInstance();
this.xpath = xpathFactory.newXPath();
-
+
+ }
+
+ String xpathString = (String) input.get(1);
+
+ if (ignoreNamespace) {
+ xpathString = createNameSpaceIgnoreXpathString(xpathString);
}
-
- final String xpathString = (String) input.get(1);
final String value = xpath.evaluate(xpathString, document);
return value;
} catch (Exception e) {
- warn("Error processing input " + input.getType(0),
- PigWarning.UDF_WARNING_1);
-
+ warn("Error processing input " + input.getType(0),
PigWarning.UDF_WARNING_1);
+
return null;
}
}
+
+ /**
+ * Validates values of the input parameters.
+ *
+ * @param Tuple
+ * @return boolean
+ */
+ private boolean isArgsValid(final Tuple input) {
+ if (input == null || input.size() <= 1) {
+ warn("Error processing input, not enough parameters or null input"
+ input, PigWarning.UDF_WARNING_1);
+ return false;
+ }
+
+ if (input.size() > 4) {
+ warn("Error processing input, too many parameters" + input,
PigWarning.UDF_WARNING_1);
+ return false;
+ }
+
+ try {
+ // 3rd Parameter - CACHE
+ if (input.size() > 2 && !(input.get(2) instanceof Boolean)) {
+ warn("Error processing input, invalid value in 3rd parameter"
+ input, PigWarning.UDF_WARNING_1);
+ return false;
+ }
+
+ // 4rd Parameter IGNORE_NAMESPACE
+ if (input.size() > 3 && !(input.get(3) instanceof Boolean)) {
+ warn("Error processing input, invalid value in 4th parameter"
+ input, PigWarning.UDF_WARNING_1);
+ return false;
+ }
+ } catch (Exception ex) {
+ return false;
+ }
+ return true;
+ }
+
+
+ /**
+ * Returns a new the xPathString by adding additional parameters
+ * in the existing xPathString for ignoring the namespace during
compilation.
+ *
+ * @param String xpathString
+ * @return String modified xpathString
+ */
+ private String createNameSpaceIgnoreXpathString(final String xpathString) {
+ final String QUERY_PREFIX = "//*";
+ final String LOCAL_PREFIX = "[local-name()='";
+ final String LOCAL_POSTFIX = "']";
+ final String SPLITTER = "/";
+
+ try {
+ String xpathStringWithLocalName = EMPTY_STRING;
+ String[] individualNodes = xpathString.split(SPLITTER);
+
+ for (String node : individualNodes) {
+ xpathStringWithLocalName =
xpathStringWithLocalName.concat(QUERY_PREFIX + LOCAL_PREFIX + node
+ + LOCAL_POSTFIX);
+ }
+ return xpathStringWithLocalName;
+ } catch (Exception ex) {
+ return xpathString;
+ }
+ }
+
+ /**
+ * Returns argument schemas of the UDF.
+ *
+ * @return List
+ */
+ @Override
+ public List<FuncSpec> getArgToFuncMapping() throws FrontendException {
- @Override
- public List<FuncSpec> getArgToFuncMapping() throws FrontendException {
+ final List<FuncSpec> funcList = new ArrayList<FuncSpec>();
- final List<FuncSpec> funcList = new ArrayList<FuncSpec>();
+ /* either two chararray arguments */
+ List<FieldSchema> fields = new ArrayList<FieldSchema>();
+ fields.add(new Schema.FieldSchema(null, DataType.CHARARRAY));
+ fields.add(new Schema.FieldSchema(null, DataType.CHARARRAY));
- /*either two chararray arguments*/
- List<FieldSchema> fields = new ArrayList<FieldSchema>();
- fields.add(new Schema.FieldSchema(null, DataType.CHARARRAY));
- fields.add(new Schema.FieldSchema(null, DataType.CHARARRAY));
+ Schema twoArgInSchema = new Schema(fields);
- Schema twoArgInSchema = new Schema(fields);
+ funcList.add(new FuncSpec(this.getClass().getName(), twoArgInSchema));
- funcList.add(new FuncSpec(this.getClass().getName(),
twoArgInSchema));
+ /* or two chararray and a boolean argument */
+ fields = new ArrayList<FieldSchema>();
+ fields.add(new Schema.FieldSchema(null, DataType.CHARARRAY));
+ fields.add(new Schema.FieldSchema(null, DataType.CHARARRAY));
+ fields.add(new Schema.FieldSchema(null, DataType.BOOLEAN));
- /*or two chararray and a boolean argument*/
- fields = new ArrayList<FieldSchema>();
- fields.add(new Schema.FieldSchema(null, DataType.CHARARRAY));
- fields.add(new Schema.FieldSchema(null, DataType.CHARARRAY));
- fields.add(new Schema.FieldSchema(null, DataType.BOOLEAN));
+ Schema threeArgInSchema = new Schema(fields);
- Schema threeArgInSchema = new Schema(fields);
+ funcList.add(new FuncSpec(this.getClass().getName(),
threeArgInSchema));
- funcList.add(new FuncSpec(this.getClass().getName(),
threeArgInSchema));
+ /* or two chararray and two boolean arguments */
+ fields = new ArrayList<FieldSchema>();
+ fields.add(new Schema.FieldSchema(null, DataType.CHARARRAY));
+ fields.add(new Schema.FieldSchema(null, DataType.CHARARRAY));
+ fields.add(new Schema.FieldSchema(null, DataType.BOOLEAN));
+ fields.add(new Schema.FieldSchema(null, DataType.BOOLEAN));
- return funcList;
- }
+ Schema fourArgInSchema = new Schema(fields);
-}
+ funcList.add(new FuncSpec(this.getClass().getName(), fourArgInSchema));
+ return funcList;
+ }
+
+}
Added:
pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/xml/XPathAll.java
URL:
http://svn.apache.org/viewvc/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/xml/XPathAll.java?rev=1652238&view=auto
==============================================================================
---
pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/xml/XPathAll.java
(added)
+++
pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/xml/XPathAll.java
Thu Jan 15 19:18:34 2015
@@ -0,0 +1,319 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the
+ * NOTICE file distributed with this work for additional information regarding
copyright ownership. The ASF
+ * licenses this file to you under the Apache License, Version 2.0 (the
"License"); you may not use this file
+ * except in compliance with the License. You may obtain a copy of the License
at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
distributed under the License is
+ * distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied.
+ * See the License for the specific language governing permissions and
limitations under the License.
+ */
+
+package org.apache.pig.piggybank.evaluation.xml;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.ArrayList;
+import java.util.List;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.xpath.XPathConstants;
+import javax.xml.xpath.XPathFactory;
+
+import org.apache.pig.EvalFunc;
+import org.apache.pig.FuncSpec;
+import org.apache.pig.PigWarning;
+import org.apache.pig.data.DataType;
+import org.apache.pig.data.Tuple;
+import org.apache.pig.data.TupleFactory;
+import org.apache.pig.impl.logicalLayer.FrontendException;
+import org.apache.pig.impl.logicalLayer.schema.Schema;
+import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema;
+import org.w3c.dom.Document;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+import org.xml.sax.InputSource;
+
+/**
+ * XPathAll is a function that allows for Tuple extraction from xml
+ */
+public class XPathAll extends EvalFunc<Tuple> {
+
+ private javax.xml.xpath.XPath xmlPath = null;
+ private String xml = null;
+ private Document document;
+
+ /**
+ * Caching of the xpath & xml in case the next call to xpath() is feeding
+ * the same xml document The reason for this is because creating an xpath
+ * object is costly.
+ */
+ private static boolean cache = true;
+ private static boolean ignoreNamespace = true;
+
+ private static TupleFactory tupleFactory = TupleFactory.getInstance();
+
+ public static enum ARGUMENTS {
+ XML_FILE(0), XPATH(1), CACHE(2), IGNORE_NAMESPACE(3);
+
+ private int argument;
+
+ ARGUMENTS(int argument) {
+ this.argument = argument;
+ }
+
+ int getPosition() {
+ return this.argument;
+ }
+ }
+
+ public static final String EMPTY_STRING = "";
+
+ /**
+ * input should contain: 1) xml 2) xpath 3) optional cache xml doc flag 4)
+ * optional ignore namespace flag
+ *
+ * The optional fourth parameter (IGNORE_NAMESPACE), if set true will
remove
+ * the namespace from xPath For example xpath /html:body/html:div will be
+ * considered as /body/div
+ *
+ * Usage: 1) XPathAll(xml, xpath)
+ * 2) XPathAll(xml, xpath, false)
+ * 3) XPathAll(xml, xpath, false, false)
+ *
+ * @param input
+ * 1st element should to be the xml 2nd element should
be the xpath
+ * 3rd optional boolean cache flag (default true)
+ * 4th optional boolean ignore namespace flag(default true)
+ *
+ * This UDF will cache the last xml document. This is helpful when
+ * multiple consecutive xpathAll calls are made for the same xml
+ * document. Caching can be turned off to ensure that the UDF's
+ * recreates the internal javax.xml.xpath.XPathAll for every call
+ *
+ * This UDF will also support ignoring the namespace in the xml
tags.
+ * This will help to search xpath items by ignoring its namespace.
+ * Ignoring of the namespace can be turned off for special cases
using
+ * a fourth argument in the UDF.
+ *
+ *
+ * @return Tuple result or null if no match
+ */
+ @Override
+ public Tuple exec(final Tuple input) throws IOException {
+
+ if (!isArgsValid(input)) { // Validate arguments
+ return null;
+ }
+
+ try {
+
+ final String xml = (String)
input.get(ARGUMENTS.XML_FILE.getPosition());
+ if (xml == null) {
+ warn("Error processing input, invalid parameter" + input,
PigWarning.UDF_WARNING_1);
+ return null;
+ }
+
+ if (input.size() > 2) {
+ cache = (Boolean) input.get(ARGUMENTS.CACHE.getPosition());
+ }
+
+ if (input.size() > 3) {
+ ignoreNamespace = (Boolean)
input.get(ARGUMENTS.IGNORE_NAMESPACE.getPosition());
+ }
+
+ // Process XML
+ if (!cache || xmlPath == null || !xml.equals(this.xml)) { // Cache
verification
+ final InputSource source = new InputSource(new
StringReader(xml));
+
+ this.xml = xml; // track the xml for subsequent calls to this
udf
+
+ final DocumentBuilderFactory dbf =
DocumentBuilderFactory.newInstance();
+ final DocumentBuilder db = dbf.newDocumentBuilder();
+
+ this.document = db.parse(source);
+
+ final XPathFactory xpathFactory = XPathFactory.newInstance();
+ this.xmlPath = xpathFactory.newXPath();
+ }
+
+ String xpathString = (String)
input.get(ARGUMENTS.XPATH.getPosition());
+
+ if (ignoreNamespace) {
+ xpathString = createNameSpaceIgnoreXpathString(xpathString);
+ }
+
+ final NodeList nodeEntries = (NodeList)
xmlPath.compile(xpathString).evaluate(document,
+ XPathConstants.NODESET);
+
+ if (nodeEntries == null) {
+ return null;
+ }
+
+ Tuple resultTuple = tupleFactory.newTuple(nodeEntries.getLength());
+
+ for (int nodeEntryIndex = 0; nodeEntryIndex <
nodeEntries.getLength(); nodeEntryIndex++) {
+ final String ELEMENT_NODE_SEPARATOR = ", ";
+
+ Node node = nodeEntries.item(nodeEntryIndex);
+
+ // Parse the Node
+ final NodeList childNodes = node.getChildNodes();
+ if (childNodes == null) {
+ continue;
+ }
+
+ String nodeData = "";
+ boolean dataFlag = false;
+
+ for (int i = 0; i < childNodes.getLength(); i++) {
+ try {
+ Node subNode = childNodes.item(i);
+ if (subNode.getNodeType() == Node.ELEMENT_NODE) {
+ if (subNode.getFirstChild().getNodeValue() ==
null) {
+ // If There is no direct element, return blank
+ nodeData =
nodeData.concat(ELEMENT_NODE_SEPARATOR);
+ nodeData = nodeData.concat(EMPTY_STRING);
+ dataFlag = true;
+ continue;
+ }
+ nodeData = nodeData.concat(ELEMENT_NODE_SEPARATOR);
+ nodeData =
nodeData.concat(subNode.getFirstChild().getNodeValue());
+ dataFlag = true;
+ } else if (subNode.getNodeType() == Node.TEXT_NODE
+ || subNode.getNodeType() ==
Node.ATTRIBUTE_NODE) {
+ nodeData = nodeData.concat(ELEMENT_NODE_SEPARATOR);
+ nodeData = nodeData.concat(subNode.getNodeValue());
+ dataFlag = true;
+ }
+ } catch (Exception ex) {
+ continue;
+ }
+ }
+
+ if (dataFlag) {
+ nodeData = nodeData.replaceFirst(ELEMENT_NODE_SEPARATOR,
EMPTY_STRING);
+ resultTuple.set(nodeEntryIndex, nodeData);
+ }
+ }
+ return resultTuple;
+
+ } catch (Exception e) {
+ warn("Error processing input " + input.getType(0),
PigWarning.UDF_WARNING_1);
+ return null;
+ }
+ }
+
+
+ /**
+ * Validates values of the input parameters.
+ *
+ * @param Tuple
+ * @return boolean
+ */
+ private boolean isArgsValid(final Tuple input) {
+ if (input == null || input.size() <= 1) {
+ warn("Error processing input, not enough parameters or null input"
+ input, PigWarning.UDF_WARNING_1);
+ return false;
+ }
+
+ if (input.size() > 4) {
+ warn("Error processing input, too many parameters" + input,
PigWarning.UDF_WARNING_1);
+ return false;
+ }
+
+ try {
+ // 3rd Parameter - CACHE
+ if (input.size() > 2 && !(input.get(ARGUMENTS.CACHE.getPosition())
instanceof Boolean)) {
+ warn("Error processing input, invalid value in 3rd parameter"
+ input, PigWarning.UDF_WARNING_1);
+ return false;
+ }
+
+ // 4rd Parameter IGNORE_NAMESPACE
+ if (input.size() > 3 &&
!(input.get(ARGUMENTS.IGNORE_NAMESPACE.getPosition()) instanceof Boolean)) {
+ warn("Error processing input, invalid value in 4th parameter"
+ input, PigWarning.UDF_WARNING_1);
+ return false;
+ }
+ } catch (Exception ex) {
+ return false;
+ }
+
+ return true;
+ }
+
+ /**
+ * Returns a new the xPathString by adding additional parameters
+ * in the existing xPathString for ignoring the namespace during
compilation.
+ *
+ * @param String xpathString
+ * @return String modified xpathString
+ */
+ private String createNameSpaceIgnoreXpathString(final String xpathString) {
+ final String QUERY_PREFIX = "//*";
+ final String LOCAL_PREFIX = "[local-name()='";
+ final String LOCAL_POSTFIX = "']";
+ final String SPLITTER = "/";
+
+ try {
+ String xpathStringWithLocalName = EMPTY_STRING;
+ String[] individualNodes = xpathString.split(SPLITTER);
+
+ for (String node : individualNodes) {
+ xpathStringWithLocalName =
xpathStringWithLocalName.concat(QUERY_PREFIX + LOCAL_PREFIX + node
+ + LOCAL_POSTFIX);
+ }
+
+ return xpathStringWithLocalName;
+ } catch (Exception ex) {
+ return xpathString;
+ }
+ }
+
+ /**
+ * Returns argument schemas of the UDF.
+ *
+ * @return List
+ */
+
+ @Override
+ public List<FuncSpec> getArgToFuncMapping() throws FrontendException {
+
+ final List<FuncSpec> funcList = new ArrayList<FuncSpec>();
+
+ /* either two chararray arguments */
+ List<FieldSchema> fields = new ArrayList<FieldSchema>();
+ fields.add(new Schema.FieldSchema(null, DataType.CHARARRAY));
+ fields.add(new Schema.FieldSchema(null, DataType.CHARARRAY));
+
+ Schema twoArgInSchema = new Schema(fields);
+
+ funcList.add(new FuncSpec(this.getClass().getName(), twoArgInSchema));
+
+ /* or two chararray and a boolean argument */
+ fields = new ArrayList<FieldSchema>();
+ fields.add(new Schema.FieldSchema(null, DataType.CHARARRAY));
+ fields.add(new Schema.FieldSchema(null, DataType.CHARARRAY));
+ fields.add(new Schema.FieldSchema(null, DataType.BOOLEAN));
+
+ Schema threeArgInSchema = new Schema(fields);
+
+ funcList.add(new FuncSpec(this.getClass().getName(),
threeArgInSchema));
+
+ /* or two chararray and two boolean arguments */
+ fields = new ArrayList<FieldSchema>();
+ fields.add(new Schema.FieldSchema(null, DataType.CHARARRAY));
+ fields.add(new Schema.FieldSchema(null, DataType.CHARARRAY));
+ fields.add(new Schema.FieldSchema(null, DataType.BOOLEAN));
+ fields.add(new Schema.FieldSchema(null, DataType.BOOLEAN));
+
+ Schema fourArgInSchema = new Schema(fields);
+
+ funcList.add(new FuncSpec(this.getClass().getName(), fourArgInSchema));
+
+ return funcList;
+ }
+
+}
Added:
pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/xml/XPathAllTest.java
URL:
http://svn.apache.org/viewvc/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/xml/XPathAllTest.java?rev=1652238&view=auto
==============================================================================
---
pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/xml/XPathAllTest.java
(added)
+++
pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/xml/XPathAllTest.java
Thu Jan 15 19:18:34 2015
@@ -0,0 +1,367 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the
+ * NOTICE file distributed with this work for additional information regarding
copyright ownership. The ASF
+ * licenses this file to you under the Apache License, Version 2.0 (the
"License"); you may not use this file
+ * except in compliance with the License. You may obtain a copy of the License
at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
distributed under the License is
+ * distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied.
+ * See the License for the specific language governing permissions and
limitations under the License.
+ */
+
+package org.apache.pig.piggybank.test.evaluation.xml;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotEquals;
+import static org.junit.Assert.assertTrue;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.when;
+
+import org.apache.commons.lang.math.RandomUtils;
+import org.apache.pig.data.Tuple;
+import org.apache.pig.data.TupleFactory;
+import org.apache.pig.piggybank.evaluation.xml.XPathAll;
+import org.junit.Ignore;
+import org.junit.Test;
+
+public class XPathAllTest {
+
+ @Test
+ public void testExecTuple() throws Exception {
+
+ final XPathAll xpath = new XPathAll();
+
+ final Tuple tuple = mock(Tuple.class);
+
+ when(tuple.get(0)).thenReturn(
+ "<book id=\"bk101\">"
+ + "<author>Gambardella,
Matthew</author>"
+ + "<title>XML Developer's
Guide</title>"
+ + "<genre>Computer</genre>"
+ + "<price>44.95</price>"
+ +
"<publish_date>2000-10-01</publish_date>"
+ + "<description>An in-depth
look at creating applications with XML.</description>"
+ + "</book>");
+
+ when(tuple.size()).thenReturn(2);
+ when(tuple.get(1)).thenReturn("book");
+
+ assertEquals(
+ "Gambardella, Matthew, XML Developer's Guide, Computer, 44.95,
2000-10-01, An in-depth look at creating applications with XML.",
+ xpath.exec(tuple).get(0));
+ }
+
+ @Test
+ public void testExecTupleWithInnerNodes() throws Exception {
+
+ final XPathAll xpath = new XPathAll();
+
+ final Tuple tuple = mock(Tuple.class);
+
+ when(tuple.get(0)).thenReturn(
+ "<book id=\"bk101\">"
+ + "<authors>"
+ + "<author_1>Gambardella</author_1>"
+ + "<author_2>Matthew</author_2>"
+ + "<author_2>Mike</author_2>"
+ + "</authors>"
+ + "<title>XML Developer's Guide</title>"
+ + "<genre>Computer</genre>"
+ + "<price>44.95</price>"
+ + "<publish_date>2000-10-01</publish_date>"
+ + "<description>An in-depth look at creating
applications with XML.</description>"
+ + "</book>");
+
+ when(tuple.size()).thenReturn(2);
+ when(tuple.get(1)).thenReturn("book/authors");
+
+ assertEquals("Gambardella, Matthew, Mike", xpath.exec(tuple).get(0));
+ }
+
+ @Test
+ public void testExecTupleWithMultipleParentNodes() throws Exception {
+
+ final XPathAll xpath = new XPathAll();
+
+ final Tuple tuple = mock(Tuple.class);
+
+ when(tuple.get(0)).thenReturn(
+ "<bookstore>"
+ + "<book id=\"bk101\">"
+ + "<authors>"
+ + "<author_1>Gambardella</author_1>"
+ + "<author_2>Matthew</author_2>"
+ + "<author_3>Mike</author_3>"
+ + "</authors>"
+ + "<title>XML Developer's Guide</title>"
+ + "<genre>Computer</genre>"
+ + "<price>44.95</price>"
+ + "<publish_date>2000-10-01</publish_date>"
+ + "<description>An in-depth look at creating
applications with XML.</description>"
+ + "</book>"
+ + "<book id=\"bk102\">"
+ + "<authors>"
+ + "<author_1>Kent</author_1>"
+ + "<author_2>Beck</author_2>"
+ + "</authors>"
+ + "<title>HTML Developer's</title>"
+ + "<genre>Computer</genre>"
+ + "<price>60.00</price>"
+ + "<publish_date>2000-10-01</publish_date>"
+ + "<description>An in-depth look at creating applications
with HTML.</description>"
+ + "</book>"
+ + "</bookstore>");
+
+ when(tuple.size()).thenReturn(4);
+ when(tuple.get(2)).thenReturn(true);
+ when(tuple.get(3)).thenReturn(true);
+
+ when(tuple.get(1)).thenReturn("bookstore/book");
+ assertEquals(2, xpath.exec(tuple).getAll().size());
+ assertEquals(
+ ", XML Developer's Guide, Computer, 44.95, 2000-10-01, An
in-depth look at creating applications with XML.",
+ xpath.exec(tuple).get(0));
+ assertEquals(
+ ", HTML Developer's, Computer, 60.00, 2000-10-01, An in-depth
look at creating applications with HTML.",
+ xpath.exec(tuple).get(1));
+
+ when(tuple.get(1)).thenReturn("bookstore/book/authors");
+ assertEquals(2, xpath.exec(tuple).getAll().size());
+ assertEquals("Gambardella, Matthew, Mike", xpath.exec(tuple).get(0));
+ assertEquals("Kent, Beck", xpath.exec(tuple).get(1));
+
+ }
+
+ @Test
+ public void testRepeatingCallWithSameXml() throws Exception {
+
+ final XPathAll xpath = new XPathAll();
+
+ final Tuple tuple = mock(Tuple.class);
+
+ when(tuple.get(0)).thenReturn(
+ "<book id=\"bk101\">"
+ + "<author>Gambardella, Matthew</author>"
+ + "<title>XML Developer's Guide</title>"
+ + "<genre>Computer</genre>"
+ + "<price>44.95</price>"
+ + "<publish_date>2000-10-01</publish_date>"
+ + "<description>An in-depth look at creating
applications with XML.</description>"
+ + "</book>");
+
+ when(tuple.size()).thenReturn(2);
+
+ when(tuple.get(1)).thenReturn("book/author");
+ assertEquals("Gambardella, Matthew", xpath.exec(tuple).get(0));
+ assertNotEquals("Someone else", xpath.exec(tuple).get(0));
+
+ when(tuple.get(1)).thenReturn("book/price");
+ assertEquals("44.95", xpath.exec(tuple).get(0));
+ assertNotEquals("00.00", xpath.exec(tuple).get(0));
+
+ when(tuple.get(1)).thenReturn("book/genre");
+ assertEquals("Computer", xpath.exec(tuple).get(0));
+ assertNotEquals("Sometihng else", xpath.exec(tuple).get(0));
+ }
+
+ @Test
+ public void testCacheFlag() throws Exception {
+
+ final XPathAll xpath = new XPathAll();
+
+ final Tuple tuple = mock(Tuple.class);
+
+ when(tuple.get(0)).thenReturn(
+ "<book id=\"bk101\">" + "<author>Gambardella, Matthew</author>"
+ + "<title>XML Developer's Guide</title>" +
"<genre>Computer</genre>" + "<price>44.95</price>"
+ + "<publish_date>2000-10-01</publish_date>"
+ + "<description>An in-depth look at creating
applications with XML.</description>" + "</book>");
+
+ when(tuple.size()).thenReturn(3);
+
+ // cache on
+ when(tuple.get(2)).thenReturn(true);
+
+ when(tuple.get(1)).thenReturn("book/author");
+ assertEquals("Gambardella, Matthew", xpath.exec(tuple).get(0));
+ assertNotEquals("Someone else", xpath.exec(tuple).get(0));
+
+ when(tuple.get(1)).thenReturn("book/price");
+ assertEquals("44.95", xpath.exec(tuple).get(0));
+ assertNotEquals("00.00", xpath.exec(tuple).get(0));
+
+ // cache off
+ when(tuple.get(2)).thenReturn(false);
+
+ when(tuple.get(1)).thenReturn("book/author");
+ assertEquals("Gambardella, Matthew", xpath.exec(tuple).get(0));
+ assertNotEquals("Someone else", xpath.exec(tuple).get(0));
+
+ when(tuple.get(1)).thenReturn("book/price");
+ assertEquals("44.95", xpath.exec(tuple).get(0));
+ assertNotEquals("00.00", xpath.exec(tuple).get(0));
+
+ }
+
+ @Ignore
+ @Test
+ public void testCacheBenefit() throws Exception {
+
+ final XPathAll xpath = new XPathAll();
+
+ // should be a live instance this time
+ final Tuple tuple = TupleFactory.getInstance().newTuple(3);
+
+ // cache on
+ tuple.set(2, true);
+ final long withCache = timeTheUDF(tuple, xpath);
+
+ // cache off
+ tuple.set(2, false);
+ final long withOutCache = timeTheUDF(tuple, xpath);
+
+ System.out.println(withCache + "\t" + withOutCache);
+
+ assertTrue(withCache < withOutCache);
+
+ }
+
+ @Test
+ public void testExecTupleWithSimpleNamespace() throws Exception {
+
+ final XPathAll xpath = new XPathAll();
+
+ final Tuple tuple = mock(Tuple.class);
+
+ when(tuple.get(0)).thenReturn(
+ "<ann:book id=\"bk101\">"
+ + "<author>Gambardella, Matthew</author>"
+ + "<title>XML Developer's Guide</title>"
+ + "<genre>Computer</genre>"
+ + "<price>44.95</price>"
+ + "<publish_date>2000-10-01</publish_date>"
+ + "<description>An in-depth look at creating
applications with XML.</description>"
+ + "</ann:book>");
+
+ when(tuple.size()).thenReturn(4);
+ when(tuple.get(2)).thenReturn(true);
+ when(tuple.get(3)).thenReturn(true);
+
+ when(tuple.get(1)).thenReturn("book");
+ assertEquals(1, xpath.exec(tuple).getAll().size());
+ assertEquals(
+ "Gambardella, Matthew, XML Developer's Guide, Computer, 44.95,
2000-10-01, An in-depth look at creating applications with XML.",
+ xpath.exec(tuple).get(0));
+
+ }
+
+ @Test
+ public void testExecTupleWithElementNodeWithComplexNameSpace() throws
Exception {
+
+ final XPathAll xpath = new XPathAll();
+
+ final Tuple tuple = mock(Tuple.class);
+
+ when(tuple.get(0)).thenReturn(
+
+ "<cbs:bookstore>"
+ +"<cbs:book>"
+ + "<bsbi:authors>"
+ + "<bsbi:author_1>Gambardella</bsbi:author_1>"
+ + "<bsbi:author_2>Matthew</bsbi:author_2>"
+ + "<bsbi:author_3>Mike</bsbi:author_3>"
+ + "</bsbi:authors>"
+ + "<bsbi:title>23</bsbi:title>"
+ + "<bsbi:genre>semiAutomatic</bsbi:genre>"
+ + "<bsbi:price>enabled</bsbi:price>"
+ +
"<bsbi:publish_date>leftToRight</bsbi:publish_date>"
+ + "<bsbi:description>282</bsbi:description>"
+ + "<bsbi:reviews>"
+ + "<review_1>4 stars</review_1>"
+ + "<review_2>3.5 stars</review_2>"
+ + "<review_3>4 stars</review_3>"
+ + "<review_4>4.2 stars</review_4>"
+ + "<review_5>3.5 stars</review_5>"
+ + "</bsbi:reviews>"
+ + "</cbs:book>"
+ + "<cbs:book>"
+ + "<bsbi:authors>"
+ + "<bsbi:author_1>O'Brien</bsbi:author_1>"
+ + "<bsbi:author_2>Tim</bsbi:author_2>"
+ + "</bsbi:authors>"
+ + "<bsbi:title>23</bsbi:title>"
+ + "<bsbi:genre>semiAutomatic</bsbi:genre>"
+ + "<bsbi:price>enabled</bsbi:price>"
+ + "<bsbi:publish_date>leftToRight</bsbi:publish_date>"
+ + "<bsbi:description>282</bsbi:description>"
+ + "<bsbi:reviews>"
+ + "<bsbi:review_1>3.5 stars</bsbi:review_1>"
+ + "<bsbi:review_2>4 stars</bsbi:review_2>"
+ + "<bsbi:review_3>3.5 stars</bsbi:review_3>"
+ + "<bsbi:review_4>4.2 stars</bsbi:review_4>"
+ + "<bsbi:review_5>4 stars</bsbi:review_5>"
+
+ + "</bsbi:reviews>"
+ + "</cbs:book></cbs:bookstore>");
+
+ when(tuple.size()).thenReturn(4);
+ when(tuple.get(2)).thenReturn(true);
+ when(tuple.get(3)).thenReturn(true);
+
+ when(tuple.get(1)).thenReturn("bookstore/book/authors");
+ assertEquals(2, xpath.exec(tuple).getAll().size());
+ assertEquals("Gambardella, Matthew, Mike", xpath.exec(tuple).get(0));
+ assertEquals("O'Brien, Tim", xpath.exec(tuple).get(1));
+
+ when(tuple.get(1)).thenReturn("bookstore/book/reviews");
+ assertEquals(2, xpath.exec(tuple).getAll().size());
+ assertEquals("4 stars, 3.5 stars, 4 stars, 4.2 stars, 3.5 stars",
xpath.exec(tuple).get(0));
+ assertEquals("3.5 stars, 4 stars, 3.5 stars, 4.2 stars, 4 stars",
xpath.exec(tuple).get(1));
+
+ }
+
+ private long timeTheUDF(final Tuple tuple, final XPathAll xpath) throws
Exception {
+
+ final long start = System.currentTimeMillis();
+
+ for (int i = 0; i < 50000; i++) {
+
+ tuple.set(0, "<book id=\"bk101"
+ + i
+ + "\">"
+ + // we need to make sure xml changes
+ "<author>Gambardella, Matthew</author>" + "<title>XML
Developer's Guide</title>"
+ + "<genre>Computer</genre>" + expandXml() +
"<price>44.95</price>"
+ + "<publish_date>2000-10-01</publish_date>"
+ + "<description>An in-depth look at creating applications
with XML.</description>" + "</book>");
+
+ // caching is used here. for 2nd and 3rd calls to xpath.exec, the
+ // cached javax.xml.xpath.XPath should help
+ tuple.set(1, "book/author");
+ assertEquals("Gambardella, Matthew", xpath.exec(tuple).get(0));
+
+ tuple.set(1, "book/price");
+ assertEquals("44.95", xpath.exec(tuple).get(0));
+
+ tuple.set(1, "book/publish_date");
+ assertEquals("2000-10-01", xpath.exec(tuple).get(0));
+ }
+
+ return System.currentTimeMillis() - start;
+ }
+
+ private String expandXml() {
+
+ final StringBuilder sb = new StringBuilder();
+
+ final int max = RandomUtils.nextInt(100);
+
+ for (int i = 0; i < max; i++) {
+ sb.append("<expansion>This is an expansion of the xml to simulate
random sized xml" + i + "</expansion>");
+ }
+
+ return sb.toString();
+ }
+}
Modified:
pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/xml/XPathTest.java
URL:
http://svn.apache.org/viewvc/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/xml/XPathTest.java?rev=1652238&r1=1652237&r2=1652238&view=diff
==============================================================================
---
pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/xml/XPathTest.java
(original)
+++
pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/xml/XPathTest.java
Thu Jan 15 19:18:34 2015
@@ -21,6 +21,7 @@ import org.apache.commons.lang.math.Rand
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.piggybank.evaluation.xml.XPath;
+import org.junit.Ignore;
import org.junit.Test;
public class XPathTest {
@@ -121,8 +122,97 @@ public class XPathTest {
}
+ @Test
+ public void testExecTupleWithNamespace() throws Exception {
+
+ final XPath xpath = new XPath();
+
+ final Tuple tuple = mock(Tuple.class);
+
+ when(tuple.get(0)).thenReturn(
+ "<ann:book id=\"bk101\">" + "<author>Gambardella,
Matthew</author>"
+ + "<title>XML Developer's Guide</title>" +
"<genre>Computer</genre>" + "<price>44.95</price>"
+ + "<publish_date>2000-10-01</publish_date>"
+ + "<description>An in-depth look at creating
applications with XML.</description>"
+ + "</ann:book>");
+
+ when(tuple.size()).thenReturn(4);
+ when(tuple.get(2)).thenReturn(true);
+ when(tuple.get(3)).thenReturn(true);
+
+ when(tuple.get(1)).thenReturn("book/author");
+ assertEquals("Gambardella, Matthew", xpath.exec(tuple));
+ assertNotEquals("Someone else", xpath.exec(tuple));
+
+ when(tuple.get(1)).thenReturn("book/price");
+ assertEquals("44.95", xpath.exec(tuple));
+ assertNotEquals("00.00", xpath.exec(tuple));
+
+ }
+
+ @Test
+ public void testExecTupleWithElementNodeWithComplexNameSpace() throws
Exception {
+
+ final XPath xpath = new XPath();
+
+ final Tuple tuple = mock(Tuple.class);
+
+ when(tuple.get(0)).thenReturn(
+
+ "<cbs:bookstore>"
+ +"<cbs:book>"
+ + "<bsbi:authors>"
+ + "<bsbi:author_1>Gambardella</bsbi:author_1>"
+ + "<bsbi:author_2>Matthew</bsbi:author_2>"
+ + "<bsbi:author_3>Mike</bsbi:author_3>"
+ + "</bsbi:authors>"
+ + "<bsbi:title>23</bsbi:title>"
+ + "<bsbi:genre>semiAutomatic</bsbi:genre>"
+ + "<bsbi:price>enabled</bsbi:price>"
+ +
"<bsbi:publish_date>leftToRight</bsbi:publish_date>"
+ + "<bsbi:description>282</bsbi:description>"
+ + "<bsbi:reviews>"
+ + "<review_1>4 stars</review_1>"
+ + "<review_2>3.5 stars</review_2>"
+ + "<review_3>4 stars</review_3>"
+ + "<review_4>4.2 stars</review_4>"
+ + "<review_5>3.5 stars</review_5>"
+ + "</bsbi:reviews>"
+ + "</cbs:book>"
+ + "<cbs:book>"
+ + "<bsbi:authors>"
+ + "<bsbi:author_1>O'Brien</bsbi:author_1>"
+ + "<bsbi:author_2>Tim</bsbi:author_2>"
+ + "</bsbi:authors>"
+ + "<bsbi:title>23</bsbi:title>"
+ + "<bsbi:genre>semiAutomatic</bsbi:genre>"
+ + "<bsbi:price>enabled</bsbi:price>"
+ + "<bsbi:publish_date>leftToRight</bsbi:publish_date>"
+ + "<bsbi:description>282</bsbi:description>"
+ + "<bsbi:reviews>"
+ + "<bsbi:review_1>3.5 stars</bsbi:review_1>"
+ + "<bsbi:review_2>4 stars</bsbi:review_2>"
+ + "<bsbi:review_3>3.5 stars</bsbi:review_3>"
+ + "<bsbi:review_4>4.2 stars</bsbi:review_4>"
+ + "<bsbi:review_5>4 stars</bsbi:review_5>"
+
+ + "</bsbi:reviews>"
+ + "</cbs:book></cbs:bookstore>");
+
+ when(tuple.size()).thenReturn(4);
+ when(tuple.get(2)).thenReturn(true);
+ when(tuple.get(3)).thenReturn(true);
+
+ when(tuple.get(1)).thenReturn("bookstore/book/authors");
+ assertEquals("GambardellaMatthewMike", xpath.exec(tuple));
+
+ when(tuple.get(1)).thenReturn("bookstore/book/reviews");
+ assertEquals("4 stars3.5 stars4 stars4.2 stars3.5 stars",
xpath.exec(tuple));
+
+ }
- //@Test --optional test
+ @Ignore //--optional test
+ @Test
public void testCacheBenefit() throws Exception{
final XPath xpath = new XPath();