Author: daijy
Date: Thu Jan 15 19:18:34 2015
New Revision: 1652238

URL: http://svn.apache.org/r1652238
Log:
PIG-4355: Piggybank: XPath cant handle namespace in xpath, nor can it return 
more than one match

Added:
    
pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/xml/XPathAll.java
    
pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/xml/XPathAllTest.java
Modified:
    pig/trunk/CHANGES.txt
    
pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/xml/XPath.java
    
pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/xml/XPathTest.java

Modified: pig/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/pig/trunk/CHANGES.txt?rev=1652238&r1=1652237&r2=1652238&view=diff
==============================================================================
--- pig/trunk/CHANGES.txt (original)
+++ pig/trunk/CHANGES.txt Thu Jan 15 19:18:34 2015
@@ -42,6 +42,9 @@ PIG-4333: Split BigData tests into multi
  
 BUG FIXES
 
+PIG-4355: Piggybank: XPath cant handle namespace in xpath, nor can it return 
more than one match
+ (cavanaug via daijy)
+
 PIG-4371: Duplicate snappy.version in libraries.properties (daijy)
 
 PIG-4368: Port local mode tests to Tez - TestLoadStoreFuncLifeCycle (daijy)

Modified: 
pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/xml/XPath.java
URL: 
http://svn.apache.org/viewvc/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/xml/XPath.java?rev=1652238&r1=1652237&r2=1652238&view=diff
==============================================================================
--- 
pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/xml/XPath.java
 (original)
+++ 
pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/xml/XPath.java
 Thu Jan 15 19:18:34 2015
@@ -38,113 +38,203 @@ import org.xml.sax.InputSource;
  */
 public class XPath extends EvalFunc<String> {
 
-    /** Hold onto last xpath & xml in case the next call to xpath() is feeding 
the same xml document
-     * The reason for this is because creating an xpath object is costly. */
+    /**
+     * Hold onto last xpath & xml in case the next call to xpath() is feeding
+     * the same xml document The reason for this is because creating an xpath
+     * object is costly.
+     */
     private javax.xml.xpath.XPath xpath = null;
     private String xml = null;
     private Document document;
     
     private static boolean cache = true;
+    private static boolean ignoreNamespace = true;
+    public static final String EMPTY_STRING = "";
     
     /**
-     * input should contain: 1) xml 2) xpath 3) optional cache xml doc flag
+     * input should contain: 1) xml 2) xpath 
+     *                       3) optional cache xml doc flag 
+     *                       4) optional ignore namespace flag
      * 
      * Usage:
      * 1) XPath(xml, xpath)
      * 2) XPath(xml, xpath, false) 
+     * 3) XPath(xml, xpath, false, false)
      * 
-     * @param 1st element should to be the xml
+     * @param input
+     *                   1st element should to be the xml
      *        2nd element should be the xpath
      *        3rd optional boolean cache flag (default true)
+     *        4th optional boolean ignore namespace flag (default true)
+     *        
+     * 
+      *        This UDF will cache the last xml document. This is helpful when
+     *        multiple consecutive xpathAll calls are made for the same xml
+     *        document. Caching can be turned off to ensure that the UDF's
+     *        recreates the internal javax.xml.xpath.XPathAll for every call
      *        
-     * This UDF will cache the last xml document. This is helpful when 
multiple consecutive xpath calls are made for the same xml document.
-     * Caching can be turned off to ensure that the UDF's recreates the 
internal javax.xml.xpath.XPath for every call
+     *        This UDF will also support ignoring the namespace in the xml 
tags.
+     *        This will help to search xpath items by ignoring its namespace.
+     *        Ignoring of the namespace can be turned off for special cases 
using
+     *        a fourth argument in the UDF. 
      * 
      * @return chararrary result or null if no match
      */
     @Override
     public String exec(final Tuple input) throws IOException {
 
-        if (input == null || input.size() <= 1) {
-            warn("Error processing input, not enough parameters or null input" 
+ input,
-                    PigWarning.UDF_WARNING_1);
-            return null;
-        }
-
-
-        if (input.size() > 3) {
-            warn("Error processing input, too many parameters" + input,
-                    PigWarning.UDF_WARNING_1);
+        if (!isArgsValid(input)) { // Validate arguments
             return null;
         }
 
         try {
 
             final String xml = (String) input.get(0);
+
             if (xml == null) {
                 return null;
             }
             
             if(input.size() > 2)
                 cache = (Boolean) input.get(2);
-            
-            if(!cache || xpath == null || !xml.equals(this.xml))
-            {
+
+            if (!cache || xpath == null || !xml.equals(this.xml)) {
                 final InputSource source = new InputSource(new 
StringReader(xml));
-                
-                this.xml = xml; //track the xml for subsequent calls to this 
udf
+
+                this.xml = xml; // track the xml for subsequent calls to this 
udf
 
                 final DocumentBuilderFactory dbf = 
DocumentBuilderFactory.newInstance();
                 final DocumentBuilder db = dbf.newDocumentBuilder();
-                
+
                 this.document = db.parse(source);
 
                 final XPathFactory xpathFactory = XPathFactory.newInstance();
 
                 this.xpath = xpathFactory.newXPath();
-                
+
+            }
+
+            String xpathString = (String) input.get(1);
+
+            if (ignoreNamespace) {
+                xpathString = createNameSpaceIgnoreXpathString(xpathString);
             }
-            
-            final String xpathString = (String) input.get(1);
 
             final String value = xpath.evaluate(xpathString, document);
 
             return value;
 
         } catch (Exception e) {
-            warn("Error processing input " + input.getType(0), 
-                    PigWarning.UDF_WARNING_1);
-            
+            warn("Error processing input " + input.getType(0), 
PigWarning.UDF_WARNING_1);
+
             return null;
         }
     }
+    
+    /**
+     * Validates values of the input parameters.
+     * 
+     * @param Tuple
+     * @return boolean
+     */
+    private boolean isArgsValid(final Tuple input) {
+        if (input == null || input.size() <= 1) {
+            warn("Error processing input, not enough parameters or null input" 
+ input, PigWarning.UDF_WARNING_1);
+            return false;
+        }
+
+        if (input.size() > 4) {
+            warn("Error processing input, too many parameters" + input, 
PigWarning.UDF_WARNING_1);
+            return false;
+        }
+
+        try {
+            // 3rd Parameter - CACHE
+            if (input.size() > 2 && !(input.get(2) instanceof Boolean)) { 
+                warn("Error processing input, invalid value in 3rd parameter" 
+ input, PigWarning.UDF_WARNING_1);
+                return false;
+            }
+
+            // 4rd Parameter IGNORE_NAMESPACE
+            if (input.size() > 3 && !(input.get(3) instanceof Boolean)) {
+                warn("Error processing input, invalid value in 4th parameter" 
+ input, PigWarning.UDF_WARNING_1);
+                return false;
+            }
+        } catch (Exception ex) {
+            return false;
+        }
+        return true;
+    }
+    
+    
+    /**
+     * Returns a new the xPathString by adding additional parameters 
+     * in the existing xPathString for ignoring the namespace during 
compilation.
+     * 
+     * @param String xpathString
+     * @return String modified xpathString
+     */
+    private String createNameSpaceIgnoreXpathString(final String xpathString) {
+        final String QUERY_PREFIX = "//*";
+        final String LOCAL_PREFIX = "[local-name()='";
+        final String LOCAL_POSTFIX = "']";
+        final String SPLITTER = "/";
+
+        try {
+            String xpathStringWithLocalName = EMPTY_STRING;
+            String[] individualNodes = xpathString.split(SPLITTER);
+
+            for (String node : individualNodes) {
+                xpathStringWithLocalName = 
xpathStringWithLocalName.concat(QUERY_PREFIX + LOCAL_PREFIX + node
+                        + LOCAL_POSTFIX);
+            }
+            return xpathStringWithLocalName;
+        } catch (Exception ex) {
+            return xpathString;
+        }
+    }
+
+    /**
+     * Returns argument schemas of the UDF.
+     * 
+     * @return List
+     */
+    @Override
+    public List<FuncSpec> getArgToFuncMapping() throws FrontendException {
 
-       @Override
-       public List<FuncSpec> getArgToFuncMapping() throws FrontendException {
+        final List<FuncSpec> funcList = new ArrayList<FuncSpec>();
 
-               final List<FuncSpec> funcList = new ArrayList<FuncSpec>();
+        /* either two chararray arguments */
+        List<FieldSchema> fields = new ArrayList<FieldSchema>();
+        fields.add(new Schema.FieldSchema(null, DataType.CHARARRAY));
+        fields.add(new Schema.FieldSchema(null, DataType.CHARARRAY));
 
-               /*either two chararray arguments*/
-               List<FieldSchema> fields = new ArrayList<FieldSchema>();
-               fields.add(new Schema.FieldSchema(null, DataType.CHARARRAY));
-               fields.add(new Schema.FieldSchema(null, DataType.CHARARRAY));
+        Schema twoArgInSchema = new Schema(fields);
 
-               Schema twoArgInSchema = new Schema(fields);
+        funcList.add(new FuncSpec(this.getClass().getName(), twoArgInSchema));
 
-               funcList.add(new FuncSpec(this.getClass().getName(), 
twoArgInSchema));
+        /* or two chararray and a boolean argument */
+        fields = new ArrayList<FieldSchema>();
+        fields.add(new Schema.FieldSchema(null, DataType.CHARARRAY));
+        fields.add(new Schema.FieldSchema(null, DataType.CHARARRAY));
+        fields.add(new Schema.FieldSchema(null, DataType.BOOLEAN));
 
-               /*or two chararray and a boolean argument*/
-               fields = new ArrayList<FieldSchema>();
-               fields.add(new Schema.FieldSchema(null, DataType.CHARARRAY));
-               fields.add(new Schema.FieldSchema(null, DataType.CHARARRAY));
-               fields.add(new Schema.FieldSchema(null, DataType.BOOLEAN));
+        Schema threeArgInSchema = new Schema(fields);
 
-               Schema threeArgInSchema = new Schema(fields);
+        funcList.add(new FuncSpec(this.getClass().getName(), 
threeArgInSchema));
 
-               funcList.add(new FuncSpec(this.getClass().getName(), 
threeArgInSchema));
+        /* or two chararray and two boolean arguments */
+        fields = new ArrayList<FieldSchema>();
+        fields.add(new Schema.FieldSchema(null, DataType.CHARARRAY));
+        fields.add(new Schema.FieldSchema(null, DataType.CHARARRAY));
+        fields.add(new Schema.FieldSchema(null, DataType.BOOLEAN));
+        fields.add(new Schema.FieldSchema(null, DataType.BOOLEAN));
 
-               return funcList;
-       }
+        Schema fourArgInSchema = new Schema(fields);
 
-}
+        funcList.add(new FuncSpec(this.getClass().getName(), fourArgInSchema));
 
+        return funcList;
+    }
+
+}

Added: 
pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/xml/XPathAll.java
URL: 
http://svn.apache.org/viewvc/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/xml/XPathAll.java?rev=1652238&view=auto
==============================================================================
--- 
pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/xml/XPathAll.java
 (added)
+++ 
pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/evaluation/xml/XPathAll.java
 Thu Jan 15 19:18:34 2015
@@ -0,0 +1,319 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more 
contributor license agreements. See the
+ * NOTICE file distributed with this work for additional information regarding 
copyright ownership. The ASF
+ * licenses this file to you under the Apache License, Version 2.0 (the 
"License"); you may not use this file
+ * except in compliance with the License. You may obtain a copy of the License 
at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software 
distributed under the License is
+ * distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 
KIND, either express or implied.
+ * See the License for the specific language governing permissions and 
limitations under the License.
+ */
+
+package org.apache.pig.piggybank.evaluation.xml;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.ArrayList;
+import java.util.List;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.xpath.XPathConstants;
+import javax.xml.xpath.XPathFactory;
+
+import org.apache.pig.EvalFunc;
+import org.apache.pig.FuncSpec;
+import org.apache.pig.PigWarning;
+import org.apache.pig.data.DataType;
+import org.apache.pig.data.Tuple;
+import org.apache.pig.data.TupleFactory;
+import org.apache.pig.impl.logicalLayer.FrontendException;
+import org.apache.pig.impl.logicalLayer.schema.Schema;
+import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema;
+import org.w3c.dom.Document;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+import org.xml.sax.InputSource;
+
+/**
+ * XPathAll is a function that allows for Tuple extraction from xml
+ */
+public class XPathAll extends EvalFunc<Tuple> {
+
+    private javax.xml.xpath.XPath xmlPath = null;
+    private String xml = null;
+    private Document document;
+
+    /**
+     * Caching of the xpath & xml in case the next call to xpath() is feeding
+     * the same xml document The reason for this is because creating an xpath
+     * object is costly.
+     */
+    private static boolean cache = true;
+    private static boolean ignoreNamespace = true;
+
+    private static TupleFactory tupleFactory = TupleFactory.getInstance();
+
+    public static enum ARGUMENTS {
+        XML_FILE(0), XPATH(1), CACHE(2), IGNORE_NAMESPACE(3);
+
+        private int argument;
+
+        ARGUMENTS(int argument) {
+            this.argument = argument;
+        }
+
+        int getPosition() {
+            return this.argument;
+        }
+    }
+
+    public static final String EMPTY_STRING = "";
+
+    /**
+     * input should contain: 1) xml 2) xpath 3) optional cache xml doc flag 4)
+     * optional ignore namespace flag
+     * 
+     * The optional fourth parameter (IGNORE_NAMESPACE), if set true will 
remove
+     * the namespace from xPath For example xpath /html:body/html:div will be
+     * considered as /body/div
+     * 
+     * Usage:  1) XPathAll(xml, xpath) 
+     *                         2) XPathAll(xml, xpath, false)
+     *                         3) XPathAll(xml, xpath, false, false)
+     * 
+     * @param input
+     *                   1st element should to be the xml 2nd element should 
be the xpath
+     *        3rd optional boolean cache flag (default true) 
+     *        4th optional boolean ignore namespace flag(default true)
+     * 
+     *        This UDF will cache the last xml document. This is helpful when
+     *        multiple consecutive xpathAll calls are made for the same xml
+     *        document. Caching can be turned off to ensure that the UDF's
+     *        recreates the internal javax.xml.xpath.XPathAll for every call
+     *        
+     *        This UDF will also support ignoring the namespace in the xml 
tags.
+     *        This will help to search xpath items by ignoring its namespace.
+     *        Ignoring of the namespace can be turned off for special cases 
using
+     *        a fourth argument in the UDF. 
+     *        
+     * 
+     * @return Tuple result or null if no match
+     */
+    @Override
+    public Tuple exec(final Tuple input) throws IOException {
+
+        if (!isArgsValid(input)) { // Validate arguments
+            return null;
+        }
+
+        try {
+               
+            final String xml = (String) 
input.get(ARGUMENTS.XML_FILE.getPosition());
+            if (xml == null) {
+                warn("Error processing input, invalid parameter" + input, 
PigWarning.UDF_WARNING_1);
+                return null;
+            }
+
+            if (input.size() > 2) {
+                cache = (Boolean) input.get(ARGUMENTS.CACHE.getPosition());
+            }
+
+            if (input.size() > 3) {
+                ignoreNamespace = (Boolean) 
input.get(ARGUMENTS.IGNORE_NAMESPACE.getPosition());
+            }
+
+            // Process XML
+            if (!cache || xmlPath == null || !xml.equals(this.xml)) { // Cache 
verification
+                final InputSource source = new InputSource(new 
StringReader(xml));
+
+                this.xml = xml; // track the xml for subsequent calls to this 
udf
+
+                final DocumentBuilderFactory dbf = 
DocumentBuilderFactory.newInstance();
+                final DocumentBuilder db = dbf.newDocumentBuilder();
+
+                this.document = db.parse(source);
+
+                final XPathFactory xpathFactory = XPathFactory.newInstance();
+                this.xmlPath = xpathFactory.newXPath();
+            }
+
+            String xpathString = (String) 
input.get(ARGUMENTS.XPATH.getPosition());
+
+            if (ignoreNamespace) {
+                xpathString = createNameSpaceIgnoreXpathString(xpathString);
+            }
+
+            final NodeList nodeEntries = (NodeList) 
xmlPath.compile(xpathString).evaluate(document,
+                    XPathConstants.NODESET);
+
+            if (nodeEntries == null) {
+                return null;
+            }
+
+            Tuple resultTuple = tupleFactory.newTuple(nodeEntries.getLength());
+
+            for (int nodeEntryIndex = 0; nodeEntryIndex < 
nodeEntries.getLength(); nodeEntryIndex++) {
+                final String ELEMENT_NODE_SEPARATOR = ", ";
+
+                Node node = nodeEntries.item(nodeEntryIndex);
+
+                // Parse the Node
+                final NodeList childNodes = node.getChildNodes();
+                if (childNodes == null) {
+                    continue;
+                }
+
+                String nodeData = "";
+                boolean dataFlag = false;
+
+                for (int i = 0; i < childNodes.getLength(); i++) {
+                    try {
+                        Node subNode = childNodes.item(i);
+                        if (subNode.getNodeType() == Node.ELEMENT_NODE) {
+                            if (subNode.getFirstChild().getNodeValue() == 
null) {
+                                // If There is no direct element, return blank
+                                nodeData = 
nodeData.concat(ELEMENT_NODE_SEPARATOR);
+                                nodeData = nodeData.concat(EMPTY_STRING);
+                                dataFlag = true;
+                                continue;
+                            }
+                            nodeData = nodeData.concat(ELEMENT_NODE_SEPARATOR);
+                            nodeData = 
nodeData.concat(subNode.getFirstChild().getNodeValue());
+                            dataFlag = true;
+                        } else if (subNode.getNodeType() == Node.TEXT_NODE
+                                || subNode.getNodeType() == 
Node.ATTRIBUTE_NODE) {
+                            nodeData = nodeData.concat(ELEMENT_NODE_SEPARATOR);
+                            nodeData = nodeData.concat(subNode.getNodeValue());
+                            dataFlag = true;
+                        }
+                    } catch (Exception ex) {
+                        continue;
+                    }
+                }
+
+                if (dataFlag) {
+                    nodeData = nodeData.replaceFirst(ELEMENT_NODE_SEPARATOR, 
EMPTY_STRING);
+                    resultTuple.set(nodeEntryIndex, nodeData);
+                }
+            }
+            return resultTuple;
+
+        } catch (Exception e) {
+            warn("Error processing input " + input.getType(0), 
PigWarning.UDF_WARNING_1);
+            return null;
+        }
+    }
+    
+    
+    /**
+     * Validates values of the input parameters.
+     * 
+     * @param Tuple
+     * @return boolean
+     */
+    private boolean isArgsValid(final Tuple input) {
+        if (input == null || input.size() <= 1) {
+            warn("Error processing input, not enough parameters or null input" 
+ input, PigWarning.UDF_WARNING_1);
+            return false;
+        }
+
+        if (input.size() > 4) {
+            warn("Error processing input, too many parameters" + input, 
PigWarning.UDF_WARNING_1);
+            return false;
+        }
+
+        try {
+            // 3rd Parameter - CACHE
+            if (input.size() > 2 && !(input.get(ARGUMENTS.CACHE.getPosition()) 
instanceof Boolean)) {
+                warn("Error processing input, invalid value in 3rd parameter" 
+ input, PigWarning.UDF_WARNING_1);
+                return false;
+            }
+
+            // 4rd Parameter IGNORE_NAMESPACE
+            if (input.size() > 3 && 
!(input.get(ARGUMENTS.IGNORE_NAMESPACE.getPosition()) instanceof Boolean)) {
+                warn("Error processing input, invalid value in 4th parameter" 
+ input, PigWarning.UDF_WARNING_1);
+                return false;
+            }
+        } catch (Exception ex) {
+            return false;
+        }
+
+        return true;
+    }
+
+    /**
+     * Returns a new the xPathString by adding additional parameters 
+     * in the existing xPathString for ignoring the namespace during 
compilation.
+     * 
+     * @param String xpathString
+     * @return String modified xpathString
+     */
+    private String createNameSpaceIgnoreXpathString(final String xpathString) {
+        final String QUERY_PREFIX = "//*";
+        final String LOCAL_PREFIX = "[local-name()='";
+        final String LOCAL_POSTFIX = "']";
+        final String SPLITTER = "/";
+
+        try {
+            String xpathStringWithLocalName = EMPTY_STRING;
+            String[] individualNodes = xpathString.split(SPLITTER);
+
+            for (String node : individualNodes) {
+                xpathStringWithLocalName = 
xpathStringWithLocalName.concat(QUERY_PREFIX + LOCAL_PREFIX + node
+                        + LOCAL_POSTFIX);
+            }
+
+            return xpathStringWithLocalName;
+        } catch (Exception ex) {
+            return xpathString;
+        }
+    }
+
+    /**
+     * Returns argument schemas of the UDF.
+     * 
+     * @return List
+     */
+    
+    @Override
+    public List<FuncSpec> getArgToFuncMapping() throws FrontendException {
+
+        final List<FuncSpec> funcList = new ArrayList<FuncSpec>();
+
+        /* either two chararray arguments */
+        List<FieldSchema> fields = new ArrayList<FieldSchema>();
+        fields.add(new Schema.FieldSchema(null, DataType.CHARARRAY));
+        fields.add(new Schema.FieldSchema(null, DataType.CHARARRAY));
+
+        Schema twoArgInSchema = new Schema(fields);
+
+        funcList.add(new FuncSpec(this.getClass().getName(), twoArgInSchema));
+
+        /* or two chararray and a boolean argument */
+        fields = new ArrayList<FieldSchema>();
+        fields.add(new Schema.FieldSchema(null, DataType.CHARARRAY));
+        fields.add(new Schema.FieldSchema(null, DataType.CHARARRAY));
+        fields.add(new Schema.FieldSchema(null, DataType.BOOLEAN));
+
+        Schema threeArgInSchema = new Schema(fields);
+
+        funcList.add(new FuncSpec(this.getClass().getName(), 
threeArgInSchema));
+
+        /* or two chararray and two boolean arguments */
+        fields = new ArrayList<FieldSchema>();
+        fields.add(new Schema.FieldSchema(null, DataType.CHARARRAY));
+        fields.add(new Schema.FieldSchema(null, DataType.CHARARRAY));
+        fields.add(new Schema.FieldSchema(null, DataType.BOOLEAN));
+        fields.add(new Schema.FieldSchema(null, DataType.BOOLEAN));
+
+        Schema fourArgInSchema = new Schema(fields);
+
+        funcList.add(new FuncSpec(this.getClass().getName(), fourArgInSchema));
+
+        return funcList;
+    }
+
+}

Added: 
pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/xml/XPathAllTest.java
URL: 
http://svn.apache.org/viewvc/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/xml/XPathAllTest.java?rev=1652238&view=auto
==============================================================================
--- 
pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/xml/XPathAllTest.java
 (added)
+++ 
pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/xml/XPathAllTest.java
 Thu Jan 15 19:18:34 2015
@@ -0,0 +1,367 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more 
contributor license agreements. See the
+ * NOTICE file distributed with this work for additional information regarding 
copyright ownership. The ASF
+ * licenses this file to you under the Apache License, Version 2.0 (the 
"License"); you may not use this file
+ * except in compliance with the License. You may obtain a copy of the License 
at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software 
distributed under the License is
+ * distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 
KIND, either express or implied.
+ * See the License for the specific language governing permissions and 
limitations under the License.
+ */
+
+package org.apache.pig.piggybank.test.evaluation.xml;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotEquals;
+import static org.junit.Assert.assertTrue;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.when;
+
+import org.apache.commons.lang.math.RandomUtils;
+import org.apache.pig.data.Tuple;
+import org.apache.pig.data.TupleFactory;
+import org.apache.pig.piggybank.evaluation.xml.XPathAll;
+import org.junit.Ignore;
+import org.junit.Test;
+
+public class XPathAllTest {
+    
+    @Test
+    public void testExecTuple() throws Exception {
+
+        final XPathAll xpath = new XPathAll();
+
+        final Tuple tuple = mock(Tuple.class);
+
+        when(tuple.get(0)).thenReturn(
+                               "<book id=\"bk101\">" 
+                                               + "<author>Gambardella, 
Matthew</author>"
+                                               + "<title>XML Developer's 
Guide</title>" 
+                                               + "<genre>Computer</genre>" 
+                                               + "<price>44.95</price>"
+                                               + 
"<publish_date>2000-10-01</publish_date>"
+                                               + "<description>An in-depth 
look at creating applications with XML.</description>" 
+                        + "</book>");
+
+        when(tuple.size()).thenReturn(2);
+        when(tuple.get(1)).thenReturn("book");
+        
+        assertEquals(
+                "Gambardella, Matthew, XML Developer's Guide, Computer, 44.95, 
2000-10-01, An in-depth look at creating applications with XML.",
+                xpath.exec(tuple).get(0));
+    }
+
+    @Test
+    public void testExecTupleWithInnerNodes() throws Exception {
+
+        final XPathAll xpath = new XPathAll();
+
+        final Tuple tuple = mock(Tuple.class);
+
+        when(tuple.get(0)).thenReturn(
+                "<book id=\"bk101\">" 
+                               + "<authors>" 
+                                       + "<author_1>Gambardella</author_1>"
+                               + "<author_2>Matthew</author_2>" 
+                                       + "<author_2>Mike</author_2>" 
+                        + "</authors>"
+                        + "<title>XML Developer's Guide</title>" 
+                        + "<genre>Computer</genre>" 
+                        + "<price>44.95</price>"
+                        + "<publish_date>2000-10-01</publish_date>"
+                        + "<description>An in-depth look at creating 
applications with XML.</description>" 
+                + "</book>");
+
+        when(tuple.size()).thenReturn(2);
+        when(tuple.get(1)).thenReturn("book/authors");
+        
+        assertEquals("Gambardella, Matthew, Mike", xpath.exec(tuple).get(0));
+    }
+
+    @Test
+    public void testExecTupleWithMultipleParentNodes() throws Exception {
+
+        final XPathAll xpath = new XPathAll();
+
+        final Tuple tuple = mock(Tuple.class);
+
+        when(tuple.get(0)).thenReturn(
+                "<bookstore>"
+                + "<book id=\"bk101\">" 
+                        + "<authors>" 
+                            + "<author_1>Gambardella</author_1>"
+                            + "<author_2>Matthew</author_2>" 
+                            + "<author_3>Mike</author_3>"
+                        + "</authors>"
+                        + "<title>XML Developer's Guide</title>" 
+                        + "<genre>Computer</genre>" 
+                        + "<price>44.95</price>"
+                        + "<publish_date>2000-10-01</publish_date>"
+                        + "<description>An in-depth look at creating 
applications with XML.</description>"
+                + "</book>"
+                + "<book id=\"bk102\">"
+                    + "<authors>"
+                        + "<author_1>Kent</author_1>"
+                        + "<author_2>Beck</author_2>" 
+                    + "</authors>"
+                    + "<title>HTML Developer's</title>"
+                    + "<genre>Computer</genre>" 
+                    + "<price>60.00</price>"
+                    + "<publish_date>2000-10-01</publish_date>"
+                    + "<description>An in-depth look at creating applications 
with HTML.</description>"
+                + "</book>"
+          + "</bookstore>");
+
+        when(tuple.size()).thenReturn(4);
+        when(tuple.get(2)).thenReturn(true);
+        when(tuple.get(3)).thenReturn(true);
+        
+        when(tuple.get(1)).thenReturn("bookstore/book");
+        assertEquals(2, xpath.exec(tuple).getAll().size());
+        assertEquals(
+             ", XML Developer's Guide, Computer, 44.95, 2000-10-01, An 
in-depth look at creating applications with XML.",
+             xpath.exec(tuple).get(0));
+        assertEquals(
+                ", HTML Developer's, Computer, 60.00, 2000-10-01, An in-depth 
look at creating applications with HTML.",
+                xpath.exec(tuple).get(1));
+        
+        when(tuple.get(1)).thenReturn("bookstore/book/authors");
+        assertEquals(2, xpath.exec(tuple).getAll().size());
+        assertEquals("Gambardella, Matthew, Mike", xpath.exec(tuple).get(0));
+        assertEquals("Kent, Beck", xpath.exec(tuple).get(1));
+     
+    }
+
+    @Test
+    public void testRepeatingCallWithSameXml() throws Exception {
+
+        final XPathAll xpath = new XPathAll();
+
+        final Tuple tuple = mock(Tuple.class);
+
+        when(tuple.get(0)).thenReturn(
+                "<book id=\"bk101\">" 
+                               + "<author>Gambardella, Matthew</author>"
+                        + "<title>XML Developer's Guide</title>" 
+                               + "<genre>Computer</genre>" 
+                        + "<price>44.95</price>"
+                        + "<publish_date>2000-10-01</publish_date>"
+                        + "<description>An in-depth look at creating 
applications with XML.</description>" 
+                + "</book>");
+
+        when(tuple.size()).thenReturn(2);
+
+        when(tuple.get(1)).thenReturn("book/author");
+        assertEquals("Gambardella, Matthew", xpath.exec(tuple).get(0));
+        assertNotEquals("Someone else", xpath.exec(tuple).get(0));
+
+        when(tuple.get(1)).thenReturn("book/price");
+        assertEquals("44.95", xpath.exec(tuple).get(0));
+        assertNotEquals("00.00", xpath.exec(tuple).get(0));
+
+        when(tuple.get(1)).thenReturn("book/genre");
+        assertEquals("Computer", xpath.exec(tuple).get(0));
+        assertNotEquals("Sometihng else", xpath.exec(tuple).get(0));
+    }
+
+    @Test
+    public void testCacheFlag() throws Exception {
+
+        final XPathAll xpath = new XPathAll();
+
+        final Tuple tuple = mock(Tuple.class);
+
+        when(tuple.get(0)).thenReturn(
+                "<book id=\"bk101\">" + "<author>Gambardella, Matthew</author>"
+                        + "<title>XML Developer's Guide</title>" + 
"<genre>Computer</genre>" + "<price>44.95</price>"
+                        + "<publish_date>2000-10-01</publish_date>"
+                        + "<description>An in-depth look at creating 
applications with XML.</description>" + "</book>");
+
+        when(tuple.size()).thenReturn(3);
+
+        // cache on
+        when(tuple.get(2)).thenReturn(true);
+
+        when(tuple.get(1)).thenReturn("book/author");
+        assertEquals("Gambardella, Matthew", xpath.exec(tuple).get(0));
+        assertNotEquals("Someone else", xpath.exec(tuple).get(0));
+
+        when(tuple.get(1)).thenReturn("book/price");
+        assertEquals("44.95", xpath.exec(tuple).get(0));
+        assertNotEquals("00.00", xpath.exec(tuple).get(0));
+
+        // cache off
+        when(tuple.get(2)).thenReturn(false);
+
+        when(tuple.get(1)).thenReturn("book/author");
+        assertEquals("Gambardella, Matthew", xpath.exec(tuple).get(0));
+        assertNotEquals("Someone else", xpath.exec(tuple).get(0));
+
+        when(tuple.get(1)).thenReturn("book/price");
+        assertEquals("44.95", xpath.exec(tuple).get(0));
+        assertNotEquals("00.00", xpath.exec(tuple).get(0));
+
+    }
+
+    @Ignore
+    @Test    
+    public void testCacheBenefit() throws Exception {
+
+        final XPathAll xpath = new XPathAll();
+
+        // should be a live instance this time
+        final Tuple tuple = TupleFactory.getInstance().newTuple(3);
+
+        // cache on
+        tuple.set(2, true);
+        final long withCache = timeTheUDF(tuple, xpath);
+
+        // cache off
+        tuple.set(2, false);
+        final long withOutCache = timeTheUDF(tuple, xpath);
+
+        System.out.println(withCache + "\t" + withOutCache);
+
+        assertTrue(withCache < withOutCache);
+
+    }
+
+    @Test
+    public void testExecTupleWithSimpleNamespace() throws Exception {
+
+        final XPathAll xpath = new XPathAll();
+
+        final Tuple tuple = mock(Tuple.class);
+
+        when(tuple.get(0)).thenReturn(
+                "<ann:book id=\"bk101\">" 
+                               + "<author>Gambardella, Matthew</author>"
+                        + "<title>XML Developer's Guide</title>" 
+                               + "<genre>Computer</genre>" 
+                        + "<price>44.95</price>"
+                        + "<publish_date>2000-10-01</publish_date>"
+                        + "<description>An in-depth look at creating 
applications with XML.</description>"
+                + "</ann:book>");
+
+        when(tuple.size()).thenReturn(4);
+        when(tuple.get(2)).thenReturn(true);
+        when(tuple.get(3)).thenReturn(true);
+        
+        when(tuple.get(1)).thenReturn("book");
+        assertEquals(1, xpath.exec(tuple).getAll().size());
+        assertEquals(
+                "Gambardella, Matthew, XML Developer's Guide, Computer, 44.95, 
2000-10-01, An in-depth look at creating applications with XML.",
+                xpath.exec(tuple).get(0));
+
+    }
+
+    @Test
+    public void testExecTupleWithElementNodeWithComplexNameSpace() throws 
Exception {
+
+        final XPathAll xpath = new XPathAll();
+
+        final Tuple tuple = mock(Tuple.class);
+
+        when(tuple.get(0)).thenReturn(
+
+                "<cbs:bookstore>"
+                        +"<cbs:book>"
+                            + "<bsbi:authors>"
+                                + "<bsbi:author_1>Gambardella</bsbi:author_1>"
+                                + "<bsbi:author_2>Matthew</bsbi:author_2>"
+                                + "<bsbi:author_3>Mike</bsbi:author_3>"
+                            + "</bsbi:authors>"
+                            + "<bsbi:title>23</bsbi:title>"
+                            + "<bsbi:genre>semiAutomatic</bsbi:genre>"
+                            + "<bsbi:price>enabled</bsbi:price>"
+                            + 
"<bsbi:publish_date>leftToRight</bsbi:publish_date>"
+                            + "<bsbi:description>282</bsbi:description>"
+                            + "<bsbi:reviews>"
+                                + "<review_1>4 stars</review_1>"
+                                + "<review_2>3.5 stars</review_2>"
+                                + "<review_3>4 stars</review_3>"
+                                + "<review_4>4.2 stars</review_4>"
+                                + "<review_5>3.5 stars</review_5>"
+                            + "</bsbi:reviews>"
+                        + "</cbs:book>"
+                        + "<cbs:book>"
+                        + "<bsbi:authors>"
+                            + "<bsbi:author_1>O'Brien</bsbi:author_1>"
+                            + "<bsbi:author_2>Tim</bsbi:author_2>"
+                        + "</bsbi:authors>"
+                        + "<bsbi:title>23</bsbi:title>"
+                        + "<bsbi:genre>semiAutomatic</bsbi:genre>"
+                        + "<bsbi:price>enabled</bsbi:price>"
+                        + "<bsbi:publish_date>leftToRight</bsbi:publish_date>"
+                        + "<bsbi:description>282</bsbi:description>"
+                        + "<bsbi:reviews>"
+                            + "<bsbi:review_1>3.5 stars</bsbi:review_1>"
+                            + "<bsbi:review_2>4 stars</bsbi:review_2>"
+                            + "<bsbi:review_3>3.5 stars</bsbi:review_3>"
+                            + "<bsbi:review_4>4.2 stars</bsbi:review_4>"
+                            + "<bsbi:review_5>4 stars</bsbi:review_5>"
+                            
+                        + "</bsbi:reviews>"
+                        + "</cbs:book></cbs:bookstore>");
+
+        when(tuple.size()).thenReturn(4);
+        when(tuple.get(2)).thenReturn(true);
+        when(tuple.get(3)).thenReturn(true);
+
+        when(tuple.get(1)).thenReturn("bookstore/book/authors");
+        assertEquals(2, xpath.exec(tuple).getAll().size());
+        assertEquals("Gambardella, Matthew, Mike", xpath.exec(tuple).get(0));
+        assertEquals("O'Brien, Tim", xpath.exec(tuple).get(1));
+
+        when(tuple.get(1)).thenReturn("bookstore/book/reviews");
+        assertEquals(2, xpath.exec(tuple).getAll().size());
+        assertEquals("4 stars, 3.5 stars, 4 stars, 4.2 stars, 3.5 stars", 
xpath.exec(tuple).get(0));
+        assertEquals("3.5 stars, 4 stars, 3.5 stars, 4.2 stars, 4 stars", 
xpath.exec(tuple).get(1));
+
+    }
+
+    private long timeTheUDF(final Tuple tuple, final XPathAll xpath) throws 
Exception {
+
+        final long start = System.currentTimeMillis();
+
+        for (int i = 0; i < 50000; i++) {
+
+            tuple.set(0, "<book id=\"bk101"
+                    + i
+                    + "\">"
+                    + // we need to make sure xml changes
+                    "<author>Gambardella, Matthew</author>" + "<title>XML 
Developer's Guide</title>"
+                    + "<genre>Computer</genre>" + expandXml() + 
"<price>44.95</price>"
+                    + "<publish_date>2000-10-01</publish_date>"
+                    + "<description>An in-depth look at creating applications 
with XML.</description>" + "</book>");
+
+            // caching is used here. for 2nd and 3rd calls to xpath.exec, the
+            // cached javax.xml.xpath.XPath should help
+            tuple.set(1, "book/author");
+            assertEquals("Gambardella, Matthew", xpath.exec(tuple).get(0));
+
+            tuple.set(1, "book/price");
+            assertEquals("44.95", xpath.exec(tuple).get(0));
+
+            tuple.set(1, "book/publish_date");
+            assertEquals("2000-10-01", xpath.exec(tuple).get(0));
+        }
+
+        return System.currentTimeMillis() - start;
+    }
+
+    private String expandXml() {
+
+        final StringBuilder sb = new StringBuilder();
+
+        final int max = RandomUtils.nextInt(100);
+
+        for (int i = 0; i < max; i++) {
+            sb.append("<expansion>This is an expansion of the xml to simulate 
random sized xml" + i + "</expansion>");
+        }
+
+        return sb.toString();
+    }
+}

Modified: 
pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/xml/XPathTest.java
URL: 
http://svn.apache.org/viewvc/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/xml/XPathTest.java?rev=1652238&r1=1652237&r2=1652238&view=diff
==============================================================================
--- 
pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/xml/XPathTest.java
 (original)
+++ 
pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/evaluation/xml/XPathTest.java
 Thu Jan 15 19:18:34 2015
@@ -21,6 +21,7 @@ import org.apache.commons.lang.math.Rand
 import org.apache.pig.data.Tuple;
 import org.apache.pig.data.TupleFactory;
 import org.apache.pig.piggybank.evaluation.xml.XPath;
+import org.junit.Ignore;
 import org.junit.Test;
 
 public class XPathTest {
@@ -121,8 +122,97 @@ public class XPathTest {
                 
     }
     
+    @Test
+    public void testExecTupleWithNamespace() throws Exception {
+
+        final XPath xpath = new XPath();
+
+        final Tuple tuple = mock(Tuple.class);
+
+        when(tuple.get(0)).thenReturn(
+                "<ann:book id=\"bk101\">" + "<author>Gambardella, 
Matthew</author>"
+                        + "<title>XML Developer's Guide</title>" + 
"<genre>Computer</genre>" + "<price>44.95</price>"
+                        + "<publish_date>2000-10-01</publish_date>"
+                        + "<description>An in-depth look at creating 
applications with XML.</description>"
+                        + "</ann:book>");
+
+        when(tuple.size()).thenReturn(4);
+        when(tuple.get(2)).thenReturn(true);
+        when(tuple.get(3)).thenReturn(true);
+        
+        when(tuple.get(1)).thenReturn("book/author");
+        assertEquals("Gambardella, Matthew", xpath.exec(tuple));
+        assertNotEquals("Someone else", xpath.exec(tuple));
+        
+        when(tuple.get(1)).thenReturn("book/price");
+        assertEquals("44.95", xpath.exec(tuple));
+        assertNotEquals("00.00", xpath.exec(tuple));
+
+    }
+
+    @Test
+    public void testExecTupleWithElementNodeWithComplexNameSpace() throws 
Exception {
+
+        final XPath xpath = new XPath();
+
+        final Tuple tuple = mock(Tuple.class);
+
+        when(tuple.get(0)).thenReturn(
+
+                "<cbs:bookstore>"
+                        +"<cbs:book>"
+                            + "<bsbi:authors>"
+                                + "<bsbi:author_1>Gambardella</bsbi:author_1>"
+                                + "<bsbi:author_2>Matthew</bsbi:author_2>"
+                                + "<bsbi:author_3>Mike</bsbi:author_3>"
+                            + "</bsbi:authors>"
+                            + "<bsbi:title>23</bsbi:title>"
+                            + "<bsbi:genre>semiAutomatic</bsbi:genre>"
+                            + "<bsbi:price>enabled</bsbi:price>"
+                            + 
"<bsbi:publish_date>leftToRight</bsbi:publish_date>"
+                            + "<bsbi:description>282</bsbi:description>"
+                            + "<bsbi:reviews>"
+                                + "<review_1>4 stars</review_1>"
+                                + "<review_2>3.5 stars</review_2>"
+                                + "<review_3>4 stars</review_3>"
+                                + "<review_4>4.2 stars</review_4>"
+                                + "<review_5>3.5 stars</review_5>"
+                            + "</bsbi:reviews>"
+                        + "</cbs:book>"
+                        + "<cbs:book>"
+                        + "<bsbi:authors>"
+                            + "<bsbi:author_1>O'Brien</bsbi:author_1>"
+                            + "<bsbi:author_2>Tim</bsbi:author_2>"
+                        + "</bsbi:authors>"
+                        + "<bsbi:title>23</bsbi:title>"
+                        + "<bsbi:genre>semiAutomatic</bsbi:genre>"
+                        + "<bsbi:price>enabled</bsbi:price>"
+                        + "<bsbi:publish_date>leftToRight</bsbi:publish_date>"
+                        + "<bsbi:description>282</bsbi:description>"
+                        + "<bsbi:reviews>"
+                            + "<bsbi:review_1>3.5 stars</bsbi:review_1>"
+                            + "<bsbi:review_2>4 stars</bsbi:review_2>"
+                            + "<bsbi:review_3>3.5 stars</bsbi:review_3>"
+                            + "<bsbi:review_4>4.2 stars</bsbi:review_4>"
+                            + "<bsbi:review_5>4 stars</bsbi:review_5>"
+                            
+                        + "</bsbi:reviews>"
+                        + "</cbs:book></cbs:bookstore>");
+
+        when(tuple.size()).thenReturn(4);
+        when(tuple.get(2)).thenReturn(true);
+        when(tuple.get(3)).thenReturn(true);
+
+        when(tuple.get(1)).thenReturn("bookstore/book/authors");
+        assertEquals("GambardellaMatthewMike", xpath.exec(tuple));
+
+        when(tuple.get(1)).thenReturn("bookstore/book/reviews");
+        assertEquals("4 stars3.5 stars4 stars4.2 stars3.5 stars", 
xpath.exec(tuple));
+
+    }
     
-    //@Test --optional test
+    @Ignore //--optional test
+    @Test 
     public void testCacheBenefit() throws Exception{
 
         final XPath xpath = new XPath();


Reply via email to