sampler ParseHtmlParser.java ParseJTidy.java

sebb Wed, 19 Nov 2003 19:38:03 -0800

sebb        2003/11/19 18:34:29

  Added:       src/protocol/http/org/apache/jmeter/protocol/http/sampler
                        ParseHtmlParser.java ParseJTidy.java
  Log:
  Parser code split off from httpsamplerfull and newhttpsamplerfull
  
  Revision  Changes    Path
  1.1                  
jakarta-jmeter/src/protocol/http/org/apache/jmeter/protocol/http/sampler/ParseHtmlParser.java
  
  Index: ParseHtmlParser.java
  ===================================================================
  /*
   * ====================================================================
   * The Apache Software License, Version 1.1
   *
   * Copyright (c) 2001-2003 The Apache Software Foundation.  All rights
   * reserved.
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
   *
   * 1. Redistributions of source code must retain the above copyright
   * notice, this list of conditions and the following disclaimer.
   *
   * 2. Redistributions in binary form must reproduce the above copyright
   * notice, this list of conditions and the following disclaimer in
   * the documentation and/or other materials provided with the
   * distribution.
   *
   * 3. The end-user documentation included with the redistribution,
   * if any, must include the following acknowledgment:
   * "This product includes software developed by the
   * Apache Software Foundation (http://www.apache.org/)."
   * Alternately, this acknowledgment may appear in the software itself,
   * if and wherever such third-party acknowledgments normally appear.
   *
   * 4. The names "Apache" and "Apache Software Foundation" and
   * "Apache JMeter" must not be used to endorse or promote products
   * derived from this software without prior written permission. For
   * written permission, please contact [EMAIL PROTECTED]
   *
   * 5. Products derived from this software may not be called "Apache",
   * "Apache JMeter", nor may "Apache" appear in their name, without
   * prior written permission of the Apache Software Foundation.
   *
   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   * SUCH DAMAGE.
   * ====================================================================
   *
   * This software consists of voluntary contributions made by many
   * individuals on behalf of the Apache Software Foundation.  For more
   * information on the Apache Software Foundation, please see
   * <http://www.apache.org/>.
   */
  package org.apache.jmeter.protocol.http.sampler;
  
  import java.io.StringReader;
  import java.net.MalformedURLException;
  import java.net.URL;
  import java.util.HashSet;
  import java.util.Set;
  
  import org.apache.jmeter.samplers.SampleResult;
  import org.apache.jorphan.logging.LoggingManager;
  import org.apache.log.Logger;
  
  import org.htmlparser.*;
  import org.htmlparser.util.*;
  import org.htmlparser.scanners.*;
  import org.htmlparser.tags.*;
  
  /**
   * Parser class using HtmlParser to scan HTML documents fof images etc
   * 
   * @author TBA
   * @version $revision$ Last updated: $date$
   */
  public class ParseHtmlParser
  {
        /** Used to store the Logger (used for debug and error messages). */
     transient private static Logger log = LoggingManager.getLoggerForClass();
  
      /**
       * This is a singleton class
       */
      private ParseHtmlParser()
      {
          super();
      }
  
        /**
         * This method is called by HTTPSampler to get the images
         * from the HTML and retrieve it from the server. It is
         * the entry point for parsing the HTML.
         * @param res
         * @param sampler
         * @return
         */
        protected static SampleResult parseForImages(SampleResult res,HTTPSampler 
sampler)
        {
                Parser HtmlParser = null;
                URL baseUrl = null;
                String displayName = res.getSampleLabel();
                try {
  
                        String contents = new String(res.getResponseData());
                        StringReader reader = new StringReader(contents);
                        NodeReader nreader = new NodeReader(reader,contents.length());
                        HtmlParser = new Parser(nreader,new DefaultParserFeedback ());
                        addTagListeners(HtmlParser);
                } catch (Exception e){
                        log.error("IOException: had problems reading the InputStream");
                }
  
                try
                {
                        baseUrl = sampler.getUrl();
                        if(log.isDebugEnabled())
                        {
                                log.error("baseUrl - " + baseUrl.toString());
                        }
                }
                catch(MalformedURLException mfue)
                {
                        log.error("Error creating URL '" + displayName + "'");
                        log.error("MalformedURLException - " + mfue);
                        res.setResponseData(mfue.toString().getBytes());
                        res.setResponseCode(HTTPSampler.NON_HTTP_RESPONSE_CODE);
                        res.setResponseMessage(HTTPSampler.NON_HTTP_RESPONSE_MESSAGE);
                        res.setSuccessful(false);
                        mfue.printStackTrace();
                        return res;
                }
                // System.out.println("got the baseURL");
                // Now parse the DOM tree
          
                // This is used to ignore duplicated binary files.
                Set uniqueURLs = new HashSet();
          
                // look for applets
          
                // This will only work with an Applet .class file.
                // Ideally, this should be upgraded to work with Objects (IE)
                //      and archives (.jar and .zip) files as well.
          
                parseNodes(HtmlParser, uniqueURLs, res,sampler, baseUrl);
          
                // Okay, we're all done now
                if(log.isDebugEnabled())
                {
                        log.debug("Total time - " + res.getTime());
                }
                log.debug("End   : NewHTTPSamplerFull sample");
                return res;
        }
  
      /**
       * User HTMLParser to get the relevant nodes. This simplifies
       * the process and should provide significant performance
       * improvements.
       *
       * @param parser  HTMLParser
       * @param uniques used to ensure that binary files are only downloaded once
       * @param res             <code>SampleResult</code> to store sampling results
       */
      private static void parseNodes(Parser parser, Set uniques,
        SampleResult res,HTTPSampler sampler, URL baseUrl)
      {
                // Okay, we're all done now
                if(log.isDebugEnabled())
                {
                        log.debug("Start        : NewHTTPSamplerFull");
                }
          boolean uniqueBinary;
          SampleResult binRes = null;
  
                try {
                        // we start to iterate through the elements
                        for(NodeIterator e = parser.elements(); e.hasMoreNodes();)
                        {
                                uniqueBinary = true;
                                Node node = e.nextNode();
                                String binUrlStr = null;
  
                                // first we check to see if body tag has a
                                // background set and we set the NodeIterator
                                // to the child elements inside the body
                                if (node instanceof BodyTag){
                                        BodyTag body = (BodyTag)node;
                                        binUrlStr = body.getAttribute("background");
                                        // if the body tag exists, we get the elements
                                        // within the body tag. if we don't we won't
                                        // see the body of the page. The only catch
                                        // with this is if there are images after the
                                        // closing body tag, it won't get parsed. If
                                        // someone puts it outside the body tag, it
                                        // is probably a mistake. Plus it's bad to
                                        // have important content after the closing
                                        // body tag. Peter Lin 10-9-03
                                        e = body.elements();
                                } else if (node instanceof ImageTag){
                                        ImageTag image = (ImageTag)node;
                                        binUrlStr = image.getImageURL();
                                } else if (node instanceof AppletTag){
                                        AppletTag applet = (AppletTag)node;
                                        binUrlStr = applet.getAppletClass();
                                } else if (node instanceof InputTag){
                                        InputTag input = (InputTag)node;
                                        // we check the input tag type for image
                                        String strType = input.getAttribute("type");
                                        if(strType != null && 
strType.equalsIgnoreCase("image"))
                                        {
                                                // then we need to download the binary
                                                binUrlStr = input.getAttribute("src");
                                        }
                                }
  
                                binRes = new SampleResult();
                                if(binUrlStr == null)
                                {
                                        continue;
                                }
  
                                // set the baseUrl and binUrl so that if error occurs
                                // due to MalformedException then at least the values 
will be
                                // visible to the user to aid correction
                                binRes.setSampleLabel(baseUrl + "," + binUrlStr);
                                // download the binary
                                URL binUrl = null;
                                try
                                {
                                        binUrl = new URL(baseUrl, binUrlStr);
                                }
                                catch(MalformedURLException mfue)
                                {
                                        log.error("Error creating URL '" + baseUrl +
                                                          " , " + binUrlStr + "'");
                                        log.error("MalformedURLException - " + mfue);
                                        
binRes.setResponseData(mfue.toString().getBytes());
                                        
binRes.setResponseCode(HTTPSampler.NON_HTTP_RESPONSE_CODE);
                                        
binRes.setResponseMessage(HTTPSampler.NON_HTTP_RESPONSE_MESSAGE);
                                        binRes.setSuccessful(false);
                                        res.addSubResult(binRes);
                                        break;
                                }
                                if(log.isDebugEnabled())
                                {
                                        log.debug("Binary url - " + binUrlStr);
                                        log.debug("Full Binary url - " + binUrl);
                                }
                                binRes.setSampleLabel(binUrl.toString());
                                uniqueBinary = uniques.add(binUrl.toString());
                                if (uniqueBinary)
                                {
                                        // a browser should be smart enough to *not* 
download
                                        //   a binary file that it already has in its 
cache.
                                        try
                                        {
                                                HTTPSamplerFull.loadBinary(binUrl, 
binRes,sampler);
                                        }
                                        catch(Exception ioe)
                                        {
                                                log.error("Error reading from URL - " 
+ ioe);
                                                
binRes.setResponseData(ioe.toString().getBytes());
                                                
binRes.setResponseCode(HTTPSampler.NON_HTTP_RESPONSE_CODE);
                                                
binRes.setResponseMessage(HTTPSampler.NON_HTTP_RESPONSE_MESSAGE);
                                                binRes.setSuccessful(false);
                                        }
                                        log.debug("Adding result");
                                        res.addSubResult(binRes);
                                        res.setTime(res.getTime() + binRes.getTime());
                                }
                                else
                                {
                                        if(log.isDebugEnabled())
                                        {
                                                log.debug("Skipping duplicate - " + 
binUrl);
                                        }
                                }
                        }
                        log.debug("End   : NewHTTPSamplerFull parseNodes");
                }
                catch (ParserException e){
                }
      }
  
      /**
       * Returns a node representing a whole xml given an xml document.
       *
       * @param text    an xml document
       * @return        a node representing a whole xml
       *
       * @throws SAXException indicates an error parsing the xml document
       */
      private static void addTagListeners(Parser parser) 
      {
          log.debug("Start : addTagListeners");
                // add body tag scanner
                parser.addScanner(new BodyScanner());
          // add ImageTag scanner
                LinkScanner linkScanner = new LinkScanner(LinkTag.LINK_TAG_FILTER);
                // parser.addScanner(linkScanner);
                
parser.addScanner(linkScanner.createImageScanner(ImageTag.IMAGE_TAG_FILTER));
                // add input tag scanner
                parser.addScanner(new InputTagScanner());
                // add applet tag scanner
                parser.addScanner(new AppletScanner()); 
      }
  
  }
  
  
  
  
  1.1                  
jakarta-jmeter/src/protocol/http/org/apache/jmeter/protocol/http/sampler/ParseJTidy.java
  
  Index: ParseJTidy.java
  ===================================================================
  /*
   * ====================================================================
   * The Apache Software License, Version 1.1
   *
   * Copyright (c) 2001-2003 The Apache Software Foundation.  All rights
   * reserved.
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
   *
   * 1. Redistributions of source code must retain the above copyright
   * notice, this list of conditions and the following disclaimer.
   *
   * 2. Redistributions in binary form must reproduce the above copyright
   * notice, this list of conditions and the following disclaimer in
   * the documentation and/or other materials provided with the
   * distribution.
   *
   * 3. The end-user documentation included with the redistribution,
   * if any, must include the following acknowledgment:
   * "This product includes software developed by the
   * Apache Software Foundation (http://www.apache.org/)."
   * Alternately, this acknowledgment may appear in the software itself,
   * if and wherever such third-party acknowledgments normally appear.
   *
   * 4. The names "Apache" and "Apache Software Foundation" and
   * "Apache JMeter" must not be used to endorse or promote products
   * derived from this software without prior written permission. For
   * written permission, please contact [EMAIL PROTECTED]
   *
   * 5. Products derived from this software may not be called "Apache",
   * "Apache JMeter", nor may "Apache" appear in their name, without
   * prior written permission of the Apache Software Foundation.
   *
   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   * SUCH DAMAGE.
   * ====================================================================
   *
   * This software consists of voluntary contributions made by many
   * individuals on behalf of the Apache Software Foundation.  For more
   * information on the Apache Software Foundation, please see
   * <http://www.apache.org/>.
   */
  package org.apache.jmeter.protocol.http.sampler;
  
  import java.io.ByteArrayInputStream;
  import java.net.MalformedURLException;
  import java.net.URL;
  import java.util.HashSet;
  import java.util.Set;
  
  import org.apache.jmeter.samplers.SampleResult;
  import org.apache.jorphan.logging.LoggingManager;
  import org.apache.log.Logger;
  import org.w3c.dom.Document;
  import org.w3c.dom.NamedNodeMap;
  import org.w3c.dom.Node;
  import org.w3c.dom.NodeList;
  import org.w3c.tidy.Tidy;
  import org.xml.sax.SAXException;
  
  /**
   * Parser class using JTidy to scan HTML documents fof images etc
   * 
   * @author TBA
   * @version $revision$ Last updated: $date$
   */
  public class ParseJTidy
  {
      /** Used to store the Logger (used for debug and error messages). */
      transient private static Logger log = LoggingManager.getLoggerForClass();
  
      /**
       * This is a singleton class
       */
      private ParseJTidy()
      {
          super();
      }
  
        protected static SampleResult parseForImages(SampleResult res,HTTPSampler 
sampler)
        {
                String displayName = res.getSampleLabel();
                Document html = null;
                URL baseUrl = null;
                try
                {
                        baseUrl = sampler.getUrl();
                        if(log.isDebugEnabled())
                        {
                                log.debug("baseUrl - " + baseUrl.toString());
                        }
                        html = (Document)getDOM(res.getResponseData());
                }
                catch(SAXException se)
                {
                        log.error("Error parsing document - " + se);
                        res.setResponseData(se.toString().getBytes());
                        res.setResponseCode(HTTPSampler.NON_HTTP_RESPONSE_CODE);
                        res.setResponseMessage(HTTPSampler.NON_HTTP_RESPONSE_MESSAGE);
                        res.setSuccessful(false);
                        return res;
                }
                catch(MalformedURLException mfue)
                {
                        log.error("Error creating URL '" + displayName + "'");
                        log.error("MalformedURLException - " + mfue);
                        res.setResponseData(mfue.toString().getBytes());
                        res.setResponseCode(HTTPSampler.NON_HTTP_RESPONSE_CODE);
                        res.setResponseMessage(HTTPSampler.NON_HTTP_RESPONSE_MESSAGE);
                        res.setSuccessful(false);
                        return res;
                }
          
                // Now parse the DOM tree
                
                // TODO - check for <base> tag ??
          
                // This is used to ignore duplicated binary files.
                Set uniqueURLs = new HashSet();
          
                // look for images
                parseNodes(html, "img", false, "src", uniqueURLs, res,sampler,baseUrl);
                // look for applets
          
                // This will only work with an Applet .class file.
                // Ideally, this should be upgraded to work with Objects (IE)
                // and archives (.jar and .zip) files as well.
          
                parseNodes(html, "applet", false, "code", uniqueURLs, 
res,sampler,baseUrl);
                // look for input tags with image types
                parseNodes(html, "input", true, "src", uniqueURLs, 
res,sampler,baseUrl);
                // look for background images
                parseNodes(html, "body", false, "background", uniqueURLs, 
res,sampler,baseUrl);
          
                // look for table background images
                parseNodes(html, "table", false, "background", uniqueURLs, 
res,sampler,baseUrl);
  
                //TODO look for TD, TR etc images
  
                // Okay, we're all done now
                if(log.isDebugEnabled())
                {
                        log.debug("Total time - " + res.getTime());
                }
                log.debug("End   : HTTPSamplerFull sample");
                return res;
        }
  
  
      /**
       * Parse the DOM tree looking for the specified HTML source tags,
       * and download the appropriate binary files matching these tags.
       *
       * @param html      the HTML document to parse
       * @param htmlTag   the HTML tag to parse for
       * @param type      indicates that we require 'type=image'
       * @param srcTag    the HTML tag that indicates the source URL
       * @param uniques   used to ensure that binary files are only downloaded
       *                  once
       * @param baseUrl   base URL
       * 
       * @param res       <code>SampleResult</code> to store sampling results
       */
      private static void parseNodes(Document html, String htmlTag, boolean type,
              String srcTag, Set uniques, SampleResult res,HTTPSampler sampler,
              URL baseUrl)
      {
          log.debug("Start : HTTPSamplerFull parseNodes");
          NodeList nodeList = html.getElementsByTagName(htmlTag);
          boolean uniqueBinary;
          SampleResult binRes = null;
          for(int i = 0; i < nodeList.getLength(); i++)
          {
              uniqueBinary = true;
              Node tempNode = nodeList.item(i);
              if(log.isDebugEnabled())
              {
                  log.debug("'" + htmlTag + "' tag: " + tempNode);
              }
  
              // get the url of the Binary
              NamedNodeMap nnm = tempNode.getAttributes();
              Node namedItem = null;
  
              if(type)
              {
                  // if type is set, we need 'type=image'
                  namedItem = nnm.getNamedItem("type");
                  if(namedItem == null)
                  {
                      log.debug("namedItem 'null' - ignoring");
                      break;
                  }
                  String inputType = namedItem.getNodeValue();
                  if(log.isDebugEnabled())
                  {
                      log.debug("Input type - " + inputType);
                  }
                  if(inputType != null && inputType.equalsIgnoreCase("image"))
                  {
                      // then we need to download the binary
                  }
                  else
                  {
                      log.debug("type != 'image' - ignoring");
                      break;
                  }
              }
  
              binRes = new SampleResult();
              namedItem = nnm.getNamedItem(srcTag);
              if(namedItem == null)
              {
                  continue;
              }
              String binUrlStr = namedItem.getNodeValue();
              // set the baseUrl and binUrl so that if error occurs
              // due to MalformedException then at least the values will be
              // visible to the user to aid correction
              binRes.setSampleLabel(baseUrl + "," + binUrlStr);
              // download the binary
              URL binUrl = null;
              try
              {
                  binUrl = new URL(baseUrl, binUrlStr);
              }
              catch(MalformedURLException mfue)
              {
                  log.error("Error creating URL '" + baseUrl +
                            " , " + binUrlStr + "'");
                  log.error("MalformedURLException - " + mfue);
                  binRes.setResponseData(mfue.toString().getBytes());
                  binRes.setResponseCode(HTTPSampler.NON_HTTP_RESPONSE_CODE);
                  binRes.setResponseMessage(
                      HTTPSampler.NON_HTTP_RESPONSE_MESSAGE);
                  binRes.setSuccessful(false);
                  res.addSubResult(binRes);
                  break;
              }
              if(log.isDebugEnabled())
              {
                  log.debug("Binary url - " + binUrlStr);
                  log.debug("Full Binary url - " + binUrl);
              }
              binRes.setSampleLabel(binUrl.toString());
              uniqueBinary = uniques.add(binUrl.toString());
              if (uniqueBinary)
              {
                  // a browser should be smart enough to *not* download
                  //   a binary file that it already has in its cache.
                  try
                  {
                      HTTPSamplerFull.loadBinary(binUrl, binRes,sampler);
                  }
                  catch(Exception ioe)
                  {
                      log.error("Error reading from URL - " + ioe);
                      binRes.setResponseData(ioe.toString().getBytes());
                      binRes.setResponseCode(HTTPSampler.NON_HTTP_RESPONSE_CODE);
                      binRes.setResponseMessage(
                          HTTPSampler.NON_HTTP_RESPONSE_MESSAGE);
                      binRes.setSuccessful(false);
                  }
                  log.debug("Adding result");
                  res.addSubResult(binRes);
                  res.setTime(res.getTime() + binRes.getTime());
              }
              else
              {
                  if(log.isDebugEnabled())
                  {
                      log.debug("Skipping duplicate - " + binUrl);
                  }
              }
          }
          log.debug("End   : HTTPSamplerFull parseNodes");
      }
  
  
      /**
       * Returns <code>tidy</code> as HTML parser.
       *
       * @return  a <code>tidy</code> HTML parser
       */
      private static Tidy getParser()
      {
          log.debug("Start : getParser");
          Tidy tidy = new Tidy();
          tidy.setCharEncoding(org.w3c.tidy.Configuration.UTF8);
          tidy.setQuiet(true);
          tidy.setShowWarnings(false);
          if(log.isDebugEnabled())
          {
              log.debug("getParser : tidy parser created - " + tidy);
          }
          log.debug("End   : getParser");
          return tidy;
      }
  
      /**
       * Returns a node representing a whole xml given an xml document.
       *
       * @param text  an xml document (as a byte array)
       * @return      a node representing a whole xml
       *
       * @throws SAXException indicates an error parsing the xml document
       */
      private static Node getDOM(byte [] text) throws SAXException
      {
          log.debug("Start : getDOM");
          Node node = getParser().parseDOM(new
            ByteArrayInputStream(text), null);
          if(log.isDebugEnabled())
          {
              log.debug("node : " + node);
          }
          log.debug("End   : getDOM");
          return node;
      }
  
  }


---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]

cvs commit: jakarta-jmeter/src/protocol/http/org/apache/jmeter/protocol/http/sampler ParseHtmlParser.java ParseJTidy.java

Reply via email to