Hi all,
I was reading https://developers.google.com/webmasters/ajax-crawling/
on how to make ajax apps (consequently gwt apps) crawlable.
I took the code from google (summarized in point 3 of "How to create
an HTML snapshot?" to create a filter (that returns html from ajax
using HtmlUnit) and changed the web.xml accordingly. I created a new
GWT project with example code and applied the filter and the web.xml
there. It worked directly.
However, I did exactly the same on the gwt app I want to make
searchable and it doesn't work. For some reason, the only requests the
filter gets are the ones to the ones for the rpc.
I think I must be missing a terribly simple detail, but I'm a bit lost
on where to go from here.


Following you can see the code for the filter (CrawlServlet) and the
web.xml

package crawltest.server;

import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage;

import java.io.IOException;
import java.io.PrintWriter;
import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;
import java.util.logging.Logger;

import javax.servlet.Filter;
import javax.servlet.FilterChain;
import javax.servlet.FilterConfig;
import javax.servlet.ServletException;
import javax.servlet.ServletRequest;
import javax.servlet.ServletResponse;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;

/**
 * Servlet that makes this application crawlable
 */
public final class CrawlServlet implements Filter {

        private static final Logger logger =
Logger.getLogger(CrawlServlet.class
                        .getName());
  private static String rewriteQueryString(String queryString) throws
UnsupportedEncodingException {
    StringBuilder queryStringSb = new StringBuilder(queryString);
    int i = queryStringSb.indexOf("&_escaped_fragment_");
    if (i != -1) {
      StringBuilder tmpSb = new
StringBuilder(queryStringSb.substring(0, i));
      tmpSb.append("#!");
      tmpSb.append(URLDecoder.decode(queryStringSb.substring(i + 20,
queryStringSb.length()),"UTF-8"));
      queryStringSb = tmpSb;
    }

    i = queryStringSb.indexOf("_escaped_fragment_");
    if (i != -1) {
      StringBuilder tmpSb = new
StringBuilder(queryStringSb.substring(0, i));
      tmpSb.append("#!");
      tmpSb.append(URLDecoder.decode(queryStringSb.substring(i + 19,
queryStringSb.length()), "UTF-8"));
      queryStringSb = tmpSb;
    }
    if (queryStringSb.indexOf("#!") != 0) {
      queryStringSb.insert(0, '?');
    }
    queryString = queryStringSb.toString();



    return queryString;
  }

  private FilterConfig filterConfig = null;

  /**
   * Destroys the filter configuration
   */
  public void destroy() {
    this.filterConfig = null;
  }

  /**
   * Filters all requests and invokes headless browser if necessary
   */
  public void doFilter(ServletRequest request, ServletResponse
response,
      FilterChain chain) throws IOException {
          System.out.println("crawl");
    if (filterConfig == null) {
      return;
    }
    System.out.println("crawl");
    HttpServletRequest req = (HttpServletRequest) request;
    HttpServletResponse res = (HttpServletResponse) response;
    String queryString = req.getQueryString();
    System.out.println("query:"+queryString);
    System.out.println("param:"+req.getParameterMap().toString());
    System.out.println("req:"+req);
    if ((queryString != null) &&
(queryString.contains("_escaped_fragment_"))) {
        System.out.println("in!!");
      StringBuilder pageNameSb = new StringBuilder("http://";);
      pageNameSb.append(req.getServerName());
      if (req.getServerPort() != 0) {
        pageNameSb.append(":");
        pageNameSb.append(req.getServerPort());
      }
      pageNameSb.append(req.getRequestURI());
      queryString = rewriteQueryString(queryString);
      pageNameSb.append(queryString);

      final WebClient webClient = new
WebClient(BrowserVersion.FIREFOX_3);
      webClient.setJavaScriptEnabled(true);
      String pageName = pageNameSb.toString();
      HtmlPage page = webClient.getPage(pageName);
      webClient.waitForBackgroundJavaScriptStartingBefore(2000);

      res.setContentType("text/html;charset=UTF-8");
      PrintWriter out = res.getWriter();
      out.println("<hr>");
      out.println("<center><h3>You are viewing a non-interactive page
that is intended for the crawler.  You probably want to see this page:
<a href=\""
          + pageName + "\">" + pageName + "</a></h3></center>");
      out.println("<hr>");

      out.println(page.asXml());
      webClient.closeAllWindows();
      out.close();

    } else {
      try {
        chain.doFilter(request, response);
      } catch (ServletException e) {
        e.printStackTrace();
      }
    }
  }

  /**
   * Initializes the filter configuration
   */
  public void init(FilterConfig filterConfig) {
    this.filterConfig = filterConfig;
  }

}


web-xml:

<?xml version="1.0" encoding="UTF-8"?>
<web-app xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance";
        xsi:schemaLocation="http://java.sun.com/xml/ns/javaee
              http://java.sun.com/xml/ns/javaee/web-app_2_5.xsd";
        version="2.5" xmlns="http://java.sun.com/xml/ns/javaee";>

        <filter>
                <filter-name>CrawlServlet</filter-name>
                <filter-class>crawltest.server.CrawlServlet</filter-class>
        </filter>

        <filter-mapping>
                <filter-name>CrawlServlet</filter-name>
                <url-pattern>/*</url-pattern>
        </filter-mapping>

        <!-- Servlets -->

        <!-- Default page to serve -->
        <welcome-file-list>
                <welcome-file>CrawlTest.html</welcome-file>
        </welcome-file-list>

</web-app>



-- 
You received this message because you are subscribed to the Google Groups 
"Google Web Toolkit" group.
To post to this group, send email to [email protected].
To unsubscribe from this group, send email to 
[email protected].
For more options, visit this group at 
http://groups.google.com/group/google-web-toolkit?hl=en.

Reply via email to