Does your existing app's URLs conform to the spec at https://developers.google.com/webmasters/ajax-crawling/docs/specification ? More specifically, do all your history tokens start with an exclamation mark?
On Fri, Mar 30, 2012 at 9:31 AM, erebrus <[email protected]> wrote: > Hi all, > I was reading https://developers.google.com/webmasters/ajax-crawling/ > on how to make ajax apps (consequently gwt apps) crawlable. > I took the code from google (summarized in point 3 of "How to create > an HTML snapshot?" to create a filter (that returns html from ajax > using HtmlUnit) and changed the web.xml accordingly. I created a new > GWT project with example code and applied the filter and the web.xml > there. It worked directly. > However, I did exactly the same on the gwt app I want to make > searchable and it doesn't work. For some reason, the only requests the > filter gets are the ones to the ones for the rpc. > I think I must be missing a terribly simple detail, but I'm a bit lost > on where to go from here. > > > Following you can see the code for the filter (CrawlServlet) and the > web.xml > > package crawltest.server; > > import com.gargoylesoftware.htmlunit.BrowserVersion; > import com.gargoylesoftware.htmlunit.WebClient; > import com.gargoylesoftware.htmlunit.html.HtmlPage; > > import java.io.IOException; > import java.io.PrintWriter; > import java.io.UnsupportedEncodingException; > import java.net.URLDecoder; > import java.util.logging.Logger; > > import javax.servlet.Filter; > import javax.servlet.FilterChain; > import javax.servlet.FilterConfig; > import javax.servlet.ServletException; > import javax.servlet.ServletRequest; > import javax.servlet.ServletResponse; > import javax.servlet.http.HttpServletRequest; > import javax.servlet.http.HttpServletResponse; > > /** > * Servlet that makes this application crawlable > */ > public final class CrawlServlet implements Filter { > > private static final Logger logger = > Logger.getLogger(CrawlServlet.class > .getName()); > private static String rewriteQueryString(String queryString) throws > UnsupportedEncodingException { > StringBuilder queryStringSb = new StringBuilder(queryString); > int i = queryStringSb.indexOf("&_escaped_fragment_"); > if (i != -1) { > StringBuilder tmpSb = new > StringBuilder(queryStringSb.substring(0, i)); > tmpSb.append("#!"); > tmpSb.append(URLDecoder.decode(queryStringSb.substring(i + 20, > queryStringSb.length()),"UTF-8")); > queryStringSb = tmpSb; > } > > i = queryStringSb.indexOf("_escaped_fragment_"); > if (i != -1) { > StringBuilder tmpSb = new > StringBuilder(queryStringSb.substring(0, i)); > tmpSb.append("#!"); > tmpSb.append(URLDecoder.decode(queryStringSb.substring(i + 19, > queryStringSb.length()), "UTF-8")); > queryStringSb = tmpSb; > } > if (queryStringSb.indexOf("#!") != 0) { > queryStringSb.insert(0, '?'); > } > queryString = queryStringSb.toString(); > > > > return queryString; > } > > private FilterConfig filterConfig = null; > > /** > * Destroys the filter configuration > */ > public void destroy() { > this.filterConfig = null; > } > > /** > * Filters all requests and invokes headless browser if necessary > */ > public void doFilter(ServletRequest request, ServletResponse > response, > FilterChain chain) throws IOException { > System.out.println("crawl"); > if (filterConfig == null) { > return; > } > System.out.println("crawl"); > HttpServletRequest req = (HttpServletRequest) request; > HttpServletResponse res = (HttpServletResponse) response; > String queryString = req.getQueryString(); > System.out.println("query:"+queryString); > System.out.println("param:"+req.getParameterMap().toString()); > System.out.println("req:"+req); > if ((queryString != null) && > (queryString.contains("_escaped_fragment_"))) { > System.out.println("in!!"); > StringBuilder pageNameSb = new StringBuilder("http://"); > pageNameSb.append(req.getServerName()); > if (req.getServerPort() != 0) { > pageNameSb.append(":"); > pageNameSb.append(req.getServerPort()); > } > pageNameSb.append(req.getRequestURI()); > queryString = rewriteQueryString(queryString); > pageNameSb.append(queryString); > > final WebClient webClient = new > WebClient(BrowserVersion.FIREFOX_3); > webClient.setJavaScriptEnabled(true); > String pageName = pageNameSb.toString(); > HtmlPage page = webClient.getPage(pageName); > webClient.waitForBackgroundJavaScriptStartingBefore(2000); > > res.setContentType("text/html;charset=UTF-8"); > PrintWriter out = res.getWriter(); > out.println("<hr>"); > out.println("<center><h3>You are viewing a non-interactive page > that is intended for the crawler. You probably want to see this page: > <a href=\"" > + pageName + "\">" + pageName + "</a></h3></center>"); > out.println("<hr>"); > > out.println(page.asXml()); > webClient.closeAllWindows(); > out.close(); > > } else { > try { > chain.doFilter(request, response); > } catch (ServletException e) { > e.printStackTrace(); > } > } > } > > /** > * Initializes the filter configuration > */ > public void init(FilterConfig filterConfig) { > this.filterConfig = filterConfig; > } > > } > > > web-xml: > > <?xml version="1.0" encoding="UTF-8"?> > <web-app xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" > xsi:schemaLocation="http://java.sun.com/xml/ns/javaee > http://java.sun.com/xml/ns/javaee/web-app_2_5.xsd" > version="2.5" xmlns="http://java.sun.com/xml/ns/javaee"> > > <filter> > <filter-name>CrawlServlet</filter-name> > <filter-class>crawltest.server.CrawlServlet</filter-class> > </filter> > > <filter-mapping> > <filter-name>CrawlServlet</filter-name> > <url-pattern>/*</url-pattern> > </filter-mapping> > > <!-- Servlets --> > > <!-- Default page to serve --> > <welcome-file-list> > <welcome-file>CrawlTest.html</welcome-file> > </welcome-file-list> > > </web-app> > > > > -- > You received this message because you are subscribed to the Google Groups > "Google Web Toolkit" group. > To post to this group, send email to [email protected]. > To unsubscribe from this group, send email to > [email protected]. > For more options, visit this group at > http://groups.google.com/group/google-web-toolkit?hl=en. > -- You received this message because you are subscribed to the Google Groups "Google Web Toolkit" group. To post to this group, send email to [email protected]. To unsubscribe from this group, send email to [email protected]. For more options, visit this group at http://groups.google.com/group/google-web-toolkit?hl=en.
