Author: violetagg
Date: Mon Jun 12 21:04:53 2017
New Revision: 1798533
URL: http://svn.apache.org/viewvc?rev=1798533&view=rev
Log:
A new configuration property 'crawlerIps' is added to the
'o.a.catalina.valves.CrawlerSessionManagerValve'. Using this property one can
specify a regular expression that will be used to identify crawlers based on
their IP address. Based on a patch provided by Tetradeus via GitHub.
Added:
tomcat/trunk/test/org/apache/catalina/valves/TestCrawlerSessionManagerValve.java
(with props)
Modified:
tomcat/trunk/java/org/apache/catalina/valves/CrawlerSessionManagerValve.java
tomcat/trunk/webapps/docs/changelog.xml
tomcat/trunk/webapps/docs/config/valve.xml
Modified:
tomcat/trunk/java/org/apache/catalina/valves/CrawlerSessionManagerValve.java
URL:
http://svn.apache.org/viewvc/tomcat/trunk/java/org/apache/catalina/valves/CrawlerSessionManagerValve.java?rev=1798533&r1=1798532&r2=1798533&view=diff
==============================================================================
---
tomcat/trunk/java/org/apache/catalina/valves/CrawlerSessionManagerValve.java
(original)
+++
tomcat/trunk/java/org/apache/catalina/valves/CrawlerSessionManagerValve.java
Mon Jun 12 21:04:53 2017
@@ -49,6 +49,10 @@ public class CrawlerSessionManagerValve
private String crawlerUserAgents = ".*[bB]ot.*|.*Yahoo!
Slurp.*|.*Feedfetcher-Google.*";
private Pattern uaPattern = null;
+
+ private String crawlerIps = null;
+ private Pattern ipPattern = null;
+
private int sessionInactiveInterval = 60;
@@ -86,6 +90,31 @@ public class CrawlerSessionManagerValve
/**
+ * Specify the regular expression (using {@link Pattern}) that will be used
+ * to identify crawlers based on their IP address. The default is no
crawler
+ * IPs.
+ *
+ * @param crawlerIps The regular expression using {@link Pattern}
+ */
+ public void setCrawlerIps(String crawlerIps) {
+ this.crawlerIps = crawlerIps;
+ if (crawlerIps == null || crawlerIps.length() == 0) {
+ ipPattern = null;
+ } else {
+ ipPattern = Pattern.compile(crawlerIps);
+ }
+ }
+
+ /**
+ * @see #setCrawlerIps(String)
+ * @return The current regular expression being used to match IP addresses.
+ */
+ public String getCrawlerIps() {
+ return crawlerIps;
+ }
+
+
+ /**
* Specify the session timeout (in seconds) for a crawler's session. This
is
* typically lower than that for a user session. The default is 60 seconds.
*
@@ -122,11 +151,11 @@ public class CrawlerSessionManagerValve
boolean isBot = false;
String sessionId = null;
- String clientIp = null;
+ String clientIp = request.getRemoteAddr();
if (log.isDebugEnabled()) {
- log.debug(request.hashCode() + ": ClientIp=" +
request.getRemoteAddr()
- + ", RequestedSessionId=" +
request.getRequestedSessionId());
+ log.debug(request.hashCode() + ": ClientIp=" + clientIp + ",
RequestedSessionId="
+ + request.getRequestedSessionId());
}
// If the incoming request has a valid session ID, no action is
required
@@ -155,9 +184,16 @@ public class CrawlerSessionManagerValve
}
}
+ if (ipPattern != null && ipPattern.matcher(clientIp).matches()) {
+ isBot = true;
+
+ if (log.isDebugEnabled()) {
+ log.debug(request.hashCode() + ": Bot found. IP=" +
clientIp);
+ }
+ }
+
// If this is a bot, is the session ID known?
if (isBot) {
- clientIp = request.getRemoteAddr();
sessionId = clientIpSessionId.get(clientIp);
if (sessionId != null) {
request.setRequestedSessionId(sessionId);
Added:
tomcat/trunk/test/org/apache/catalina/valves/TestCrawlerSessionManagerValve.java
URL:
http://svn.apache.org/viewvc/tomcat/trunk/test/org/apache/catalina/valves/TestCrawlerSessionManagerValve.java?rev=1798533&view=auto
==============================================================================
---
tomcat/trunk/test/org/apache/catalina/valves/TestCrawlerSessionManagerValve.java
(added)
+++
tomcat/trunk/test/org/apache/catalina/valves/TestCrawlerSessionManagerValve.java
Mon Jun 12 21:04:53 2017
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.catalina.valves;
+
+import java.util.Collections;
+
+import javax.servlet.http.HttpSession;
+
+import org.junit.Test;
+
+import org.apache.catalina.Valve;
+import org.apache.catalina.connector.Request;
+import org.apache.catalina.connector.Response;
+import org.easymock.EasyMock;
+import org.easymock.IExpectationSetters;
+
+public class TestCrawlerSessionManagerValve {
+
+ @Test
+ public void testCrawlerIpsPositive() throws Exception {
+ CrawlerSessionManagerValve valve = new CrawlerSessionManagerValve();
+ valve.setCrawlerIps("216\\.58\\.206\\.174");
+ valve.setNext(EasyMock.createMock(Valve.class));
+ HttpSession session = createSessionExpectations(valve, true);
+ Request request = createRequestExpectations("216.58.206.174", session,
true);
+
+ EasyMock.replay(request, session);
+
+ valve.invoke(request, EasyMock.createMock(Response.class));
+
+ EasyMock.verify(request, session);
+ }
+
+ @Test
+ public void testCrawlerIpsNegative() throws Exception {
+ CrawlerSessionManagerValve valve = new CrawlerSessionManagerValve();
+ valve.setCrawlerIps("216\\.58\\.206\\.174");
+ valve.setNext(EasyMock.createMock(Valve.class));
+ HttpSession session = createSessionExpectations(valve, false);
+ Request request = createRequestExpectations("127.0.0.1", session,
false);
+
+ EasyMock.replay(request, session);
+
+ valve.invoke(request, EasyMock.createMock(Response.class));
+
+ EasyMock.verify(request, session);
+ }
+
+ private HttpSession createSessionExpectations(CrawlerSessionManagerValve
valve, boolean isBot) {
+ HttpSession session = EasyMock.createMock(HttpSession.class);
+ if (isBot) {
+ EasyMock.expect(session.getId()).andReturn("id").times(2);
+ session.setAttribute(valve.getClass().getName(), valve);
+ EasyMock.expectLastCall();
+ session.setMaxInactiveInterval(60);
+ EasyMock.expectLastCall();
+ }
+ return session;
+ }
+
+ private Request createRequestExpectations(String ip, HttpSession session,
boolean isBot) {
+ Request request = EasyMock.createMock(Request.class);
+ EasyMock.expect(request.getRemoteAddr()).andReturn(ip);
+ IExpectationSetters<HttpSession> setter =
EasyMock.expect(request.getSession(false))
+ .andReturn(null);
+ if (isBot) {
+ setter.andReturn(session);
+ }
+
EasyMock.expect(request.getHeaders("user-agent")).andReturn(Collections.emptyEnumeration());
+ return request;
+ }
+}
Propchange:
tomcat/trunk/test/org/apache/catalina/valves/TestCrawlerSessionManagerValve.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: tomcat/trunk/webapps/docs/changelog.xml
URL:
http://svn.apache.org/viewvc/tomcat/trunk/webapps/docs/changelog.xml?rev=1798533&r1=1798532&r2=1798533&view=diff
==============================================================================
--- tomcat/trunk/webapps/docs/changelog.xml (original)
+++ tomcat/trunk/webapps/docs/changelog.xml Mon Jun 12 21:04:53 2017
@@ -100,6 +100,13 @@
<code>o.a.catalina.startup.Tomcat</code>. Patch provided by
peterhansson_se. (violetagg)
</fix>
+ <add>
+ A new configuration property <code>crawlerIps</code> is added to the
+ <code>o.a.catalina.valves.CrawlerSessionManagerValve</code>. Using this
+ property one can specify a regular expression that will be used to
+ identify crawlers based on their IP address. Based on a patch provided
+ by Tetradeus. (violetagg)
+ </add>
</changelog>
</subsection>
<subsection name="Coyote">
Modified: tomcat/trunk/webapps/docs/config/valve.xml
URL:
http://svn.apache.org/viewvc/tomcat/trunk/webapps/docs/config/valve.xml?rev=1798533&r1=1798532&r2=1798533&view=diff
==============================================================================
--- tomcat/trunk/webapps/docs/config/valve.xml (original)
+++ tomcat/trunk/webapps/docs/config/valve.xml Mon Jun 12 21:04:53 2017
@@ -1651,6 +1651,12 @@
</p>
</attribute>
+ <attribute name="crawlerIps" required="false">
+ <p>Regular expression (using <code>java.util.regex</code>) that client
+ IP is matched against to determine if a request is from a web crawler.
+ By default such regular expression is not set.</p>
+ </attribute>
+
<attribute name="crawlerUserAgents" required="false">
<p>Regular expression (using <code>java.util.regex</code>) that the
user
agent HTTP request header is matched against to determine if a request
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]