OliverKeyes has uploaded a new change for review. https://gerrit.wikimedia.org/r/181939
Change subject: [WIP] start of a generalised class of UDFs for handling the webrequests table ...................................................................... [WIP] start of a generalised class of UDFs for handling the webrequests table This commit marks the start of the "Webrequest" class of UDFs, intended to be particularly helpful when dealing with wmf_raw.webrequests and derived tables. It contains an identifier for Wikimedia-specific web crawlers, and a function to extract values from x_analytics fields. It's very much a WIP and is (1) dependent on Otto's POM file modifications as part of the pageview UDFs and (2) not currently sourced by any UDFs, on account of everyone is on holiday and so nobody is around to explain to me how in the seven hells one goes about including multiple UDFs in the same class without maven throwing all of its toys out of the pram. Change-Id: Ia79d893a50b0807c6f11340a5758786147b460e1 --- A refinery-core/src/main/java/org/wikimedia/analytics/refinery/Webrequests.java A refinery-core/src/test/java/org/wikimedia/mediawiki/TestWebrequests.java A refinery-core/src/test/resources/isCrawler_test_data.csv A refinery-core/src/test/resources/x_analytics_test_data.csv 4 files changed, 128 insertions(+), 0 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/analytics/refinery/source refs/changes/39/181939/1 diff --git a/refinery-core/src/main/java/org/wikimedia/analytics/refinery/Webrequests.java b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/Webrequests.java new file mode 100644 index 0000000..f9d4bc2 --- /dev/null +++ b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/Webrequests.java @@ -0,0 +1,62 @@ +// Copyright 2014 Wikimedia Foundation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package org.wikimedia.analytics.refinery.core; + +import java.util.regex.Pattern; + +// Static functions to work withh Wikimedia webrequest data. +public class Webrequests { + + //Wikimedia-specific crawlers + private static final Pattern crawlerPattern = Pattern.compile( + "(wikiwix-bot|goo wikipedia|MediaWikiCrawler-Google).*" + ); + + //Identify Wikimedia-specific crawlers. Given + //a userAgent, return TRUE if the UA matches one of the known WM-specific + //crawlers, and FALSE if not. + public static boolean isCrawler(String userAgent) { + + return(crawlerPattern.matcher(userAgent).matches()); + } + + //Extract values from the x_analytics header. + public static String xExtract(String xAnalytics, String parameter) { + + String out = ""; + + //Match the parameter name. If nothing can be found, return an empty string. + int hasField = xAnalytics.indexOf(parameter); + if(hasField == -1){ + return out; + } + + //If something can be found, find the next semicolon, which x-analytics uses + //to delimit fields. + int hasDelimiter = xAnalytics.indexOf(";", hasField); + + //If there is a delimeter, we want the output to be substringed on both sides. + //If there isn't, one-sided substring. + if(hasDelimiter == -1){ + out = xAnalytics.substring(hasField + parameter.length() + 1); + } else { + out = xAnalytics.substring(hasField + parameter.length() + 1, hasDelimiter); + } + + //Done + return out; + + } +} \ No newline at end of file diff --git a/refinery-core/src/test/java/org/wikimedia/mediawiki/TestWebrequests.java b/refinery-core/src/test/java/org/wikimedia/mediawiki/TestWebrequests.java new file mode 100644 index 0000000..4c951ae --- /dev/null +++ b/refinery-core/src/test/java/org/wikimedia/mediawiki/TestWebrequests.java @@ -0,0 +1,54 @@ +package org.wikimedia.analytics.refinery.core; + +import org.junit.Test; +import org.junit.runner.RunWith; +import static org.junit.Assert.assertEquals; +import junitparams.FileParameters; +import junitparams.JUnitParamsRunner; +import junitparams.mappers.CsvWithHeaderMapper; + +@RunWith(JUnitParamsRunner.class) +public class TestWebrequests { + + @Test + @FileParameters( + value = "src/test/resources/isCrawler_test_data.csv", + mapper = CsvWithHeaderMapper.class + ) + + public void testisCrawler( + String test_description, + boolean is_crawler, + String user_agent + ) { + assertEquals( + test_description, + is_crawler, + Webrequests.isCrawler( + user_agent + ) + ); + } + + @Test + @FileParameters( + value = "src/test/resources/x_analytics_test_data.csv", + mapper = CsvWithHeaderMapper.class + ) + public void testxAnalytics( + String test_description, + String expected_output, + String x_analytics, + String param + ) { + assertEquals( + test_description, + expected_output, + Webrequests.xExtract( + x_analytics, + param + ) + ); + } + +} \ No newline at end of file diff --git a/refinery-core/src/test/resources/isCrawler_test_data.csv b/refinery-core/src/test/resources/isCrawler_test_data.csv new file mode 100644 index 0000000..d0cb88a --- /dev/null +++ b/refinery-core/src/test/resources/isCrawler_test_data.csv @@ -0,0 +1,7 @@ +test_description, is_crawler,user_agent +Is crawler - Google, true,MediaWikiCrawler-Google/2.0 ([email protected]) +Is crawler – goo.ne.jp, true,goo wikipedia (http://help.goo.ne.jp/contact/) +Is crawler – wikiwix, true,wikiwix-bot-3.0 +Is Not Pageview - http_status != 200, false,Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko +Is Not Pageview - content_type does not match, false,Mozilla/5.0 (iPhone; CPU iPhone OS 7_1_2 like Mac OS X) AppleWebKit/537.51.2 (KHTML like Gecko) Version/7.0 Mobile/11D257 Safari/9537.53 +Is Not Pageview - API stupidity: it outputs a 200 status code and text/html as a MIME type on certain class, false,Opera/9.80 (Android; Opera Mini/7.6.35843/35.5858; U; en) Presto/2.8.119 Version/11.10 diff --git a/refinery-core/src/test/resources/x_analytics_test_data.csv b/refinery-core/src/test/resources/x_analytics_test_data.csv new file mode 100644 index 0000000..8e1af7a --- /dev/null +++ b/refinery-core/src/test/resources/x_analytics_test_data.csv @@ -0,0 +1,5 @@ +test_description,expected_output,x_analytics,param +Grab app Install ID,foobar,zero=621-30;appInstallID=foobar;proxy=Nokiaprod;php=zend,appInstallID +Grab MCC code,621-30,zero=621-30;proxy=Nokiaprod;php=zend,zero +Grab app install ID...when it's the last entry,foobar,zero=621-30;proxy=Nokiaprod;php=zend;appInstallID=foobar,appInstallID +Grab app install ID when it doesn't exist,,zero=621-30;proxy=Nokiaprod;php=zend,appInstallID -- To view, visit https://gerrit.wikimedia.org/r/181939 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Ia79d893a50b0807c6f11340a5758786147b460e1 Gerrit-PatchSet: 1 Gerrit-Project: analytics/refinery/source Gerrit-Branch: master Gerrit-Owner: OliverKeyes <[email protected]> _______________________________________________ MediaWiki-commits mailing list [email protected] https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
