OliverKeyes has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/181939

Change subject: [WIP] start of a generalised class of UDFs for handling the 
webrequests table
......................................................................

[WIP] start of a generalised class of UDFs for handling the webrequests table

This commit marks the start of the "Webrequest" class of UDFs, intended to
be particularly helpful when dealing with wmf_raw.webrequests and derived
tables. It contains an identifier for Wikimedia-specific web crawlers,
and a function to extract values from x_analytics fields.

It's very much a WIP and is (1) dependent on Otto's POM file modifications
as part of the pageview UDFs and (2) not currently sourced by any
UDFs, on account of everyone is on holiday and so nobody is
around to explain to me how in the seven hells one goes about
including multiple UDFs in the same class without maven throwing
all of its toys out of the pram.

Change-Id: Ia79d893a50b0807c6f11340a5758786147b460e1
---
A refinery-core/src/main/java/org/wikimedia/analytics/refinery/Webrequests.java
A refinery-core/src/test/java/org/wikimedia/mediawiki/TestWebrequests.java
A refinery-core/src/test/resources/isCrawler_test_data.csv
A refinery-core/src/test/resources/x_analytics_test_data.csv
4 files changed, 128 insertions(+), 0 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/analytics/refinery/source 
refs/changes/39/181939/1

diff --git 
a/refinery-core/src/main/java/org/wikimedia/analytics/refinery/Webrequests.java 
b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/Webrequests.java
new file mode 100644
index 0000000..f9d4bc2
--- /dev/null
+++ 
b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/Webrequests.java
@@ -0,0 +1,62 @@
+// Copyright 2014 Wikimedia Foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package org.wikimedia.analytics.refinery.core;
+
+import java.util.regex.Pattern;
+
+// Static functions to work withh Wikimedia webrequest data.
+public class Webrequests {
+
+    //Wikimedia-specific crawlers
+    private static final Pattern crawlerPattern = Pattern.compile(
+               "(wikiwix-bot|goo wikipedia|MediaWikiCrawler-Google).*"
+    );
+
+    //Identify Wikimedia-specific crawlers. Given
+    //a userAgent, return TRUE if the UA matches one of the known WM-specific
+    //crawlers, and FALSE if not.
+    public static boolean isCrawler(String userAgent) {
+
+        return(crawlerPattern.matcher(userAgent).matches());
+    }
+
+    //Extract values from the x_analytics header.
+    public static String xExtract(String xAnalytics, String parameter) {
+
+        String out = "";
+
+        //Match the parameter name. If nothing can be found, return an empty 
string.
+        int hasField = xAnalytics.indexOf(parameter);
+        if(hasField == -1){
+            return out;
+        }
+
+        //If something can be found, find the next semicolon, which 
x-analytics uses
+        //to delimit fields.
+        int hasDelimiter = xAnalytics.indexOf(";", hasField);
+
+        //If there is a delimeter, we want the output to be substringed on 
both sides.
+        //If there isn't, one-sided substring.
+        if(hasDelimiter == -1){
+            out = xAnalytics.substring(hasField + parameter.length() + 1);
+        } else {
+            out = xAnalytics.substring(hasField + parameter.length() + 1, 
hasDelimiter);
+        }
+
+        //Done
+        return out;
+        
+    }
+}
\ No newline at end of file
diff --git 
a/refinery-core/src/test/java/org/wikimedia/mediawiki/TestWebrequests.java 
b/refinery-core/src/test/java/org/wikimedia/mediawiki/TestWebrequests.java
new file mode 100644
index 0000000..4c951ae
--- /dev/null
+++ b/refinery-core/src/test/java/org/wikimedia/mediawiki/TestWebrequests.java
@@ -0,0 +1,54 @@
+package org.wikimedia.analytics.refinery.core;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import static org.junit.Assert.assertEquals;
+import junitparams.FileParameters;
+import junitparams.JUnitParamsRunner;
+import junitparams.mappers.CsvWithHeaderMapper;
+
+@RunWith(JUnitParamsRunner.class)
+public class TestWebrequests {
+
+    @Test
+    @FileParameters(
+        value = "src/test/resources/isCrawler_test_data.csv",
+        mapper = CsvWithHeaderMapper.class
+    )
+
+    public void testisCrawler(
+        String test_description,
+        boolean is_crawler,
+        String user_agent
+    ) {
+        assertEquals(
+            test_description,
+            is_crawler,
+            Webrequests.isCrawler(
+                user_agent
+            )
+        );
+    }
+
+    @Test
+    @FileParameters(
+        value = "src/test/resources/x_analytics_test_data.csv",
+        mapper = CsvWithHeaderMapper.class
+    )
+    public void testxAnalytics(
+        String test_description,
+        String expected_output,
+        String x_analytics,
+        String param
+    ) {
+        assertEquals(
+            test_description,
+            expected_output,
+            Webrequests.xExtract(
+                x_analytics,
+                param
+            )
+        );
+    }
+
+}
\ No newline at end of file
diff --git a/refinery-core/src/test/resources/isCrawler_test_data.csv 
b/refinery-core/src/test/resources/isCrawler_test_data.csv
new file mode 100644
index 0000000..d0cb88a
--- /dev/null
+++ b/refinery-core/src/test/resources/isCrawler_test_data.csv
@@ -0,0 +1,7 @@
+test_description, is_crawler,user_agent
+Is crawler - Google, true,MediaWikiCrawler-Google/2.0 
([email protected])
+Is crawler – goo.ne.jp, true,goo wikipedia (http://help.goo.ne.jp/contact/)
+Is crawler – wikiwix, true,wikiwix-bot-3.0
+Is Not Pageview - http_status != 200, false,Mozilla/5.0 (Windows NT 6.1; 
Trident/7.0; rv:11.0) like Gecko
+Is Not Pageview - content_type does not match, false,Mozilla/5.0 (iPhone; CPU 
iPhone OS 7_1_2 like Mac OS X) AppleWebKit/537.51.2 (KHTML like Gecko) 
Version/7.0 Mobile/11D257 Safari/9537.53
+Is Not Pageview - API stupidity: it outputs a 200 status code and text/html as 
a MIME type on certain class, false,Opera/9.80 (Android; Opera 
Mini/7.6.35843/35.5858; U; en) Presto/2.8.119 Version/11.10
diff --git a/refinery-core/src/test/resources/x_analytics_test_data.csv 
b/refinery-core/src/test/resources/x_analytics_test_data.csv
new file mode 100644
index 0000000..8e1af7a
--- /dev/null
+++ b/refinery-core/src/test/resources/x_analytics_test_data.csv
@@ -0,0 +1,5 @@
+test_description,expected_output,x_analytics,param
+Grab app Install 
ID,foobar,zero=621-30;appInstallID=foobar;proxy=Nokiaprod;php=zend,appInstallID
+Grab MCC code,621-30,zero=621-30;proxy=Nokiaprod;php=zend,zero
+Grab app install ID...when it's the last 
entry,foobar,zero=621-30;proxy=Nokiaprod;php=zend;appInstallID=foobar,appInstallID
+Grab app install ID when it doesn't 
exist,,zero=621-30;proxy=Nokiaprod;php=zend,appInstallID

-- 
To view, visit https://gerrit.wikimedia.org/r/181939
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Ia79d893a50b0807c6f11340a5758786147b460e1
Gerrit-PatchSet: 1
Gerrit-Project: analytics/refinery/source
Gerrit-Branch: master
Gerrit-Owner: OliverKeyes <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to