OliverKeyes has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/182971

Change subject: [WIP] Legacy pageviews definition UDF
......................................................................

[WIP] Legacy pageviews definition UDF

Adds a UDF for seeing if requests match the legacy
pageviews definition. This is very much a WIP; don't
even bother CRing it at this stage, I just need to
commit SOMEthing so that Nuria can try and debug
a weird error.

Change-Id: Ifd8c9d73da4724d2dccd4ec6d49cc0c2ebd2b63b
---
A 
refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/LegacyPageview.java
A 
refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestLegacyPageview.java
M 
refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestPageview.java
M refinery-core/src/test/resources/pageview_test_data.csv
M 
refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsPageviewUDF.java
5 files changed, 177 insertions(+), 19 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/analytics/refinery/source 
refs/changes/71/182971/1

diff --git 
a/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/LegacyPageview.java
 
b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/LegacyPageview.java
new file mode 100644
index 0000000..31d0640
--- /dev/null
+++ 
b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/LegacyPageview.java
@@ -0,0 +1,93 @@
+// Copyright 2014 Wikimedia Foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package org.wikimedia.analytics.refinery.core;
+
+import java.util.regex.Pattern;
+import java.util.HashSet;
+import java.util.Arrays;
+
+/**
+ * Static functions to work wtih Wikimedia webrequest data.
+ * Created based on the HQL implementation at
+ * 
https://github.com/wikimedia/analytics-refinery/blob/master/oozie/pagecounts-all-sites/load/insert_hourly_pagecounts.hql
+ */
+public class LegacyPageview {
+
+    private static final Pattern acceptedUriHostsPattern = Pattern.compile(
+               
"\\.(mediawiki|wik(ibooks|idata|imediafoundation|inews|ipedia|iquote|isource|tionary|iversity|ivoyage))\\.org$"
+    );
+
+    private static final Pattern acceptedMetaUriHostsPattern = Pattern.compile(
+               
"(commons|incubator|meta|outreach|quality|species|strategy|usability)(\\.m)?\\.wikimedia)\\.org$"
+    );
+
+    private static final String acceptedUriPaths = "/wiki/";
+
+    private static final String rejectedUriPaths = 
"/wiki/Special:CentralAutoLogin/";
+
+    private static final HashSet<String> rejectedUriPathPages = new 
HashSet<String>(Arrays.asList(
+       "/wiki/undefined",
+       "/wiki/Undefined"
+    ));
+
+    private static final HashSet<String> rejectedStatusCodes = new 
HashSet<String>(Arrays.asList(
+       "301",
+       "302",
+       "303"
+       ));
+
+       private static final Pattern rejectedIPPattern = Pattern.compile(
+               
"^(10\\.20\\.0|10\\.64\\.0|10\\.128\\.0|10\\.64\\.32|208\\.80\\.15[2-5]|91\\.198\\.174)\\..+"
+       );
+
+    /**
+     * Given a webrequest ip, x_forwarded_for, uri_host, uri_path, and 
http_status, returns
+     * True if we consider this a 'pageview', False otherwise.
+     *
+     * See: 
https://meta.wikimedia.org/wiki/Research:Page_view/Generalised_filters
+     *      for information on how to classify a pageview.
+     * @param   ipAddress   Requesting IP address
+     * @param   xForwarded  the x_forwarded_for field
+     * @param   uriHost     Hostname portion of the URI
+     * @param   uriPath     Path portion of the URI
+     * @param   uriQuery    Query portion of the URI
+     * @param   httpStatus  HTTP request status code
+     */
+    public static boolean isLegacyPageview(String ipAddress, String 
xForwarded, String uriHost, String uriPath, String httpStatus) {
+
+        return (
+            //The status code is not 301, 302 or 303
+               !rejectedStatusCodes.contains(httpStatus) 
+
+            //The host is a "recognised" project
+            &&  (
+                    acceptedUriHostsPattern.matcher(uriHost).find()
+                    || acceptedMetaUriHostsPattern.matcher(uriHost).find()
+                )
+            //The URI path starts with /wiki/, and
+            //isn't to undefined, Undefined or Special:CentralAutoLogin
+            && uriPath.substring(0,6).equals(acceptedUriPaths)
+            && !uriPath.substring(0,30).equals(rejectedUriPaths)
+            && !rejectedUriPathPages.contains(uriPath)
+
+            //The source IP isn't in a specified range (or,
+            //is, but the XFF field is not empty)
+               &&  (
+                    !rejectedIPPattern.matcher(ipAddress).find()
+                    || !xForwarded.equals("-")
+                )
+        );
+    }
+}
\ No newline at end of file
diff --git 
a/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestLegacyPageview.java
 
b/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestLegacyPageview.java
new file mode 100644
index 0000000..4027cf1
--- /dev/null
+++ 
b/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestLegacyPageview.java
@@ -0,0 +1,59 @@
+// Copyright 2014 Wikimedia Foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package org.wikimedia.analytics.refinery.core;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import static org.junit.Assert.assertEquals;
+
+import junitparams.FileParameters;
+import junitparams.JUnitParamsRunner;
+import junitparams.mappers.CsvWithHeaderMapper;
+
+@RunWith(JUnitParamsRunner.class)
+public class TestLegacyPageview {
+
+
+    @Test
+    @FileParameters(
+        value = "src/test/resources/pageview_test_data.csv",
+        mapper = CsvWithHeaderMapper.class
+    )
+    public void testIsLegacyPageview(
+        String test_description,
+        boolean is_pageview,
+        boolean is_legacy_pageview,
+        String ip_address,
+        String x_forwarded_for,
+        String uri_host,
+        String uri_path,
+        String uri_query,
+        String http_status,
+        String content_type,
+        String user_agent
+    ) {
+        assertEquals(
+            test_description,
+            is_legacy_pageview,
+            LegacyPageview.isLegacyPageview(
+                ip_address,
+                x_forwarded_for,
+                uri_host,
+                uri_path,
+                http_status
+            )
+        );
+    }
+}
\ No newline at end of file
diff --git 
a/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestPageview.java
 
b/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestPageview.java
index f8e2781..8fef768 100644
--- 
a/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestPageview.java
+++ 
b/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestPageview.java
@@ -34,6 +34,9 @@
     public void testIsPageview(
         String test_description,
         boolean is_pageview,
+        boolean is_legacy_pageview,
+        String ip_address,
+        String x_forwarded_for,
         String uri_host,
         String uri_path,
         String uri_query,
diff --git a/refinery-core/src/test/resources/pageview_test_data.csv 
b/refinery-core/src/test/resources/pageview_test_data.csv
index 3637293..ff7ae7f 100644
--- a/refinery-core/src/test/resources/pageview_test_data.csv
+++ b/refinery-core/src/test/resources/pageview_test_data.csv
@@ -1,19 +1,19 @@
-test_description, is_pageview, uri_host, uri_path, uri_query, http_status, 
content_type, user_agent
-Is Pageview - Desktop, true, en.wikipedia.org, 
/wiki/Horseshoe_crab,-,200,text/html, turnip
-Is Pageview - App, true, en.wikipedia.org, /w/api.php, 
?action=mobileview&sections=0,200, application/json, WikipediaApp/1.2.3
-Is Pageview – Mobile Web, 
true,en.m.wikipedia.org,/wiki/Bernard_Manning,-,200,text/html,rutabaga
-Is Pageview – Desktop - Serbian sr-ec, 
true,sr.wikipedia.org,/sr-ec/Историја_Срба_пре_Немањића,-,200,text/html,Three-finger
 salute
-Is Pageview – Desktop - Serbian sr-el, 
true,sr.wikipedia.org,/sr-el/Историја_Срба_пре_Немањића,-,200,text/html,Three-finger
 salute
-Is Pageview – Desktop - Chinese zh-cn, 
true,zh.wikipedia.org,/zh-cn/Wikipedia:首页,-,200,text/html,Five-test plan
-Is Pageview – Desktop - Chinese zh-hans, 
true,zh.wikipedia.org,/zh-hans/Wikipedia:首页,-,200,text/html,Five-test plan
-Is Pageview – Desktop - Chinese zh-hant, 
true,zh.wikipedia.org,/zh-hant/Wikipedia:首页,-,200,text/html,Five-test plan
-Is Pageview – Desktop - Chinese zh-hk, 
true,zh.wikipedia.org,/zh-hk/Wikipedia:foo,-,200,text/html,Five-test plan
-Is Pageview – Desktop - Chinese zh-mo, 
true,zh.wikipedia.org,/zh-mo/Wikipedia:首页,-,200,text/html,Five-test plan
-Is Pageview – Desktop - Chinese zh-my, 
true,zh.wikipedia.org,/zh-my/Wikipedia:首页,-,200,text/html,Five-test plan
-Is Pageview – Desktop - Chinese zh-sg, 
true,zh.wikipedia.org,/zh-sg/Wikipedia:首页,-,200,text/html,Five-test plan
-Is Pageview – Desktop - Chinese zh-tw, 
true,zh.wikipedia.org,/zh-tw/Wikipedia:首页,-,200,text/html,Five-test plan
-Is Not Pageview - http_status != 200, false, en.wikipedia.org, 
/wiki/Noppperrrrs,-,400,text/html ,turnip
-Is Not Pageview - content_type does not match, false, en.wikipedia.org, 
/wiki/Noppperrrrs,-,200, image/png, turnip
-Is Not Pageview - API stupidity: it outputs a 200 status code and text/html as 
a MIME type on certain classes of error., false, en.wikipedia.org, 
/w/api.php,-,200, text/html, turnip
-Is Not Pageview – App request for non-page content, false, en.wikipedia.org, 
/w/api.php,?action=query&format=json&titles=Foo&prop=pageimages&piprop=thumbnail&pithumbsize=96&pilimit=1
    ,200, application/json, WikipediaApp/1.2.3
-Is Not Pageview – Non-App request for page content, false, en.wikipedia.org, 
/w/api.php,?action=mobileview&sections=0,200, application/json, TributeApp/1.2.3
+test_description, is_pageview,is_legacy_pageview,ip_address,x_forwarded_for, 
uri_host, uri_path, uri_query, http_status, content_type, user_agent
+Is Pageview - Desktop, true,true,174.62.175.82,-,en.wikipedia.org, 
/wiki/Horseshoe_crab,-,200,text/html, turnip
+Is Pageview - App, true,false,174.62.175.83,-,en.wikipedia.org, /w/api.php, 
?action=mobileview&sections=0,200, application/json, WikipediaApp/1.2.3
+Is Pageview – Mobile Web, 
true,true,174.62.175.84,-,en.m.wikipedia.org,/wiki/Bernard_Manning,-,200,text/html,rutabaga
+Is Pageview – Desktop - Serbian sr-ec, 
true,false,174.62.175.85,-,sr.wikipedia.org,/sr-ec/Историја_Срба_пре_Немањића,-,200,text/html,Three-finger
 salute
+Is Pageview – Desktop - Serbian sr-el, 
true,false,174.62.175.86,-,sr.wikipedia.org,/sr-el/Историја_Срба_пре_Немањића,-,200,text/html,Three-finger
 salute
+Is Pageview – Desktop - Chinese zh-cn, 
true,false,174.62.175.87,-,zh.wikipedia.org,/zh-cn/Wikipedia:首页,-,200,text/html,Five-test
 plan
+Is Pageview – Desktop - Chinese zh-hans, 
true,false,174.62.175.88,-,zh.wikipedia.org,/zh-hans/Wikipedia:首页,-,200,text/html,Five-test
 plan
+Is Pageview – Desktop - Chinese zh-hant, 
true,false,174.62.175.89,-,zh.wikipedia.org,/zh-hant/Wikipedia:首页,-,200,text/html,Five-test
 plan
+Is Pageview – Desktop - Chinese zh-hk, 
true,false,174.62.175.90,-,zh.wikipedia.org,/zh-hk/Wikipedia:foo,-,200,text/html,Five-test
 plan
+Is Pageview – Desktop - Chinese zh-mo, 
true,false,174.62.175.91,-,zh.wikipedia.org,/zh-mo/Wikipedia:首页,-,200,text/html,Five-test
 plan
+Is Pageview – Desktop - Chinese zh-my, 
true,false,174.62.175.92,-,zh.wikipedia.org,/zh-my/Wikipedia:首页,-,200,text/html,Five-test
 plan
+Is Pageview – Desktop - Chinese zh-sg, 
true,false,174.62.175.93,-,zh.wikipedia.org,/zh-sg/Wikipedia:首页,-,200,text/html,Five-test
 plan
+Is Pageview – Desktop - Chinese zh-tw, 
true,false,174.62.175.94,-,zh.wikipedia.org,/zh-tw/Wikipedia:首页,-,200,text/html,Five-test
 plan
+Is Not Pageview - http_status != 200, false,true,174.62.175.95,-, 
en.wikipedia.org, /wiki/Noppperrrrs,-,400,text/html ,turnip
+Is Not Pageview - content_type does not match, false,true,174.62.175.96,-, 
en.wikipedia.org, /wiki/Noppperrrrs,-,200, image/png, turnip
+Is Not Pageview - API stupidity: it outputs a 200 status code and text/html as 
a MIME type on certain classes of error., false, false,174.62.175.97,-, 
en.wikipedia.org, /w/api.php,-,200, text/html, turnip
+Is Not Pageview – App request for non-page content, false, 
false,174.62.175.98,-, en.wikipedia.org, 
/w/api.php,?action=query&format=json&titles=Foo&prop=pageimages&piprop=thumbnail&pithumbsize=96&pilimit=1
    ,200, application/json, WikipediaApp/1.2.3
+Is Not Pageview – Non-App request for page content, false, 
false,174.62.175.99,-, en.wikipedia.org, 
/w/api.php,?action=mobileview&sections=0,200, application/json, TributeApp/1.2.3
diff --git 
a/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsPageviewUDF.java
 
b/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsPageviewUDF.java
index e192056..00a42fa 100644
--- 
a/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsPageviewUDF.java
+++ 
b/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsPageviewUDF.java
@@ -35,6 +35,9 @@
     public void testIsPageview(
         String test_description,
         boolean is_pageview,
+        boolean is_legacy_pageview,
+        String ip_address,
+        String x_forwarded_for,
         String uri_host,
         String uri_path,
         String uri_query,

-- 
To view, visit https://gerrit.wikimedia.org/r/182971
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Ifd8c9d73da4724d2dccd4ec6d49cc0c2ebd2b63b
Gerrit-PatchSet: 1
Gerrit-Project: analytics/refinery/source
Gerrit-Branch: master
Gerrit-Owner: OliverKeyes <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to