Ottomata has submitted this change and it was merged.

Change subject: Legacy pageviews definition UDF
......................................................................


Legacy pageviews definition UDF

Adds a UDF for seeing if requests match the legacy
pageviews definition.

Change-Id: Ifd8c9d73da4724d2dccd4ec6d49cc0c2ebd2b63b
---
A 
refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/LegacyPageview.java
M 
refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Pageview.java
A 
refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestLegacyPageview.java
M 
refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestPageview.java
M refinery-core/src/test/resources/pageview_test_data.csv
A 
refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsLegacyPageviewUDF.java
A 
refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsLegacyPageviewUDF.java
M 
refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsPageviewUDF.java
8 files changed, 322 insertions(+), 20 deletions(-)

Approvals:
  Ottomata: Verified; Looks good to me, approved



diff --git 
a/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/LegacyPageview.java
 
b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/LegacyPageview.java
new file mode 100644
index 0000000..b6edb2d
--- /dev/null
+++ 
b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/LegacyPageview.java
@@ -0,0 +1,104 @@
+// Copyright 2014 Wikimedia Foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package org.wikimedia.analytics.refinery.core;
+
+import java.util.regex.Pattern;
+import java.util.HashSet;
+import java.util.Arrays;
+
+/**
+ * Static functions to identify what requests constitute "pageviews",
+ * according to the definition at
+ * 
https://github.com/wikimedia/analytics-refinery/blob/master/oozie/pagecounts-all-sites/load/insert_hourly_pagecounts.hql
+ * This is the "legacy" definition, in use by WebStatsCollector and the
+ * pageviews dumps at http://dumps.wikimedia.org/other/pagecounts-ez/
+ * from 2007 to early 2015, and is to be superseded by the "Pageview" class
+ * and isPageview method.
+ */
+public class LegacyPageview {
+
+    private static final Pattern acceptedUriHostsPattern = Pattern.compile(
+        
"\\.(mediawiki|wik(ibooks|idata|imediafoundation|inews|ipedia|iquote|isource|tionary|iversity|ivoyage))\\.org$"
+    );
+
+    private static final Pattern acceptedMetaUriHostsPattern = Pattern.compile(
+                   
"(commons|incubator|meta|outreach|quality|species|strategy|usability)(\\.m)?\\.wikimedia\\.org$"
+    );
+
+    private static final Pattern acceptedUriPattern = Pattern.compile(
+        "^/wiki/"
+    );
+
+    private static final Pattern rejectedUriPattern = Pattern.compile(
+        "^/wiki/Special\\:CentralAutoLogin/"
+    );
+    private static final HashSet<String> rejectedUriPathPages = new 
HashSet<String>(Arrays.asList(
+        "/wiki/undefined",
+        "/wiki/Undefined"
+    ));
+
+    private static final HashSet<String> rejectedStatusCodes = new 
HashSet<String>(Arrays.asList(
+        "301",
+        "302",
+        "303"
+    ));
+
+    private static final Pattern rejectedIPPattern = Pattern.compile(
+        
"^(10\\.20\\.0|10\\.64\\.0|10\\.128\\.0|10\\.64\\.32|208\\.80\\.15[2-5]|91\\.198\\.174)\\..+"
+    );
+
+    /**
+     * Given a webrequest ip, x_forwarded_for, uri_host, uri_path, and 
http_status, returns
+     * True if we consider this a 'legacy pageview', False otherwise.
+     *
+     * @param   ip             Requesting IP address
+     * @param   xForwardedFor  the x_forwarded_for field
+     * @param   uriHost        Hostname portion of the URI
+     * @param   uriPath        Path portion of the URI
+     * @param   uriQuery       Query portion of the URI
+     * @param   httpStatus     HTTP request status code
+     */
+    public static boolean isLegacyPageview(
+        String ip,
+        String xForwardedFor,
+        String uriHost,
+        String uriPath,
+        String httpStatus
+    ) {
+
+        return (
+            //The status code is not 301, 302 or 303
+            !rejectedStatusCodes.contains(httpStatus)
+
+            //The host is a "recognised" project
+            &&  (
+                    Pageview.patternIsFound(acceptedUriHostsPattern, uriHost)
+                    || Pageview.patternIsFound(acceptedMetaUriHostsPattern, 
uriHost)
+                )
+            //The URI path starts with /wiki/, and
+            //isn't to undefined, Undefined or Special:CentralAutoLogin
+            && Pageview.patternIsFound(acceptedUriPattern, uriPath)
+            && !Pageview.patternIsFound(rejectedUriPattern, uriPath)
+            && !rejectedUriPathPages.contains(uriPath)
+
+            //The source IP isn't in a specified range (or,
+            //is, but the XFF field is not empty)
+            &&  (
+                    !Pageview.patternIsFound(rejectedIPPattern, ip)
+                    || !xForwardedFor.equals("-")
+                )
+        );
+    }
+}
\ No newline at end of file
diff --git 
a/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Pageview.java
 
b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Pageview.java
index 81e96f5..e52777d 100644
--- 
a/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Pageview.java
+++ 
b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Pageview.java
@@ -85,13 +85,14 @@
     /**
      * Convenience method for Using Matcher.find() to check if
      * the given regex Pattern matches the target String.
+     * Also called in the LegacyPageview class.
      *
      * @param Pattern pattern
      * @param String  target
      *
      * @return boolean
      */
-    private static boolean patternIsFound(Pattern pattern, String target) {
+    public static boolean patternIsFound(Pattern pattern, String target) {
         return pattern.matcher(target).find();
     }
 
diff --git 
a/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestLegacyPageview.java
 
b/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestLegacyPageview.java
new file mode 100644
index 0000000..4027cf1
--- /dev/null
+++ 
b/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestLegacyPageview.java
@@ -0,0 +1,59 @@
+// Copyright 2014 Wikimedia Foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package org.wikimedia.analytics.refinery.core;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import static org.junit.Assert.assertEquals;
+
+import junitparams.FileParameters;
+import junitparams.JUnitParamsRunner;
+import junitparams.mappers.CsvWithHeaderMapper;
+
+@RunWith(JUnitParamsRunner.class)
+public class TestLegacyPageview {
+
+
+    @Test
+    @FileParameters(
+        value = "src/test/resources/pageview_test_data.csv",
+        mapper = CsvWithHeaderMapper.class
+    )
+    public void testIsLegacyPageview(
+        String test_description,
+        boolean is_pageview,
+        boolean is_legacy_pageview,
+        String ip_address,
+        String x_forwarded_for,
+        String uri_host,
+        String uri_path,
+        String uri_query,
+        String http_status,
+        String content_type,
+        String user_agent
+    ) {
+        assertEquals(
+            test_description,
+            is_legacy_pageview,
+            LegacyPageview.isLegacyPageview(
+                ip_address,
+                x_forwarded_for,
+                uri_host,
+                uri_path,
+                http_status
+            )
+        );
+    }
+}
\ No newline at end of file
diff --git 
a/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestPageview.java
 
b/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestPageview.java
index f8e2781..8fef768 100644
--- 
a/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestPageview.java
+++ 
b/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestPageview.java
@@ -34,6 +34,9 @@
     public void testIsPageview(
         String test_description,
         boolean is_pageview,
+        boolean is_legacy_pageview,
+        String ip_address,
+        String x_forwarded_for,
         String uri_host,
         String uri_path,
         String uri_query,
diff --git a/refinery-core/src/test/resources/pageview_test_data.csv 
b/refinery-core/src/test/resources/pageview_test_data.csv
index 3637293..ff7ae7f 100644
--- a/refinery-core/src/test/resources/pageview_test_data.csv
+++ b/refinery-core/src/test/resources/pageview_test_data.csv
@@ -1,19 +1,19 @@
-test_description, is_pageview, uri_host, uri_path, uri_query, http_status, 
content_type, user_agent
-Is Pageview - Desktop, true, en.wikipedia.org, 
/wiki/Horseshoe_crab,-,200,text/html, turnip
-Is Pageview - App, true, en.wikipedia.org, /w/api.php, 
?action=mobileview&sections=0,200, application/json, WikipediaApp/1.2.3
-Is Pageview – Mobile Web, 
true,en.m.wikipedia.org,/wiki/Bernard_Manning,-,200,text/html,rutabaga
-Is Pageview – Desktop - Serbian sr-ec, 
true,sr.wikipedia.org,/sr-ec/Историја_Срба_пре_Немањића,-,200,text/html,Three-finger
 salute
-Is Pageview – Desktop - Serbian sr-el, 
true,sr.wikipedia.org,/sr-el/Историја_Срба_пре_Немањића,-,200,text/html,Three-finger
 salute
-Is Pageview – Desktop - Chinese zh-cn, 
true,zh.wikipedia.org,/zh-cn/Wikipedia:首页,-,200,text/html,Five-test plan
-Is Pageview – Desktop - Chinese zh-hans, 
true,zh.wikipedia.org,/zh-hans/Wikipedia:首页,-,200,text/html,Five-test plan
-Is Pageview – Desktop - Chinese zh-hant, 
true,zh.wikipedia.org,/zh-hant/Wikipedia:首页,-,200,text/html,Five-test plan
-Is Pageview – Desktop - Chinese zh-hk, 
true,zh.wikipedia.org,/zh-hk/Wikipedia:foo,-,200,text/html,Five-test plan
-Is Pageview – Desktop - Chinese zh-mo, 
true,zh.wikipedia.org,/zh-mo/Wikipedia:首页,-,200,text/html,Five-test plan
-Is Pageview – Desktop - Chinese zh-my, 
true,zh.wikipedia.org,/zh-my/Wikipedia:首页,-,200,text/html,Five-test plan
-Is Pageview – Desktop - Chinese zh-sg, 
true,zh.wikipedia.org,/zh-sg/Wikipedia:首页,-,200,text/html,Five-test plan
-Is Pageview – Desktop - Chinese zh-tw, 
true,zh.wikipedia.org,/zh-tw/Wikipedia:首页,-,200,text/html,Five-test plan
-Is Not Pageview - http_status != 200, false, en.wikipedia.org, 
/wiki/Noppperrrrs,-,400,text/html ,turnip
-Is Not Pageview - content_type does not match, false, en.wikipedia.org, 
/wiki/Noppperrrrs,-,200, image/png, turnip
-Is Not Pageview - API stupidity: it outputs a 200 status code and text/html as 
a MIME type on certain classes of error., false, en.wikipedia.org, 
/w/api.php,-,200, text/html, turnip
-Is Not Pageview – App request for non-page content, false, en.wikipedia.org, 
/w/api.php,?action=query&format=json&titles=Foo&prop=pageimages&piprop=thumbnail&pithumbsize=96&pilimit=1
    ,200, application/json, WikipediaApp/1.2.3
-Is Not Pageview – Non-App request for page content, false, en.wikipedia.org, 
/w/api.php,?action=mobileview&sections=0,200, application/json, TributeApp/1.2.3
+test_description, is_pageview,is_legacy_pageview,ip_address,x_forwarded_for, 
uri_host, uri_path, uri_query, http_status, content_type, user_agent
+Is Pageview - Desktop, true,true,174.62.175.82,-,en.wikipedia.org, 
/wiki/Horseshoe_crab,-,200,text/html, turnip
+Is Pageview - App, true,false,174.62.175.83,-,en.wikipedia.org, /w/api.php, 
?action=mobileview&sections=0,200, application/json, WikipediaApp/1.2.3
+Is Pageview – Mobile Web, 
true,true,174.62.175.84,-,en.m.wikipedia.org,/wiki/Bernard_Manning,-,200,text/html,rutabaga
+Is Pageview – Desktop - Serbian sr-ec, 
true,false,174.62.175.85,-,sr.wikipedia.org,/sr-ec/Историја_Срба_пре_Немањића,-,200,text/html,Three-finger
 salute
+Is Pageview – Desktop - Serbian sr-el, 
true,false,174.62.175.86,-,sr.wikipedia.org,/sr-el/Историја_Срба_пре_Немањића,-,200,text/html,Three-finger
 salute
+Is Pageview – Desktop - Chinese zh-cn, 
true,false,174.62.175.87,-,zh.wikipedia.org,/zh-cn/Wikipedia:首页,-,200,text/html,Five-test
 plan
+Is Pageview – Desktop - Chinese zh-hans, 
true,false,174.62.175.88,-,zh.wikipedia.org,/zh-hans/Wikipedia:首页,-,200,text/html,Five-test
 plan
+Is Pageview – Desktop - Chinese zh-hant, 
true,false,174.62.175.89,-,zh.wikipedia.org,/zh-hant/Wikipedia:首页,-,200,text/html,Five-test
 plan
+Is Pageview – Desktop - Chinese zh-hk, 
true,false,174.62.175.90,-,zh.wikipedia.org,/zh-hk/Wikipedia:foo,-,200,text/html,Five-test
 plan
+Is Pageview – Desktop - Chinese zh-mo, 
true,false,174.62.175.91,-,zh.wikipedia.org,/zh-mo/Wikipedia:首页,-,200,text/html,Five-test
 plan
+Is Pageview – Desktop - Chinese zh-my, 
true,false,174.62.175.92,-,zh.wikipedia.org,/zh-my/Wikipedia:首页,-,200,text/html,Five-test
 plan
+Is Pageview – Desktop - Chinese zh-sg, 
true,false,174.62.175.93,-,zh.wikipedia.org,/zh-sg/Wikipedia:首页,-,200,text/html,Five-test
 plan
+Is Pageview – Desktop - Chinese zh-tw, 
true,false,174.62.175.94,-,zh.wikipedia.org,/zh-tw/Wikipedia:首页,-,200,text/html,Five-test
 plan
+Is Not Pageview - http_status != 200, false,true,174.62.175.95,-, 
en.wikipedia.org, /wiki/Noppperrrrs,-,400,text/html ,turnip
+Is Not Pageview - content_type does not match, false,true,174.62.175.96,-, 
en.wikipedia.org, /wiki/Noppperrrrs,-,200, image/png, turnip
+Is Not Pageview - API stupidity: it outputs a 200 status code and text/html as 
a MIME type on certain classes of error., false, false,174.62.175.97,-, 
en.wikipedia.org, /w/api.php,-,200, text/html, turnip
+Is Not Pageview – App request for non-page content, false, 
false,174.62.175.98,-, en.wikipedia.org, 
/w/api.php,?action=query&format=json&titles=Foo&prop=pageimages&piprop=thumbnail&pithumbsize=96&pilimit=1
    ,200, application/json, WikipediaApp/1.2.3
+Is Not Pageview – Non-App request for page content, false, 
false,174.62.175.99,-, en.wikipedia.org, 
/w/api.php,?action=mobileview&sections=0,200, application/json, TributeApp/1.2.3
diff --git 
a/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsLegacyPageviewUDF.java
 
b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsLegacyPageviewUDF.java
new file mode 100644
index 0000000..733cb20
--- /dev/null
+++ 
b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsLegacyPageviewUDF.java
@@ -0,0 +1,70 @@
+/**
+ * Copyright (C) 2014  Wikimedia Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.wikimedia.analytics.refinery.hive;
+
+import org.apache.hadoop.hive.ql.exec.UDF;
+import org.wikimedia.analytics.refinery.core.LegacyPageview;
+
+
+/**
+ * A Hive UDF to identify what requests constitute "pageviews",
+ * according to the definition at 
+ * 
https://github.com/wikimedia/analytics-refinery/blob/master/oozie/pagecounts-all-sites/load/insert_hourly_pagecounts.hql
+ * This is the "legacy" definition, in use by WebStatsCollector and the
+ * pageviews dumps at http://dumps.wikimedia.org/other/pagecounts-ez/
+ * from 2007 to early 2015, and is to be superseded by the "Pageview" class
+ * and isPageview method.
+ * <p>
+ * Hive Usage:
+ *   ADD JAR /path/to/refinery-hive.jar;
+ *   CREATE TEMPORARY FUNCTION is_legacy_pageview AS
+ *     'org.wikimedia.analytics.refinery.hive.IsLegacyPageviewUDF';
+ *   SELECT
+ *     LOWER(uri_host) as uri_host,
+ *     count(*) as cnt
+ *   FROM
+ *     wmf_raw.webrequest
+ *   WHERE
+ *    webrequest_source = 'mobile'
+ *     AND year=2014
+ *     AND month=12
+ *     AND day=7
+ *     AND hour=12
+ *     AND is_legacy_pageview(ip, x_forwarded_for, uri_host, uri_path, 
http_status)
+ *   GROUP BY
+ *     LOWER(uri_host)
+ *   ORDER BY cnt desc
+ *   LIMIT 10
+ *   ;
+ */
+public class IsLegacyPageviewUDF extends UDF {
+    public boolean evaluate(
+        String ip,
+        String xForwardedFor,
+        String uriHost,
+        String uriPath,
+        String httpStatus
+    ) {
+        return LegacyPageview.isLegacyPageview(
+            ip,
+            xForwardedFor,
+            uriHost,
+            uriPath,
+            httpStatus
+        );
+    }
+}
\ No newline at end of file
diff --git 
a/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsLegacyPageviewUDF.java
 
b/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsLegacyPageviewUDF.java
new file mode 100644
index 0000000..1a11387
--- /dev/null
+++ 
b/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsLegacyPageviewUDF.java
@@ -0,0 +1,62 @@
+/**
+ * Copyright (C) 2014  Wikimedia Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.wikimedia.analytics.refinery.hive;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import static org.junit.Assert.assertEquals;
+
+import junitparams.FileParameters;
+import junitparams.JUnitParamsRunner;
+import junitparams.mappers.CsvWithHeaderMapper;
+
+@RunWith(JUnitParamsRunner.class)
+public class TestIsLegacyPageviewUDF {
+
+
+    @Test
+    @FileParameters(
+        value = "../refinery-core/src/test/resources/pageview_test_data.csv",
+        mapper = CsvWithHeaderMapper.class
+    )
+    public void testIsPageview(
+        String test_description,
+        boolean is_pageview,
+        boolean is_legacy_pageview,
+        String ip_address,
+        String x_forwarded_for,
+        String uri_host,
+        String uri_path,
+        String uri_query,
+        String http_status,
+        String content_type,
+        String user_agent
+    ) {
+        IsLegacyPageviewUDF udf = new IsLegacyPageviewUDF();
+
+        assertEquals(
+            test_description,
+            is_legacy_pageview,
+            udf.evaluate(
+                ip_address,
+                x_forwarded_for,
+                uri_host,
+                uri_path,
+                http_status
+            )
+        );
+    }
+}
\ No newline at end of file
diff --git 
a/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsPageviewUDF.java
 
b/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsPageviewUDF.java
index e192056..00a42fa 100644
--- 
a/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsPageviewUDF.java
+++ 
b/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsPageviewUDF.java
@@ -35,6 +35,9 @@
     public void testIsPageview(
         String test_description,
         boolean is_pageview,
+        boolean is_legacy_pageview,
+        String ip_address,
+        String x_forwarded_for,
         String uri_host,
         String uri_path,
         String uri_query,

-- 
To view, visit https://gerrit.wikimedia.org/r/182971
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: Ifd8c9d73da4724d2dccd4ec6d49cc0c2ebd2b63b
Gerrit-PatchSet: 6
Gerrit-Project: analytics/refinery/source
Gerrit-Branch: master
Gerrit-Owner: OliverKeyes <[email protected]>
Gerrit-Reviewer: Nuria <[email protected]>
Gerrit-Reviewer: OliverKeyes <[email protected]>
Gerrit-Reviewer: Ottomata <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to