Ottomata has submitted this change and it was merged.

Change subject: Document some filter idiosyncrasies through tests
......................................................................


Document some filter idiosyncrasies through tests

Change-Id: I7242d292a82a7ee9de033bab7caf07844075888e
---
M tests/test.sh
1 file changed, 58 insertions(+), 1 deletion(-)

Approvals:
  Ottomata: Verified; Looks good to me, approved
  jenkins-bot: Verified



diff --git a/tests/test.sh b/tests/test.sh
index 258c95e..9b1e225 100755
--- a/tests/test.sh
+++ b/tests/test.sh
@@ -31,11 +31,14 @@
 set_FILTERED_OUTPUT() {
     local URL="$1"
 
+    local LOCAL_LOG_LINE_IP="${LOG_LINE_IP:-IP}"
+    unset LOG_LINE_IP
+
     local LOG_LINE="CACHE_MACHINE"
     LOG_LINE="$LOG_LINE        SEQUENCE_NUMBER"
     LOG_LINE="$LOG_LINE        TIMESTAMP"
     LOG_LINE="$LOG_LINE        DURATION"
-    LOG_LINE="$LOG_LINE        IP"
+    LOG_LINE="$LOG_LINE        $LOCAL_LOG_LINE_IP"
     LOG_LINE="$LOG_LINE        STATUS_CODE"
     LOG_LINE="$LOG_LINE        SIZE"
     LOG_LINE="$LOG_LINE        REQUEST_METHOD"
@@ -136,6 +139,60 @@
 assert_counted     'http://en.wikipedia.org/wiki/Robinson_Can\xC3\xB3' 'en' 
'Robinson_Can\xC3\xB3'
 assert_counted     'http://en.wikipedia.org/wiki/Robinson_Canó' 'en' 
'Robinson_Canó'
 
+
+
+# Idiosyncrasies 
---------------------------------------------------------------
+# Here, we document some idiosyncrasies of webstatscollector.
+# We might wish to change/fix them, but that would require all
+# consumers of those files to adapt their software. And it would make
+# comparison between files harder. So let's at least call them out for
+# now.
+
+# Idiosyncrasy #1 Pageviews to mobile enwiki, are only counted for
+# .mw, not for plain enwiki. And this counting is not "per page", but
+# "per language".
+
+assert_counted 
'http://en.m.wikipedia.org/wiki/Idiosyncrasy/Page_on_MobileEnwikiSite_only_counted_titleless_for_en.mw'
 'en.mw' 'en'
+
+# Idiosyncrasy #2 While "en.mw" might suggest to be thought of as
+# "English mobile wikipedia", it is rather "English mobile sites". So
+# it includes for example hits to enwikivoyage.
+assert_counted 
'http://en.m.wikivoyage.org/wiki/Idiosyncrasy/Page_on_MobileEnwikivoyageSite_only_counted_titleless_for_en.mw'
 'en.mw' 'en'
+
+# Idiosyncrasy #3 Languages in domain names are considered case
+# sensitive.
+assert_counted 
'http://En.wikipedia.org/wiki/Idiosyncrasy/Case_sensitive_languages' 'En' 
'Idiosyncrasy/Case_sensitive_languages'
+
+# Idiosyncrasy #4 Some internal IPv4 IPs are not counted
+# altogether. This gets in the way for SSL requests, and makes it
+# necessary that the logs from the SSL terminators get fed into the
+# filter process too.
+# First some internal IP addresses covered by 'filter'.
+LOG_LINE_IP="208.80.152.1" ; assert_not_counted 
'http://en.wikipedia.org/wiki/Idiosyncrasy/Do_not_count_internal/208.80.152.x'
+LOG_LINE_IP="208.80.153.2" ; assert_not_counted 
'http://en.wikipedia.org/wiki/Idiosyncrasy/Do_not_count_internal/208.80.153.x'
+LOG_LINE_IP="208.80.154.3" ; assert_not_counted 
'http://en.wikipedia.org/wiki/Idiosyncrasy/Do_not_count_internal/208.80.154.x'
+LOG_LINE_IP="208.80.155.3" ; assert_not_counted 
'http://en.wikipedia.org/wiki/Idiosyncrasy/Do_not_count_internal/208.80.155.x'
+LOG_LINE_IP="91.198.174.5" ; assert_not_counted 
'http://en.wikipedia.org/wiki/Idiosyncrasy/Do_not_count_internal/91.198.141.x'
+
+# Then some internal IP addresses not covered by 'filter'.
+LOG_LINE_IP="198.35.26.6" ; assert_counted 
'http://en.wikipedia.org/wiki/Idiosyncrasy/Do_not_count_internal/198.35.26.x_not_covered'
 'en' 'Idiosyncrasy/Do_not_count_internal/198.35.26.x_not_covered'
+LOG_LINE_IP="198.35.27.7" ; assert_counted 
'http://en.wikipedia.org/wiki/Idiosyncrasy/Do_not_count_internal/198.35.27.x_not_covered'
 'en' 'Idiosyncrasy/Do_not_count_internal/198.35.27.x_not_covered'
+LOG_LINE_IP="185.15.56.7" ; assert_counted 
'http://en.wikipedia.org/wiki/Idiosyncrasy/Do_not_count_internal/185.15.56.x_not_covered'
 'en' 'Idiosyncrasy/Do_not_count_internal/185.15.56.x_not_covered'
+LOG_LINE_IP="185.15.57.8" ; assert_counted 
'http://en.wikipedia.org/wiki/Idiosyncrasy/Do_not_count_internal/185.15.57.x_not_covered'
 'en' 'Idiosyncrasy/Do_not_count_internal/185.15.57.x_not_covered'
+LOG_LINE_IP="185.15.58.9" ; assert_counted 
'http://en.wikipedia.org/wiki/Idiosyncrasy/Do_not_count_internal/185.15.58.x_not_covered'
 'en' 'Idiosyncrasy/Do_not_count_internal/185.15.58.x_not_covered'
+LOG_LINE_IP="185.15.59.10" ; assert_counted 
'http://en.wikipedia.org/wiki/Idiosyncrasy/Do_not_count_internal/185.15.59.x_not_covered'
 'en' 'Idiosyncrasy/Do_not_count_internal/185.15.59.x_not_covered'
+LOG_LINE_IP="2620:0:860::11" ; assert_counted 
'http://en.wikipedia.org/wiki/Idiosyncrasy/Do_not_count_internal/2620:0:860::_not_covered'
 'en' 'Idiosyncrasy/Do_not_count_internal/2620:0:860::_not_covered'
+LOG_LINE_IP="2a02:ec80::12" ; assert_counted 
'http://en.wikipedia.org/wiki/Idiosyncrasy/Do_not_count_internal/2a02:ec80::_not_covered'
 'en' 'Idiosyncrasy/Do_not_count_internal/2a02:ec80::_not_covered'
+
+# Idiosyncrasy #5 Pages are only counted through '/wiki' URLs.
+# 'index.php' is not counted.
+assert_not_counted 'http://en.wikipedia.org/w/index.php?title=Main_Page'
+
+# Idiosyncrasy #6 Zero does not get counted at all
+assert_not_counted 'http://en.zero.wikipedia.org/wiki/Main_Page'
+
+
+
 # -- printing statistics 
-------------------------------------------------------
 
 TESTS_FAILED=$((TESTS-TESTS_GOOD))

-- 
To view, visit https://gerrit.wikimedia.org/r/156050
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I7242d292a82a7ee9de033bab7caf07844075888e
Gerrit-PatchSet: 2
Gerrit-Project: analytics/webstatscollector
Gerrit-Branch: master
Gerrit-Owner: QChris <christ...@quelltextlich.at>
Gerrit-Reviewer: Ottomata <o...@wikimedia.org>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to