Milimetric has submitted this change and it was merged.
Change subject: trying to handle unicode names
......................................................................
trying to handle unicode names
Change-Id: Icf936da3ebcf37c88cb937c1175ca37f7ab0cc65
---
M tests/test_controllers/test_cohorts.py
M wikimetrics/controllers/cohorts.py
2 files changed, 20 insertions(+), 9 deletions(-)
Approvals:
Milimetric: Verified; Looks good to me, approved
diff --git a/tests/test_controllers/test_cohorts.py
b/tests/test_controllers/test_cohorts.py
index c3fdc93..4d4d60f 100644
--- a/tests/test_controllers/test_cohorts.py
+++ b/tests/test_controllers/test_cohorts.py
@@ -1,7 +1,9 @@
+# -*- coding:utf-8 -*-
import pprint
import json
-from nose.tools import assert_equal
+from nose.tools import assert_equal, assert_not_equal
from tests.fixtures import WebTest
+from wikimetrics.controllers.cohorts import *
class TestCohortsController(WebTest):
@@ -58,3 +60,14 @@
response.status_code,
404,
)
+
+ def test_validate_username(self):
+ # this username has a few problems that the normalize call should
handle
+ # 1. normal ascii space in front
+ # 2. lowercase
+ # 3. nasty trailing unicode space (the reason this file has an
encoding definition)
+ problem_username = ' danĀ '
+
+ parsed_user = parse_username(problem_username)
+ valid_user = normalize_user(parsed_user, 'enwiki')
+ assert_not_equal(valid_user, None)
diff --git a/wikimetrics/controllers/cohorts.py
b/wikimetrics/controllers/cohorts.py
index c280ed3..8b0966e 100644
--- a/wikimetrics/controllers/cohorts.py
+++ b/wikimetrics/controllers/cohorts.py
@@ -283,26 +283,25 @@
project = default_project
parsed.append({
- 'raw_username': parse_username(username, decode=False),
'username': parse_username(username),
'project': project,
})
return parsed
-def parse_username(raw_username, decode=True):
+def parse_username(username):
"""
parses uncapitalized, whitespace-padded, and weird-charactered mediawiki
user names into ones that have a chance of being found in the database
"""
- username = str(raw_username)
- if decode:
- username = username.decode('utf8')
+ username = str(username)
+ username = username.decode('utf8')
stripped = username.strip()
# Capitalize the username according to the Mediawiki standard
# NOTE: unfortunately .title() or .capitalize() don't work
# because 'miliMetric'.capitalize() == 'Milimetric'
- return stripped[0].upper() + stripped[1:]
+ capitalized = stripped[0].upper() + stripped[1:]
+ return capitalized.encode('utf8')
def normalize_project(project):
@@ -319,7 +318,6 @@
def get_wikiuser_by_name(username, project):
- # NOTE: Not needed right? username = username.encode('utf-8')
db_session = db.get_mw_session(project)
try:
wikiuser = db_session.query(MediawikiUser)\
@@ -393,7 +391,7 @@
record['reason_invalid'] = 'invalid project: %s' %
record['project']
invalid.append(record)
continue
- normalized_user = normalize_user(record['raw_username'],
normalized_project)
+ normalized_user = normalize_user(record['username'],
normalized_project)
# make a link to the potential user page even if user doesn't exist
# this gives a chance to see any misspelling etc.
if normalized_user is None:
--
To view, visit https://gerrit.wikimedia.org/r/75865
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: Icf936da3ebcf37c88cb937c1175ca37f7ab0cc65
Gerrit-PatchSet: 1
Gerrit-Project: analytics/wikimetrics
Gerrit-Branch: master
Gerrit-Owner: Milimetric <[email protected]>
Gerrit-Reviewer: Milimetric <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits