Milimetric has uploaded a new change for review.
https://gerrit.wikimedia.org/r/75866
Change subject: still messing with encoding
......................................................................
still messing with encoding
Change-Id: I4b23508c0b2870dbaf75824668d280c4844aae33
---
M tests/test_controllers/test_cohorts.py
M wikimetrics/controllers/cohorts.py
2 files changed, 7 insertions(+), 5 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/analytics/wikimetrics
refs/changes/66/75866/1
diff --git a/tests/test_controllers/test_cohorts.py
b/tests/test_controllers/test_cohorts.py
index 4d4d60f..7700f30 100644
--- a/tests/test_controllers/test_cohorts.py
+++ b/tests/test_controllers/test_cohorts.py
@@ -68,6 +68,6 @@
# 3. nasty trailing unicode space (the reason this file has an
encoding definition)
problem_username = ' danĀ '
- parsed_user = parse_username(problem_username)
+ parsed_user = parse_username(problem_username, decode=False)
valid_user = normalize_user(parsed_user, 'enwiki')
assert_not_equal(valid_user, None)
diff --git a/wikimetrics/controllers/cohorts.py
b/wikimetrics/controllers/cohorts.py
index 8b0966e..ba3023d 100644
--- a/wikimetrics/controllers/cohorts.py
+++ b/wikimetrics/controllers/cohorts.py
@@ -283,13 +283,14 @@
project = default_project
parsed.append({
+ 'raw_username': parse_username(username, decode=False),
'username': parse_username(username),
'project': project,
})
return parsed
-def parse_username(username):
+def parse_username(username, decode=True):
"""
parses uncapitalized, whitespace-padded, and weird-charactered mediawiki
user names into ones that have a chance of being found in the database
@@ -297,11 +298,12 @@
username = str(username)
username = username.decode('utf8')
stripped = username.strip()
+ if not decode:
+ stripped = stripped.encode('utf8')
# Capitalize the username according to the Mediawiki standard
# NOTE: unfortunately .title() or .capitalize() don't work
# because 'miliMetric'.capitalize() == 'Milimetric'
- capitalized = stripped[0].upper() + stripped[1:]
- return capitalized.encode('utf8')
+ return stripped[0].upper() + stripped[1:]
def normalize_project(project):
@@ -391,7 +393,7 @@
record['reason_invalid'] = 'invalid project: %s' %
record['project']
invalid.append(record)
continue
- normalized_user = normalize_user(record['username'],
normalized_project)
+ normalized_user = normalize_user(record['raw_username'],
normalized_project)
# make a link to the potential user page even if user doesn't exist
# this gives a chance to see any misspelling etc.
if normalized_user is None:
--
To view, visit https://gerrit.wikimedia.org/r/75866
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I4b23508c0b2870dbaf75824668d280c4844aae33
Gerrit-PatchSet: 1
Gerrit-Project: analytics/wikimetrics
Gerrit-Branch: master
Gerrit-Owner: Milimetric <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits