Milimetric has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/68041


Change subject: more validation and handling of special characters
......................................................................

more validation and handling of special characters

Change-Id: Ibf2c3832a7eb42626fd01162fcf3df85a5c6cbf5
---
M user_metrics/api/templates/csv_upload.html
M user_metrics/api/templates/csv_upload_review.html
M user_metrics/api/views.py
3 files changed, 19 insertions(+), 15 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/analytics/user-metrics 
refs/changes/41/68041/1

diff --git a/user_metrics/api/templates/csv_upload.html 
b/user_metrics/api/templates/csv_upload.html
index 10c4ffe..9fc6467 100644
--- a/user_metrics/api/templates/csv_upload.html
+++ b/user_metrics/api/templates/csv_upload.html
@@ -3,14 +3,14 @@
 {% include "csv_upload_form.html" %}
 <div>
     <h6>File Format</h6>
-    <p>User Metrics expects a CSV file with the following format (the 
username, project header row is not required)</p>
+    <p>User Metrics expects a CSV file with the following format (the 
username, project header row is not required).  Do pay special attention to the 
use of underscore in user names.  Most user names do not have underscores but 
may be recorded that way in your file by people copying from a user page url.  
If those users show up as invalid, you may want to try them with spaces instead 
of underscores.</p>
     <pre><code>username,project
-Evan (WMF),en</code></pre>
+Evan (WMF),en
+Most Users Do Not Have Underscores In Their Names,en
+If_This_Fails_try_replacing_underscores_with_spaces,en</code></pre>
     <ul>
         <li><strong>username</strong>: straightforward utf-8 encoded name or a 
user id.</li>
         <li><strong>project</strong>: the langauge or project code on which 
that username exists.</li>
-        <!--li><strong>reference date</strong>: a user-specific date which can 
be used as a reference point to compute certain metrics over relative time 
periods (i.e. the first month after signing up).  Also, <code>userstats</code> 
uses <code>datutil.parser</code> to interpret date strings, so you have a 
little flexibility in the exact date format you use.</li>
-        <li><strong>cohort id</strong>: an arbitrary string identifier which 
will be used to group users with the same identifier into a cohort, on which 
you can then aggregate statistics and compare.</li-->
     </ul>
 </div>
 {% endblock %}
diff --git a/user_metrics/api/templates/csv_upload_review.html 
b/user_metrics/api/templates/csv_upload_review.html
index af87828..57ae770 100644
--- a/user_metrics/api/templates/csv_upload_review.html
+++ b/user_metrics/api/templates/csv_upload_review.html
@@ -22,11 +22,6 @@
 
 <h5>Try uploading again after fixing the issues</h5>
 {% include "csv_upload_form.html" %}
-<!--form action="/uploads/cohort/validate" method="POST" 
class="form-horizontal">
-    <div class="form-actions">
-        <input type="submit" class="btn" value="Re-Check These Users"/>
-    </div>
-</form-->
 {% endif %}
 
 <hr/>
diff --git a/user_metrics/api/views.py b/user_metrics/api/views.py
index 1655da7..2f37847 100644
--- a/user_metrics/api/views.py
+++ b/user_metrics/api/views.py
@@ -202,8 +202,8 @@
             return render_template('csv_upload_review.html',
                 valid=valid,
                 invalid=invalid,
-                valid_json=json.dumps(valid),
-                invalid_json=json.dumps(invalid),
+                valid_json=json.dumps(valid).replace("'", "\\'"),
+                invalid_json=json.dumps(invalid).replace("'", "\\'"),
                 cohort_name=cohort_name,
                 cohort_project=cohort_project,
                 wiki_projects=sorted(conf.PROJECT_DB_MAP.keys())
@@ -219,7 +219,15 @@
     return json.dumps(available)
 
 def parse_records(records, default_project):
-    return [{'username': parse_username(r[0]), 'project': r[1] if len(r) > 1 
else default_project} for r in records if r]
+    # NOTE: the reason for the crazy -1 and comma joins
+    # is that some users can have commas in their name
+    # TODO: This makes it impossible to add fields to the csv in the future,
+    # so maybe require the project to be the first field and the username to 
be the last
+    # or maybe change to a tsv format
+    return [{
+        'username': parse_username(",".join([str(p) for p in r[:-1]])),
+        'project': r[-1].decode('utf8') if len(r) > 1 else default_project
+    } for r in records if r]
 
 def parse_username(raw_name):
     stripped = str(raw_name).decode('utf8').strip()
@@ -240,11 +248,12 @@
             return new_proj
 
 def normalize_user(user_str, project):
-    uid = query_mod.is_valid_username_query(user_str, project)
+    user_str_sql_safe = user_str.encode('utf-8')
+    uid = query_mod.is_valid_username_query(user_str_sql_safe, project)
     if uid is not None:
         return (uid, user_str)
     elif user_str.isdigit():
-        username = query_mod.is_valid_uid_query(user_str, project)
+        username = query_mod.is_valid_uid_query(user_str_sql_safe, project)
         if username is not None:
             return (user_str, username)
         else:
@@ -268,7 +277,7 @@
 
 def link_to_user_page(username, project):
     project = project_name_for_link(project)
-    return 'https://%s.wikipedia.org/wiki/User:%s' % (project.decode('utf8'), 
username.decode('utf8'))
+    return 'https://%s.wikipedia.org/wiki/User:%s' % (project, username)
 
 def validate_records(records):
     valid = []

-- 
To view, visit https://gerrit.wikimedia.org/r/68041
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Ibf2c3832a7eb42626fd01162fcf3df85a5c6cbf5
Gerrit-PatchSet: 1
Gerrit-Project: analytics/user-metrics
Gerrit-Branch: master
Gerrit-Owner: Milimetric <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to