Demon has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/51680


Change subject: Add replace_space function
......................................................................

Add replace_space function

  Added tests, added test file, fixed replace_space

Change-Id: Ic795c876105fe8a1a980f83a0c07a5ba2011fff5
---
M Makefile
A README.tests
A entries-with-urls-with-spaces-2013-02-10.txt
M filter.c
A test.sh
5 files changed, 80 insertions(+), 35 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/analytics/webstatscollector 
refs/changes/80/51680/1

diff --git a/Makefile b/Makefile
index b05321b..25451a7 100644
--- a/Makefile
+++ b/Makefile
@@ -14,12 +14,14 @@
 
 all: collector filter
 
-collector: collector.h collector.c export.c
+collector: collector.h collector.c export.c export.o
+       gcc -o collector collector.c export.o -ldb -lpthread
 
 filter: filter.c
-       cc -o filter filter.c
+       gcc -o filter filter.c
 
-#export: collector.h export.c
+export.o: export.c collector.h collector.c filter.c
+       gcc -c -o export.o export.c
 
 clean:
-       rm -f collector exporter
+       rm -f *.o collector filter
diff --git a/README.tests b/README.tests
new file mode 100644
index 0000000..c14927e
--- /dev/null
+++ b/README.tests
@@ -0,0 +1,7 @@
+
+
+The test lines in entries-with-urls-with-spaces-2013-02-10.txt were produced 
like this:
+
+   zcat /home/user/wikidata/raw_gzips/sampled-1000.tab.log-20130210.gz | perl 
-ne '@f=split(/\t/); print if index($f[8]," ")!=-1;'  > 
entries-with-urls-with-spaces-2013-02-10.txt
+
+After this head -40 | tail -30  was applied in order for the filter to accept 
all of them (some of them were influenced by some the discarding rules of the 
filter).
diff --git a/entries-with-urls-with-spaces-2013-02-10.txt 
b/entries-with-urls-with-spaces-2013-02-10.txt
new file mode 100644
index 0000000..abb6648
--- /dev/null
+++ b/entries-with-urls-with-spaces-2013-02-10.txt
@@ -0,0 +1,30 @@
+ssl1002        338866312       2013-02-09T10:00:09.126 1.667   0.0.0.0 
FAKE_CACHE_STATUS/301   667     GET     
https://fr.wikipedia.org/wiki/Discussion:Bourail/Droit d'auteur NONE/wikipedia  
-       -       -       
Mozilla/5.0%20(compatible;%20Googlebot/2.1;%20+http://www.google.com/bot.html)  
-       -
+ssl1004        347282202       2013-02-09T10:18:11.367 0.082   0.0.0.0 
FAKE_CACHE_STATUS/301   675     GET     
https://fr.wikipedia.org/wiki/Discussion:Chemin\xC3\xA9e solaire/Traduction     
NONE/wikipedia  -       -       -       
Mozilla/5.0%20(compatible;%20Googlebot/2.1;%20+https://www.google.com/bot.html) 
-       -
+ssl1004        349592952       2013-02-09T12:19:04.972 0.079   0.0.0.0 
FAKE_CACHE_STATUS/301   667     GET     
https://fr.wikipedia.org/wiki/\xC3\x89tienne Perrot (psychanalyste)     
NONE/wikipedia  -       -       -       
Mozilla/5.0%20(compatible;%20Googlebot/2.1;%20+http://www.google.com/bot.html)  
-       -
+ssl1001        338735382       2013-02-09T12:57:51.173 0.201   0.0.0.0 
FAKE_CACHE_STATUS/301   759     GET     http://ja.wikipedia.org/wiki/CODE NAME. 
1 Brother 
Sun\xE3\x80\x90\xE5\x88\x9D\xE5\x9B\x9E\xE7\x94\x9F\xE7\x94\xA3\xE9\x99\x90\xE5\xAE\x9A\xE3\x80\x91(\xE7\xB4\x99\xE3\x82\xB8\xE3\x83\xA3\xE3\x82\xB1\xE3\x83\x83\xE3\x83\x88\xE4\xBB\x95\xE6\xA7\x98)
 NONE/wikipedia  -       -       -       foobar2000/1.1.14a      -       -
+ssl1001        340768664       2013-02-09T14:29:25.274 0.351   0.0.0.0 
FAKE_CACHE_STATUS/301   885     GET     
http://ja.wikipedia.org/wiki/\xE7\x89\xB9\xE5\x88\xA5:\xE3\x83\x87\xE3\x83\xBC\xE3\x82\xBF\xE6\x9B\xB8\xE3\x81\x8D\xE5\x87\xBA\xE3\x81\x97/\xE3\x82\x82\xE3\x82\x82\xE3\x81\x84\xE3\x82\x8D\xE3\x82\xAF\xE3\x83\xAD\xE3\x83\xBC\xE3\x83\x90\xE3\x83\xBCZ
 1st Live 
\xE3\x81\xAB\xE3\x83\x95\xE3\x83\xA9\xE3\x83\xB3\xE3\x82\xB9\xE4\xBA\xBA\xE5\xA4\xA7\xE8\x88\x88\xE5\xA5\xAE\xEF\xBC\x81\xE3\x83\x91\xE3\x83\xAAde
 Japan Expo NONE/wikipedia  -       -       -       
Mozilla/4.0%20(compatible;%20MSIE%207.0;%20Windows%20NT%206.0)  -       -
+ssl3002        764325222       2013-02-09T14:35:55.799 0.088   0.0.0.0 
FAKE_CACHE_STATUS/301   1247    GET     http://en.wikipedia.org/wiki/Chopin - 
The Piano Works (Brilliant Classics) (CD4 of 13)_(album)  NONE/wikipedia  -     
  -       -       foobar2000/1.2  -       -
+ssl1002        345415572       2013-02-09T15:10:39.804 0.051   0.0.0.0 
FAKE_CACHE_STATUS/301   674     GET     
https://fr.wikipedia.org/wiki/Portail:Indianapolis/Index th\xC3\xA9matique      
NONE/wikipedia  -       -       -       
Mozilla/5.0%20(compatible;%20Googlebot/2.1;%20+http://www.google.com/bot.html)  
-       -
+ssl3002        769405948       2013-02-09T15:36:59.442 0.408   0.0.0.0 
FAKE_CACHE_STATUS/301   1270    GET     https://en.wikipedia.org/wiki/Kevin 
Smith       NONE/wikipedia  -       -       -       
Mozilla/5.0%20(X11;%20Linux%20x86_64;%20rv:18.0)%20Gecko/20100101%20Firefox/18.0
        en-US,en;q=0.5  -
+ssl1004        354242982       2013-02-09T15:40:02.553 0.061   0.0.0.0 
FAKE_CACHE_STATUS/301   677     GET     
https://fr.wikipedia.org/wiki/Personnages de la Saga du d\xC3\xA9sir interdit   
NONE/wikipedia  -       -       -       
Mozilla/5.0%20(compatible;%20Googlebot/2.1;%20+http://www.google.com/bot.html)  
-       -
+ssl1001        343866342       2013-02-09T16:29:52.032 0.002   0.0.0.0 
FAKE_CACHE_STATUS/200   29022   GET     
https://en.wikipedia.org/wiki/Template:Attached_KML/Interstate 87       
NONE/wikipedia  -       https://en.wikipedia.org/wiki/Interstate_87     -       
Mozilla/5.0%20(Windows%20NT%206.1;%20WOW64)%20AppleWebKit/537.17%20(KHTML,%20like%20Gecko)%20Chrome/24.0.1312.57%20Safari/537.17
        en-US,en;q=0.8,ja;q=0.6 -
+ssl3002        775361522       2013-02-09T16:46:28.770 0.088   0.0.0.0 
FAKE_CACHE_STATUS/301   1214    GET     http://pl.wikipedia.org/wiki/Show Your 
Bones (Advance)_(album)  NONE/wikipedia  -       -       -       foobar2000/1.1 
 -       -
+ssl1003        345719812       2013-02-09T16:58:27.186 0.108   0.0.0.0 
FAKE_CACHE_STATUS/301   660     GET     
https://fr.wikipedia.org/wiki/Discussion utilisateur:Myaly      NONE/wikipedia  
-       -       -       
Mozilla/5.0%20(compatible;%20Googlebot/2.1;%20+http://www.google.com/bot.html)  
-       -
+ssl1003        347346702       2013-02-09T17:57:30.211 0.063   0.0.0.0 
FAKE_CACHE_STATUS/301   683     GET     
https://fr.wikipedia.org/wiki/Discussion:Navire de d\xC3\xA9fense 
c\xC3\xB4ti\xC3\xA8re NONE/wikipedia  -       -       -       
Mozilla/5.0%20(compatible;%20Googlebot/2.1;%20+http://www.google.com/bot.html)  
-       -
+ssl3003        782892222       2013-02-09T18:55:51.553 0.115   0.0.0.0 
FAKE_CACHE_STATUS/301   1201    GET     http://ru.wikipedia.org/wiki/American 
Capitalist        NONE/wikipedia  -       -       -       foobar2000/1.1.17     
  -       -
+ssl3002        787278662       2013-02-09T19:04:23.234 0.116   0.0.0.0 
FAKE_CACHE_STATUS/301   1235    GET     http://ru.wikipedia.org/wiki/Live at 
Wembley Stadium 1986 (25th Anniversary Edition)    NONE/wikipedia  -       -    
   -       foobar2000/1.1.5        -       -
+ssl1001        348171052       2013-02-09T19:07:42.557 0.056   0.0.0.0 
FAKE_CACHE_STATUS/301   647     GET     https://fr.wikipedia.org/wiki/Tangara 
(genre)   NONE/wikipedia  -       -       -       
Mozilla/5.0%20(compatible;%20Googlebot/2.1;%20+http://www.google.com/bot.html)  
-       -
+ssl1004        360069242       2013-02-09T19:09:35.203 0.050   0.0.0.0 
FAKE_CACHE_STATUS/301   662     GET     
https://fr.wikipedia.org/wiki/Mod\xC3\xA8le:Cantons de Valence  NONE/wikipedia  
-       -       -       
Mozilla/5.0%20(compatible;%20Googlebot/2.1;%20+http://www.google.com/bot.html)  
-       -
+ssl1004        363042322       2013-02-09T20:55:34.082 0.057   0.0.0.0 
FAKE_CACHE_STATUS/301   668     GET     
https://fr.wikipedia.org/wiki/Projet:Impression/Quality images/113      
NONE/wikipedia  -       -       -       
Mozilla/5.0%20(compatible;%20Googlebot/2.1;%20+http://www.google.com/bot.html)  
-       -
+ssl1002        358824032       2013-02-09T23:20:20.933 0.050   0.0.0.0 
FAKE_CACHE_STATUS/301   668     GET     
https://meta.wikimedia.org/wiki/Special:CentralAuth/Lala78 z z b 5      
NONE/wikimedia  -       -       -       
Mozilla/5.0%20(compatible;%20Googlebot/2.1;%20+http://www.google.com/bot.html)  
-       -
+ssl3003        806330792       2013-02-09T23:28:30.354 0.310   0.0.0.0 
FAKE_CACHE_STATUS/301   1189    GET     http://de.wikipedia.org/wiki/Hip Hop Is 
Dead    NONE/wikipedia  -       -       -       foobar2000/1.1.10       -       
-
+ssl1003        357371192       2013-02-10T00:05:37.562 0.055   0.0.0.0 
FAKE_CACHE_STATUS/301   665     GET     
https://fr.wikipedia.org/wiki/Discussion utilisateur:Steve92341 NONE/wikipedia  
-       -       -       
Mozilla/5.0%20(compatible;%20Googlebot/2.1;%20+http://www.google.com/bot.html)  
-       -
+ssl1002        362176022       2013-02-10T01:17:26.634 0.086   0.0.0.0 
FAKE_CACHE_STATUS/301   680     GET     
https://fr.wikipedia.org/wiki/Discussion:Histoire du Racing Club de Strasbourg  
NONE/wikipedia  -       -       -       
Mozilla/5.0%20(compatible;%20Googlebot/2.1;%20+http://www.google.com/bot.html)  
-       -
+ssl3003        812500292       2013-02-10T01:32:25.217 0.001   0.0.0.0 
FAKE_CACHE_STATUS/301   1226    GET     
http://commons.wikimedia.org/wiki/Template:Motd/2013-02-9 (en)  NONE/wikimedia  
-       -       -       Magnus%20tools  -       -
+ssl1001        359846002       2013-02-10T02:05:36.049 0.433   0.0.0.0 
FAKE_CACHE_STATUS/400   17294   GET     
http://ja.wikipedia.org/wiki/\xE7\x89\xB9\xE5\x88\xA5:\xE3\x83\x87\xE3\x83\xBC\xE3\x82\xBF\xE6\x9B\xB8\xE3\x81\x8D\xE5\x87\xBA\xE3\x81\x97/[enews24.net]
 '\xED\x99\x94\xEC\x84\xB1\xEC\x9D\xB8' \xEC\x84\xB9\xEC\x8B\x9C\xED\x95\x9C 
\xED\x82\xA4\xED\x8B\xB0\xEC\xA4\x91\xEB\x8F\x85\xEB\x85\x80       
NONE/wikipedia  -       -       -       
Mozilla/4.0%20(compatible;%20MSIE%207.0;%20Windows%20NT%206.0)  -       -
+ssl3003        815122202       2013-02-10T03:01:12.862 0.374   0.0.0.0 
FAKE_CACHE_STATUS/400   17734   GET     
http://pl.wikipedia.org/wiki/Mi\xC5\x82</img></table></i></td></tr></td></tr></table></img></div></div><a
 href= NONE/wikipedia  -       -       -       
Mozilla/5.0%20(compatible;%20Googlebot/2.1;%20+http://www.google.com/bot.html)  
-       -
+ssl1003        363030612       2013-02-10T03:11:03.299 0.266   0.0.0.0 
FAKE_CACHE_STATUS/301   621     GET     http://en.wikipedia.org/wiki/Tales Of 
VS. Original Soundtrack (Disc 1)  NONE/wikipedia  -       -       -       
foobar2000/1.0.3        -       -
+ssl1003        363126032       2013-02-10T03:13:58.975 0.089   0.0.0.0 
FAKE_CACHE_STATUS/301   654     GET     
https://fr.wikipedia.org/wiki/Box-office France 1986    NONE/wikipedia  -       
-       -       
Mozilla/5.0%20(compatible;%20Googlebot/2.1;%20+http://www.google.com/bot.html)  
-       -
+ssl1003        364287962       2013-02-10T03:50:17.999 0.060   0.0.0.0 
FAKE_CACHE_STATUS/301   656     GET     
https://fr.wikipedia.org/wiki/Chesterfield (homonymie)  NONE/wikipedia  -       
-       -       
Mozilla/5.0%20(compatible;%20Googlebot/2.1;%20+http://www.google.com/bot.html)  
-       -
+ssl1002        370989322       2013-02-10T06:00:27.104 0.046   0.0.0.0 
FAKE_CACHE_STATUS/301   659     GET     
https://fr.wikipedia.org/wiki/Discussion mod\xC3\xA8le:OMIM     NONE/wikipedia  
-       -       -       
Mozilla/5.0%20(compatible;%20Googlebot/2.1;%20+http://www.google.com/bot.html)  
-       -
+ssl3002        823296232       2013-02-10T06:27:48.601 0.151   0.0.0.0 
FAKE_CACHE_STATUS/301   1195    GET     http://ru.wikipedia.org/wiki/The 
Electric Mist  NONE/wikipedia  -       -       -       foobar2000/1.2.2        
-       -
diff --git a/filter.c b/filter.c
index 72474b4..68daf4b 100644
--- a/filter.c
+++ b/filter.c
@@ -16,30 +16,6 @@
 
 */
 
-/*
-
-#!/usr/bin/python
-
-import re
-import sys
-
-dupes = 
re.compile('^(145\.97\.39\.|66\.230\.200\.|211\.115\.107\.|91\.198\.174\.)')
-urlre = re.compile('^http://([^\.]+)\.([^\.]+).org/wiki/([^?]+)')
-
-projects={"wikipedia":"","wiktionary":".d","wikinews":".n","wikimedia":".m","wikibooks":".b","wikisource":".s","mediawiki":".w","wikiversity":".v","wikiquote":".q"
 }
-
-for line in sys.stdin:
-       ip,undef,bytes,undef,url=line.split()[4:9]
-       if dupes.match(ip): continue
-       stuff=urlre.match(url)
-       if stuff == None: continue
-       language,project,title = stuff.groups()
-       if project=="wikimedia" and language not in 
["commons","meta","incubator","species"]: continue
-       try: print language + projects[project] + " 1 " + bytes + " "  + title
-       except: continue
-
-*/
-
 #define LINESIZE 4096
 char *_sep, *_lasttok, *_firsttok;
 #define TOKENIZE(x,y) _lasttok=NULL; _sep=y; _firsttok=strtok_r(x,y,&_lasttok);
@@ -70,11 +46,11 @@
 */
 
 char *dupes[] = {"208.80.152.",
-                               "208.80.153.",
-                               "208.80.154.",
-                               "208.80.155.",
-                               "91.198.174.",
-                               NULL};
+                "208.80.153.",
+                "208.80.154.",
+                "208.80.155.",
+                "91.198.174.",
+                NULL};
 
 bool check_ip(char *ip) {
        char **prefix=dupes;
@@ -114,6 +90,20 @@
        char *title;
        char *suffix;
 } info;
+
+void replace_space(char *url) {
+       int len = strlen(url);
+       if (len==0) {
+               return;
+       }
+
+       int i;
+       for(i = 0; i < len; i++){
+               if(url[i] == ' '){
+                       url[i] = '_';
+               }
+        }
+}
 
 bool parse_url(char *url, struct info *in) {
        if (!url)
@@ -186,10 +176,11 @@
                info.size=      FIELD; /* object size */
                                FIELD;
                url=        FIELD;
-               if (!parse_url(url,&info))
-                       continue;
+        replace_space(url);
                if (!check_ip(info.ip))
                        continue;
+               if (!parse_url(url,&info))
+                       continue;
                if (!check_project(&info))
                        continue;
                printf("%s%s 1 %s %s\n",info.language, info.suffix, info.size, 
info.title);
diff --git a/test.sh b/test.sh
new file mode 100755
index 0000000..dc91912
--- /dev/null
+++ b/test.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+TEST_SPACELESS_LINES=`cat entries-with-urls-with-spaces-2013-02-10.txt | 
./filter | perl -MData::Dumper -ne '@f=split(/\s/,$_,4);  print if $f[3] =~ /\ 
/;' | wc -l`
+
+
+
+if [ $TEST_SPACELESS_LINES -eq 0 ]; then
+  echo "Test1: Spaceless lines in filter PASSED";
+else
+  echo "Test1: Spaceless lines in filter FAILED";
+  exit -1;
+fi
+
+
+exit 0;

-- 
To view, visit https://gerrit.wikimedia.org/r/51680
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Ic795c876105fe8a1a980f83a0c07a5ba2011fff5
Gerrit-PatchSet: 1
Gerrit-Project: analytics/webstatscollector
Gerrit-Branch: time_travel
Gerrit-Owner: Demon <ch...@wikimedia.org>
Gerrit-Reviewer: Diederik <dvanli...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to