Have you tried searching https://godoc.org for a PCRE that may suite 
better? 
 https://godoc.org/github.com/glenn-brown/golang-pkg-pcre/src/pkg/pcre

On Tuesday, March 9, 2010 at 9:52:16 AM UTC-5, Alex Dong wrote:
>
> I'm writing a simple 'word picker' using golang and python. It
> basically reads a tweeter message from a csv file, tokenize it and put
> the word->id into a map called lexicon.  It turned out that same logic
> took 22 seconds for python 2.6 to finish whereas 57 seconds for
> golang!
>
> Wondering are there anything I've done wrong?
>
> Here is the python code:
> $ cat loader.py
> lexicon = {}
> pnct_ptn = re.compile(r'([\.,\\/\'"\?!=_\)\(\]\[\{\}:;]+|http://[^ ]
> +)')
> def tokenize(s):
>     s = pnct_ptn.sub(' ', s)
>     return [t for t in s.split() if len(t)>=3]
>
> for line in open("result.csv").readlines():
>     parts = line.split(',', 3)
>     if len(parts) != 4: continue
>     msg = parts[3]
>     s  = msg.decode('utf8','ignore').lower()
>     for word in tokenize(s):
>         if not lexicon.has_key(word):
>             unique_words += 1
>             lexicon[word] = unique_words
>
> Here is the go code:
> $ cat loader.go
> package main
>
> import (
>     "bufio"
>     "os"
>     "regexp"
>     "strings"
> )
>
> var (
>     pr, _ = regexp.Compile(`(http://[^ ]+|['".\\,=()*:;?!/]|-)`)    //
> pattern for removal
> )
>
> func tokenize(s string) []string {
>     ms := pr.ReplaceAllString(strings.ToLower(s), " ")
>     return strings.Split(ms, " ", 0)
> }
>
> func main() {
>     lex := make(map[string] int)                            // lexicon
>     dic := make(map[int] string)                            // lookup
>     tw  := 0                                                // total
> words
>     ps  := false                                            // present
>
>     r, _ := os.Open("result.csv", os.O_RDONLY, 0444)
>     defer r.Close()
>     in := bufio.NewReader(r)
>
>     for i := 0; i >= 0; i++ {
>         line, err := in.ReadString('\n')
>         if err != nil {
>             break
>         }
>
>         parts    := strings.Split(line, ",", 4)
>         if len(parts) != 4 {
>             continue
>         }
>
>         ts := tokenize(parts[3])
>         for d := 0; d < len(ts); d++ {
>             w := ts[d]
>             if len(w) < 3 {
>                 continue
>             }
>             _, ps = lex[w]
>             if ps == false {
>                 lex[w] = tw
>                 dic[tw] = w
>                 tw ++
>             }
>         }
>     }
> }
>
>
> Cheers,
> Alex
>
>

-- 
You received this message because you are subscribed to the Google Groups 
"golang-nuts" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to golang-nuts+unsubscr...@googlegroups.com.
For more options, visit https://groups.google.com/d/optout.

Reply via email to