Hello community,

here is the log from the commit of package perl-Spooky-Patterns-XS for 
openSUSE:Factory checked in at 2020-01-28 10:54:36
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Comparing /work/SRC/openSUSE:Factory/perl-Spooky-Patterns-XS (Old)
 and      /work/SRC/openSUSE:Factory/.perl-Spooky-Patterns-XS.new.26092 (New)
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

Package is "perl-Spooky-Patterns-XS"

Tue Jan 28 10:54:36 2020 rev:5 rq:767807 version:1.55

Changes:
--------
--- 
/work/SRC/openSUSE:Factory/perl-Spooky-Patterns-XS/perl-Spooky-Patterns-XS.changes
  2020-01-16 18:22:27.565016170 +0100
+++ 
/work/SRC/openSUSE:Factory/.perl-Spooky-Patterns-XS.new.26092/perl-Spooky-Patterns-XS.changes
       2020-01-28 10:54:54.789012811 +0100
@@ -1,0 +2,10 @@
+Sat Jan 25 14:49:03 UTC 2020 - Stephan Kulow <[email protected]>
+
+- 1.55: Ignore more tokens on matching
+
+-------------------------------------------------------------------
+Thu Jan 23 16:00:47 UTC 2020 - Stephan Kulow <[email protected]>
+
+- 1.54: Add BagOfPatterns to calculcate closest pattern
+
+-------------------------------------------------------------------

Old:
----
  Spooky-Patterns-XS-1.53.tar.gz

New:
----
  Spooky-Patterns-XS-1.55.tar.gz

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

Other differences:
------------------
++++++ perl-Spooky-Patterns-XS.spec ++++++
--- /var/tmp/diff_new_pack.DC364B/_old  2020-01-28 10:54:55.757013944 +0100
+++ /var/tmp/diff_new_pack.DC364B/_new  2020-01-28 10:54:55.757013944 +0100
@@ -17,7 +17,7 @@
 
 
 Name:           perl-Spooky-Patterns-XS
-Version:        1.53
+Version:        1.55
 Release:        0
 %define cpan_name Spooky-Patterns-XS
 Summary:        Spooky::Patterns::XS Perl module

++++++ Spooky-Patterns-XS-1.53.tar.gz -> Spooky-Patterns-XS-1.55.tar.gz ++++++
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/Spooky-Patterns-XS-1.53/Changes 
new/Spooky-Patterns-XS-1.55/Changes
--- old/Spooky-Patterns-XS-1.53/Changes 2020-01-14 09:13:11.440514006 +0100
+++ new/Spooky-Patterns-XS-1.55/Changes 2020-01-25 15:44:14.236636977 +0100
@@ -1,5 +1,12 @@
 Revision history for Perl extension Spooky::Patterns::XS
 
+1.55    2020-01-25
+        - Way stronger strategy on ignoring characters that
+          do not add value
+
+1.54    2020-01-23
+        - Add Bag of Patterns to calculate nearest pattern
+
 1.53    2020-01-14
         - Fix read_lines to return the last line in file correctly
           if it doesn't end with a newline
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/Spooky-Patterns-XS-1.53/MANIFEST 
new/Spooky-Patterns-XS-1.55/MANIFEST
--- old/Spooky-Patterns-XS-1.53/MANIFEST        2020-01-14 09:19:41.701153998 
+0100
+++ new/Spooky-Patterns-XS-1.55/MANIFEST        2020-01-25 15:48:09.797617393 
+0100
@@ -1,7 +1,9 @@
+bag_impl.cc
 Changes
 COPYING
 Makefile.PL
 MANIFEST                       This list of files
+Matcher.h
 patterns_impl.cc
 patterns_impl.h
 SpookyV2.cpp
@@ -10,8 +12,6 @@
 t/02compile.t
 t/03match.t
 t/03match.txt
-t/04license.1.pattern
-t/04license.1.txt
 t/04license.10.pattern
 t/04license.10.txt
 t/04license.11.pattern
@@ -19,14 +19,15 @@
 t/04license.12.pattern
 t/04license.12.txt
 t/04license.13.pattern
+t/04license.13.txt
 t/04license.14.pattern
 t/04license.15.pattern
 t/04license.16.pattern
 t/04license.17.pattern
 t/04license.18.pattern
 t/04license.19.pattern
-t/04license.2.pattern
-t/04license.2.txt
+t/04license.1.pattern
+t/04license.1.txt
 t/04license.20.pattern
 t/04license.21.pattern
 t/04license.22.pattern
@@ -37,6 +38,10 @@
 t/04license.27.pattern
 t/04license.28.pattern
 t/04license.29.pattern
+t/04license.2.pattern
+t/04license.2.txt
+t/04license.30.pattern
+t/04license.31.pattern
 t/04license.3.pattern
 t/04license.3.txt
 t/04license.4.pattern
@@ -59,8 +64,14 @@
 t/07close.p1
 t/07close.p2
 t/07close.t
-t/test.t
+t/08bag.t
+t/09normalize.1.in
+t/09normalize.1.out
+t/09normalize.t
+t/09normalize.2.in
+t/09normalize.2.out
 TokenTree.h
+t/test.t
 typemap
 XS.pm
 XS.xs
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/Spooky-Patterns-XS-1.53/META.json 
new/Spooky-Patterns-XS-1.55/META.json
--- old/Spooky-Patterns-XS-1.53/META.json       2020-01-14 09:19:41.649150569 
+0100
+++ new/Spooky-Patterns-XS-1.55/META.json       2020-01-25 15:48:09.593603943 
+0100
@@ -6,7 +6,7 @@
    "dynamic_config" : 1,
    "generated_by" : "ExtUtils::MakeMaker version 7.34, CPAN::Meta::Converter 
version 2.150010",
    "license" : [
-      "unknown"
+      "gpl_2"
    ],
    "meta-spec" : {
       "url" : "http://search.cpan.org/perldoc?CPAN::Meta::Spec";,
@@ -45,6 +45,6 @@
          "url" : "https://github.com/coolo/spooky-pattern-xs";
       }
    },
-   "version" : "1.53",
+   "version" : "1.55",
    "x_serialization_backend" : "JSON::PP version 4.02"
 }
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/Spooky-Patterns-XS-1.53/META.yml 
new/Spooky-Patterns-XS-1.55/META.yml
--- old/Spooky-Patterns-XS-1.53/META.yml        2020-01-14 09:19:41.581146085 
+0100
+++ new/Spooky-Patterns-XS-1.55/META.yml        2020-01-25 15:48:09.353588119 
+0100
@@ -8,7 +8,7 @@
   ExtUtils::MakeMaker: '0'
 dynamic_config: 1
 generated_by: 'ExtUtils::MakeMaker version 7.34, CPAN::Meta::Converter version 
2.150010'
-license: unknown
+license: gpl
 meta-spec:
   url: http://module-build.sourceforge.net/META-spec-v1.4.html
   version: '1.4'
@@ -22,5 +22,5 @@
 resources:
   license: https://www.gnu.org/licenses/old-licenses/gpl-2.0.txt
   repository: https://github.com/coolo/spooky-pattern-xs
-version: '1.53'
+version: '1.55'
 x_serialization_backend: 'CPAN::Meta::YAML version 0.018'
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/Spooky-Patterns-XS-1.53/Makefile.PL 
new/Spooky-Patterns-XS-1.55/Makefile.PL
--- old/Spooky-Patterns-XS-1.53/Makefile.PL     2017-04-21 19:30:14.967869967 
+0200
+++ new/Spooky-Patterns-XS-1.55/Makefile.PL     2020-01-23 17:12:57.258561689 
+0100
@@ -19,7 +19,7 @@
     },
     LD => 'g++',
     XSOPT => '-C++',
-    LICENSE       => 'GPL-2.0+',
+    LICENSE       => 'GPL_2',
     AUTHOR            => 'Stephan Kulow <[email protected]>',
     INC               => join(' ', @INC),
     LIBS              => [ join(' ', @LIBPATH, @LIBS) ],
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/Spooky-Patterns-XS-1.53/Matcher.h 
new/Spooky-Patterns-XS-1.55/Matcher.h
--- old/Spooky-Patterns-XS-1.53/Matcher.h       1970-01-01 01:00:00.000000000 
+0100
+++ new/Spooky-Patterns-XS-1.55/Matcher.h       2020-01-25 11:57:55.925056935 
+0100
@@ -0,0 +1,48 @@
+#include <cstdint>
+#include <list>
+#include <vector>
+#include <string>
+#include <set>
+
+struct Match {
+    int start;
+    int matched;
+    int pattern;
+    int sline;
+    int eline;
+};
+
+typedef std::list<Match> Matches;
+
+struct Token {
+    int linenumber;
+    uint64_t hash;
+    std::string text;
+};
+
+typedef std::vector<Token> TokenList;
+
+class TokenTree;
+
+struct Matcher {
+    std::set<uint64_t> ignored_tokens;
+    TokenTree *pattern_tree;
+
+    ssize_t longest_pattern;
+
+    static Matcher* _self;
+    static Matcher* self() {
+      if (!_self) {
+        _self = new Matcher();
+      }
+
+      return _self;
+    }
+
+    Matcher();
+    bool to_ignore(uint64_t t) const;
+    bool to_ignore(const char *t, unsigned int len) const;
+    void init();
+    void add_token(TokenList& result, const char* start, size_t len, int line) 
const;
+    void tokenize(TokenList& result, char* str, int linenumber = 0);
+};
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/Spooky-Patterns-XS-1.53/XS.pm 
new/Spooky-Patterns-XS-1.55/XS.pm
--- old/Spooky-Patterns-XS-1.53/XS.pm   2020-01-14 09:13:11.444514270 +0100
+++ new/Spooky-Patterns-XS-1.55/XS.pm   2020-01-25 15:43:26.253473510 +0100
@@ -23,7 +23,7 @@
 our @ISA       = qw(Exporter);
 our @EXPORT_OK = qw();
 
-our $VERSION = '1.53';
+our $VERSION = '1.55';
 
 require XSLoader;
 XSLoader::load( 'Spooky::Patterns::XS', $VERSION );
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/Spooky-Patterns-XS-1.53/XS.xs 
new/Spooky-Patterns-XS-1.55/XS.xs
--- old/Spooky-Patterns-XS-1.53/XS.xs   2020-01-14 09:13:11.444514270 +0100
+++ new/Spooky-Patterns-XS-1.55/XS.xs   2020-01-24 08:29:00.768988607 +0100
@@ -8,6 +8,7 @@
 
 typedef Matcher *Spooky__Patterns__XS__Matcher;
 typedef SpookyHash *Spooky__Patterns__XS__Hash;
+typedef BagOfPatterns *Spooky__Patterns__XS__BagOfPatterns;
 
 MODULE = Spooky::Patterns::XS  PACKAGE = Spooky::Patterns::XS
 
@@ -19,7 +20,15 @@
 
   OUTPUT:
     RETVAL
-    
+
+# pass a hash of integer index to string here
+Spooky::Patterns::XS::BagOfPatterns init_bag_of_patterns()
+  CODE:
+    RETVAL = pattern_init_bag_of_patterns();
+
+  OUTPUT:
+    RETVAL
+
 AV *parse_tokens(const char *str)
   CODE:
     RETVAL = pattern_parse(str);
@@ -62,29 +71,29 @@
     pattern_add(self, id, tokens);
 
 AV *find_matches(Spooky::Patterns::XS::Matcher self, const char *filename)
- CODE:
-   RETVAL = pattern_find_matches(self, filename);
+  CODE:
+    RETVAL = pattern_find_matches(self, filename);
 
- OUTPUT:
-   RETVAL
+  OUTPUT:
+    RETVAL
 
 void dump(Spooky::Patterns::XS::Matcher self, const char *filename)
- CODE:
-   pattern_dump(self, filename);
+  CODE:
+    pattern_dump(self, filename);
 
 void load(Spooky::Patterns::XS::Matcher self, const char *filename)
- CODE:
-   pattern_load(self, filename);
+  CODE:
+    pattern_load(self, filename);
 
 void DESTROY(Spooky::Patterns::XS::Matcher self)
- CODE:
-  destroy_matcher(self);
+  CODE:
+   destroy_matcher(self);
 
 MODULE = Spooky::Patterns::XS  PACKAGE = Spooky::Patterns::XS::Hash PREFIX = 
Hash
 
 void DESTROY(Spooky::Patterns::XS::Hash self)
- CODE:
-  destroy_hash(self);
+  CODE:
+    destroy_hash(self);
 
 void add(Spooky::Patterns::XS::Hash self, SV *s)
   CODE:
@@ -96,3 +105,31 @@
 
   OUTPUT:
     RETVAL
+
+MODULE = Spooky::Patterns::XS PACKAGE = Spooky::Patterns::XS::BagOfPatterns 
PREFIX = BagOfPatterns
+
+void DESTROY(Spooky::Patterns::XS::BagOfPatterns self)
+  CODE:
+    destroy_bag_of_patterns(self);
+
+void set_patterns(Spooky::Patterns::XS::BagOfPatterns self, HV *patterns)
+  CODE:
+    pattern_bag_set_patterns(self, patterns);
+
+AV *best_for(Spooky::Patterns::XS::BagOfPatterns self, const char *str, int 
count)
+  CODE:
+    RETVAL = pattern_bag_best_for(self, str, count);
+
+  OUTPUT:
+    RETVAL
+
+void dump(Spooky::Patterns::XS::BagOfPatterns self, const char *filename)
+  CODE:
+    pattern_bag_dump(self, filename);
+
+bool load(Spooky::Patterns::XS::BagOfPatterns self, const char *filename)
+  CODE:
+    RETVAL = pattern_bag_load(self, filename);
+
+  OUTPUT:
+    RETVAL
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/Spooky-Patterns-XS-1.53/bag_impl.cc 
new/Spooky-Patterns-XS-1.55/bag_impl.cc
--- old/Spooky-Patterns-XS-1.53/bag_impl.cc     1970-01-01 01:00:00.000000000 
+0100
+++ new/Spooky-Patterns-XS-1.55/bag_impl.cc     2020-01-24 08:30:01.568751360 
+0100
@@ -0,0 +1,350 @@
+// Copyright © 2020 SUSE LLC
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License along
+// with this program; if not, see <http://www.gnu.org/licenses/>.
+
+#include "Matcher.h"
+#include "patterns_impl.h"
+#include <EXTERN.h>
+#include <XSUB.h>
+// work around seed
+#undef seed
+#include <algorithm>
+#include <iostream>
+#include <map>
+#include <perl.h>
+#include <set>
+
+#define DEBUG 0
+
+using namespace std;
+
+typedef map<uint64_t, uint64_t> wordmap;
+
+// https://en.wikipedia.org/wiki/Tf%E2%80%93idf
+struct TfIdf {
+    uint64_t hash;
+    double value;
+    TfIdf(uint64_t _hash, double _value)
+    {
+        hash = _hash;
+        value = _value;
+    }
+    bool operator<(const TfIdf& str) const
+    {
+        return (hash < str.hash);
+    }
+};
+
+struct Pattern {
+    uint64_t index;
+    double square_sum;
+    vector<TfIdf> tf_idfs;
+};
+
+class BagOfPatterns {
+public:
+    BagOfPatterns() {}
+    void set_patterns(HV* patterns);
+    AV* best_for(const string& snippet, unsigned int count);
+    void dump(const char* filename) const;
+    bool load(const char* filename);
+
+private:
+    void tokenize(const char* str, wordmap& localwords);
+    double compare2(const vector<TfIdf>& tfdif1, const Pattern& pattern) const;
+    double tf_idf(const wordmap& l1, vector<TfIdf>& tfdif1);
+
+    map<uint64_t, double> idfs;
+    vector<Pattern> patterns;
+};
+
+BagOfPatterns* pattern_init_bag_of_patterns()
+{
+    return new BagOfPatterns();
+}
+
+void pattern_bag_set_patterns(BagOfPatterns* b, HV* patterns)
+{
+    b->set_patterns(patterns);
+}
+
+void destroy_bag_of_patterns(BagOfPatterns* b)
+{
+    delete b;
+}
+
+AV* pattern_bag_best_for(BagOfPatterns* b, const char* str, int count)
+{
+    return b->best_for(str, count);
+}
+
+void pattern_bag_dump(BagOfPatterns* b, const char* filename)
+{
+    b->dump(filename);
+}
+
+bool pattern_bag_load(BagOfPatterns* b, const char* filename)
+{
+    return b->load(filename);
+}
+
+void BagOfPatterns::set_patterns(HV* hv_patterns)
+{
+    idfs.clear();
+    patterns.clear();
+
+    wordmap words;
+    vector<wordmap> wordcounts;
+    vector<uint64_t> indexes;
+
+    hv_iterinit(hv_patterns);
+    HE* he;
+    while ((he = hv_iternext(hv_patterns)) != 0) {
+        I32 len;
+        char* key = hv_iterkey(he, &len);
+        unsigned int index = strtoul(key, 0, 10);
+
+        SV* svp = hv_iterval(hv_patterns, he);
+        if (!svp)
+            continue;
+
+        wordmap localwords;
+        tokenize(SvPV_nolen(svp), localwords);
+        indexes.push_back(index);
+        wordcounts.push_back(localwords);
+
+        for (wordmap::const_iterator it = localwords.begin(); it != 
localwords.end(); ++it) {
+            wordmap::iterator word_it = words.find(it->first);
+            if (word_it == words.end())
+                words[it->first] = 1;
+            else
+                word_it->second++;
+        }
+    }
+    for (wordmap::const_iterator it = words.begin(); it != words.end(); ++it) {
+        idfs[it->first] = log(double(indexes.size()) / it->second);
+    }
+
+    vector<uint64_t>::const_iterator index_it = indexes.begin();
+    vector<wordmap>::const_iterator words_it = wordcounts.begin();
+    for (; words_it != wordcounts.end(); ++words_it, ++index_it) {
+        Pattern p;
+        p.index = *index_it;
+        p.square_sum = tf_idf(*words_it, p.tf_idfs);
+        patterns.push_back(p);
+    }
+}
+
+void BagOfPatterns::tokenize(const char* str, wordmap& localwords)
+{
+    char* copy = strdup(str);
+    TokenList t;
+    Matcher::self()->tokenize(t, copy, 1);
+    free(copy);
+
+    // avoid '=======' dominating matches
+    uint64_t last_hash = 0;
+    for (TokenList::const_iterator it = t.begin(); it != t.end(); ++it) {
+        if (it->hash == last_hash)
+            continue;
+        last_hash = it->hash;
+        wordmap::iterator word_it = localwords.find(it->hash);
+        if (word_it == localwords.end()) {
+            localwords[it->hash] = 1;
+        } else {
+            word_it->second++;
+        }
+    }
+}
+
+double BagOfPatterns::tf_idf(const wordmap& words, vector<TfIdf>& tf_idfs)
+{
+    double square_sum = 0;
+    for (wordmap::const_iterator it = words.begin(); it != words.end(); ++it) {
+        double value = it->second * idfs[it->first];
+        square_sum += value * value;
+        tf_idfs.emplace_back(it->first, value);
+    }
+    sort(tf_idfs.begin(), tf_idfs.end());
+    return sqrt(square_sum);
+}
+
+double BagOfPatterns::compare2(const vector<TfIdf>& tf_idfs1, const Pattern& 
pattern) const
+{
+    // both vectors are assumed to be sorted by hash
+    double sum = 0;
+    vector<TfIdf>::const_iterator it1 = pattern.tf_idfs.begin();
+    vector<TfIdf>::const_iterator it2 = tf_idfs1.begin();
+    while (it1 != pattern.tf_idfs.end() && it2 != tf_idfs1.end()) {
+        if (it1->hash == it2->hash) {
+            sum += it1->value * it2->value;
+            ++it1;
+            ++it2;
+        } else if (it1->hash > it2->hash) {
+            ++it2;
+        } else {
+            ++it1;
+        }
+    }
+    return sum / pattern.square_sum;
+}
+
+AV* BagOfPatterns::best_for(const string& snippet, unsigned int count)
+{
+    AV* result = newAV();
+
+    wordmap localwords;
+    tokenize(snippet.c_str(), localwords);
+
+    double highscore = -1;
+
+    vector<TfIdf> tfidf;
+    double square_sum = tf_idf(localwords, tfidf);
+
+    struct BagHit {
+        BagHit() {} // not used
+        BagHit(double _match, uint64_t _index)
+        {
+            match = _match;
+            index = _index;
+        }
+        bool operator<(const BagHit& rhs)
+        {
+            return match < rhs.match;
+        }
+        double match;
+        uint64_t index;
+    };
+    vector<BagHit> hits;
+    vector<Pattern>::const_iterator it = patterns.begin();
+    for (; it != patterns.end(); ++it) {
+        double match = compare2(tfidf, *it);
+        if (match > highscore) {
+            hits.emplace_back(match, it->index);
+            sort(hits.rbegin(), hits.rend());
+            if (hits.size() > count) {
+                hits.resize(count);
+                highscore = hits.back().match;
+            }
+        }
+    }
+    for (const auto& i : hits) {
+        HV* hv = (HV*)sv_2mortal((SV*)newHV());
+        hv_store(hv, "pattern", 7, newSVuv(i.index), 0);
+        hv_store(hv, "match", 5, newSVnv(int(i.match * 10000 / square_sum) / 
10000.), 0);
+        av_push(result, newRV_inc((SV*)hv));
+    }
+
+    return result;
+}
+
+void BagOfPatterns::dump(const char* filename) const
+{
+    FILE* file = fopen(filename, "wb");
+
+    uint64_t count = idfs.size();
+    fwrite(&count, sizeof(count), 1, file);
+
+    map<uint64_t, double>::const_iterator it = idfs.begin();
+    for (; it != idfs.end(); ++it) {
+        uint64_t f1 = it->first;
+        double f2 = it->second;
+        fwrite(&f1, sizeof(f1), 1, file);
+        fwrite(&f2, sizeof(f2), 1, file);
+    }
+
+    count = patterns.size();
+    fwrite(&count, sizeof(count), 1, file);
+
+    for (const auto& i : patterns) {
+        uint64_t f1 = i.index;
+        fwrite(&f1, sizeof(f1), 1, file);
+        double f2 = i.square_sum;
+        fwrite(&f2, sizeof(f2), 1, file);
+
+        count = i.tf_idfs.size();
+        fwrite(&count, sizeof(count), 1, file);
+        for (const auto& t : i.tf_idfs) {
+            uint64_t f1 = t.hash;
+            double f2 = t.value;
+            fwrite(&f1, sizeof(f1), 1, file);
+            fwrite(&f2, sizeof(f2), 1, file);
+        }
+    }
+}
+
+bool BagOfPatterns::load(const char* filename)
+{
+    FILE* file = fopen(filename, "rb");
+    if (!file)
+        return false;
+
+    uint64_t count = 0;
+    if (fread(&count, sizeof(count), 1, file) != 1) {
+        fclose(file);
+        return false;
+    }
+
+    idfs.clear();
+    while (count--) {
+        uint64_t f1 = 0;
+        double f2 = 0;
+        int read = fread(&f1, sizeof(f1), 1, file);
+        read += fread(&f2, sizeof(f2), 1, file);
+        if (read != 2) {
+            fclose(file);
+            return false;
+        }
+        idfs[f1] = f2;
+    }
+
+    patterns.clear();
+    count = 0;
+    if (fread(&count, sizeof(count), 1, file) != 1) {
+        fclose(file);
+        return false;
+    }
+
+    while (count--) {
+        Pattern p;
+        uint64_t f1 = 0;
+        int read = fread(&f1, sizeof(f1), 1, file);
+        p.index = f1;
+        double f2 = 0;
+        read += fread(&f2, sizeof(f2), 1, file);
+        p.square_sum = f2;
+
+        uint64_t f3 = 0;
+        read += fread(&f3, sizeof(f3), 1, file);
+
+        if (read != 3) {
+            fclose(file);
+            return false;
+        }
+        while (f3--) {
+            uint64_t f1;
+            double f2;
+            read = fread(&f1, sizeof(f1), 1, file);
+            read += fread(&f2, sizeof(f2), 1, file);
+            if (read != 2) {
+                fclose(file);
+                return false;
+            }
+            p.tf_idfs.emplace_back(f1, f2);
+        }
+        patterns.push_back(p);
+    }
+
+    return true;
+}
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/Spooky-Patterns-XS-1.53/patterns_impl.cc 
new/Spooky-Patterns-XS-1.55/patterns_impl.cc
--- old/Spooky-Patterns-XS-1.53/patterns_impl.cc        2020-01-14 
09:13:11.444514270 +0100
+++ new/Spooky-Patterns-XS-1.55/patterns_impl.cc        2020-01-25 
15:42:56.479510549 +0100
@@ -14,6 +14,7 @@
 // with this program; if not, see <http://www.gnu.org/licenses/>.
 
 #include "patterns_impl.h"
+#include "Matcher.h"
 #include "SpookyV2.h"
 #include "TokenTree.h"
 #include <EXTERN.h>
@@ -30,102 +31,80 @@
 
 using namespace std;
 
-struct Token {
-    int linenumber;
-    uint64_t hash;
-    std::string text;
-};
-
-typedef std::vector<Token> TokenList;
-
 std::vector<AANode> TokenTree::nodes;
 
 const int MAX_TOKEN_LENGTH = 100;
 const int MAX_LINE_SIZE = 8000;
 
-struct Match {
-    int start;
-    int matched;
-    int pattern;
-    int sline;
-    int eline;
-};
-
-typedef std::list<Match> Matches;
-
-struct Matcher {
-    TokenTree ignore_tree;
-    TokenTree pattern_tree;
-
-    SSize_t longest_pattern;
+Matcher* Matcher::_self = 0;
 
-    static Matcher* _self;
-    static Matcher* self() { return _self; }
+Matcher* pattern_init_matcher()
+{
+    Matcher::self()->init();
+    return Matcher::self();
+}
 
-    bool to_ignore(uint64_t t)
-    {
-        return ignore_tree.find(t);
-    }
+void destroy_matcher(Matcher* m)
+{
+    // do nothing, we reuse the self
+}
 
-    Matcher()
-    {
-        if (_self) {
-            fprintf(stderr, "Matcher::self already initialized\n");
-        }
-        init();
+Matcher::Matcher()
+{
+    if (_self) {
+        fprintf(stderr, "Matcher::self already initialized\n");
     }
+    pattern_tree = new TokenTree;
+    init();
+}
 
-    void init()
-    {
-        TokenTree::nodes.clear();
-
-        ignore_tree.initNull();
-        pattern_tree.initNull();
-
-        // typical comment and markup - have to be single tokens!
-        static const char* ignored_tokens[] = {
-            "/", "//", "%", "%%", "dnl",
-            "#~", ";;", "\"\"", "--", "#:",
-            "\\", ">", "==", "::", "##", 0
-        };
+void Matcher::init()
+{
+    TokenTree::nodes.clear();
+    pattern_tree->initNull();
+    ignored_tokens.clear();
 
-        static TokenTree dummy_next;
+    // typical comment and markup - have to be single tokens!
+    static const char* _ignored_tokens[] = {
+        "dnl", "\\n", "\\r", 0
+    };
 
-        int index = 0;
-        while (ignored_tokens[index]) {
-            int len = strlen(ignored_tokens[index]);
-            uint64_t h = SpookyHash::Hash64(ignored_tokens[index], len, 1);
-            ignore_tree.insert(h, &dummy_next);
-            index++;
-        }
-        longest_pattern = 0;
+    int index = 0;
+    while (_ignored_tokens[index]) {
+        int len = strlen(_ignored_tokens[index]);
+        uint64_t h = SpookyHash::Hash64(_ignored_tokens[index], len, 1);
+        ignored_tokens.insert(h);
+        index++;
     }
-};
-
-Matcher* Matcher::_self = 0;
+    longest_pattern = 0;
+}
 
-Matcher* pattern_init_matcher()
+// check if the token is purely non alpha numeric
+bool Matcher::to_ignore(const char* text, unsigned int len) const
 {
-    if (!Matcher::_self)
-        Matcher::_self = new Matcher;
-
-    Matcher::_self->init();
-
-    return Matcher::_self;
+    if (!len)
+        return true;
+    uint64_t index = 0;
+    while (index < len) {
+        if (isalnum(text[index]))
+            return false;
+        index++;
+    }
+    //cerr << "ignore '" << string(text, len) << endl;
+    return true;
 }
 
-void destroy_matcher(Matcher* m)
+bool Matcher::to_ignore(uint64_t t) const
 {
-    // do nothing, we reuse the self
+    return ignored_tokens.find(t) != ignored_tokens.end();
 }
 
-static void add_token(Matcher* m, TokenList& result, const char* start, size_t 
len, int line)
+void Matcher::add_token(TokenList& result, const char* start, size_t len, int 
line) const
 {
-    if (!len)
+    if (to_ignore(start, len))
         return;
 
     Token t;
-    t.text = std::string(start, len);
     t.linenumber = line;
     t.hash = 0;
     if (!line && len > 5 && len < 9 && !strncmp(start, "$skip", 5)) {
@@ -137,21 +116,30 @@
         if (*endptr || t.hash > MAX_SKIP) // more than just a number
             t.hash = 0;
     }
+    // very special case
+    if (start[len - 1] == '.') {
+        len--;
+    }
+    if (start[0] == '+' || start[0] == '-') {
+        start++;
+        len--;
+    }
+    t.text = std::string(start, len);
     if (!t.hash) {
         // hash64 has no collisions on our patterns and is very fast
         // *and* 0-3000 (at least) are "free"
         t.hash = SpookyHash::Hash64(start, len, 1);
         assert(t.hash > MAX_SKIP);
-        if (m->to_ignore(t.hash))
+        if (to_ignore(t.hash))
             return;
     }
     result.push_back(t);
 }
 
-void tokenize(Matcher* m, TokenList& result, char* str, int linenumber = 0)
+void Matcher::tokenize(TokenList& result, char* str, int linenumber)
 {
-    static const char* ignore_seps = " \r\n\t*;,:!#{}()[]|";
-    static const char* single_seps = "-.+?\"\'`=<>";
+    static const char* ignore_seps = " \r\n\t*;,:!#{}()[]|><";
+    static const char* single_seps = "?\"\'`'=";
 
     const char* start = str;
 
@@ -162,14 +150,14 @@
         *str = tolower(*str);
         bool ignored = (strchr(ignore_seps, *str) != NULL);
         if (ignored || strchr(single_seps, *str)) {
-            add_token(m, result, start, str - start, linenumber);
+            add_token(result, start, str - start, linenumber);
             //fprintf(stderr, "TO %d:'%s'\n", ignored, str);
             if (!ignored)
-                add_token(m, result, str, 1, linenumber);
+                add_token(result, str, 1, linenumber);
             start = str + 1;
         }
     }
-    add_token(m, result, start, str - start, linenumber);
+    add_token(result, start, str - start, linenumber);
 }
 
 AV* pattern_parse(const char* str)
@@ -182,7 +170,7 @@
         fprintf(stderr, "Need a Matcher - call init_matcher\n");
         return ret;
     }
-    tokenize(m, t, copy);
+    m->tokenize(t, copy);
     free(copy);
     av_extend(ret, t.size());
     int index = 0;
@@ -197,7 +185,7 @@
         ++index;
     }
     // do not end with an expansion variable either
-    if (last_hash <= MAX_SKIP)  {
+    if (last_hash <= MAX_SKIP) {
         av_pop(ret);
     }
 
@@ -229,13 +217,13 @@
 
 void pattern_add(Matcher* m, unsigned int id, av* tokens)
 {
-    SSize_t len = av_top_index(tokens) + 1;
+    ssize_t len = av_top_index(tokens) + 1;
     if (!len) {
         std::cerr << "add failed for id " << id << std::endl;
         return;
     }
 
-    TokenTree* current = &m->pattern_tree;
+    TokenTree* current = m->pattern_tree;
 
     for (SSize_t i = 0; i < len; ++i) {
         SV* sv = *av_fetch(tokens, i, 0);
@@ -260,23 +248,24 @@
         m->longest_pattern = len;
 }
 
-void add_match(const TokenList& ts, Matches& ms, int tokenlist_offset, int 
tokenlist_index, unsigned int matched, int pid) {
-  Match m;
-  m.start = tokenlist_offset + tokenlist_index;
-  m.matched = matched - tokenlist_index;
+void add_match(const TokenList& ts, Matches& ms, int tokenlist_offset, int 
tokenlist_index, unsigned int matched, int pid)
+{
+    Match m;
+    m.start = tokenlist_offset + tokenlist_index;
+    m.matched = matched - tokenlist_index;
 
-  m.sline = ts[tokenlist_index].linenumber;
-  m.eline = ts[matched - 1].linenumber;
+    m.sline = ts[tokenlist_index].linenumber;
+    m.eline = ts[matched - 1].linenumber;
 
-  m.pattern = pid;
+    m.pattern = pid;
 #if DEBUG
-  fprintf(stderr, "L %d(%d)-%d(%d) id:%d\n", ts[tokenlist_index].linenumber,
-      tokenlist_offset + tokenlist_index, ts[matched - 1].linenumber, 
m.matched, m.pattern);
+    fprintf(stderr, "L %d(%d)-%d(%d) id:%d\n", ts[tokenlist_index].linenumber,
+        tokenlist_offset + tokenlist_index, ts[matched - 1].linenumber, 
m.matched, m.pattern);
 #endif
-  ms.push_back(m);
+    ms.push_back(m);
 }
 
-void check_token_matches(const TokenList& tokens, Matches &ms, int 
tokenlist_offset, int tokenlist_index, unsigned int offset, const TokenTree* 
patterns)
+void check_token_matches(const TokenList& tokens, Matches& ms, int 
tokenlist_offset, int tokenlist_index, unsigned int offset, const TokenTree* 
patterns)
 {
     if (offset >= tokens.size())
         return;
@@ -309,7 +298,6 @@
     }
 }
 
-
 // if either the start or the end of one region is within the other
 bool match_overlap(int s1, int e1, int s2, int e2)
 {
@@ -322,7 +310,7 @@
 
 void find_tokens(Matcher* m, TokenList& ts, Matches& ms, int tokenlist_offset, 
int tokenlist_index)
 {
-    TokenTree* patterns = m->pattern_tree.find(ts[tokenlist_index].hash);
+    TokenTree* patterns = m->pattern_tree->find(ts[tokenlist_index].hash);
     if (!patterns)
         return;
     check_token_matches(ts, ms, tokenlist_offset, tokenlist_index, 
tokenlist_index + 1, patterns);
@@ -344,7 +332,7 @@
     Matches ms;
     int token_offset = 0;
     while (fgets(line, sizeof(line) - 1, input)) {
-        tokenize(m, ts, line, linenumber++);
+        m->tokenize(ts, line, linenumber++);
         // preserve memory
         if (SSize_t(ts.size()) > m->longest_pattern * 100) {
             unsigned int erasing = ts.size() - m->longest_pattern - 1;
@@ -401,7 +389,7 @@
 
     SerializeInfo si;
 
-    m->pattern_tree.mark_elements(si);
+    m->pattern_tree->mark_elements(si);
     fwrite(&si.tree_count, sizeof(si.tree_count), 1, file);
     uint32_t count = TokenTree::nodes.size();
     fwrite(&count, sizeof(count), 1, file);
@@ -461,7 +449,7 @@
         fwrite(&index, sizeof(int32_t), 1, file);
     }
 
-    uint32_t index = m->pattern_tree.root;
+    uint32_t index = m->pattern_tree->root;
     fwrite(&index, sizeof(uint32_t), 1, file);
 
     fclose(file);
@@ -527,7 +515,7 @@
 
     TokenTree::nodes.clear();
     TokenTree::nodes.reserve(node_count);
-    m->pattern_tree.initNull();
+    m->pattern_tree->initNull();
 
     for (unsigned int i = 1; i < node_count; i++) {
         uint64_t element = *reinterpret_cast<uint64_t*>(dump);
@@ -545,7 +533,7 @@
 
     //delete [] trees;
 
-    m->pattern_tree.root = *reinterpret_cast<uint32_t*>(dump);
+    m->pattern_tree->root = *reinterpret_cast<uint32_t*>(dump);
     dump += sizeof(uint32_t);
 
     munmap(dump, attr.st_size);
@@ -572,7 +560,7 @@
             // fgets makes sure we have a 0 at the end
             size_t len = strlen(line);
             // chop
-            if (len && line[len-1] == '\n') {
+            if (len && line[len - 1] == '\n') {
                 line[--len] = 0;
             }
             AV* row = newAV();
@@ -674,10 +662,6 @@
 {
     AV* ret = newAV();
     Matcher* m = Matcher::self();
-    if (!m) {
-        fprintf(stderr, "Need a Matcher - call init_matcher\n");
-        return ret;
-    }
     TokenList t;
     int line = 1;
     while (true) {
@@ -687,7 +671,7 @@
             copy = strndup(p, nl - p);
         else
             copy = strdup(p);
-        tokenize(m, t, copy, line++);
+        m->tokenize(t, copy, line++);
         free(copy);
         if (!nl)
             break;
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/Spooky-Patterns-XS-1.53/patterns_impl.h 
new/Spooky-Patterns-XS-1.55/patterns_impl.h
--- old/Spooky-Patterns-XS-1.53/patterns_impl.h 2020-01-14 09:13:11.444514270 
+0100
+++ new/Spooky-Patterns-XS-1.55/patterns_impl.h 2020-01-24 08:27:50.792375119 
+0100
@@ -22,16 +22,26 @@
 AV* pattern_parse(const char* str);
 AV* pattern_normalize(const char* str);
 int pattern_distance(AV* a1, AV* a2);
+AV* pattern_read_lines(const char* filename, HV* needed);
+
 struct Matcher;
-class SpookyHash;
 Matcher* pattern_init_matcher();
 void pattern_add(Matcher* m, unsigned id, AV* tokens);
 AV* pattern_find_matches(Matcher* m, const char* filename);
 void pattern_dump(Matcher* m, const char* filename);
 void pattern_load(Matcher* m, const char* filename);
 void destroy_matcher(Matcher* m);
-AV* pattern_read_lines(const char* filename, HV* needed);
+
+class SpookyHash;
 SpookyHash* pattern_init_hash(UV seed1, UV seed2);
 void pattern_add_to_hash(SpookyHash* s, SV* sv);
 void destroy_hash(SpookyHash* s);
 AV* pattern_hash128(SpookyHash* s);
+
+class BagOfPatterns;
+BagOfPatterns* pattern_init_bag_of_patterns();
+void destroy_bag_of_patterns(BagOfPatterns *b);
+void pattern_bag_set_patterns(BagOfPatterns *b, HV *patterns);
+AV *pattern_bag_best_for(BagOfPatterns *b, const char *str, int count);
+void pattern_bag_dump(BagOfPatterns* b, const char* filename);
+bool pattern_bag_load(BagOfPatterns* b, const char* filename);
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/Spooky-Patterns-XS-1.53/t/04license.13.txt 
new/Spooky-Patterns-XS-1.55/t/04license.13.txt
--- old/Spooky-Patterns-XS-1.53/t/04license.13.txt      1970-01-01 
01:00:00.000000000 +0100
+++ new/Spooky-Patterns-XS-1.55/t/04license.13.txt      2020-01-25 
12:12:15.365695264 +0100
@@ -0,0 +1,5 @@
+// SPDX-License-Identifier: EPL-1.0+
+
+Hallo
+
+// SPDX-License-Identifier: EPL-1.0
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/Spooky-Patterns-XS-1.53/t/04license.30.pattern 
new/Spooky-Patterns-XS-1.55/t/04license.30.pattern
--- old/Spooky-Patterns-XS-1.53/t/04license.30.pattern  1970-01-01 
01:00:00.000000000 +0100
+++ new/Spooky-Patterns-XS-1.55/t/04license.30.pattern  2020-01-25 
12:11:43.419589102 +0100
@@ -0,0 +1 @@
+SPDX-License-Identifier: EPL-1.0
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/Spooky-Patterns-XS-1.53/t/04license.31.pattern 
new/Spooky-Patterns-XS-1.55/t/04license.31.pattern
--- old/Spooky-Patterns-XS-1.53/t/04license.31.pattern  1970-01-01 
01:00:00.000000000 +0100
+++ new/Spooky-Patterns-XS-1.55/t/04license.31.pattern  2020-01-25 
12:11:51.204102326 +0100
@@ -0,0 +1 @@
+SPDX-License-Identifier: EPL-1.0+
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/Spooky-Patterns-XS-1.53/t/04license.t 
new/Spooky-Patterns-XS-1.55/t/04license.t
--- old/Spooky-Patterns-XS-1.53/t/04license.t   2020-01-14 09:13:11.444514270 
+0100
+++ new/Spooky-Patterns-XS-1.55/t/04license.t   2020-01-25 14:14:23.333200814 
+0100
@@ -31,7 +31,8 @@
     9  => [ [ 20, 1,   29 ] ],
     10 => [ [ 22, 113, 114 ], [ 4, 112, 112 ], [ 3, 109, 109 ] ],
     11 => [ [ 25, 4,   15 ], [ 23, 2, 2 ], [ 4, 2, 2 ] ],
-    12 => [ [ 29, 1, 115 ] ]
+    12 => [ [ 29, 1,   115 ] ],
+    13 => [ [ 31, 1,   1 ], [ 30, 5, 5 ] ],
 );
 
 for my $fn ( glob("t/04license.*.txt") ) {
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/Spooky-Patterns-XS-1.53/t/05readlines.t 
new/Spooky-Patterns-XS-1.55/t/05readlines.t
--- old/Spooky-Patterns-XS-1.53/t/05readlines.t 2020-01-14 09:13:11.444514270 
+0100
+++ new/Spooky-Patterns-XS-1.55/t/05readlines.t 2020-01-25 14:14:23.349201868 
+0100
@@ -29,6 +29,10 @@
 $ret = Spooky::Patterns::XS::read_lines( 't/05readlines.2.raw', { 1 => 1 } );
 
 $ret = Spooky::Patterns::XS::read_lines( 't/04license.12.txt', { 115 => 1 } );
-cmp_deeply($ret, [ [ 115, 1, 'END OF TERMS AND CONDITIONS' ] ], "end of file 
returned" );
+cmp_deeply(
+    $ret,
+    [ [ 115, 1, 'END OF TERMS AND CONDITIONS' ] ],
+    "end of file returned"
+);
 
 done_testing();
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/Spooky-Patterns-XS-1.53/t/06hash.t 
new/Spooky-Patterns-XS-1.55/t/06hash.t
--- old/Spooky-Patterns-XS-1.53/t/06hash.t      2019-07-27 14:53:24.498286975 
+0200
+++ new/Spooky-Patterns-XS-1.55/t/06hash.t      2020-01-25 14:14:23.361202659 
+0100
@@ -10,8 +10,8 @@
 my $h = Spooky::Patterns::XS::init_hash( 0, 0 );
 $h->add("Hállöchen\n");
 $h->add("abc\x{300}");
-is( $h->hex, 'd6d58320114a2d3c1d6dd671ab0383ec', "Hhex correct" );
-is( $h->hash64, 15480423467908214076, "Hash64 correct" );
+is( $h->hex,    'd6d58320114a2d3c1d6dd671ab0383ec', "Hhex correct" );
+is( $h->hash64, 15480423467908214076,               "Hash64 correct" );
 
 $h = Spooky::Patterns::XS::init_hash( 0, 0 );
 $h->add("/* \n");
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/Spooky-Patterns-XS-1.53/t/07close.t 
new/Spooky-Patterns-XS-1.55/t/07close.t
--- old/Spooky-Patterns-XS-1.53/t/07close.t     2020-01-14 09:13:11.444514270 
+0100
+++ new/Spooky-Patterns-XS-1.55/t/07close.t     2020-01-25 14:14:23.385204241 
+0100
@@ -13,7 +13,7 @@
 Spooky::Patterns::XS::init_matcher();
 my $p1 = Spooky::Patterns::XS::normalize( read_file('t/07close.p1') );
 my $p2 = Spooky::Patterns::XS::normalize( read_file('t/07close.p2') );
-is( Spooky::Patterns::XS::distance( $p1, $p2 ), 4, "Distance is 2" );
+is( Spooky::Patterns::XS::distance( $p1, $p2 ), 4, "Distance is right" );
 
 my @words1 = map { $_->[1] } @$p1;
 my @words2 = map { $_->[1] } @$p2;
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/Spooky-Patterns-XS-1.53/t/08bag.t 
new/Spooky-Patterns-XS-1.55/t/08bag.t
--- old/Spooky-Patterns-XS-1.53/t/08bag.t       1970-01-01 01:00:00.000000000 
+0100
+++ new/Spooky-Patterns-XS-1.55/t/08bag.t       2020-01-25 14:56:51.782687801 
+0100
@@ -0,0 +1,59 @@
+#! /usr/bin/perl
+
+use 5.012;
+use strict;
+use warnings;
+use Test::More;
+use Spooky::Patterns::XS;
+
+my %patterns;
+$patterns{42} = "This is the great GPL";
+$patterns{17} = "Artistic is great too";
+
+my $bag = Spooky::Patterns::XS::init_bag_of_patterns;
+$bag->set_patterns( \%patterns );
+$bag->dump('t/08bag.dump');
+$bag->load('t/08bag.dump');
+my $result = $bag->best_for( 'GPL is great', 2 );
+
+is( scalar @$result, 2, 'right number' );
+is_deeply( $result->[0], { pattern => 42, match => 0.5773 }, 'fits GPL' );
+
+done_testing();
+
+=benchmark
+  use Mojo::File;
+  use Mojo::JSON 'decode_json';
+  
+  # https://stephan.kulow.org/test.json.xz
+  my $json = Mojo::File->new('test.json')->slurp;
+  $json = decode_json($json);
+  $bag  = Spooky::Patterns::XS::init_bag_of_patterns;
+  $bag->set_patterns( $json->{patterns} );
+  
+  #$bag->dump('test.dump');
+  #$bag->load('cavil.pattern.bag');
+  $result = $bag->best_for( $json->{snippets}{2061026}, 10 );
+  use Data::Dumper;
+  diag Dumper($result);
+  is_deeply( $result->[0], { pattern => 2430, match => 0.9228 }, 'fits' );
+  
+  diag "lookup:";
+  diag $json->{snippets}{2061026};
+  
+  diag "33214:";
+  diag $json->{patterns}{33214};
+  
+  diag "2430:";
+  diag $json->{patterns}{2430};
+  
+  my $stime = time;
+  my $count = 0;
+  for my $snippet ( keys %{ $json->{snippets} } ) {
+      $result = $bag->best_for( $json->{snippets}{$snippet}, 1 )->[0];
+      $count++;
+      my $delta = time - $stime;
+      diag "$snippet: $count/$delta $result->{pattern}/$result->{match}";
+      last if ( $delta > 10 || $count > 1000 );
+  }
+
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/Spooky-Patterns-XS-1.53/t/09normalize.1.in 
new/Spooky-Patterns-XS-1.55/t/09normalize.1.in
--- old/Spooky-Patterns-XS-1.53/t/09normalize.1.in      1970-01-01 
01:00:00.000000000 +0100
+++ new/Spooky-Patterns-XS-1.55/t/09normalize.1.in      2020-01-25 
12:45:04.789174947 +0100
@@ -0,0 +1,27 @@
+**  LANDON CURT NOLL DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+**  INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.  IN NO
+**  EVENT SHALL LANDON CURT NOLL BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+**  CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
+**  USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
+**  OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+**  PERFORMANCE OF THIS SOFTWARE.
+**
+**  Copyright (C) 1990, RSA Data Security, Inc.  All rights reserved.
+**
+**  License to copy and use this software is granted provided that it is
+**  identified as the "RSA Data Security, Inc. MD5 Message-Digest Algorithm"
+**  in all material mentioning or referencing this software or this
+**  function.
+**
+**  License is also granted to make and use derivative works provided that
+**  such works are identified as "derived from the RSA Data Security,
+**  Inc. MD5 Message-Digest Algorithm" in all material mentioning or
+**  referencing the derived work.
+**
+**  RSA Data Security, Inc. makes no representations concerning either the
+**  merchantability of this software or the suitability of this software for
+**  any particular purpose.  It is provided "as is" without express or
+**  implied warranty of any kind.
+**
+**  These notices must be retained in any copies of any part of this
+**  documentation and/or software.
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/Spooky-Patterns-XS-1.53/t/09normalize.1.out 
new/Spooky-Patterns-XS-1.55/t/09normalize.1.out
--- old/Spooky-Patterns-XS-1.53/t/09normalize.1.out     1970-01-01 
01:00:00.000000000 +0100
+++ new/Spooky-Patterns-XS-1.55/t/09normalize.1.out     2020-01-25 
13:50:23.721816938 +0100
@@ -0,0 +1,27 @@
+landon curt noll disclaims all warranties with regard to this software 
+including all implied warranties of merchantability and fitness in no 
+event shall landon curt noll be liable for any special indirect or 
+consequential damages or any damages whatsoever resulting from loss of 
+use data or profits whether in an action of contract negligence or 
+other tortious action arising out of or in connection with the use or 
+performance of this software 
+
+copyright c 1990 rsa data security inc all rights reserved 
+
+license to copy and use this software is granted provided that it is 
+identified as the rsa data security inc md5 message-digest algorithm 
+in all material mentioning or referencing this software or this 
+function 
+
+license is also granted to make and use derivative works provided that 
+such works are identified as derived from the rsa data security 
+inc md5 message-digest algorithm in all material mentioning or 
+referencing the derived work 
+
+rsa data security inc makes no representations concerning either the 
+merchantability of this software or the suitability of this software for 
+any particular purpose it is provided as is without express or 
+implied warranty of any kind 
+
+these notices must be retained in any copies of any part of this 
+documentation and/or software 
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/Spooky-Patterns-XS-1.53/t/09normalize.2.in 
new/Spooky-Patterns-XS-1.55/t/09normalize.2.in
--- old/Spooky-Patterns-XS-1.53/t/09normalize.2.in      1970-01-01 
01:00:00.000000000 +0100
+++ new/Spooky-Patterns-XS-1.55/t/09normalize.2.in      2020-01-25 
15:11:42.346670056 +0100
@@ -0,0 +1,9 @@
+// SPDX-License-Identifier: EPL-1.0+
+
+Hallo
+
+// SPDX-License-Identifier: EPL-1.0
+
+<p>SPDX-License-Identifier: EPL-1.0</p>
+
++Hallo World!
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/Spooky-Patterns-XS-1.53/t/09normalize.2.out 
new/Spooky-Patterns-XS-1.55/t/09normalize.2.out
--- old/Spooky-Patterns-XS-1.53/t/09normalize.2.out     1970-01-01 
01:00:00.000000000 +0100
+++ new/Spooky-Patterns-XS-1.55/t/09normalize.2.out     2020-01-25 
15:12:57.075596854 +0100
@@ -0,0 +1,9 @@
+spdx-license-identifier epl-1.0+ 
+
+hallo 
+
+spdx-license-identifier epl-1.0 
+
+p spdx-license-identifier epl-1.0 /p 
+
+hallo world 
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/Spooky-Patterns-XS-1.53/t/09normalize.t 
new/Spooky-Patterns-XS-1.55/t/09normalize.t
--- old/Spooky-Patterns-XS-1.53/t/09normalize.t 1970-01-01 01:00:00.000000000 
+0100
+++ new/Spooky-Patterns-XS-1.55/t/09normalize.t 2020-01-25 14:14:23.413206088 
+0100
@@ -0,0 +1,35 @@
+#! /usr/bin/perl
+
+use 5.012;
+use strict;
+use warnings;
+use Test::More;
+use File::Slurp;
+use Spooky::Patterns::XS;
+
+sub compare_case {
+    my $case = shift;
+    my $p1 =
+      Spooky::Patterns::XS::normalize( read_file("t/09normalize.$case.in") );
+    my @outlines;
+
+    my $txt;
+    my $prevline = 1;
+    for my $token (@$p1) {
+        while ( $prevline < $token->[0] ) {
+            $txt .= "\n";
+            $prevline++;
+        }
+        $txt .= $token->[1] . " ";
+    }
+    chomp $txt;
+    my $exp = read_file("t/09normalize.$case.out");
+    chomp $exp;
+
+    is( $txt, $exp );
+}
+
+compare_case(1);
+compare_case(2);
+
+done_testing();
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/Spooky-Patterns-XS-1.53/typemap 
new/Spooky-Patterns-XS-1.55/typemap
--- old/Spooky-Patterns-XS-1.53/typemap 2017-03-15 06:58:08.298117819 +0100
+++ new/Spooky-Patterns-XS-1.55/typemap 2020-01-23 17:12:57.262561953 +0100
@@ -1,3 +1,4 @@
-Spooky::Patterns::XS::Matcher     T_PTROBJ
-Spooky::Patterns::XS::Hash        T_PTROBJ
-AV*                              T_AVREF_REFCOUNT_FIXED
\ No newline at end of file
+Spooky::Patterns::XS::Matcher       T_PTROBJ
+Spooky::Patterns::XS::Hash          T_PTROBJ
+Spooky::Patterns::XS::BagOfPatterns T_PTROBJ
+AV*                                T_AVREF_REFCOUNT_FIXED


Reply via email to