Hello community,
here is the log from the commit of package perl-Spooky-Patterns-XS for
openSUSE:Factory checked in at 2020-01-28 10:54:36
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Comparing /work/SRC/openSUSE:Factory/perl-Spooky-Patterns-XS (Old)
and /work/SRC/openSUSE:Factory/.perl-Spooky-Patterns-XS.new.26092 (New)
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Package is "perl-Spooky-Patterns-XS"
Tue Jan 28 10:54:36 2020 rev:5 rq:767807 version:1.55
Changes:
--------
---
/work/SRC/openSUSE:Factory/perl-Spooky-Patterns-XS/perl-Spooky-Patterns-XS.changes
2020-01-16 18:22:27.565016170 +0100
+++
/work/SRC/openSUSE:Factory/.perl-Spooky-Patterns-XS.new.26092/perl-Spooky-Patterns-XS.changes
2020-01-28 10:54:54.789012811 +0100
@@ -1,0 +2,10 @@
+Sat Jan 25 14:49:03 UTC 2020 - Stephan Kulow <[email protected]>
+
+- 1.55: Ignore more tokens on matching
+
+-------------------------------------------------------------------
+Thu Jan 23 16:00:47 UTC 2020 - Stephan Kulow <[email protected]>
+
+- 1.54: Add BagOfPatterns to calculcate closest pattern
+
+-------------------------------------------------------------------
Old:
----
Spooky-Patterns-XS-1.53.tar.gz
New:
----
Spooky-Patterns-XS-1.55.tar.gz
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Other differences:
------------------
++++++ perl-Spooky-Patterns-XS.spec ++++++
--- /var/tmp/diff_new_pack.DC364B/_old 2020-01-28 10:54:55.757013944 +0100
+++ /var/tmp/diff_new_pack.DC364B/_new 2020-01-28 10:54:55.757013944 +0100
@@ -17,7 +17,7 @@
Name: perl-Spooky-Patterns-XS
-Version: 1.53
+Version: 1.55
Release: 0
%define cpan_name Spooky-Patterns-XS
Summary: Spooky::Patterns::XS Perl module
++++++ Spooky-Patterns-XS-1.53.tar.gz -> Spooky-Patterns-XS-1.55.tar.gz ++++++
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/Spooky-Patterns-XS-1.53/Changes
new/Spooky-Patterns-XS-1.55/Changes
--- old/Spooky-Patterns-XS-1.53/Changes 2020-01-14 09:13:11.440514006 +0100
+++ new/Spooky-Patterns-XS-1.55/Changes 2020-01-25 15:44:14.236636977 +0100
@@ -1,5 +1,12 @@
Revision history for Perl extension Spooky::Patterns::XS
+1.55 2020-01-25
+ - Way stronger strategy on ignoring characters that
+ do not add value
+
+1.54 2020-01-23
+ - Add Bag of Patterns to calculate nearest pattern
+
1.53 2020-01-14
- Fix read_lines to return the last line in file correctly
if it doesn't end with a newline
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/Spooky-Patterns-XS-1.53/MANIFEST
new/Spooky-Patterns-XS-1.55/MANIFEST
--- old/Spooky-Patterns-XS-1.53/MANIFEST 2020-01-14 09:19:41.701153998
+0100
+++ new/Spooky-Patterns-XS-1.55/MANIFEST 2020-01-25 15:48:09.797617393
+0100
@@ -1,7 +1,9 @@
+bag_impl.cc
Changes
COPYING
Makefile.PL
MANIFEST This list of files
+Matcher.h
patterns_impl.cc
patterns_impl.h
SpookyV2.cpp
@@ -10,8 +12,6 @@
t/02compile.t
t/03match.t
t/03match.txt
-t/04license.1.pattern
-t/04license.1.txt
t/04license.10.pattern
t/04license.10.txt
t/04license.11.pattern
@@ -19,14 +19,15 @@
t/04license.12.pattern
t/04license.12.txt
t/04license.13.pattern
+t/04license.13.txt
t/04license.14.pattern
t/04license.15.pattern
t/04license.16.pattern
t/04license.17.pattern
t/04license.18.pattern
t/04license.19.pattern
-t/04license.2.pattern
-t/04license.2.txt
+t/04license.1.pattern
+t/04license.1.txt
t/04license.20.pattern
t/04license.21.pattern
t/04license.22.pattern
@@ -37,6 +38,10 @@
t/04license.27.pattern
t/04license.28.pattern
t/04license.29.pattern
+t/04license.2.pattern
+t/04license.2.txt
+t/04license.30.pattern
+t/04license.31.pattern
t/04license.3.pattern
t/04license.3.txt
t/04license.4.pattern
@@ -59,8 +64,14 @@
t/07close.p1
t/07close.p2
t/07close.t
-t/test.t
+t/08bag.t
+t/09normalize.1.in
+t/09normalize.1.out
+t/09normalize.t
+t/09normalize.2.in
+t/09normalize.2.out
TokenTree.h
+t/test.t
typemap
XS.pm
XS.xs
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/Spooky-Patterns-XS-1.53/META.json
new/Spooky-Patterns-XS-1.55/META.json
--- old/Spooky-Patterns-XS-1.53/META.json 2020-01-14 09:19:41.649150569
+0100
+++ new/Spooky-Patterns-XS-1.55/META.json 2020-01-25 15:48:09.593603943
+0100
@@ -6,7 +6,7 @@
"dynamic_config" : 1,
"generated_by" : "ExtUtils::MakeMaker version 7.34, CPAN::Meta::Converter
version 2.150010",
"license" : [
- "unknown"
+ "gpl_2"
],
"meta-spec" : {
"url" : "http://search.cpan.org/perldoc?CPAN::Meta::Spec",
@@ -45,6 +45,6 @@
"url" : "https://github.com/coolo/spooky-pattern-xs"
}
},
- "version" : "1.53",
+ "version" : "1.55",
"x_serialization_backend" : "JSON::PP version 4.02"
}
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/Spooky-Patterns-XS-1.53/META.yml
new/Spooky-Patterns-XS-1.55/META.yml
--- old/Spooky-Patterns-XS-1.53/META.yml 2020-01-14 09:19:41.581146085
+0100
+++ new/Spooky-Patterns-XS-1.55/META.yml 2020-01-25 15:48:09.353588119
+0100
@@ -8,7 +8,7 @@
ExtUtils::MakeMaker: '0'
dynamic_config: 1
generated_by: 'ExtUtils::MakeMaker version 7.34, CPAN::Meta::Converter version
2.150010'
-license: unknown
+license: gpl
meta-spec:
url: http://module-build.sourceforge.net/META-spec-v1.4.html
version: '1.4'
@@ -22,5 +22,5 @@
resources:
license: https://www.gnu.org/licenses/old-licenses/gpl-2.0.txt
repository: https://github.com/coolo/spooky-pattern-xs
-version: '1.53'
+version: '1.55'
x_serialization_backend: 'CPAN::Meta::YAML version 0.018'
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/Spooky-Patterns-XS-1.53/Makefile.PL
new/Spooky-Patterns-XS-1.55/Makefile.PL
--- old/Spooky-Patterns-XS-1.53/Makefile.PL 2017-04-21 19:30:14.967869967
+0200
+++ new/Spooky-Patterns-XS-1.55/Makefile.PL 2020-01-23 17:12:57.258561689
+0100
@@ -19,7 +19,7 @@
},
LD => 'g++',
XSOPT => '-C++',
- LICENSE => 'GPL-2.0+',
+ LICENSE => 'GPL_2',
AUTHOR => 'Stephan Kulow <[email protected]>',
INC => join(' ', @INC),
LIBS => [ join(' ', @LIBPATH, @LIBS) ],
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/Spooky-Patterns-XS-1.53/Matcher.h
new/Spooky-Patterns-XS-1.55/Matcher.h
--- old/Spooky-Patterns-XS-1.53/Matcher.h 1970-01-01 01:00:00.000000000
+0100
+++ new/Spooky-Patterns-XS-1.55/Matcher.h 2020-01-25 11:57:55.925056935
+0100
@@ -0,0 +1,48 @@
+#include <cstdint>
+#include <list>
+#include <vector>
+#include <string>
+#include <set>
+
+struct Match {
+ int start;
+ int matched;
+ int pattern;
+ int sline;
+ int eline;
+};
+
+typedef std::list<Match> Matches;
+
+struct Token {
+ int linenumber;
+ uint64_t hash;
+ std::string text;
+};
+
+typedef std::vector<Token> TokenList;
+
+class TokenTree;
+
+struct Matcher {
+ std::set<uint64_t> ignored_tokens;
+ TokenTree *pattern_tree;
+
+ ssize_t longest_pattern;
+
+ static Matcher* _self;
+ static Matcher* self() {
+ if (!_self) {
+ _self = new Matcher();
+ }
+
+ return _self;
+ }
+
+ Matcher();
+ bool to_ignore(uint64_t t) const;
+ bool to_ignore(const char *t, unsigned int len) const;
+ void init();
+ void add_token(TokenList& result, const char* start, size_t len, int line)
const;
+ void tokenize(TokenList& result, char* str, int linenumber = 0);
+};
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/Spooky-Patterns-XS-1.53/XS.pm
new/Spooky-Patterns-XS-1.55/XS.pm
--- old/Spooky-Patterns-XS-1.53/XS.pm 2020-01-14 09:13:11.444514270 +0100
+++ new/Spooky-Patterns-XS-1.55/XS.pm 2020-01-25 15:43:26.253473510 +0100
@@ -23,7 +23,7 @@
our @ISA = qw(Exporter);
our @EXPORT_OK = qw();
-our $VERSION = '1.53';
+our $VERSION = '1.55';
require XSLoader;
XSLoader::load( 'Spooky::Patterns::XS', $VERSION );
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/Spooky-Patterns-XS-1.53/XS.xs
new/Spooky-Patterns-XS-1.55/XS.xs
--- old/Spooky-Patterns-XS-1.53/XS.xs 2020-01-14 09:13:11.444514270 +0100
+++ new/Spooky-Patterns-XS-1.55/XS.xs 2020-01-24 08:29:00.768988607 +0100
@@ -8,6 +8,7 @@
typedef Matcher *Spooky__Patterns__XS__Matcher;
typedef SpookyHash *Spooky__Patterns__XS__Hash;
+typedef BagOfPatterns *Spooky__Patterns__XS__BagOfPatterns;
MODULE = Spooky::Patterns::XS PACKAGE = Spooky::Patterns::XS
@@ -19,7 +20,15 @@
OUTPUT:
RETVAL
-
+
+# pass a hash of integer index to string here
+Spooky::Patterns::XS::BagOfPatterns init_bag_of_patterns()
+ CODE:
+ RETVAL = pattern_init_bag_of_patterns();
+
+ OUTPUT:
+ RETVAL
+
AV *parse_tokens(const char *str)
CODE:
RETVAL = pattern_parse(str);
@@ -62,29 +71,29 @@
pattern_add(self, id, tokens);
AV *find_matches(Spooky::Patterns::XS::Matcher self, const char *filename)
- CODE:
- RETVAL = pattern_find_matches(self, filename);
+ CODE:
+ RETVAL = pattern_find_matches(self, filename);
- OUTPUT:
- RETVAL
+ OUTPUT:
+ RETVAL
void dump(Spooky::Patterns::XS::Matcher self, const char *filename)
- CODE:
- pattern_dump(self, filename);
+ CODE:
+ pattern_dump(self, filename);
void load(Spooky::Patterns::XS::Matcher self, const char *filename)
- CODE:
- pattern_load(self, filename);
+ CODE:
+ pattern_load(self, filename);
void DESTROY(Spooky::Patterns::XS::Matcher self)
- CODE:
- destroy_matcher(self);
+ CODE:
+ destroy_matcher(self);
MODULE = Spooky::Patterns::XS PACKAGE = Spooky::Patterns::XS::Hash PREFIX =
Hash
void DESTROY(Spooky::Patterns::XS::Hash self)
- CODE:
- destroy_hash(self);
+ CODE:
+ destroy_hash(self);
void add(Spooky::Patterns::XS::Hash self, SV *s)
CODE:
@@ -96,3 +105,31 @@
OUTPUT:
RETVAL
+
+MODULE = Spooky::Patterns::XS PACKAGE = Spooky::Patterns::XS::BagOfPatterns
PREFIX = BagOfPatterns
+
+void DESTROY(Spooky::Patterns::XS::BagOfPatterns self)
+ CODE:
+ destroy_bag_of_patterns(self);
+
+void set_patterns(Spooky::Patterns::XS::BagOfPatterns self, HV *patterns)
+ CODE:
+ pattern_bag_set_patterns(self, patterns);
+
+AV *best_for(Spooky::Patterns::XS::BagOfPatterns self, const char *str, int
count)
+ CODE:
+ RETVAL = pattern_bag_best_for(self, str, count);
+
+ OUTPUT:
+ RETVAL
+
+void dump(Spooky::Patterns::XS::BagOfPatterns self, const char *filename)
+ CODE:
+ pattern_bag_dump(self, filename);
+
+bool load(Spooky::Patterns::XS::BagOfPatterns self, const char *filename)
+ CODE:
+ RETVAL = pattern_bag_load(self, filename);
+
+ OUTPUT:
+ RETVAL
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/Spooky-Patterns-XS-1.53/bag_impl.cc
new/Spooky-Patterns-XS-1.55/bag_impl.cc
--- old/Spooky-Patterns-XS-1.53/bag_impl.cc 1970-01-01 01:00:00.000000000
+0100
+++ new/Spooky-Patterns-XS-1.55/bag_impl.cc 2020-01-24 08:30:01.568751360
+0100
@@ -0,0 +1,350 @@
+// Copyright © 2020 SUSE LLC
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License along
+// with this program; if not, see <http://www.gnu.org/licenses/>.
+
+#include "Matcher.h"
+#include "patterns_impl.h"
+#include <EXTERN.h>
+#include <XSUB.h>
+// work around seed
+#undef seed
+#include <algorithm>
+#include <iostream>
+#include <map>
+#include <perl.h>
+#include <set>
+
+#define DEBUG 0
+
+using namespace std;
+
+typedef map<uint64_t, uint64_t> wordmap;
+
+// https://en.wikipedia.org/wiki/Tf%E2%80%93idf
+struct TfIdf {
+ uint64_t hash;
+ double value;
+ TfIdf(uint64_t _hash, double _value)
+ {
+ hash = _hash;
+ value = _value;
+ }
+ bool operator<(const TfIdf& str) const
+ {
+ return (hash < str.hash);
+ }
+};
+
+struct Pattern {
+ uint64_t index;
+ double square_sum;
+ vector<TfIdf> tf_idfs;
+};
+
+class BagOfPatterns {
+public:
+ BagOfPatterns() {}
+ void set_patterns(HV* patterns);
+ AV* best_for(const string& snippet, unsigned int count);
+ void dump(const char* filename) const;
+ bool load(const char* filename);
+
+private:
+ void tokenize(const char* str, wordmap& localwords);
+ double compare2(const vector<TfIdf>& tfdif1, const Pattern& pattern) const;
+ double tf_idf(const wordmap& l1, vector<TfIdf>& tfdif1);
+
+ map<uint64_t, double> idfs;
+ vector<Pattern> patterns;
+};
+
+BagOfPatterns* pattern_init_bag_of_patterns()
+{
+ return new BagOfPatterns();
+}
+
+void pattern_bag_set_patterns(BagOfPatterns* b, HV* patterns)
+{
+ b->set_patterns(patterns);
+}
+
+void destroy_bag_of_patterns(BagOfPatterns* b)
+{
+ delete b;
+}
+
+AV* pattern_bag_best_for(BagOfPatterns* b, const char* str, int count)
+{
+ return b->best_for(str, count);
+}
+
+void pattern_bag_dump(BagOfPatterns* b, const char* filename)
+{
+ b->dump(filename);
+}
+
+bool pattern_bag_load(BagOfPatterns* b, const char* filename)
+{
+ return b->load(filename);
+}
+
+void BagOfPatterns::set_patterns(HV* hv_patterns)
+{
+ idfs.clear();
+ patterns.clear();
+
+ wordmap words;
+ vector<wordmap> wordcounts;
+ vector<uint64_t> indexes;
+
+ hv_iterinit(hv_patterns);
+ HE* he;
+ while ((he = hv_iternext(hv_patterns)) != 0) {
+ I32 len;
+ char* key = hv_iterkey(he, &len);
+ unsigned int index = strtoul(key, 0, 10);
+
+ SV* svp = hv_iterval(hv_patterns, he);
+ if (!svp)
+ continue;
+
+ wordmap localwords;
+ tokenize(SvPV_nolen(svp), localwords);
+ indexes.push_back(index);
+ wordcounts.push_back(localwords);
+
+ for (wordmap::const_iterator it = localwords.begin(); it !=
localwords.end(); ++it) {
+ wordmap::iterator word_it = words.find(it->first);
+ if (word_it == words.end())
+ words[it->first] = 1;
+ else
+ word_it->second++;
+ }
+ }
+ for (wordmap::const_iterator it = words.begin(); it != words.end(); ++it) {
+ idfs[it->first] = log(double(indexes.size()) / it->second);
+ }
+
+ vector<uint64_t>::const_iterator index_it = indexes.begin();
+ vector<wordmap>::const_iterator words_it = wordcounts.begin();
+ for (; words_it != wordcounts.end(); ++words_it, ++index_it) {
+ Pattern p;
+ p.index = *index_it;
+ p.square_sum = tf_idf(*words_it, p.tf_idfs);
+ patterns.push_back(p);
+ }
+}
+
+void BagOfPatterns::tokenize(const char* str, wordmap& localwords)
+{
+ char* copy = strdup(str);
+ TokenList t;
+ Matcher::self()->tokenize(t, copy, 1);
+ free(copy);
+
+ // avoid '=======' dominating matches
+ uint64_t last_hash = 0;
+ for (TokenList::const_iterator it = t.begin(); it != t.end(); ++it) {
+ if (it->hash == last_hash)
+ continue;
+ last_hash = it->hash;
+ wordmap::iterator word_it = localwords.find(it->hash);
+ if (word_it == localwords.end()) {
+ localwords[it->hash] = 1;
+ } else {
+ word_it->second++;
+ }
+ }
+}
+
+double BagOfPatterns::tf_idf(const wordmap& words, vector<TfIdf>& tf_idfs)
+{
+ double square_sum = 0;
+ for (wordmap::const_iterator it = words.begin(); it != words.end(); ++it) {
+ double value = it->second * idfs[it->first];
+ square_sum += value * value;
+ tf_idfs.emplace_back(it->first, value);
+ }
+ sort(tf_idfs.begin(), tf_idfs.end());
+ return sqrt(square_sum);
+}
+
+double BagOfPatterns::compare2(const vector<TfIdf>& tf_idfs1, const Pattern&
pattern) const
+{
+ // both vectors are assumed to be sorted by hash
+ double sum = 0;
+ vector<TfIdf>::const_iterator it1 = pattern.tf_idfs.begin();
+ vector<TfIdf>::const_iterator it2 = tf_idfs1.begin();
+ while (it1 != pattern.tf_idfs.end() && it2 != tf_idfs1.end()) {
+ if (it1->hash == it2->hash) {
+ sum += it1->value * it2->value;
+ ++it1;
+ ++it2;
+ } else if (it1->hash > it2->hash) {
+ ++it2;
+ } else {
+ ++it1;
+ }
+ }
+ return sum / pattern.square_sum;
+}
+
+AV* BagOfPatterns::best_for(const string& snippet, unsigned int count)
+{
+ AV* result = newAV();
+
+ wordmap localwords;
+ tokenize(snippet.c_str(), localwords);
+
+ double highscore = -1;
+
+ vector<TfIdf> tfidf;
+ double square_sum = tf_idf(localwords, tfidf);
+
+ struct BagHit {
+ BagHit() {} // not used
+ BagHit(double _match, uint64_t _index)
+ {
+ match = _match;
+ index = _index;
+ }
+ bool operator<(const BagHit& rhs)
+ {
+ return match < rhs.match;
+ }
+ double match;
+ uint64_t index;
+ };
+ vector<BagHit> hits;
+ vector<Pattern>::const_iterator it = patterns.begin();
+ for (; it != patterns.end(); ++it) {
+ double match = compare2(tfidf, *it);
+ if (match > highscore) {
+ hits.emplace_back(match, it->index);
+ sort(hits.rbegin(), hits.rend());
+ if (hits.size() > count) {
+ hits.resize(count);
+ highscore = hits.back().match;
+ }
+ }
+ }
+ for (const auto& i : hits) {
+ HV* hv = (HV*)sv_2mortal((SV*)newHV());
+ hv_store(hv, "pattern", 7, newSVuv(i.index), 0);
+ hv_store(hv, "match", 5, newSVnv(int(i.match * 10000 / square_sum) /
10000.), 0);
+ av_push(result, newRV_inc((SV*)hv));
+ }
+
+ return result;
+}
+
+void BagOfPatterns::dump(const char* filename) const
+{
+ FILE* file = fopen(filename, "wb");
+
+ uint64_t count = idfs.size();
+ fwrite(&count, sizeof(count), 1, file);
+
+ map<uint64_t, double>::const_iterator it = idfs.begin();
+ for (; it != idfs.end(); ++it) {
+ uint64_t f1 = it->first;
+ double f2 = it->second;
+ fwrite(&f1, sizeof(f1), 1, file);
+ fwrite(&f2, sizeof(f2), 1, file);
+ }
+
+ count = patterns.size();
+ fwrite(&count, sizeof(count), 1, file);
+
+ for (const auto& i : patterns) {
+ uint64_t f1 = i.index;
+ fwrite(&f1, sizeof(f1), 1, file);
+ double f2 = i.square_sum;
+ fwrite(&f2, sizeof(f2), 1, file);
+
+ count = i.tf_idfs.size();
+ fwrite(&count, sizeof(count), 1, file);
+ for (const auto& t : i.tf_idfs) {
+ uint64_t f1 = t.hash;
+ double f2 = t.value;
+ fwrite(&f1, sizeof(f1), 1, file);
+ fwrite(&f2, sizeof(f2), 1, file);
+ }
+ }
+}
+
+bool BagOfPatterns::load(const char* filename)
+{
+ FILE* file = fopen(filename, "rb");
+ if (!file)
+ return false;
+
+ uint64_t count = 0;
+ if (fread(&count, sizeof(count), 1, file) != 1) {
+ fclose(file);
+ return false;
+ }
+
+ idfs.clear();
+ while (count--) {
+ uint64_t f1 = 0;
+ double f2 = 0;
+ int read = fread(&f1, sizeof(f1), 1, file);
+ read += fread(&f2, sizeof(f2), 1, file);
+ if (read != 2) {
+ fclose(file);
+ return false;
+ }
+ idfs[f1] = f2;
+ }
+
+ patterns.clear();
+ count = 0;
+ if (fread(&count, sizeof(count), 1, file) != 1) {
+ fclose(file);
+ return false;
+ }
+
+ while (count--) {
+ Pattern p;
+ uint64_t f1 = 0;
+ int read = fread(&f1, sizeof(f1), 1, file);
+ p.index = f1;
+ double f2 = 0;
+ read += fread(&f2, sizeof(f2), 1, file);
+ p.square_sum = f2;
+
+ uint64_t f3 = 0;
+ read += fread(&f3, sizeof(f3), 1, file);
+
+ if (read != 3) {
+ fclose(file);
+ return false;
+ }
+ while (f3--) {
+ uint64_t f1;
+ double f2;
+ read = fread(&f1, sizeof(f1), 1, file);
+ read += fread(&f2, sizeof(f2), 1, file);
+ if (read != 2) {
+ fclose(file);
+ return false;
+ }
+ p.tf_idfs.emplace_back(f1, f2);
+ }
+ patterns.push_back(p);
+ }
+
+ return true;
+}
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/Spooky-Patterns-XS-1.53/patterns_impl.cc
new/Spooky-Patterns-XS-1.55/patterns_impl.cc
--- old/Spooky-Patterns-XS-1.53/patterns_impl.cc 2020-01-14
09:13:11.444514270 +0100
+++ new/Spooky-Patterns-XS-1.55/patterns_impl.cc 2020-01-25
15:42:56.479510549 +0100
@@ -14,6 +14,7 @@
// with this program; if not, see <http://www.gnu.org/licenses/>.
#include "patterns_impl.h"
+#include "Matcher.h"
#include "SpookyV2.h"
#include "TokenTree.h"
#include <EXTERN.h>
@@ -30,102 +31,80 @@
using namespace std;
-struct Token {
- int linenumber;
- uint64_t hash;
- std::string text;
-};
-
-typedef std::vector<Token> TokenList;
-
std::vector<AANode> TokenTree::nodes;
const int MAX_TOKEN_LENGTH = 100;
const int MAX_LINE_SIZE = 8000;
-struct Match {
- int start;
- int matched;
- int pattern;
- int sline;
- int eline;
-};
-
-typedef std::list<Match> Matches;
-
-struct Matcher {
- TokenTree ignore_tree;
- TokenTree pattern_tree;
-
- SSize_t longest_pattern;
+Matcher* Matcher::_self = 0;
- static Matcher* _self;
- static Matcher* self() { return _self; }
+Matcher* pattern_init_matcher()
+{
+ Matcher::self()->init();
+ return Matcher::self();
+}
- bool to_ignore(uint64_t t)
- {
- return ignore_tree.find(t);
- }
+void destroy_matcher(Matcher* m)
+{
+ // do nothing, we reuse the self
+}
- Matcher()
- {
- if (_self) {
- fprintf(stderr, "Matcher::self already initialized\n");
- }
- init();
+Matcher::Matcher()
+{
+ if (_self) {
+ fprintf(stderr, "Matcher::self already initialized\n");
}
+ pattern_tree = new TokenTree;
+ init();
+}
- void init()
- {
- TokenTree::nodes.clear();
-
- ignore_tree.initNull();
- pattern_tree.initNull();
-
- // typical comment and markup - have to be single tokens!
- static const char* ignored_tokens[] = {
- "/", "//", "%", "%%", "dnl",
- "#~", ";;", "\"\"", "--", "#:",
- "\\", ">", "==", "::", "##", 0
- };
+void Matcher::init()
+{
+ TokenTree::nodes.clear();
+ pattern_tree->initNull();
+ ignored_tokens.clear();
- static TokenTree dummy_next;
+ // typical comment and markup - have to be single tokens!
+ static const char* _ignored_tokens[] = {
+ "dnl", "\\n", "\\r", 0
+ };
- int index = 0;
- while (ignored_tokens[index]) {
- int len = strlen(ignored_tokens[index]);
- uint64_t h = SpookyHash::Hash64(ignored_tokens[index], len, 1);
- ignore_tree.insert(h, &dummy_next);
- index++;
- }
- longest_pattern = 0;
+ int index = 0;
+ while (_ignored_tokens[index]) {
+ int len = strlen(_ignored_tokens[index]);
+ uint64_t h = SpookyHash::Hash64(_ignored_tokens[index], len, 1);
+ ignored_tokens.insert(h);
+ index++;
}
-};
-
-Matcher* Matcher::_self = 0;
+ longest_pattern = 0;
+}
-Matcher* pattern_init_matcher()
+// check if the token is purely non alpha numeric
+bool Matcher::to_ignore(const char* text, unsigned int len) const
{
- if (!Matcher::_self)
- Matcher::_self = new Matcher;
-
- Matcher::_self->init();
-
- return Matcher::_self;
+ if (!len)
+ return true;
+ uint64_t index = 0;
+ while (index < len) {
+ if (isalnum(text[index]))
+ return false;
+ index++;
+ }
+ //cerr << "ignore '" << string(text, len) << endl;
+ return true;
}
-void destroy_matcher(Matcher* m)
+bool Matcher::to_ignore(uint64_t t) const
{
- // do nothing, we reuse the self
+ return ignored_tokens.find(t) != ignored_tokens.end();
}
-static void add_token(Matcher* m, TokenList& result, const char* start, size_t
len, int line)
+void Matcher::add_token(TokenList& result, const char* start, size_t len, int
line) const
{
- if (!len)
+ if (to_ignore(start, len))
return;
Token t;
- t.text = std::string(start, len);
t.linenumber = line;
t.hash = 0;
if (!line && len > 5 && len < 9 && !strncmp(start, "$skip", 5)) {
@@ -137,21 +116,30 @@
if (*endptr || t.hash > MAX_SKIP) // more than just a number
t.hash = 0;
}
+ // very special case
+ if (start[len - 1] == '.') {
+ len--;
+ }
+ if (start[0] == '+' || start[0] == '-') {
+ start++;
+ len--;
+ }
+ t.text = std::string(start, len);
if (!t.hash) {
// hash64 has no collisions on our patterns and is very fast
// *and* 0-3000 (at least) are "free"
t.hash = SpookyHash::Hash64(start, len, 1);
assert(t.hash > MAX_SKIP);
- if (m->to_ignore(t.hash))
+ if (to_ignore(t.hash))
return;
}
result.push_back(t);
}
-void tokenize(Matcher* m, TokenList& result, char* str, int linenumber = 0)
+void Matcher::tokenize(TokenList& result, char* str, int linenumber)
{
- static const char* ignore_seps = " \r\n\t*;,:!#{}()[]|";
- static const char* single_seps = "-.+?\"\'`=<>";
+ static const char* ignore_seps = " \r\n\t*;,:!#{}()[]|><";
+ static const char* single_seps = "?\"\'`'=";
const char* start = str;
@@ -162,14 +150,14 @@
*str = tolower(*str);
bool ignored = (strchr(ignore_seps, *str) != NULL);
if (ignored || strchr(single_seps, *str)) {
- add_token(m, result, start, str - start, linenumber);
+ add_token(result, start, str - start, linenumber);
//fprintf(stderr, "TO %d:'%s'\n", ignored, str);
if (!ignored)
- add_token(m, result, str, 1, linenumber);
+ add_token(result, str, 1, linenumber);
start = str + 1;
}
}
- add_token(m, result, start, str - start, linenumber);
+ add_token(result, start, str - start, linenumber);
}
AV* pattern_parse(const char* str)
@@ -182,7 +170,7 @@
fprintf(stderr, "Need a Matcher - call init_matcher\n");
return ret;
}
- tokenize(m, t, copy);
+ m->tokenize(t, copy);
free(copy);
av_extend(ret, t.size());
int index = 0;
@@ -197,7 +185,7 @@
++index;
}
// do not end with an expansion variable either
- if (last_hash <= MAX_SKIP) {
+ if (last_hash <= MAX_SKIP) {
av_pop(ret);
}
@@ -229,13 +217,13 @@
void pattern_add(Matcher* m, unsigned int id, av* tokens)
{
- SSize_t len = av_top_index(tokens) + 1;
+ ssize_t len = av_top_index(tokens) + 1;
if (!len) {
std::cerr << "add failed for id " << id << std::endl;
return;
}
- TokenTree* current = &m->pattern_tree;
+ TokenTree* current = m->pattern_tree;
for (SSize_t i = 0; i < len; ++i) {
SV* sv = *av_fetch(tokens, i, 0);
@@ -260,23 +248,24 @@
m->longest_pattern = len;
}
-void add_match(const TokenList& ts, Matches& ms, int tokenlist_offset, int
tokenlist_index, unsigned int matched, int pid) {
- Match m;
- m.start = tokenlist_offset + tokenlist_index;
- m.matched = matched - tokenlist_index;
+void add_match(const TokenList& ts, Matches& ms, int tokenlist_offset, int
tokenlist_index, unsigned int matched, int pid)
+{
+ Match m;
+ m.start = tokenlist_offset + tokenlist_index;
+ m.matched = matched - tokenlist_index;
- m.sline = ts[tokenlist_index].linenumber;
- m.eline = ts[matched - 1].linenumber;
+ m.sline = ts[tokenlist_index].linenumber;
+ m.eline = ts[matched - 1].linenumber;
- m.pattern = pid;
+ m.pattern = pid;
#if DEBUG
- fprintf(stderr, "L %d(%d)-%d(%d) id:%d\n", ts[tokenlist_index].linenumber,
- tokenlist_offset + tokenlist_index, ts[matched - 1].linenumber,
m.matched, m.pattern);
+ fprintf(stderr, "L %d(%d)-%d(%d) id:%d\n", ts[tokenlist_index].linenumber,
+ tokenlist_offset + tokenlist_index, ts[matched - 1].linenumber,
m.matched, m.pattern);
#endif
- ms.push_back(m);
+ ms.push_back(m);
}
-void check_token_matches(const TokenList& tokens, Matches &ms, int
tokenlist_offset, int tokenlist_index, unsigned int offset, const TokenTree*
patterns)
+void check_token_matches(const TokenList& tokens, Matches& ms, int
tokenlist_offset, int tokenlist_index, unsigned int offset, const TokenTree*
patterns)
{
if (offset >= tokens.size())
return;
@@ -309,7 +298,6 @@
}
}
-
// if either the start or the end of one region is within the other
bool match_overlap(int s1, int e1, int s2, int e2)
{
@@ -322,7 +310,7 @@
void find_tokens(Matcher* m, TokenList& ts, Matches& ms, int tokenlist_offset,
int tokenlist_index)
{
- TokenTree* patterns = m->pattern_tree.find(ts[tokenlist_index].hash);
+ TokenTree* patterns = m->pattern_tree->find(ts[tokenlist_index].hash);
if (!patterns)
return;
check_token_matches(ts, ms, tokenlist_offset, tokenlist_index,
tokenlist_index + 1, patterns);
@@ -344,7 +332,7 @@
Matches ms;
int token_offset = 0;
while (fgets(line, sizeof(line) - 1, input)) {
- tokenize(m, ts, line, linenumber++);
+ m->tokenize(ts, line, linenumber++);
// preserve memory
if (SSize_t(ts.size()) > m->longest_pattern * 100) {
unsigned int erasing = ts.size() - m->longest_pattern - 1;
@@ -401,7 +389,7 @@
SerializeInfo si;
- m->pattern_tree.mark_elements(si);
+ m->pattern_tree->mark_elements(si);
fwrite(&si.tree_count, sizeof(si.tree_count), 1, file);
uint32_t count = TokenTree::nodes.size();
fwrite(&count, sizeof(count), 1, file);
@@ -461,7 +449,7 @@
fwrite(&index, sizeof(int32_t), 1, file);
}
- uint32_t index = m->pattern_tree.root;
+ uint32_t index = m->pattern_tree->root;
fwrite(&index, sizeof(uint32_t), 1, file);
fclose(file);
@@ -527,7 +515,7 @@
TokenTree::nodes.clear();
TokenTree::nodes.reserve(node_count);
- m->pattern_tree.initNull();
+ m->pattern_tree->initNull();
for (unsigned int i = 1; i < node_count; i++) {
uint64_t element = *reinterpret_cast<uint64_t*>(dump);
@@ -545,7 +533,7 @@
//delete [] trees;
- m->pattern_tree.root = *reinterpret_cast<uint32_t*>(dump);
+ m->pattern_tree->root = *reinterpret_cast<uint32_t*>(dump);
dump += sizeof(uint32_t);
munmap(dump, attr.st_size);
@@ -572,7 +560,7 @@
// fgets makes sure we have a 0 at the end
size_t len = strlen(line);
// chop
- if (len && line[len-1] == '\n') {
+ if (len && line[len - 1] == '\n') {
line[--len] = 0;
}
AV* row = newAV();
@@ -674,10 +662,6 @@
{
AV* ret = newAV();
Matcher* m = Matcher::self();
- if (!m) {
- fprintf(stderr, "Need a Matcher - call init_matcher\n");
- return ret;
- }
TokenList t;
int line = 1;
while (true) {
@@ -687,7 +671,7 @@
copy = strndup(p, nl - p);
else
copy = strdup(p);
- tokenize(m, t, copy, line++);
+ m->tokenize(t, copy, line++);
free(copy);
if (!nl)
break;
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/Spooky-Patterns-XS-1.53/patterns_impl.h
new/Spooky-Patterns-XS-1.55/patterns_impl.h
--- old/Spooky-Patterns-XS-1.53/patterns_impl.h 2020-01-14 09:13:11.444514270
+0100
+++ new/Spooky-Patterns-XS-1.55/patterns_impl.h 2020-01-24 08:27:50.792375119
+0100
@@ -22,16 +22,26 @@
AV* pattern_parse(const char* str);
AV* pattern_normalize(const char* str);
int pattern_distance(AV* a1, AV* a2);
+AV* pattern_read_lines(const char* filename, HV* needed);
+
struct Matcher;
-class SpookyHash;
Matcher* pattern_init_matcher();
void pattern_add(Matcher* m, unsigned id, AV* tokens);
AV* pattern_find_matches(Matcher* m, const char* filename);
void pattern_dump(Matcher* m, const char* filename);
void pattern_load(Matcher* m, const char* filename);
void destroy_matcher(Matcher* m);
-AV* pattern_read_lines(const char* filename, HV* needed);
+
+class SpookyHash;
SpookyHash* pattern_init_hash(UV seed1, UV seed2);
void pattern_add_to_hash(SpookyHash* s, SV* sv);
void destroy_hash(SpookyHash* s);
AV* pattern_hash128(SpookyHash* s);
+
+class BagOfPatterns;
+BagOfPatterns* pattern_init_bag_of_patterns();
+void destroy_bag_of_patterns(BagOfPatterns *b);
+void pattern_bag_set_patterns(BagOfPatterns *b, HV *patterns);
+AV *pattern_bag_best_for(BagOfPatterns *b, const char *str, int count);
+void pattern_bag_dump(BagOfPatterns* b, const char* filename);
+bool pattern_bag_load(BagOfPatterns* b, const char* filename);
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/Spooky-Patterns-XS-1.53/t/04license.13.txt
new/Spooky-Patterns-XS-1.55/t/04license.13.txt
--- old/Spooky-Patterns-XS-1.53/t/04license.13.txt 1970-01-01
01:00:00.000000000 +0100
+++ new/Spooky-Patterns-XS-1.55/t/04license.13.txt 2020-01-25
12:12:15.365695264 +0100
@@ -0,0 +1,5 @@
+// SPDX-License-Identifier: EPL-1.0+
+
+Hallo
+
+// SPDX-License-Identifier: EPL-1.0
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/Spooky-Patterns-XS-1.53/t/04license.30.pattern
new/Spooky-Patterns-XS-1.55/t/04license.30.pattern
--- old/Spooky-Patterns-XS-1.53/t/04license.30.pattern 1970-01-01
01:00:00.000000000 +0100
+++ new/Spooky-Patterns-XS-1.55/t/04license.30.pattern 2020-01-25
12:11:43.419589102 +0100
@@ -0,0 +1 @@
+SPDX-License-Identifier: EPL-1.0
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/Spooky-Patterns-XS-1.53/t/04license.31.pattern
new/Spooky-Patterns-XS-1.55/t/04license.31.pattern
--- old/Spooky-Patterns-XS-1.53/t/04license.31.pattern 1970-01-01
01:00:00.000000000 +0100
+++ new/Spooky-Patterns-XS-1.55/t/04license.31.pattern 2020-01-25
12:11:51.204102326 +0100
@@ -0,0 +1 @@
+SPDX-License-Identifier: EPL-1.0+
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/Spooky-Patterns-XS-1.53/t/04license.t
new/Spooky-Patterns-XS-1.55/t/04license.t
--- old/Spooky-Patterns-XS-1.53/t/04license.t 2020-01-14 09:13:11.444514270
+0100
+++ new/Spooky-Patterns-XS-1.55/t/04license.t 2020-01-25 14:14:23.333200814
+0100
@@ -31,7 +31,8 @@
9 => [ [ 20, 1, 29 ] ],
10 => [ [ 22, 113, 114 ], [ 4, 112, 112 ], [ 3, 109, 109 ] ],
11 => [ [ 25, 4, 15 ], [ 23, 2, 2 ], [ 4, 2, 2 ] ],
- 12 => [ [ 29, 1, 115 ] ]
+ 12 => [ [ 29, 1, 115 ] ],
+ 13 => [ [ 31, 1, 1 ], [ 30, 5, 5 ] ],
);
for my $fn ( glob("t/04license.*.txt") ) {
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/Spooky-Patterns-XS-1.53/t/05readlines.t
new/Spooky-Patterns-XS-1.55/t/05readlines.t
--- old/Spooky-Patterns-XS-1.53/t/05readlines.t 2020-01-14 09:13:11.444514270
+0100
+++ new/Spooky-Patterns-XS-1.55/t/05readlines.t 2020-01-25 14:14:23.349201868
+0100
@@ -29,6 +29,10 @@
$ret = Spooky::Patterns::XS::read_lines( 't/05readlines.2.raw', { 1 => 1 } );
$ret = Spooky::Patterns::XS::read_lines( 't/04license.12.txt', { 115 => 1 } );
-cmp_deeply($ret, [ [ 115, 1, 'END OF TERMS AND CONDITIONS' ] ], "end of file
returned" );
+cmp_deeply(
+ $ret,
+ [ [ 115, 1, 'END OF TERMS AND CONDITIONS' ] ],
+ "end of file returned"
+);
done_testing();
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/Spooky-Patterns-XS-1.53/t/06hash.t
new/Spooky-Patterns-XS-1.55/t/06hash.t
--- old/Spooky-Patterns-XS-1.53/t/06hash.t 2019-07-27 14:53:24.498286975
+0200
+++ new/Spooky-Patterns-XS-1.55/t/06hash.t 2020-01-25 14:14:23.361202659
+0100
@@ -10,8 +10,8 @@
my $h = Spooky::Patterns::XS::init_hash( 0, 0 );
$h->add("Hállöchen\n");
$h->add("abc\x{300}");
-is( $h->hex, 'd6d58320114a2d3c1d6dd671ab0383ec', "Hhex correct" );
-is( $h->hash64, 15480423467908214076, "Hash64 correct" );
+is( $h->hex, 'd6d58320114a2d3c1d6dd671ab0383ec', "Hhex correct" );
+is( $h->hash64, 15480423467908214076, "Hash64 correct" );
$h = Spooky::Patterns::XS::init_hash( 0, 0 );
$h->add("/* \n");
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/Spooky-Patterns-XS-1.53/t/07close.t
new/Spooky-Patterns-XS-1.55/t/07close.t
--- old/Spooky-Patterns-XS-1.53/t/07close.t 2020-01-14 09:13:11.444514270
+0100
+++ new/Spooky-Patterns-XS-1.55/t/07close.t 2020-01-25 14:14:23.385204241
+0100
@@ -13,7 +13,7 @@
Spooky::Patterns::XS::init_matcher();
my $p1 = Spooky::Patterns::XS::normalize( read_file('t/07close.p1') );
my $p2 = Spooky::Patterns::XS::normalize( read_file('t/07close.p2') );
-is( Spooky::Patterns::XS::distance( $p1, $p2 ), 4, "Distance is 2" );
+is( Spooky::Patterns::XS::distance( $p1, $p2 ), 4, "Distance is right" );
my @words1 = map { $_->[1] } @$p1;
my @words2 = map { $_->[1] } @$p2;
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/Spooky-Patterns-XS-1.53/t/08bag.t
new/Spooky-Patterns-XS-1.55/t/08bag.t
--- old/Spooky-Patterns-XS-1.53/t/08bag.t 1970-01-01 01:00:00.000000000
+0100
+++ new/Spooky-Patterns-XS-1.55/t/08bag.t 2020-01-25 14:56:51.782687801
+0100
@@ -0,0 +1,59 @@
+#! /usr/bin/perl
+
+use 5.012;
+use strict;
+use warnings;
+use Test::More;
+use Spooky::Patterns::XS;
+
+my %patterns;
+$patterns{42} = "This is the great GPL";
+$patterns{17} = "Artistic is great too";
+
+my $bag = Spooky::Patterns::XS::init_bag_of_patterns;
+$bag->set_patterns( \%patterns );
+$bag->dump('t/08bag.dump');
+$bag->load('t/08bag.dump');
+my $result = $bag->best_for( 'GPL is great', 2 );
+
+is( scalar @$result, 2, 'right number' );
+is_deeply( $result->[0], { pattern => 42, match => 0.5773 }, 'fits GPL' );
+
+done_testing();
+
+=benchmark
+ use Mojo::File;
+ use Mojo::JSON 'decode_json';
+
+ # https://stephan.kulow.org/test.json.xz
+ my $json = Mojo::File->new('test.json')->slurp;
+ $json = decode_json($json);
+ $bag = Spooky::Patterns::XS::init_bag_of_patterns;
+ $bag->set_patterns( $json->{patterns} );
+
+ #$bag->dump('test.dump');
+ #$bag->load('cavil.pattern.bag');
+ $result = $bag->best_for( $json->{snippets}{2061026}, 10 );
+ use Data::Dumper;
+ diag Dumper($result);
+ is_deeply( $result->[0], { pattern => 2430, match => 0.9228 }, 'fits' );
+
+ diag "lookup:";
+ diag $json->{snippets}{2061026};
+
+ diag "33214:";
+ diag $json->{patterns}{33214};
+
+ diag "2430:";
+ diag $json->{patterns}{2430};
+
+ my $stime = time;
+ my $count = 0;
+ for my $snippet ( keys %{ $json->{snippets} } ) {
+ $result = $bag->best_for( $json->{snippets}{$snippet}, 1 )->[0];
+ $count++;
+ my $delta = time - $stime;
+ diag "$snippet: $count/$delta $result->{pattern}/$result->{match}";
+ last if ( $delta > 10 || $count > 1000 );
+ }
+
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/Spooky-Patterns-XS-1.53/t/09normalize.1.in
new/Spooky-Patterns-XS-1.55/t/09normalize.1.in
--- old/Spooky-Patterns-XS-1.53/t/09normalize.1.in 1970-01-01
01:00:00.000000000 +0100
+++ new/Spooky-Patterns-XS-1.55/t/09normalize.1.in 2020-01-25
12:45:04.789174947 +0100
@@ -0,0 +1,27 @@
+** LANDON CURT NOLL DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+** INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO
+** EVENT SHALL LANDON CURT NOLL BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+** CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
+** USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
+** OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+** PERFORMANCE OF THIS SOFTWARE.
+**
+** Copyright (C) 1990, RSA Data Security, Inc. All rights reserved.
+**
+** License to copy and use this software is granted provided that it is
+** identified as the "RSA Data Security, Inc. MD5 Message-Digest Algorithm"
+** in all material mentioning or referencing this software or this
+** function.
+**
+** License is also granted to make and use derivative works provided that
+** such works are identified as "derived from the RSA Data Security,
+** Inc. MD5 Message-Digest Algorithm" in all material mentioning or
+** referencing the derived work.
+**
+** RSA Data Security, Inc. makes no representations concerning either the
+** merchantability of this software or the suitability of this software for
+** any particular purpose. It is provided "as is" without express or
+** implied warranty of any kind.
+**
+** These notices must be retained in any copies of any part of this
+** documentation and/or software.
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/Spooky-Patterns-XS-1.53/t/09normalize.1.out
new/Spooky-Patterns-XS-1.55/t/09normalize.1.out
--- old/Spooky-Patterns-XS-1.53/t/09normalize.1.out 1970-01-01
01:00:00.000000000 +0100
+++ new/Spooky-Patterns-XS-1.55/t/09normalize.1.out 2020-01-25
13:50:23.721816938 +0100
@@ -0,0 +1,27 @@
+landon curt noll disclaims all warranties with regard to this software
+including all implied warranties of merchantability and fitness in no
+event shall landon curt noll be liable for any special indirect or
+consequential damages or any damages whatsoever resulting from loss of
+use data or profits whether in an action of contract negligence or
+other tortious action arising out of or in connection with the use or
+performance of this software
+
+copyright c 1990 rsa data security inc all rights reserved
+
+license to copy and use this software is granted provided that it is
+identified as the rsa data security inc md5 message-digest algorithm
+in all material mentioning or referencing this software or this
+function
+
+license is also granted to make and use derivative works provided that
+such works are identified as derived from the rsa data security
+inc md5 message-digest algorithm in all material mentioning or
+referencing the derived work
+
+rsa data security inc makes no representations concerning either the
+merchantability of this software or the suitability of this software for
+any particular purpose it is provided as is without express or
+implied warranty of any kind
+
+these notices must be retained in any copies of any part of this
+documentation and/or software
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/Spooky-Patterns-XS-1.53/t/09normalize.2.in
new/Spooky-Patterns-XS-1.55/t/09normalize.2.in
--- old/Spooky-Patterns-XS-1.53/t/09normalize.2.in 1970-01-01
01:00:00.000000000 +0100
+++ new/Spooky-Patterns-XS-1.55/t/09normalize.2.in 2020-01-25
15:11:42.346670056 +0100
@@ -0,0 +1,9 @@
+// SPDX-License-Identifier: EPL-1.0+
+
+Hallo
+
+// SPDX-License-Identifier: EPL-1.0
+
+<p>SPDX-License-Identifier: EPL-1.0</p>
+
++Hallo World!
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/Spooky-Patterns-XS-1.53/t/09normalize.2.out
new/Spooky-Patterns-XS-1.55/t/09normalize.2.out
--- old/Spooky-Patterns-XS-1.53/t/09normalize.2.out 1970-01-01
01:00:00.000000000 +0100
+++ new/Spooky-Patterns-XS-1.55/t/09normalize.2.out 2020-01-25
15:12:57.075596854 +0100
@@ -0,0 +1,9 @@
+spdx-license-identifier epl-1.0+
+
+hallo
+
+spdx-license-identifier epl-1.0
+
+p spdx-license-identifier epl-1.0 /p
+
+hallo world
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/Spooky-Patterns-XS-1.53/t/09normalize.t
new/Spooky-Patterns-XS-1.55/t/09normalize.t
--- old/Spooky-Patterns-XS-1.53/t/09normalize.t 1970-01-01 01:00:00.000000000
+0100
+++ new/Spooky-Patterns-XS-1.55/t/09normalize.t 2020-01-25 14:14:23.413206088
+0100
@@ -0,0 +1,35 @@
+#! /usr/bin/perl
+
+use 5.012;
+use strict;
+use warnings;
+use Test::More;
+use File::Slurp;
+use Spooky::Patterns::XS;
+
+sub compare_case {
+ my $case = shift;
+ my $p1 =
+ Spooky::Patterns::XS::normalize( read_file("t/09normalize.$case.in") );
+ my @outlines;
+
+ my $txt;
+ my $prevline = 1;
+ for my $token (@$p1) {
+ while ( $prevline < $token->[0] ) {
+ $txt .= "\n";
+ $prevline++;
+ }
+ $txt .= $token->[1] . " ";
+ }
+ chomp $txt;
+ my $exp = read_file("t/09normalize.$case.out");
+ chomp $exp;
+
+ is( $txt, $exp );
+}
+
+compare_case(1);
+compare_case(2);
+
+done_testing();
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore' old/Spooky-Patterns-XS-1.53/typemap
new/Spooky-Patterns-XS-1.55/typemap
--- old/Spooky-Patterns-XS-1.53/typemap 2017-03-15 06:58:08.298117819 +0100
+++ new/Spooky-Patterns-XS-1.55/typemap 2020-01-23 17:12:57.262561953 +0100
@@ -1,3 +1,4 @@
-Spooky::Patterns::XS::Matcher T_PTROBJ
-Spooky::Patterns::XS::Hash T_PTROBJ
-AV* T_AVREF_REFCOUNT_FIXED
\ No newline at end of file
+Spooky::Patterns::XS::Matcher T_PTROBJ
+Spooky::Patterns::XS::Hash T_PTROBJ
+Spooky::Patterns::XS::BagOfPatterns T_PTROBJ
+AV* T_AVREF_REFCOUNT_FIXED