Move StringHelper from Clownfish to Lucy
Project: http://git-wip-us.apache.org/repos/asf/lucy/repo Commit: http://git-wip-us.apache.org/repos/asf/lucy/commit/aa48a9e6 Tree: http://git-wip-us.apache.org/repos/asf/lucy/tree/aa48a9e6 Diff: http://git-wip-us.apache.org/repos/asf/lucy/diff/aa48a9e6 Branch: refs/heads/master Commit: aa48a9e68c2dd51a7c4391cab1bcd5f77bb39d8a Parents: f257a45 Author: Nick Wellnhofer <[email protected]> Authored: Tue Aug 2 19:33:01 2016 +0200 Committer: Nick Wellnhofer <[email protected]> Committed: Tue Aug 2 19:57:01 2016 +0200 ---------------------------------------------------------------------- c/src/Lucy/Analysis/RegexTokenizer.c | 2 +- core/Lucy/Analysis/StandardTokenizer.c | 1 + core/Lucy/Index/HighlightWriter.c | 1 + core/Lucy/Index/IndexManager.c | 2 +- core/Lucy/Index/Indexer.c | 1 + core/Lucy/Index/PolyReader.c | 1 - core/Lucy/Index/Posting/RawPosting.c | 1 - core/Lucy/Index/Segment.c | 2 +- core/Lucy/Index/Snapshot.c | 2 +- core/Lucy/Index/TermInfo.c | 1 - core/Lucy/Index/TermStepper.c | 1 - core/Lucy/Plan/TextType.c | 2 +- core/Lucy/Store/CompoundFileReader.c | 1 - core/Lucy/Util/IndexFileNames.c | 1 - core/Lucy/Util/Json.c | 3 +- core/Lucy/Util/StringHelper.c | 83 +++++++++++++++++++++ core/Lucy/Util/StringHelper.cfh | 59 +++++++++++++++ core/Lucy/Util/ToolSet.h | 1 - go/cfext/lucy.c | 1 - go/lucy/lucy.go | 4 +- perl/buildlib/Lucy/Build/Binding/Util.pm | 94 +++++++++++++++++++++++ perl/lib/Lucy.pm | 17 +++++ perl/lib/Lucy/Util/StringHelper.pm | 25 +++++++ perl/lib/LucyX/Index/ZlibDocReader.pm | 2 +- perl/lib/LucyX/Index/ZlibDocWriter.pm | 2 +- perl/t/105-folder.t | 2 +- perl/t/601-queryparser.t | 2 +- perl/t/binding/101-simple_io.t | 2 +- perl/t/core/032-string_helper.t | 25 +++++++ perl/xs/Lucy/Analysis/RegexTokenizer.c | 6 +- perl/xs/Lucy/Index/Inverter.c | 1 - test/Lucy/Test.c | 2 + test/Lucy/Test/Util/TestStringHelper.c | 103 ++++++++++++++++++++++++++ test/Lucy/Test/Util/TestStringHelper.cfh | 29 ++++++++ 34 files changed, 457 insertions(+), 25 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/lucy/blob/aa48a9e6/c/src/Lucy/Analysis/RegexTokenizer.c ---------------------------------------------------------------------- diff --git a/c/src/Lucy/Analysis/RegexTokenizer.c b/c/src/Lucy/Analysis/RegexTokenizer.c index d47b3ea..9d534ff 100644 --- a/c/src/Lucy/Analysis/RegexTokenizer.c +++ b/c/src/Lucy/Analysis/RegexTokenizer.c @@ -26,9 +26,9 @@ #include "Clownfish/String.h" #include "Clownfish/Err.h" #include "Clownfish/Util/Memory.h" -#include "Clownfish/Util/StringHelper.h" #include "Lucy/Analysis/Token.h" #include "Lucy/Analysis/Inversion.h" +#include "Lucy/Util/StringHelper.h" #if defined(CHY_HAS_PCRE_H) http://git-wip-us.apache.org/repos/asf/lucy/blob/aa48a9e6/core/Lucy/Analysis/StandardTokenizer.c ---------------------------------------------------------------------- diff --git a/core/Lucy/Analysis/StandardTokenizer.c b/core/Lucy/Analysis/StandardTokenizer.c index 012d428..b44e481 100644 --- a/core/Lucy/Analysis/StandardTokenizer.c +++ b/core/Lucy/Analysis/StandardTokenizer.c @@ -21,6 +21,7 @@ #include "Lucy/Analysis/StandardTokenizer.h" #include "Lucy/Analysis/Token.h" #include "Lucy/Analysis/Inversion.h" +#include "Lucy/Util/StringHelper.h" /* * We use a modified version of the Word_Break property defined in UAX #29. http://git-wip-us.apache.org/repos/asf/lucy/blob/aa48a9e6/core/Lucy/Index/HighlightWriter.c ---------------------------------------------------------------------- diff --git a/core/Lucy/Index/HighlightWriter.c b/core/Lucy/Index/HighlightWriter.c index 64b1a4c..b38a251 100644 --- a/core/Lucy/Index/HighlightWriter.c +++ b/core/Lucy/Index/HighlightWriter.c @@ -38,6 +38,7 @@ #include "Lucy/Store/InStream.h" #include "Lucy/Util/Freezer.h" #include "Lucy/Util/NumberUtils.h" +#include "Lucy/Util/StringHelper.h" static OutStream* S_lazy_init(HighlightWriter *self); http://git-wip-us.apache.org/repos/asf/lucy/blob/aa48a9e6/core/Lucy/Index/IndexManager.c ---------------------------------------------------------------------- diff --git a/core/Lucy/Index/IndexManager.c b/core/Lucy/Index/IndexManager.c index 8bc80fe..348717a 100644 --- a/core/Lucy/Index/IndexManager.c +++ b/core/Lucy/Index/IndexManager.c @@ -29,7 +29,7 @@ #include "Lucy/Store/LockFactory.h" #include "Lucy/Util/IndexFileNames.h" #include "Lucy/Util/Json.h" -#include "Clownfish/Util/StringHelper.h" +#include "Lucy/Util/StringHelper.h" #include <stdlib.h> http://git-wip-us.apache.org/repos/asf/lucy/blob/aa48a9e6/core/Lucy/Index/Indexer.c ---------------------------------------------------------------------- diff --git a/core/Lucy/Index/Indexer.c b/core/Lucy/Index/Indexer.c index 977cf38..16b915b 100644 --- a/core/Lucy/Index/Indexer.c +++ b/core/Lucy/Index/Indexer.c @@ -42,6 +42,7 @@ #include "Lucy/Util/Freezer.h" #include "Lucy/Util/IndexFileNames.h" #include "Lucy/Util/Json.h" +#include "Lucy/Util/StringHelper.h" int32_t Indexer_CREATE = 0x00000001; int32_t Indexer_TRUNCATE = 0x00000002; http://git-wip-us.apache.org/repos/asf/lucy/blob/aa48a9e6/core/Lucy/Index/PolyReader.c ---------------------------------------------------------------------- diff --git a/core/Lucy/Index/PolyReader.c b/core/Lucy/Index/PolyReader.c index 63d23b5..e843bcd 100644 --- a/core/Lucy/Index/PolyReader.c +++ b/core/Lucy/Index/PolyReader.c @@ -33,7 +33,6 @@ #include "Lucy/Util/Json.h" #include "Lucy/Util/Freezer.h" #include "Lucy/Util/IndexFileNames.h" -#include "Clownfish/Util/StringHelper.h" // Obtain/release read locks and commit locks. static bool http://git-wip-us.apache.org/repos/asf/lucy/blob/aa48a9e6/core/Lucy/Index/Posting/RawPosting.c ---------------------------------------------------------------------- diff --git a/core/Lucy/Index/Posting/RawPosting.c b/core/Lucy/Index/Posting/RawPosting.c index db2e5b7..a4b9e5d 100644 --- a/core/Lucy/Index/Posting/RawPosting.c +++ b/core/Lucy/Index/Posting/RawPosting.c @@ -28,7 +28,6 @@ #include "Lucy/Index/TermInfo.h" #include "Lucy/Plan/Schema.h" #include "Lucy/Store/OutStream.h" -#include "Clownfish/Util/StringHelper.h" RawPosting* RawPost_new(void *pre_allocated_memory, int32_t doc_id, uint32_t freq, http://git-wip-us.apache.org/repos/asf/lucy/blob/aa48a9e6/core/Lucy/Index/Segment.c ---------------------------------------------------------------------- diff --git a/core/Lucy/Index/Segment.c b/core/Lucy/Index/Segment.c index 8507d4d..a3312a8 100644 --- a/core/Lucy/Index/Segment.c +++ b/core/Lucy/Index/Segment.c @@ -23,7 +23,7 @@ #include "Clownfish/Num.h" #include "Lucy/Store/Folder.h" #include "Lucy/Util/Json.h" -#include "Clownfish/Util/StringHelper.h" +#include "Lucy/Util/StringHelper.h" #include "Lucy/Util/IndexFileNames.h" Segment* http://git-wip-us.apache.org/repos/asf/lucy/blob/aa48a9e6/core/Lucy/Index/Snapshot.c ---------------------------------------------------------------------- diff --git a/core/Lucy/Index/Snapshot.c b/core/Lucy/Index/Snapshot.c index c1005b2..fe2a89e 100644 --- a/core/Lucy/Index/Snapshot.c +++ b/core/Lucy/Index/Snapshot.c @@ -21,7 +21,7 @@ #include "Clownfish/Boolean.h" #include "Lucy/Index/Segment.h" #include "Lucy/Store/Folder.h" -#include "Clownfish/Util/StringHelper.h" +#include "Lucy/Util/StringHelper.h" #include "Lucy/Util/IndexFileNames.h" #include "Lucy/Util/Json.h" http://git-wip-us.apache.org/repos/asf/lucy/blob/aa48a9e6/core/Lucy/Index/TermInfo.c ---------------------------------------------------------------------- diff --git a/core/Lucy/Index/TermInfo.c b/core/Lucy/Index/TermInfo.c index 697d604..34c1689 100644 --- a/core/Lucy/Index/TermInfo.c +++ b/core/Lucy/Index/TermInfo.c @@ -18,7 +18,6 @@ #include "Lucy/Util/ToolSet.h" #include "Lucy/Index/TermInfo.h" -#include "Clownfish/Util/StringHelper.h" TermInfo* TInfo_new(int32_t doc_freq) { http://git-wip-us.apache.org/repos/asf/lucy/blob/aa48a9e6/core/Lucy/Index/TermStepper.c ---------------------------------------------------------------------- diff --git a/core/Lucy/Index/TermStepper.c b/core/Lucy/Index/TermStepper.c index 26cb876..6691184 100644 --- a/core/Lucy/Index/TermStepper.c +++ b/core/Lucy/Index/TermStepper.c @@ -21,7 +21,6 @@ #include "Lucy/Plan/Schema.h" #include "Lucy/Store/InStream.h" #include "Lucy/Store/OutStream.h" -#include "Clownfish/Util/StringHelper.h" TermStepper* TermStepper_init(TermStepper *self) { http://git-wip-us.apache.org/repos/asf/lucy/blob/aa48a9e6/core/Lucy/Plan/TextType.c ---------------------------------------------------------------------- diff --git a/core/Lucy/Plan/TextType.c b/core/Lucy/Plan/TextType.c index a0e2759..12fb5ca 100644 --- a/core/Lucy/Plan/TextType.c +++ b/core/Lucy/Plan/TextType.c @@ -21,8 +21,8 @@ #include "Lucy/Plan/TextType.h" #include "Lucy/Store/InStream.h" #include "Lucy/Store/OutStream.h" +#include "Lucy/Util/StringHelper.h" #include "Clownfish/ByteBuf.h" -#include "Clownfish/Util/StringHelper.h" TermStepper* TextType_Make_Term_Stepper_IMP(TextType *self) { http://git-wip-us.apache.org/repos/asf/lucy/blob/aa48a9e6/core/Lucy/Store/CompoundFileReader.c ---------------------------------------------------------------------- diff --git a/core/Lucy/Store/CompoundFileReader.c b/core/Lucy/Store/CompoundFileReader.c index 06b79a3..ad25272 100644 --- a/core/Lucy/Store/CompoundFileReader.c +++ b/core/Lucy/Store/CompoundFileReader.c @@ -26,7 +26,6 @@ #include "Lucy/Store/InStream.h" #include "Lucy/Util/IndexFileNames.h" #include "Lucy/Util/Json.h" -#include "Clownfish/Util/StringHelper.h" CompoundFileReader* CFReader_open(Folder *folder) { http://git-wip-us.apache.org/repos/asf/lucy/blob/aa48a9e6/core/Lucy/Util/IndexFileNames.c ---------------------------------------------------------------------- diff --git a/core/Lucy/Util/IndexFileNames.c b/core/Lucy/Util/IndexFileNames.c index 550995e..a1a063b 100644 --- a/core/Lucy/Util/IndexFileNames.c +++ b/core/Lucy/Util/IndexFileNames.c @@ -20,7 +20,6 @@ #include "Lucy/Util/IndexFileNames.h" #include "Lucy/Store/DirHandle.h" #include "Lucy/Store/Folder.h" -#include "Clownfish/Util/StringHelper.h" String* IxFileNames_latest_snapshot(Folder *folder) { http://git-wip-us.apache.org/repos/asf/lucy/blob/aa48a9e6/core/Lucy/Util/Json.c ---------------------------------------------------------------------- diff --git a/core/Lucy/Util/Json.c b/core/Lucy/Util/Json.c index a601f98..46a0b89 100644 --- a/core/Lucy/Util/Json.c +++ b/core/Lucy/Util/Json.c @@ -24,11 +24,12 @@ #include "Clownfish/Boolean.h" #include "Clownfish/CharBuf.h" #include "Clownfish/Num.h" +#include "Clownfish/Util/Memory.h" #include "Lucy/Store/Folder.h" #include "Lucy/Store/InStream.h" #include "Lucy/Store/OutStream.h" -#include "Clownfish/Util/Memory.h" #include "Lucy/Util/Json/JsonParser.h" +#include "Lucy/Util/StringHelper.h" /* Routines generated by Lemon. */ void* http://git-wip-us.apache.org/repos/asf/lucy/blob/aa48a9e6/core/Lucy/Util/StringHelper.c ---------------------------------------------------------------------- diff --git a/core/Lucy/Util/StringHelper.c b/core/Lucy/Util/StringHelper.c new file mode 100644 index 0000000..2331901 --- /dev/null +++ b/core/Lucy/Util/StringHelper.c @@ -0,0 +1,83 @@ +/* Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <string.h> + +#define C_LUCY_STRINGHELPER +#define LUCY_USE_SHORT_NAMES + +#include "Lucy/Util/StringHelper.h" + +const uint8_t lucy_StrHelp_UTF8_COUNT[] = { + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +size_t +StrHelp_overlap(const char *a, const char *b, size_t a_len, size_t b_len) { + size_t i; + const size_t len = a_len <= b_len ? a_len : b_len; + + for (i = 0; i < len; i++) { + if (*a++ != *b++) { break; } + } + return i; +} + +static const char base36_chars[] = "0123456789abcdefghijklmnopqrstuvwxyz"; + +size_t +StrHelp_to_base36(uint64_t num, void *buffer) { + char my_buf[StrHelp_MAX_BASE36_BYTES]; + char *buf = my_buf + StrHelp_MAX_BASE36_BYTES - 1; + char *end = buf; + + // Null terminate. + *buf = '\0'; + + // Convert to base 36 characters. + do { + *(--buf) = base36_chars[num % 36]; + num /= 36; + } while (num > 0); + + size_t size = (size_t)(end - buf); + memcpy(buffer, buf, size + 1); + return size; +} + +const char* +StrHelp_back_utf8_char(const char *ptr, const char *start) { + while (--ptr >= start) { + if ((*ptr & 0xC0) != 0x80) { return ptr; } + } + return NULL; +} + http://git-wip-us.apache.org/repos/asf/lucy/blob/aa48a9e6/core/Lucy/Util/StringHelper.cfh ---------------------------------------------------------------------- diff --git a/core/Lucy/Util/StringHelper.cfh b/core/Lucy/Util/StringHelper.cfh new file mode 100644 index 0000000..f78f0a8 --- /dev/null +++ b/core/Lucy/Util/StringHelper.cfh @@ -0,0 +1,59 @@ +/* Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +parcel Lucy; + +inert class Lucy::Util::StringHelper nickname StrHelp { + + /* A table where the values indicate the number of bytes in a UTF-8 + * sequence implied by the leading utf8 byte. + */ + inert const uint8_t[] UTF8_COUNT; + + /** Return the number of bytes that two strings have in common. + */ + inert size_t + overlap(const char *a, const char *b, size_t a_len, size_t b_len); + + /** Encode a NULL-terminated string representation of a value in base 36 + * into `buffer`. + * + * @param value The number to be encoded. + * @param buffer A buffer at least MAX_BASE36_BYTES bytes long. + * @return the number of digits encoded (not including the terminating + * NULL). + */ + inert size_t + to_base36(uint64_t value, void *buffer); + + /** Return the first non-continuation byte before the supplied pointer. + * If backtracking progresses beyond the supplied start, return NULL. + */ + inert nullable const char* + back_utf8_char(const char *utf8, const char *start); +} + +__C__ +/** The maximum number of bytes encoded by to_base36(), including the + * terminating NULL. + */ +#define lucy_StrHelp_MAX_BASE36_BYTES 14 +#ifdef LUCY_USE_SHORT_NAMES + #define StrHelp_MAX_BASE36_BYTES lucy_StrHelp_MAX_BASE36_BYTES +#endif +__END_C__ + + http://git-wip-us.apache.org/repos/asf/lucy/blob/aa48a9e6/core/Lucy/Util/ToolSet.h ---------------------------------------------------------------------- diff --git a/core/Lucy/Util/ToolSet.h b/core/Lucy/Util/ToolSet.h index 2d7a7a9..5fbc2be 100644 --- a/core/Lucy/Util/ToolSet.h +++ b/core/Lucy/Util/ToolSet.h @@ -45,7 +45,6 @@ extern "C" { #include "Clownfish/Vector.h" #include "Clownfish/Class.h" #include "Clownfish/Util/Memory.h" -#include "Clownfish/Util/StringHelper.h" #ifdef __cplusplus } http://git-wip-us.apache.org/repos/asf/lucy/blob/aa48a9e6/go/cfext/lucy.c ---------------------------------------------------------------------- diff --git a/go/cfext/lucy.c b/go/cfext/lucy.c index fc2df3b..85778fb 100644 --- a/go/cfext/lucy.c +++ b/go/cfext/lucy.c @@ -44,7 +44,6 @@ #include "Clownfish/Vector.h" #include "Clownfish/Class.h" #include "Clownfish/Util/Memory.h" -#include "Clownfish/Util/StringHelper.h" #include "Lucy/Analysis/Token.h" #include "Lucy/Analysis/Inversion.h" #include "Lucy/Document/HitDoc.h" http://git-wip-us.apache.org/repos/asf/lucy/blob/aa48a9e6/go/lucy/lucy.go ---------------------------------------------------------------------- diff --git a/go/lucy/lucy.go b/go/lucy/lucy.go index 4a4fc71..3c7f06c 100644 --- a/go/lucy/lucy.go +++ b/go/lucy/lucy.go @@ -41,7 +41,6 @@ package lucy #include "Clownfish/HashIterator.h" #include "Clownfish/Vector.h" #include "Clownfish/Err.h" -#include "Clownfish/Util/StringHelper.h" #include "Lucy/Analysis/Analyzer.h" #include "Lucy/Analysis/Inversion.h" #include "Lucy/Analysis/Token.h" @@ -55,6 +54,7 @@ package lucy #include "Lucy/Store/OutStream.h" #include "Lucy/Object/I32Array.h" #include "Lucy/Util/Freezer.h" +#include "Lucy/Util/StringHelper.h" extern lucy_RegexTokenizer* GOLUCY_RegexTokenizer_init(lucy_RegexTokenizer *self, cfish_String *pattern); @@ -154,7 +154,7 @@ S_count_code_points(const char *string, size_t len) { size_t i = 0; while (i < len) { - i += cfish_StrHelp_UTF8_COUNT[(uint8_t)(string[i])]; + i += lucy_StrHelp_UTF8_COUNT[(uint8_t)(string[i])]; ++num_code_points; } http://git-wip-us.apache.org/repos/asf/lucy/blob/aa48a9e6/perl/buildlib/Lucy/Build/Binding/Util.pm ---------------------------------------------------------------------- diff --git a/perl/buildlib/Lucy/Build/Binding/Util.pm b/perl/buildlib/Lucy/Build/Binding/Util.pm index 2b09173..10efaa6 100644 --- a/perl/buildlib/Lucy/Build/Binding/Util.pm +++ b/perl/buildlib/Lucy/Build/Binding/Util.pm @@ -25,6 +25,7 @@ sub bind_all { $class->bind_freezer; $class->bind_indexfilenames; $class->bind_sortexternal; + $class->bind_stringhelper; } sub bind_debug { @@ -180,4 +181,97 @@ sub bind_sortexternal { Clownfish::CFC::Binding::Perl::Class->register($binding); } +sub bind_stringhelper { + my $xs_code = <<'END_XS_CODE'; +MODULE = Lucy PACKAGE = Lucy::Util::StringHelper + +=for comment + +Turn an SV's UTF8 flag on. Equivalent to Encode::_utf8_on, but we don't have +to load Encode. + +=cut + +void +utf8_flag_on(sv) + SV *sv; +PPCODE: + SvUTF8_on(sv); + +=for comment + +Turn an SV's UTF8 flag off. + +=cut + +void +utf8_flag_off(sv) + SV *sv; +PPCODE: + SvUTF8_off(sv); + +SV* +to_base36(num) + uint64_t num; +CODE: +{ + char base36[lucy_StrHelp_MAX_BASE36_BYTES]; + size_t size = lucy_StrHelp_to_base36(num, &base36); + RETVAL = newSVpvn(base36, size); +} +OUTPUT: RETVAL + +=for comment + +Upgrade a SV to UTF8, converting Latin1 if necessary. Equivalent to +utf::upgrade(). + +=cut + +void +utf8ify(sv) + SV *sv; +PPCODE: + sv_utf8_upgrade(sv); + +bool +utf8_valid(sv) + SV *sv; +CODE: +{ + STRLEN len; + char *ptr = SvPV(sv, len); + RETVAL = cfish_Str_utf8_valid(ptr, len); +} +OUTPUT: RETVAL + +=for comment + +Concatenate one scalar onto the end of the other, ignoring UTF-8 status of the +second scalar. This is necessary because $not_utf8 . $utf8 results in a +scalar which has been infected by the UTF-8 flag of the second argument. + +=cut + +void +cat_bytes(sv, catted) + SV *sv; + SV *catted; +PPCODE: +{ + STRLEN len; + char *ptr = SvPV(catted, len); + if (SvUTF8(sv)) { CFISH_THROW(CFISH_ERR, "Can't cat_bytes onto a UTF-8 SV"); } + sv_catpvn(sv, ptr, len); +} +END_XS_CODE + + my $binding = Clownfish::CFC::Binding::Perl::Class->new( + class_name => "Lucy::Util::StringHelper", + ); + $binding->append_xs($xs_code); + + Clownfish::CFC::Binding::Perl::Class->register($binding); +} + 1; http://git-wip-us.apache.org/repos/asf/lucy/blob/aa48a9e6/perl/lib/Lucy.pm ---------------------------------------------------------------------- diff --git a/perl/lib/Lucy.pm b/perl/lib/Lucy.pm index 4c812ce..cdc2b54 100644 --- a/perl/lib/Lucy.pm +++ b/perl/lib/Lucy.pm @@ -305,6 +305,23 @@ BEGIN { } } +{ + package Lucy::Util::StringHelper; + our $VERSION = '0.005000'; + $VERSION = eval $VERSION; + BEGIN { + push our @ISA, 'Exporter'; + our @EXPORT_OK = qw( + utf8_flag_on + utf8_flag_off + to_base36 + utf8ify + utf8_valid + cat_bytes + ); + } +} + 1; __END__ http://git-wip-us.apache.org/repos/asf/lucy/blob/aa48a9e6/perl/lib/Lucy/Util/StringHelper.pm ---------------------------------------------------------------------- diff --git a/perl/lib/Lucy/Util/StringHelper.pm b/perl/lib/Lucy/Util/StringHelper.pm new file mode 100644 index 0000000..098855f --- /dev/null +++ b/perl/lib/Lucy/Util/StringHelper.pm @@ -0,0 +1,25 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +package Lucy::Util::StringHelper; +use Lucy; +our $VERSION = '0.005000'; +$VERSION = eval $VERSION; + +1; + +__END__ + + http://git-wip-us.apache.org/repos/asf/lucy/blob/aa48a9e6/perl/lib/LucyX/Index/ZlibDocReader.pm ---------------------------------------------------------------------- diff --git a/perl/lib/LucyX/Index/ZlibDocReader.pm b/perl/lib/LucyX/Index/ZlibDocReader.pm index 45ff9fc..6727e01 100644 --- a/perl/lib/LucyX/Index/ZlibDocReader.pm +++ b/perl/lib/LucyX/Index/ZlibDocReader.pm @@ -20,7 +20,7 @@ package LucyX::Index::ZlibDocReader; use base qw( Lucy::Index::DocReader ); our $VERSION = '0.005000'; $VERSION = eval $VERSION; -use Clownfish::Util::StringHelper qw( utf8_valid utf8_flag_on ); +use Lucy::Util::StringHelper qw( utf8_valid utf8_flag_on ); use Compress::Zlib qw( uncompress ); use Carp; http://git-wip-us.apache.org/repos/asf/lucy/blob/aa48a9e6/perl/lib/LucyX/Index/ZlibDocWriter.pm ---------------------------------------------------------------------- diff --git a/perl/lib/LucyX/Index/ZlibDocWriter.pm b/perl/lib/LucyX/Index/ZlibDocWriter.pm index 884dee0..2104f25 100644 --- a/perl/lib/LucyX/Index/ZlibDocWriter.pm +++ b/perl/lib/LucyX/Index/ZlibDocWriter.pm @@ -20,7 +20,7 @@ use base qw( Lucy::Index::DataWriter ); use Carp; use Scalar::Util qw( blessed ); use Compress::Zlib qw( compress ); -use Clownfish::Util::StringHelper qw( cat_bytes ); +use Lucy::Util::StringHelper qw( cat_bytes ); use Clownfish; use bytes; no bytes; http://git-wip-us.apache.org/repos/asf/lucy/blob/aa48a9e6/perl/t/105-folder.t ---------------------------------------------------------------------- diff --git a/perl/t/105-folder.t b/perl/t/105-folder.t index ef94518..735c162 100644 --- a/perl/t/105-folder.t +++ b/perl/t/105-folder.t @@ -21,7 +21,7 @@ use Test::More tests => 25; use File::Spec::Functions qw( catfile ); use Fcntl; use Lucy::Test::TestUtils qw( init_test_index_loc ); -use Clownfish::Util::StringHelper qw( to_base36 ); +use Lucy::Util::StringHelper qw( to_base36 ); my $fs_index_loc = init_test_index_loc(); my $fs_folder = Lucy::Store::FSFolder->new( path => $fs_index_loc, ); http://git-wip-us.apache.org/repos/asf/lucy/blob/aa48a9e6/perl/t/601-queryparser.t ---------------------------------------------------------------------- diff --git a/perl/t/601-queryparser.t b/perl/t/601-queryparser.t index 5c2275c..6e29bcd 100644 --- a/perl/t/601-queryparser.t +++ b/perl/t/601-queryparser.t @@ -77,7 +77,7 @@ sub make_req_opt_query { shift; MyReqOptQuery->new(@_) } package main; use Test::More tests => 224; -use Clownfish::Util::StringHelper qw( utf8_flag_on utf8ify ); +use Lucy::Util::StringHelper qw( utf8_flag_on utf8ify ); use Lucy::Test::TestUtils qw( create_index ); my $folder = Lucy::Store::RAMFolder->new; http://git-wip-us.apache.org/repos/asf/lucy/blob/aa48a9e6/perl/t/binding/101-simple_io.t ---------------------------------------------------------------------- diff --git a/perl/t/binding/101-simple_io.t b/perl/t/binding/101-simple_io.t index 85db57a..e2a03e9 100644 --- a/perl/t/binding/101-simple_io.t +++ b/perl/t/binding/101-simple_io.t @@ -19,7 +19,7 @@ use lib 'buildlib'; use Test::More tests => 28; use Lucy::Test::TestUtils qw( utf8_test_strings ); -use Clownfish::Util::StringHelper qw( utf8ify utf8_flag_off ); +use Lucy::Util::StringHelper qw( utf8ify utf8_flag_off ); use bytes; no bytes; http://git-wip-us.apache.org/repos/asf/lucy/blob/aa48a9e6/perl/t/core/032-string_helper.t ---------------------------------------------------------------------- diff --git a/perl/t/core/032-string_helper.t b/perl/t/core/032-string_helper.t new file mode 100644 index 0000000..b8ddea6 --- /dev/null +++ b/perl/t/core/032-string_helper.t @@ -0,0 +1,25 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +use strict; +use warnings; + +use Lucy::Test; +my $success = Lucy::Test::run_tests( + "Lucy::Test::Util::TestStringHelper" +); + +exit($success ? 0 : 1); + http://git-wip-us.apache.org/repos/asf/lucy/blob/aa48a9e6/perl/xs/Lucy/Analysis/RegexTokenizer.c ---------------------------------------------------------------------- diff --git a/perl/xs/Lucy/Analysis/RegexTokenizer.c b/perl/xs/Lucy/Analysis/RegexTokenizer.c index 408acf1..a9ee264 100644 --- a/perl/xs/Lucy/Analysis/RegexTokenizer.c +++ b/perl/xs/Lucy/Analysis/RegexTokenizer.c @@ -22,8 +22,8 @@ #include "Lucy/Analysis/RegexTokenizer.h" #include "Lucy/Analysis/Token.h" #include "Lucy/Analysis/Inversion.h" +#include "Lucy/Util/StringHelper.h" #include "Clownfish/Util/Memory.h" -#include "Clownfish/Util/StringHelper.h" static SV* S_compile_token_re(pTHX_ cfish_String *pattern); @@ -154,14 +154,14 @@ LUCY_RegexTokenizer_Tokenize_Utf8_IMP(lucy_RegexTokenizer *self, // Get start and end offsets in Unicode code points. for (; string_arg < start_ptr; num_code_points++) { - string_arg += cfish_StrHelp_UTF8_COUNT[(uint8_t)(*string_arg)]; + string_arg += lucy_StrHelp_UTF8_COUNT[(uint8_t)(*string_arg)]; if (string_arg > string_end) { THROW(CFISH_ERR, "scanned past end of '%s'", string_beg); } } start = num_code_points; for (; string_arg < end_ptr; num_code_points++) { - string_arg += cfish_StrHelp_UTF8_COUNT[(uint8_t)(*string_arg)]; + string_arg += lucy_StrHelp_UTF8_COUNT[(uint8_t)(*string_arg)]; if (string_arg > string_end) { THROW(CFISH_ERR, "scanned past end of '%s'", string_beg); } http://git-wip-us.apache.org/repos/asf/lucy/blob/aa48a9e6/perl/xs/Lucy/Index/Inverter.c ---------------------------------------------------------------------- diff --git a/perl/xs/Lucy/Index/Inverter.c b/perl/xs/Lucy/Index/Inverter.c index 28c8e90..2dfbd17 100644 --- a/perl/xs/Lucy/Index/Inverter.c +++ b/perl/xs/Lucy/Index/Inverter.c @@ -27,7 +27,6 @@ #include "Lucy/Plan/NumericType.h" #include "Lucy/Plan/Schema.h" #include "Lucy/Plan/TextType.h" -#include "Clownfish/Util/StringHelper.h" static lucy_InverterEntry* S_fetch_entry(pTHX_ lucy_Inverter *self, HE *hash_entry) { http://git-wip-us.apache.org/repos/asf/lucy/blob/aa48a9e6/test/Lucy/Test.c ---------------------------------------------------------------------- diff --git a/test/Lucy/Test.c b/test/Lucy/Test.c index 0edda50..943c5ab 100644 --- a/test/Lucy/Test.c +++ b/test/Lucy/Test.c @@ -85,6 +85,7 @@ #include "Lucy/Test/Util/TestNumberUtils.h" #include "Lucy/Test/Util/TestPriorityQueue.h" #include "Lucy/Test/Util/TestSortExternal.h" +#include "Lucy/Test/Util/TestStringHelper.h" TestSuite* Test_create_test_suite() { @@ -95,6 +96,7 @@ Test_create_test_suite() { TestSuite_Add_Batch(suite, (TestBatch*)TestSortExternal_new()); TestSuite_Add_Batch(suite, (TestBatch*)TestMemPool_new()); TestSuite_Add_Batch(suite, (TestBatch*)TestNumUtil_new()); + TestSuite_Add_Batch(suite, (TestBatch*)TestStrHelp_new()); TestSuite_Add_Batch(suite, (TestBatch*)TestIxFileNames_new()); TestSuite_Add_Batch(suite, (TestBatch*)TestJson_new()); TestSuite_Add_Batch(suite, (TestBatch*)TestFreezer_new()); http://git-wip-us.apache.org/repos/asf/lucy/blob/aa48a9e6/test/Lucy/Test/Util/TestStringHelper.c ---------------------------------------------------------------------- diff --git a/test/Lucy/Test/Util/TestStringHelper.c b/test/Lucy/Test/Util/TestStringHelper.c new file mode 100644 index 0000000..0cf44e8 --- /dev/null +++ b/test/Lucy/Test/Util/TestStringHelper.c @@ -0,0 +1,103 @@ +/* Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#define CFISH_USE_SHORT_NAMES +#define LUCY_USE_SHORT_NAMES +#define TESTLUCY_USE_SHORT_NAMES + +#include "Lucy/Test/Util/TestStringHelper.h" +#include "Lucy/Util/StringHelper.h" + +#include "Clownfish/Class.h" +#include "Clownfish/String.h" +#include "Clownfish/TestHarness/TestBatchRunner.h" + +TestStringHelper* +TestStrHelp_new() { + return (TestStringHelper*)Class_Make_Obj(TESTSTRINGHELPER); +} + +static void +test_overlap(TestBatchRunner *runner) { + size_t result; + result = StrHelp_overlap("", "", 0, 0); + TEST_UINT_EQ(runner, result, 0, "two empty strings"); + result = StrHelp_overlap("", "foo", 0, 3); + TEST_UINT_EQ(runner, result, 0, "first string is empty"); + result = StrHelp_overlap("foo", "", 3, 0); + TEST_UINT_EQ(runner, result, 0, "second string is empty"); + result = StrHelp_overlap("foo", "foo", 3, 3); + TEST_UINT_EQ(runner, result, 3, "equal strings"); + result = StrHelp_overlap("foo bar", "foo", 7, 3); + TEST_UINT_EQ(runner, result, 3, "first string is longer"); + result = StrHelp_overlap("foo", "foo bar", 3, 7); + TEST_UINT_EQ(runner, result, 3, "second string is longer"); + result = StrHelp_overlap("bar", "baz", 3, 3); + TEST_UINT_EQ(runner, result, 2, "different byte"); +} + + +static void +test_to_base36(TestBatchRunner *runner) { + char buffer[StrHelp_MAX_BASE36_BYTES]; + StrHelp_to_base36(UINT64_MAX, buffer); + TEST_STR_EQ(runner, "3w5e11264sgsf", buffer, "base36 UINT64_MAX"); + StrHelp_to_base36(1, buffer); + TEST_STR_EQ(runner, "1", buffer, "base36 1"); + TEST_INT_EQ(runner, buffer[1], 0, "base36 NULL termination"); +} + +static void +test_back_utf8_char(TestBatchRunner *runner) { + char buffer[4]; + char *buf = buffer + 1; + uint32_t len = Str_encode_utf8_char(0x263A, buffer); + char *end = buffer + len; + TEST_TRUE(runner, StrHelp_back_utf8_char(end, buffer) == buffer, + "back_utf8_char"); + TEST_TRUE(runner, StrHelp_back_utf8_char(end, buf) == NULL, + "back_utf8_char returns NULL rather than back up beyond start"); + TEST_TRUE(runner, StrHelp_back_utf8_char(buffer, buffer) == NULL, + "back_utf8_char returns NULL when end == start"); + + int32_t code_point; + for (code_point = 0; code_point <= 0x10FFFF; code_point++) { + uint32_t size = Str_encode_utf8_char(code_point, buffer); + char *start = buffer; + char *end = start + size; + + if (StrHelp_back_utf8_char(end, start) != start) { + break; + } + } + if (code_point == 0x110000) { + PASS(runner, "back_utf8_char works for code points 0 - 0x10FFFF"); + } + else { + FAIL(runner, "Failed back_utf8_char at 0x%.1X", (unsigned)code_point); + } +} + +void +TestStrHelp_Run_IMP(TestStringHelper *self, TestBatchRunner *runner) { + TestBatchRunner_Plan(runner, (TestBatch*)self, 14); + test_overlap(runner); + test_to_base36(runner); + test_back_utf8_char(runner); +} + + + http://git-wip-us.apache.org/repos/asf/lucy/blob/aa48a9e6/test/Lucy/Test/Util/TestStringHelper.cfh ---------------------------------------------------------------------- diff --git a/test/Lucy/Test/Util/TestStringHelper.cfh b/test/Lucy/Test/Util/TestStringHelper.cfh new file mode 100644 index 0000000..d0be8ec --- /dev/null +++ b/test/Lucy/Test/Util/TestStringHelper.cfh @@ -0,0 +1,29 @@ +/* Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +parcel TestLucy; + +class Lucy::Test::Util::TestStringHelper nickname TestStrHelp + inherits Clownfish::TestHarness::TestBatch { + + inert incremented TestStringHelper* + new(); + + void + Run(TestStringHelper *self, TestBatchRunner *runner); +} + +
