Rework Str_Find interface Make Str_Find return a StringIterator. Add Str_Contains.
Optimize substring search to use memchr. Unfortunately, memmem isn't widely available. Project: http://git-wip-us.apache.org/repos/asf/lucy-clownfish/repo Commit: http://git-wip-us.apache.org/repos/asf/lucy-clownfish/commit/2f6b2393 Tree: http://git-wip-us.apache.org/repos/asf/lucy-clownfish/tree/2f6b2393 Diff: http://git-wip-us.apache.org/repos/asf/lucy-clownfish/diff/2f6b2393 Branch: refs/heads/master Commit: 2f6b2393c84402b296b3081265f64c350529e96f Parents: f126bf3 Author: Nick Wellnhofer <[email protected]> Authored: Sat Oct 24 15:14:59 2015 +0200 Committer: Nick Wellnhofer <[email protected]> Committed: Wed Oct 28 16:10:35 2015 +0100 ---------------------------------------------------------------------- runtime/core/Clownfish/String.c | 45 ++++++++++++++++++++------- runtime/core/Clownfish/String.cfh | 23 +++++++++++--- runtime/core/Clownfish/Test/TestObj.c | 4 +-- runtime/core/Clownfish/Test/TestString.c | 37 ++++++++++++++++------ runtime/go/clownfish/string_test.go | 26 ++++++++++++---- 5 files changed, 101 insertions(+), 34 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/2f6b2393/runtime/core/Clownfish/String.c ---------------------------------------------------------------------- diff --git a/runtime/core/Clownfish/String.c b/runtime/core/Clownfish/String.c index f4732bd..090bda6 100644 --- a/runtime/core/Clownfish/String.c +++ b/runtime/core/Clownfish/String.c @@ -44,6 +44,9 @@ static void S_die_invalid_utf8(const char *text, size_t size, const char *file, int line, const char *func); +static const char* +S_memmem(String *self, const char *substring, size_t size); + static StringIterator* S_new_stack_iter(void *allocation, String *string, size_t byte_offset); @@ -388,25 +391,43 @@ Str_Ends_With_Utf8_IMP(String *self, const char *suffix, size_t suffix_len) { return false; } -int64_t +bool +Str_Contains_IMP(String *self, String *substring) { + return !!S_memmem(self, substring->ptr, substring->size); +} + +bool +Str_Contains_Utf8_IMP(String *self, const char *substring, size_t size) { + return !!S_memmem(self, substring, size); +} + +StringIterator* Str_Find_IMP(String *self, String *substring) { return Str_Find_Utf8(self, substring->ptr, substring->size); } -int64_t -Str_Find_Utf8_IMP(String *self, const char *ptr, size_t size) { - StringIterator *iter = STACK_ITER(self, 0); - int64_t location = 0; +StringIterator* +Str_Find_Utf8_IMP(String *self, const char *substring, size_t size) { + const char *ptr = S_memmem(self, substring, size); + return ptr ? StrIter_new(self, ptr - self->ptr) : NULL; +} - while (iter->byte_offset + size <= self->size) { - if (memcmp(self->ptr + iter->byte_offset, ptr, size) == 0) { - return location; - } - StrIter_Advance(iter, 1); - location++; +static const char* +S_memmem(String *self, const char *substring, size_t size) { + if (size == 0) { return self->ptr; } + if (size > self->size) { return NULL; } + + const char *ptr = self->ptr; + const char *end = ptr + self->size - size + 1; + char first_char = substring[0]; + + // Naive string search. + while (NULL != (ptr = (const char*)memchr(ptr, first_char, end - ptr))) { + if (memcmp(ptr, substring, size) == 0) { break; } + ptr++; } - return -1; + return ptr; } String* http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/2f6b2393/runtime/core/Clownfish/String.cfh ---------------------------------------------------------------------- diff --git a/runtime/core/Clownfish/String.cfh b/runtime/core/Clownfish/String.cfh index b54f80d..f675aff 100644 --- a/runtime/core/Clownfish/String.cfh +++ b/runtime/core/Clownfish/String.cfh @@ -158,13 +158,28 @@ public final class Clownfish::String nickname Str public bool Ends_With_Utf8(String *self, const char *suffix, size_t size); - /** Return the location of the substring within the String (measured in - * code points), or -1 if the substring does not match. + /** Test whether the String contains `substring`. */ - public int64_t + public bool + Contains(String *self, String *substring); + + /** Test whether the String contains `substring`. + */ + public bool + Contains_Utf8(String *self, const char *ptr, size_t size); + + /** Return a [](StringIterator) pointing to the first occurrence of the + * substring within the String, or [](@null) if the substring does not + * match. + */ + public incremented nullable StringIterator* Find(String *self, String *substring); - public int64_t + /** Return a [](StringIterator) pointing to the first occurrence of the + * substring within the String, or [](@null) if the substring does not + * match. + */ + public incremented nullable StringIterator* Find_Utf8(String *self, const char *ptr, size_t size); /** Test whether the String matches the supplied UTF-8 character data. http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/2f6b2393/runtime/core/Clownfish/Test/TestObj.c ---------------------------------------------------------------------- diff --git a/runtime/core/Clownfish/Test/TestObj.c b/runtime/core/Clownfish/Test/TestObj.c index 1bac642..b37ad71 100644 --- a/runtime/core/Clownfish/Test/TestObj.c +++ b/runtime/core/Clownfish/Test/TestObj.c @@ -66,7 +66,7 @@ static void test_To_String(TestBatchRunner *runner) { Obj *testobj = S_new_testobj(); String *string = Obj_To_String(testobj); - TEST_TRUE(runner, Str_Find_Utf8(string, "TestObj", 7) >= 0, "To_String"); + TEST_TRUE(runner, Str_Contains_Utf8(string, "TestObj", 7), "To_String"); DECREF(string); DECREF(testobj); } @@ -123,7 +123,7 @@ S_verify_abstract_error(TestBatchRunner *runner, Err_Attempt_t routine, Err *error = Err_trap(routine, context); TEST_TRUE(runner, error != NULL && Err_is_a(error, ERR) - && Str_Find_Utf8(Err_Get_Mess(error), "bstract", 7) != -1, + && Str_Contains_Utf8(Err_Get_Mess(error), "bstract", 7), message); DECREF(error); } http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/2f6b2393/runtime/core/Clownfish/Test/TestString.c ---------------------------------------------------------------------- diff --git a/runtime/core/Clownfish/Test/TestString.c b/runtime/core/Clownfish/Test/TestString.c index 629763d..291aee8 100644 --- a/runtime/core/Clownfish/Test/TestString.c +++ b/runtime/core/Clownfish/Test/TestString.c @@ -150,28 +150,45 @@ test_Clone(TestBatchRunner *runner) { DECREF(wanted); } +static int64_t +S_find(String *string, String *substring) { + StringIterator *iter = Str_Find(string, substring); + if (iter == NULL) { return -1; } + size_t tick = StrIter_Recede(iter, SIZE_MAX); + DECREF(iter); + return (int64_t)tick; +} + static void -test_Find(TestBatchRunner *runner) { +test_Contains_and_Find(TestBatchRunner *runner) { String *string; String *substring = S_get_str("foo"); string = S_get_str(""); - TEST_TRUE(runner, Str_Find(string, substring) == -1, "Not in empty string"); + TEST_FALSE(runner, Str_Contains(string, substring), + "Not contained in empty string"); + TEST_INT_EQ(runner, S_find(string, substring), -1, + "Not found in empty string"); DECREF(string); string = S_get_str("foo"); - TEST_TRUE(runner, Str_Find(string, substring) == 0, "Find complete string"); + TEST_TRUE(runner, Str_Contains(string, substring), + "Contained in complete string"); + TEST_INT_EQ(runner, S_find(string, substring), 0, "Find complete string"); DECREF(string); string = S_get_str("afoo"); - TEST_TRUE(runner, Str_Find(string, substring) == 1, "Find after first"); - // TODO: Enable this test when we have real substrings. - /*Str_Set_Size(string, 3); - TEST_TRUE(runner, Str_Find(string, substring) == -1, "Don't overrun");*/ + TEST_TRUE(runner, Str_Contains(string, substring), + "Contained after first"); + TEST_INT_EQ(runner, S_find(string, substring), 1, "Find after first"); + String *prefix = Str_SubString(string, 0, 3); + TEST_FALSE(runner, Str_Contains(prefix, substring), "Don't overrun"); + DECREF(prefix); DECREF(string); string = S_get_str("afood"); - TEST_TRUE(runner, Str_Find(string, substring) == 1, "Find in middle"); + TEST_TRUE(runner, Str_Contains(string, substring), "Contained in middle"); + TEST_INT_EQ(runner, S_find(string, substring), 1, "Find in middle"); DECREF(string); DECREF(substring); @@ -676,12 +693,12 @@ test_iterator_substring(TestBatchRunner *runner) { void TestStr_Run_IMP(TestString *self, TestBatchRunner *runner) { - TestBatchRunner_Plan(runner, (TestBatch*)self, 133); + TestBatchRunner_Plan(runner, (TestBatch*)self, 138); test_new(runner); test_Cat(runner); test_Clone(runner); test_Code_Point_At_and_From(runner); - test_Find(runner); + test_Contains_and_Find(runner); test_SubString(runner); test_Trim(runner); test_To_F64(runner); http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/2f6b2393/runtime/go/clownfish/string_test.go ---------------------------------------------------------------------- diff --git a/runtime/go/clownfish/string_test.go b/runtime/go/clownfish/string_test.go index 6f8c641..fb67aae 100644 --- a/runtime/go/clownfish/string_test.go +++ b/runtime/go/clownfish/string_test.go @@ -63,15 +63,29 @@ func TestStringBaseXToI64(t *testing.T) { } } +func TestStringContains(t *testing.T) { + s := NewString("foobarbaz") + if !s.Contains("bar") { + t.Error("Contains yes") + } + if s.Contains("banana") { + t.Error("Contains no") + } +} + func TestStringFind(t *testing.T) { s := NewString("foobarbaz") - var got int64 = s.Find("bar") - if got != 3 { - t.Error("Find yes", got) + iter := s.Find("bar") + var pos int = -1 + if iter != nil { + pos = int(iter.Recede(100)) + } + if pos != 3 { + t.Error("Find yes", pos) } - got = s.Find("banana") - if got != -1 { - t.Error("Find no", got) + iter = s.Find("banana") + if iter != nil { + t.Error("Find no") } }
