This is an automated email from the ASF dual-hosted git repository.

willholley pushed a commit to branch mango-beginswith
in repository https://gitbox.apache.org/repos/asf/couchdb.git

commit 85e8a9d647ae5f3a11d43df311176ec2e7490507
Author: Will Holley <[email protected]>
AuthorDate: Mon Oct 16 14:07:55 2023 +0000

    mango: add $beginsWith operator
    
    Adds a `$beginsWith` operator to selectors, with json and text index
    support. This is a compliment / precursor to optimising `$regex`
    support as proposed in https://github.com/apache/couchdb/pull/4776.
    
    For `json` indexes, a $beginsWith operator translates into a key
    range query, as is common practice for _view queries. For example,
    to find all rows with a key beginning with "W", we can use a range
    `start_key="W", end_key="W\ufff0"`. Given Mango uses compound keys,
    this is slightly more complex in practice, but the idea is the same.
    As with other range operators (`$gt`, `$gte`, etc), `$beginsWith`
    can be used in combination with equality operators and result sorting
    but must result in a contiguous key range. That is, a range of
    `start_key=[10, "W"], end_key=[10, "W\ufff0", {}]` would be valid,
    but `start_key=["W", 10], end_key=["W\ufff0", 10, {}]` would not,
    because the second element of the key may result in a non-contiguous
    range.
    
    For text indexes, `$beginsWith` translates to a Lucene query on
    the specified field of `W*`.
    
    If a non-string operand is provided to `$beginsWith`, the request will
    fail with a 400 / `invalid_operator` error.
---
 src/mango/src/mango_idx_view.erl      |   6 ++
 src/mango/src/mango_selector.erl      |  32 ++++++++++
 src/mango/src/mango_selector_text.erl |   3 +
 src/mango/test/03-operator-test.py    |  16 +++++
 src/mango/test/25-beginswith-test.py  | 112 ++++++++++++++++++++++++++++++++++
 5 files changed, 169 insertions(+)

diff --git a/src/mango/src/mango_idx_view.erl b/src/mango/src/mango_idx_view.erl
index 25d75d55d..d1650e987 100644
--- a/src/mango/src/mango_idx_view.erl
+++ b/src/mango/src/mango_idx_view.erl
@@ -306,6 +306,8 @@ indexable({[{<<"$gt">>, _}]}) ->
     true;
 indexable({[{<<"$gte">>, _}]}) ->
     true;
+indexable({[{<<"$beginsWith">>, _}]}) ->
+    true;
 % This is required to improve index selection for covering indexes.
 % Making `$exists` indexable should not cause problems in other cases.
 indexable({[{<<"$exists">>, _}]}) ->
@@ -412,6 +414,10 @@ range(_, _, LCmp, Low, HCmp, High) ->
 % operators but its all straight forward once you figure out how
 % we're basically just narrowing our logical ranges.
 
+% beginsWith requires both a high and low bound
+range({[{<<"$beginsWith">>, Arg}]}, LCmp, Low, HCmp, High) ->
+    {LCmp0, Low0, HCmp0, High0} = range({[{<<"$gte">>, Arg}]}, LCmp, Low, 
HCmp, High),
+    range({[{<<"$lte">>, <<Arg/binary, 16#10FFFF>>}]}, LCmp0, Low0, HCmp0, 
High0);
 range({[{<<"$lt">>, Arg}]}, LCmp, Low, HCmp, High) ->
     case range_pos(Low, Arg, High) of
         min ->
diff --git a/src/mango/src/mango_selector.erl b/src/mango/src/mango_selector.erl
index 59be7a6eb..c1b4d7c28 100644
--- a/src/mango/src/mango_selector.erl
+++ b/src/mango/src/mango_selector.erl
@@ -135,6 +135,8 @@ norm_ops({[{<<"$text">>, Arg}]}) when
     {[{<<"$default">>, {[{<<"$text">>, Arg}]}}]};
 norm_ops({[{<<"$text">>, Arg}]}) ->
     ?MANGO_ERROR({bad_arg, '$text', Arg});
+norm_ops({[{<<"$beginsWith">>, Arg}]} = Cond) when is_binary(Arg) ->
+    Cond;
 % Not technically an operator but we pass it through here
 % so that this function accepts its own output. This exists
 % so that $text can have a field name value which simplifies
@@ -514,6 +516,11 @@ match({[{<<"$mod">>, [D, R]}]}, Value, _Cmp) when 
is_integer(Value) ->
     Value rem D == R;
 match({[{<<"$mod">>, _}]}, _Value, _Cmp) ->
     false;
+match({[{<<"$beginsWith">>, Prefix}]}, Value, _Cmp) when is_binary(Prefix), 
is_binary(Value) ->
+    string:prefix(Value, Prefix) /= nomatch;
+% When Value is not a string, do not match
+match({[{<<"$beginsWith">>, Prefix}]}, _, _Cmp) when is_binary(Prefix) ->
+    false;
 match({[{<<"$regex">>, Regex}]}, Value, _Cmp) when is_binary(Value) ->
     try
         match == re:run(Value, Regex, [{capture, none}])
@@ -1054,4 +1061,29 @@ fields_nor_test() ->
     },
     ?assertEqual([<<"field1">>, <<"field2">>], fields_of(Selector2)).
 
+match_beginswith_test() ->
+    Doc =
+        {[
+            {<<"_id">>, <<"foo">>},
+            {<<"_rev">>, <<"bar">>},
+            {<<"user_id">>, 11}
+        ]},
+    Check = fun(Field, Prefix) ->
+        Selector = {[{Field, {[{<<"$beginsWith">>, Prefix}]}}]},
+        % Call match_int/2 to avoid ERROR for missing metric; this is confusing
+        % in the middle of test output.
+        match_int(mango_selector:normalize(Selector), Doc)
+    end,
+    [
+        % matching
+        ?assertEqual(true, Check(<<"_id">>, <<"f">>)),
+        % no match (user_id is not a binary string)
+        ?assertEqual(false, Check(<<"user_id">>, <<"f">>)),
+        % invalid (prefix is not a binary string)
+        ?assertThrow(
+            {mango_error, mango_selector, {invalid_operator, 
<<"$beginsWith">>}},
+            Check(<<"user_id">>, 1)
+        )
+    ].
+
 -endif.
diff --git a/src/mango/src/mango_selector_text.erl 
b/src/mango/src/mango_selector_text.erl
index 1f8609ac2..fc9280d85 100644
--- a/src/mango/src/mango_selector_text.erl
+++ b/src/mango/src/mango_selector_text.erl
@@ -142,6 +142,9 @@ convert(Path, {[{<<"$exists">>, ShouldExist}]}) ->
         true -> FieldExists;
         false -> {op_not, {FieldExists, false}}
     end;
+convert(Path, {[{<<"$beginsWith">>, Arg}]}) ->
+    PrefixSearch = [value_str(Arg), <<"*">>],
+    {op_field, {make_field(Path, Arg), PrefixSearch}};
 % We're not checking the actual type here, just looking for
 % anything that has a possibility of matching by checking
 % for the field name. We use the same logic for $exists on
diff --git a/src/mango/test/03-operator-test.py 
b/src/mango/test/03-operator-test.py
index 70e3fbc5f..6de11e006 100644
--- a/src/mango/test/03-operator-test.py
+++ b/src/mango/test/03-operator-test.py
@@ -141,6 +141,22 @@ class BaseOperatorTests:
             for d in docs:
                 self.assertNotIn("twitter", d)
 
+        def test_beginswith(self):
+            docs = self.db.find({"location.state": {"$beginsWith": "New"}})
+            self.assertEqual(len(docs), 2)
+            self.assertUserIds([2, 10], docs)
+
+        # non-string prefixes should return an error
+        def test_beginswith_invalid_prefix(self):
+            docs = self.db.find({"location.state": {"$beginsWith": 123}})
+            self.assertEqual(len(docs), 2)
+
+        # non-string values in documents should not match the prefix, 
+        # but should not error
+        def test_beginswith_invalid_prefix(self):
+            docs = self.db.find({"user_id": {"$beginsWith": "Foo"}})
+            self.assertEqual(len(docs), 0)
+
 
 class OperatorJSONTests(mango.UserDocsTests, BaseOperatorTests.Common):
     # START: text indexes do not support range queries across type boundaries 
so only
diff --git a/src/mango/test/25-beginswith-test.py 
b/src/mango/test/25-beginswith-test.py
new file mode 100644
index 000000000..76772c243
--- /dev/null
+++ b/src/mango/test/25-beginswith-test.py
@@ -0,0 +1,112 @@
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not
+# use this file except in compliance with the License. You may obtain a copy of
+# the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations under
+# the License.
+
+import copy
+import mango
+
+DOCS = [
+    {"_id": "aaa", "name": "Jimi", "location": "AUS", "age": 27},
+    {"_id": "abc", "name": "Eddie", "location": "AND", "age": 65},
+    {"_id": "bbb", "name": "Harry", "location": "CAN", "age": 21},
+    {"_id": "ccc", "name": "Eddie", "location": "DEN", "age": 37},
+    {"_id": "ddd", "name": "Jones", "location": "ETH", "age": 49},
+]
+
+
+def to_utf8_bytes(list):
+    return [x.encode() for x in list]
+
+
+class BeginsWithOperator(mango.DbPerClass):
+    def setUp(self):
+        self.db.recreate()
+        self.db.save_docs(copy.deepcopy(DOCS))
+        self.db.create_index(["location"])
+        self.db.create_index(["name", "location"])
+
+    def assertDocIds(self, user_ids, docs):
+        user_ids_returned = list(d["_id"] for d in docs)
+        user_ids.sort()
+        user_ids_returned.sort()
+        self.assertEqual(user_ids, user_ids_returned)
+
+    def test_basic(self):
+        docs = self.db.find({"location": {"$beginsWith": "A"}})
+
+        self.assertEqual(len(docs), 2)
+        self.assertDocIds(["aaa", "abc"], docs)
+
+    def test_json_range(self):
+        explain = self.db.find({"location": {"$beginsWith": "A"}}, 
explain=True)
+        self.assertEqual(explain["mrargs"]["start_key"], ["A"])
+        end_key_bytes = to_utf8_bytes(explain["mrargs"]["end_key"])
+        self.assertEqual(end_key_bytes, [b"A\xef\xbf\xbd", b"<MAX>"])
+
+    def test_compound_key(self):
+        selector = {"name": "Eddie", "location": {"$beginsWith": "A"}}
+        explain = self.db.find(selector, explain=True)
+
+        self.assertEqual(explain["mrargs"]["start_key"], ["Eddie", "A"])
+        end_key_bytes = to_utf8_bytes(explain["mrargs"]["end_key"])
+        self.assertEqual(end_key_bytes, [b"Eddie", b"A\xef\xbf\xbd", b"<MAX>"])
+
+        docs = self.db.find(selector)
+        self.assertEqual(len(docs), 1)
+        self.assertDocIds(["abc"], docs)
+
+    def test_sort_asc(self):
+        selector = {"location": {"$beginsWith": "A"}}
+        explain = self.db.find(selector, sort=["location"], explain=True)
+
+        self.assertEqual(explain["mrargs"]["start_key"], ["A"])
+        end_key_bytes = to_utf8_bytes(explain["mrargs"]["end_key"])
+        self.assertEqual(end_key_bytes, [b"A\xef\xbf\xbd", b"<MAX>"])
+        self.assertEqual(explain["mrargs"]["direction"], "fwd")
+
+    def test_sort_desc(self):
+        selector = {"location": {"$beginsWith": "A"}}
+        explain = self.db.find(selector, sort=[{"location": "desc"}], 
explain=True)
+
+        start_key_bytes = to_utf8_bytes(explain["mrargs"]["end_key"])
+        self.assertEqual(start_key_bytes, [b"A"])
+        self.assertEqual(explain["mrargs"]["end_key"], ["A"])
+        self.assertEqual(explain["mrargs"]["direction"], "rev")
+
+    def test_all_docs_range(self):
+        explain = self.db.find({"_id": {"$beginsWith": "a"}}, explain=True)
+        self.assertEqual(explain["mrargs"]["start_key"], "a")
+        end_key_bytes = to_utf8_bytes(explain["mrargs"]["end_key"])
+        self.assertEqual(end_key_bytes, [b"a", b"\xef\xbf\xbd"])
+
+    def test_no_index(self):
+        selector = {"foo": {"$beginsWith": "a"}}
+        resp_explain = self.db.find(selector, explain=True)
+
+        self.assertEqual(resp_explain["index"]["type"], "special")
+        self.assertEqual(resp_explain["mrargs"]["start_key"], None)
+        self.assertEqual(resp_explain["mrargs"]["end_key"], "<MAX>")
+
+    def test_invalid_operand(self):
+        try:
+            self.db.find({"_id": {"$beginsWith": True}})
+        except Exception as e:
+            self.assertEqual(e.response.status_code, 400)
+            resp = e.response.json()
+            self.assertEqual(resp["error"], "invalid_operator")
+        else:
+            raise AssertionError("expected find error")
+
+    def test_does_not_match_non_string_value(self):
+        selector = {"age": {"$beginsWith": "a"}}
+        docs = self.db.find(selector)
+
+        self.assertEqual(len(docs), 0)

Reply via email to