This is an automated email from the ASF dual-hosted git repository. hanahmily pushed a commit to branch analyzer-url in repository https://gitbox.apache.org/repos/asf/skywalking-banyandb.git
commit 42f8062ebac8cd382a2f6985b4474de54f0def70 Author: Gao Hongtao <[email protected]> AuthorDate: Tue Sep 24 04:57:08 2024 +0800 Improve the match analyzer Signed-off-by: Gao Hongtao <[email protected]> --- CHANGES.md | 2 + api/proto/banyandb/database/v1/schema.proto | 21 +-- api/proto/banyandb/model/v1/query.proto | 10 ++ docs/api-reference.md | 49 ++++-- docs/interacting/bydbctl/query/filter-operation.md | 35 ++++ docs/interacting/bydbctl/schema/index-rule.md | 6 +- pkg/index/index.go | 17 +- pkg/index/inverted/analyzer.go | 56 +++++++ pkg/index/inverted/analyzer_test.go | 184 +++++++++++++++++++++ pkg/index/inverted/inverted.go | 40 +++-- pkg/index/inverted/inverted_test.go | 60 +++++-- pkg/index/inverted/query.go | 5 +- pkg/query/logical/stream/index_filter.go | 7 +- .../testdata/index_rules/endpoint_name.json | 2 +- .../testdata/index_rules/searchable_name.json | 2 +- .../stream/testdata/index_rules/db.instance.json | 2 +- test/cases/measure/data/input/entity_match.yaml | 4 +- .../measure/data/testdata/endpoint_traffic.json | 8 +- test/cases/measure/data/want/entity_match.yaml | 2 +- .../index-rules/measure-default-index-rule.yaml | 112 ++++++------- .../index-rules/measure-minute-index-rule.yaml | 44 ++--- ui/src/components/IndexRule/Editor.vue | 16 +- 22 files changed, 529 insertions(+), 155 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index d08ca4fa..bc5f17de 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -27,6 +27,8 @@ Release Notes. - Add HTTP health check endpoint for the data node. - Add slow query log for the distributed query and local query. - Support applying the index rule to the tag belonging to the entity. +- Add search analyzer "url" which breaks test into tokens at any non-letter and non-digit character. +- Introduce "match_option" to the "match" query. ### Bugs diff --git a/api/proto/banyandb/database/v1/schema.proto b/api/proto/banyandb/database/v1/schema.proto index b390acf5..db7f2d50 100644 --- a/api/proto/banyandb/database/v1/schema.proto +++ b/api/proto/banyandb/database/v1/schema.proto @@ -168,19 +168,16 @@ message IndexRule { Type type = 3 [(validate.rules).enum.defined_only = true]; // updated_at indicates when the IndexRule is updated google.protobuf.Timestamp updated_at = 4; - enum Analyzer { - ANALYZER_UNSPECIFIED = 0; - // Keyword analyzer is a “noop” analyzer which returns the entire input string as a single token. - ANALYZER_KEYWORD = 1; - // Standard analyzer provides grammar based tokenization - ANALYZER_STANDARD = 2; - // Simple analyzer breaks text into tokens at any non-letter character, - // such as numbers, spaces, hyphens and apostrophes, discards non-letter characters, - // and changes uppercase to lowercase. - ANALYZER_SIMPLE = 3; - } + // analyzer analyzes tag value to support the full-text searching for TYPE_INVERTED indices. - Analyzer analyzer = 5; + // available analyzers are: + // - "standard" provides grammar based tokenization + // - "simple" breaks text into tokens at any non-letter character, + // such as numbers, spaces, hyphens and apostrophes, discards non-letter characters, + // and changes uppercase to lowercase. + // - "keyword" is a “noop” analyzer which returns the entire input string as a single token. + // - "url" breaks test into tokens at any non-letter and non-digit character. + string analyzer = 5; // no_sort indicates whether the index is not for sorting. bool no_sort = 6; } diff --git a/api/proto/banyandb/model/v1/query.proto b/api/proto/banyandb/model/v1/query.proto index 2b650ec3..16a3219a 100644 --- a/api/proto/banyandb/model/v1/query.proto +++ b/api/proto/banyandb/model/v1/query.proto @@ -67,6 +67,16 @@ message Condition { string name = 1; BinaryOp op = 2; TagValue value = 3; + message MatchOption { + string analyzer = 1; + enum Operator { + OPERATOR_UNSPECIFIED = 0; + OPERATOR_AND = 1; + OPERATOR_OR = 2; + } + Operator operator = 2; + } + MatchOption match_option = 4; } // tag_families are indexed. diff --git a/docs/api-reference.md b/docs/api-reference.md index f50a82f3..552b999c 100644 --- a/docs/api-reference.md +++ b/docs/api-reference.md @@ -44,6 +44,7 @@ - [banyandb/model/v1/query.proto](#banyandb_model_v1_query-proto) - [Condition](#banyandb-model-v1-Condition) + - [Condition.MatchOption](#banyandb-model-v1-Condition-MatchOption) - [Criteria](#banyandb-model-v1-Criteria) - [LogicalExpression](#banyandb-model-v1-LogicalExpression) - [QueryOrder](#banyandb-model-v1-QueryOrder) @@ -54,6 +55,7 @@ - [TimeRange](#banyandb-model-v1-TimeRange) - [Condition.BinaryOp](#banyandb-model-v1-Condition-BinaryOp) + - [Condition.MatchOption.Operator](#banyandb-model-v1-Condition-MatchOption-Operator) - [LogicalExpression.LogicalOp](#banyandb-model-v1-LogicalExpression-LogicalOp) - [Sort](#banyandb-model-v1-Sort) @@ -73,7 +75,6 @@ - [CompressionMethod](#banyandb-database-v1-CompressionMethod) - [EncodingMethod](#banyandb-database-v1-EncodingMethod) - [FieldType](#banyandb-database-v1-FieldType) - - [IndexRule.Analyzer](#banyandb-database-v1-IndexRule-Analyzer) - [IndexRule.Type](#banyandb-database-v1-IndexRule-Type) - [TagType](#banyandb-database-v1-TagType) @@ -745,6 +746,23 @@ while for 1:N BinaryOp, values can be an array with length >= 1. | name | [string](#string) | | | | op | [Condition.BinaryOp](#banyandb-model-v1-Condition-BinaryOp) | | | | value | [TagValue](#banyandb-model-v1-TagValue) | | | +| match_option | [Condition.MatchOption](#banyandb-model-v1-Condition-MatchOption) | | | + + + + + + +<a name="banyandb-model-v1-Condition-MatchOption"></a> + +### Condition.MatchOption + + + +| Field | Type | Label | Description | +| ----- | ---- | ----- | ----------- | +| analyzer | [string](#string) | | | +| operator | [Condition.MatchOption.Operator](#banyandb-model-v1-Condition-MatchOption-Operator) | | | @@ -914,6 +932,19 @@ Each item in a string array is seen as a token instead of a query expression. +<a name="banyandb-model-v1-Condition-MatchOption-Operator"></a> + +### Condition.MatchOption.Operator + + +| Name | Number | Description | +| ---- | ------ | ----------- | +| OPERATOR_UNSPECIFIED | 0 | | +| OPERATOR_AND | 1 | | +| OPERATOR_OR | 2 | | + + + <a name="banyandb-model-v1-LogicalExpression-LogicalOp"></a> ### LogicalExpression.LogicalOp @@ -1001,7 +1032,7 @@ IndexRule should bind to a subject through an IndexRuleBinding to generate prope | tags | [string](#string) | repeated | tags are the combination that refers to an indexed object If the elements in tags are more than 1, the object will generate a multi-tag index Caveat: All tags in a multi-tag MUST have an identical IndexType | | type | [IndexRule.Type](#banyandb-database-v1-IndexRule-Type) | | type is the IndexType of this IndexObject. | | updated_at | [google.protobuf.Timestamp](#google-protobuf-Timestamp) | | updated_at indicates when the IndexRule is updated | -| analyzer | [IndexRule.Analyzer](#banyandb-database-v1-IndexRule-Analyzer) | | analyzer analyzes tag value to support the full-text searching for TYPE_INVERTED indices. | +| analyzer | [string](#string) | | analyzer analyzes tag value to support the full-text searching for TYPE_INVERTED indices. available analyzers are: - "standard" provides grammar based tokenization - "simple" breaks text into tokens at any non-letter character, such as numbers, spaces, hyphens and apostrophes, discards non-letter characters, and changes uppercase to lowercase. - "keyword" is a “noop” analyzer which returns the entire input string as a single token. | | no_sort | [bool](#bool) | | no_sort indicates whether the index is not for sorting. | @@ -1198,20 +1229,6 @@ TopNAggregation generates offline TopN statistics for a measure's TopN appro -<a name="banyandb-database-v1-IndexRule-Analyzer"></a> - -### IndexRule.Analyzer - - -| Name | Number | Description | -| ---- | ------ | ----------- | -| ANALYZER_UNSPECIFIED | 0 | | -| ANALYZER_KEYWORD | 1 | Keyword analyzer is a “noop” analyzer which returns the entire input string as a single token. | -| ANALYZER_STANDARD | 2 | Standard analyzer provides grammar based tokenization | -| ANALYZER_SIMPLE | 3 | Simple analyzer breaks text into tokens at any non-letter character, such as numbers, spaces, hyphens and apostrophes, discards non-letter characters, and changes uppercase to lowercase. | - - - <a name="banyandb-database-v1-IndexRule-Type"></a> ### IndexRule.Type diff --git a/docs/interacting/bydbctl/query/filter-operation.md b/docs/interacting/bydbctl/query/filter-operation.md index 7283c7e5..76c6423e 100644 --- a/docs/interacting/bydbctl/query/filter-operation.md +++ b/docs/interacting/bydbctl/query/filter-operation.md @@ -65,6 +65,41 @@ criteria: value: "us" ``` +You can set a `match_option` to control the behavior of the match operation. The following are the available options: + +- `analyzer`: The analyzer to use for the match operation. If not set, the analyzer defined in the index rule will be used. Available options are defined in the [IndexRules](../schema/index-rule.md). +- `operator`: The operator to use for the match operation. The default value is `OPERATOR_OR`. Available options are `OPERATOR_OR` and `OPERATOR_AND`. + +If you want to use a different analyzer and operator, you can set the `match_option` as follows: + +```shell +criteria: + condition: + name: "name" + op: "BINARY_OP_MATCH" + value: + str: + value: "service-1" + match_option: + analyzer: "url" + operator: "OPERATOR_AND" +``` + +Considering the data with the following tags: + +```shell +{ + "name": "service-1" +} +{ + "name": "service-2" +} +``` + +The above query will return the data with the tag `name` that contains both `service` and `1`, which is `service-1`. + +If you set the `operator` to `OPERATOR_OR`, the query will return the data with the tag `name` that contains either `service` or `1`, which is `service-1` and `service-2`. + ## [LogicalExpression.LogicalOp](../../../api-reference.md#logicalexpressionlogicalop) Logical operation is used to combine multiple conditions. diff --git a/docs/interacting/bydbctl/schema/index-rule.md b/docs/interacting/bydbctl/schema/index-rule.md index c15d8ff4..9f5cc41f 100644 --- a/docs/interacting/bydbctl/schema/index-rule.md +++ b/docs/interacting/bydbctl/schema/index-rule.md @@ -53,8 +53,8 @@ EOF This YAML creates an index rule which uses the tag `trace_id` to generate a `TYPE_INVERTED` index. -The `analyzer` field is optional. If it is not set, the default value is `ANALYZER_UNSPECIFIED`. -We can set it to `ANALYZER_KEYWORD` to specify the analyzer. More analyzers can refer to the [API Reference](../../../api-reference.md#indexruleanalyzer). +The `analyzer` field is optional. If it is not set, the default value is an empty string. +We can set it to `url` to specify the analyzer. More analyzers can refer to the [API Reference](../../../api-reference.md#indexruleanalyzer). ```shell bydbctl indexRule create -f - <<EOF metadata: @@ -63,7 +63,7 @@ metadata: tags: - trace_id type: TYPE_INVERTED -analyzer: ANALYZER_KEYWORD +analyzer: url EOF ``` diff --git a/pkg/index/index.go b/pkg/index/index.go index 08c20615..e209b36c 100644 --- a/pkg/index/index.go +++ b/pkg/index/index.go @@ -33,11 +33,24 @@ import ( "github.com/apache/skywalking-banyandb/pkg/timestamp" ) +const ( + // AnalyzerUnspecified represents an unspecified analyzer. + AnalyzerUnspecified = "" + // AnalyzerKeyword is a “noop” analyzer which returns the entire input string as a single token. + AnalyzerKeyword = "keyword" + // AnalyzerSimple breaks text into tokens at any non-letter character. + AnalyzerSimple = "simple" + // AnalyzerStandard provides grammar based tokenization. + AnalyzerStandard = "standard" + // AnalyzerURL breaks test into tokens at any non-letter and non-digit character. + AnalyzerURL = "url" +) + // FieldKey is the key of field in a document. type FieldKey struct { + Analyzer string SeriesID common.SeriesID IndexRuleID uint32 - Analyzer databasev1.IndexRule_Analyzer } // Marshal encodes f to string. @@ -168,7 +181,7 @@ type FieldIterable interface { // Searcher allows searching a field either by its key or by its key and term. type Searcher interface { FieldIterable - Match(fieldKey FieldKey, match []string) (list posting.List, err error) + Match(fieldKey FieldKey, match []string, opts *modelv1.Condition_MatchOption) (list posting.List, err error) MatchField(fieldKey FieldKey) (list posting.List, err error) MatchTerms(field Field) (list posting.List, err error) Range(fieldKey FieldKey, opts RangeOpts) (list posting.List, err error) diff --git a/pkg/index/inverted/analyzer.go b/pkg/index/inverted/analyzer.go new file mode 100644 index 00000000..7c9c4423 --- /dev/null +++ b/pkg/index/inverted/analyzer.go @@ -0,0 +1,56 @@ +// Licensed to Apache Software Foundation (ASF) under one or more contributor +// license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright +// ownership. Apache Software Foundation (ASF) licenses this file to you under +// the Apache License, Version 2.0 (the "License"); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package inverted + +import ( + "bytes" + "unicode" + + "github.com/blugelabs/bluge/analysis" + "github.com/blugelabs/bluge/analysis/tokenizer" +) + +func newURLAnalyzer() *analysis.Analyzer { + return &analysis.Analyzer{ + Tokenizer: tokenizer.NewCharacterTokenizer(func(r rune) bool { + return unicode.IsLetter(r) || unicode.IsNumber(r) + }), + TokenFilters: []analysis.TokenFilter{ + newAlphanumericFilter(), + }, + } +} + +type alphanumericFilter struct{} + +func newAlphanumericFilter() *alphanumericFilter { + return &alphanumericFilter{} +} + +func (f *alphanumericFilter) Filter(input analysis.TokenStream) analysis.TokenStream { + for _, token := range input { + termRunes := []rune{} + for _, r := range bytes.Runes(token.Term) { + if unicode.IsLetter(r) || unicode.IsNumber(r) { + termRunes = append(termRunes, r) + } + } + token.Term = analysis.BuildTermFromRunes(termRunes) + } + return input +} diff --git a/pkg/index/inverted/analyzer_test.go b/pkg/index/inverted/analyzer_test.go new file mode 100644 index 00000000..0deeb400 --- /dev/null +++ b/pkg/index/inverted/analyzer_test.go @@ -0,0 +1,184 @@ +// Licensed to Apache Software Foundation (ASF) under one or more contributor +// license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright +// ownership. Apache Software Foundation (ASF) licenses this file to you under +// the Apache License, Version 2.0 (the "License"); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package inverted + +import ( + "testing" + + "github.com/blugelabs/bluge/analysis" + "github.com/stretchr/testify/assert" +) + +func TestAlphanumericFilter(t *testing.T) { + filter := newAlphanumericFilter() + + tests := []struct { + input analysis.TokenStream + expected analysis.TokenStream + }{ + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("hello123"), + }, + }, + expected: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("hello123"), + }, + }, + }, + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("hello!@#"), + }, + }, + expected: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("hello"), + }, + }, + }, + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("123!@#"), + }, + }, + expected: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("123"), + }, + }, + }, + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("!@#"), + }, + }, + expected: analysis.TokenStream{ + &analysis.Token{ + Term: []byte(""), + }, + }, + }, + } + + for _, tt := range tests { + t.Run(string(tt.input[0].Term), func(t *testing.T) { + output := filter.Filter(tt.input) + assert.Equal(t, tt.expected, output) + }) + } +} + +func TestNewURLAnalyzer(t *testing.T) { + analyzer := newURLAnalyzer() + + tests := []struct { + input string + expected analysis.TokenStream + }{ + { + input: "http://example.com", + expected: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("http"), + }, + &analysis.Token{ + Term: []byte("example"), + }, + &analysis.Token{ + Term: []byte("com"), + }, + }, + }, + { + input: "https://www.example.com/path?query=123", + expected: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("https"), + }, + &analysis.Token{ + Term: []byte("www"), + }, + &analysis.Token{ + Term: []byte("example"), + }, + &analysis.Token{ + Term: []byte("com"), + }, + &analysis.Token{ + Term: []byte("path"), + }, + &analysis.Token{ + Term: []byte("query"), + }, + &analysis.Token{ + Term: []byte("123"), + }, + }, + }, + { + input: "ftp://user:[email protected]:21/path", + expected: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("ftp"), + }, + &analysis.Token{ + Term: []byte("user"), + }, + &analysis.Token{ + Term: []byte("pass"), + }, + &analysis.Token{ + Term: []byte("ftp"), + }, + &analysis.Token{ + Term: []byte("example"), + }, + &analysis.Token{ + Term: []byte("com"), + }, + &analysis.Token{ + Term: []byte("21"), + }, + &analysis.Token{ + Term: []byte("path"), + }, + }, + }, + } + + for _, tt := range tests { + t.Run(tt.input, func(t *testing.T) { + tokenStream := analyzer.Analyze([]byte(tt.input)) + assert.Equal(t, extractTerms(tt.expected), extractTerms(tokenStream)) + }) + } +} + +func extractTerms(tokenStream analysis.TokenStream) [][]byte { + terms := make([][]byte, len(tokenStream)) + for i, token := range tokenStream { + terms[i] = token.Term + } + return terms +} diff --git a/pkg/index/inverted/inverted.go b/pkg/index/inverted/inverted.go index a3915bae..aa43d07c 100644 --- a/pkg/index/inverted/inverted.go +++ b/pkg/index/inverted/inverted.go @@ -35,7 +35,6 @@ import ( "go.uber.org/multierr" "github.com/apache/skywalking-banyandb/api/common" - databasev1 "github.com/apache/skywalking-banyandb/api/proto/banyandb/database/v1" modelv1 "github.com/apache/skywalking-banyandb/api/proto/banyandb/model/v1" "github.com/apache/skywalking-banyandb/pkg/convert" "github.com/apache/skywalking-banyandb/pkg/index" @@ -63,13 +62,14 @@ var ( ) // Analyzers is a map that associates each IndexRule_Analyzer type with a corresponding Analyzer. -var Analyzers map[databasev1.IndexRule_Analyzer]*analysis.Analyzer +var Analyzers map[string]*analysis.Analyzer func init() { - Analyzers = map[databasev1.IndexRule_Analyzer]*analysis.Analyzer{ - databasev1.IndexRule_ANALYZER_KEYWORD: analyzer.NewKeywordAnalyzer(), - databasev1.IndexRule_ANALYZER_SIMPLE: analyzer.NewSimpleAnalyzer(), - databasev1.IndexRule_ANALYZER_STANDARD: analyzer.NewStandardAnalyzer(), + Analyzers = map[string]*analysis.Analyzer{ + index.AnalyzerKeyword: analyzer.NewKeywordAnalyzer(), + index.AnalyzerSimple: analyzer.NewSimpleAnalyzer(), + index.AnalyzerStandard: analyzer.NewStandardAnalyzer(), + index.AnalyzerURL: newURLAnalyzer(), } } @@ -126,7 +126,7 @@ func (s *store) Batch(batch index.Batch) error { if f.Store { tf.StoreValue() } - if f.Key.Analyzer != databasev1.IndexRule_ANALYZER_UNSPECIFIED { + if f.Key.Analyzer != index.AnalyzerUnspecified { tf = tf.WithAnalyzer(Analyzers[f.Key.Analyzer]) } doc.AddField(tf) @@ -156,7 +156,7 @@ func NewStore(opts StoreOpts) (index.SeriesStore, error) { WithPersisterNapTimeMSec(int(opts.BatchWaitSec * 1000)) } config := bluge.DefaultConfigWithIndexConfig(indexConfig) - config.DefaultSearchAnalyzer = Analyzers[databasev1.IndexRule_ANALYZER_KEYWORD] + config.DefaultSearchAnalyzer = Analyzers[index.AnalyzerKeyword] config.Logger = log.New(opts.Logger, opts.Logger.Module(), 0) w, err := bluge.OpenWriter(config) if err != nil { @@ -282,15 +282,15 @@ func (s *store) MatchTerms(field index.Field) (list posting.List, err error) { return list, err } -func (s *store) Match(fieldKey index.FieldKey, matches []string) (posting.List, error) { - if len(matches) == 0 || fieldKey.Analyzer == databasev1.IndexRule_ANALYZER_UNSPECIFIED { +func (s *store) Match(fieldKey index.FieldKey, matches []string, opts *modelv1.Condition_MatchOption) (posting.List, error) { + if len(matches) == 0 || fieldKey.Analyzer == index.AnalyzerUnspecified { return roaring.DummyPostingList, nil } reader, err := s.writer.Reader() if err != nil { return nil, err } - analyzer := Analyzers[fieldKey.Analyzer] + analyzer, operator := getMatchOptions(fieldKey.Analyzer, opts) fk := fieldKey.Marshal() query := bluge.NewBooleanQuery() if fieldKey.HasSeriesID() { @@ -298,7 +298,7 @@ func (s *store) Match(fieldKey index.FieldKey, matches []string) (posting.List, } for _, m := range matches { query.AddMust(bluge.NewMatchQuery(m).SetField(fk). - SetAnalyzer(analyzer)) + SetAnalyzer(analyzer).SetOperator(operator)) } documentMatchIterator, err := reader.Search(context.Background(), bluge.NewAllMatches(query)) if err != nil { @@ -315,6 +315,22 @@ func (s *store) Match(fieldKey index.FieldKey, matches []string) (posting.List, return list, err } +func getMatchOptions(analyzerOnIndexRule string, opts *modelv1.Condition_MatchOption) (*analysis.Analyzer, bluge.MatchQueryOperator) { + analyzer := Analyzers[analyzerOnIndexRule] + operator := bluge.MatchQueryOperatorOr + if opts != nil { + if opts.Analyzer != index.AnalyzerUnspecified { + analyzer = Analyzers[opts.Analyzer] + } + if opts.Operator != modelv1.Condition_MatchOption_OPERATOR_UNSPECIFIED { + if opts.Operator == modelv1.Condition_MatchOption_OPERATOR_AND { + operator = bluge.MatchQueryOperatorAnd + } + } + } + return analyzer, bluge.MatchQueryOperator(operator) +} + func (s *store) Range(fieldKey index.FieldKey, opts index.RangeOpts) (list posting.List, err error) { iter, err := s.Iterator(fieldKey, opts, modelv1.Sort_SORT_ASC, defaultRangePreloadSize, nil, nil) if err != nil { diff --git a/pkg/index/inverted/inverted_test.go b/pkg/index/inverted/inverted_test.go index 07b7ffa7..58b086ef 100644 --- a/pkg/index/inverted/inverted_test.go +++ b/pkg/index/inverted/inverted_test.go @@ -25,7 +25,7 @@ import ( "github.com/stretchr/testify/require" "github.com/apache/skywalking-banyandb/api/common" - databasev1 "github.com/apache/skywalking-banyandb/api/proto/banyandb/database/v1" + modelv1 "github.com/apache/skywalking-banyandb/api/proto/banyandb/model/v1" "github.com/apache/skywalking-banyandb/pkg/index" "github.com/apache/skywalking-banyandb/pkg/index/posting" "github.com/apache/skywalking-banyandb/pkg/index/posting/roaring" @@ -51,14 +51,15 @@ func TestStore_Match(t *testing.T) { // http_method IndexRuleID: 6, SeriesID: common.SeriesID(11), - Analyzer: databasev1.IndexRule_ANALYZER_SIMPLE, + Analyzer: index.AnalyzerURL, } setup(tester, s, serviceName) tests := []struct { - want posting.List - matches []string - wantErr bool + want posting.List + matches []string + operator modelv1.Condition_MatchOption_Operator + wantErr bool }{ { matches: []string{"root"}, @@ -76,10 +77,20 @@ func TestStore_Match(t *testing.T) { matches: []string{"/root/product"}, want: roaring.NewPostingListWithInitialData(1, 2), }, + { + matches: []string{"/root/product"}, + operator: modelv1.Condition_MatchOption_OPERATOR_AND, + want: roaring.NewPostingListWithInitialData(2), + }, { matches: []string{"/product/order"}, want: roaring.NewPostingListWithInitialData(1, 2, 3), }, + { + matches: []string{"/product/order"}, + operator: modelv1.Condition_MatchOption_OPERATOR_AND, + want: roaring.NewPostingListWithInitialData(1), + }, { matches: []string{"GET"}, want: roaring.NewPostingListWithInitialData(1, 2), @@ -116,12 +127,22 @@ func TestStore_Match(t *testing.T) { matches: []string{"test"}, want: roaring.NewPostingListWithInitialData(), }, + { + matches: []string{"v1"}, + want: roaring.NewPostingListWithInitialData(4), + }, + { + matches: []string{"v2"}, + want: roaring.NewPostingListWithInitialData(5), + }, } for _, tt := range tests { name := strings.Join(tt.matches, "-") t.Run(name, func(t *testing.T) { tester := assert.New(t) - list, err := s.Match(serviceName, tt.matches) + list, err := s.Match(serviceName, tt.matches, &modelv1.Condition_MatchOption{ + Operator: tt.operator, + }) if tt.wantErr { tester.Error(err) return @@ -148,14 +169,15 @@ func TestStore_SeriesMatch(t *testing.T) { serviceName := index.FieldKey{ // http_method IndexRuleID: 6, - Analyzer: databasev1.IndexRule_ANALYZER_SIMPLE, + Analyzer: index.AnalyzerURL, } setupSeries(tester, s, serviceName) tests := []struct { - want posting.List - matches []string - wantErr bool + want posting.List + matches []string + operator modelv1.Condition_MatchOption_Operator + wantErr bool }{ { matches: []string{"test"}, @@ -173,7 +195,9 @@ func TestStore_SeriesMatch(t *testing.T) { for _, tt := range tests { name := strings.Join(tt.matches, " and ") t.Run(name, func(t *testing.T) { - list, err := s.Match(serviceName, tt.matches) + list, err := s.Match(serviceName, tt.matches, &modelv1.Condition_MatchOption{ + Operator: tt.operator, + }) if tt.wantErr { tester.Error(err) return @@ -209,6 +233,20 @@ func setup(tester *require.Assertions, s index.Store, serviceName index.FieldKey }}, DocID: 3, }, + index.Document{ + Fields: []index.Field{{ + Key: serviceName, + Term: []byte("/svc1/v1/user"), + }}, + DocID: 4, + }, + index.Document{ + Fields: []index.Field{{ + Key: serviceName, + Term: []byte("/svc1/v2/user"), + }}, + DocID: 5, + }, ) tester.NoError(s.Batch(batch)) } diff --git a/pkg/index/inverted/query.go b/pkg/index/inverted/query.go index fdd5fc4b..f4487352 100644 --- a/pkg/index/inverted/query.go +++ b/pkg/index/inverted/query.go @@ -183,7 +183,8 @@ func parseConditionToQuery(cond *modelv1.Condition, indexRule *databasev1.IndexR node := newTermNode(str, indexRule) return &queryNode{query, node}, [][]*modelv1.TagValue{entity}, false, nil case modelv1.Condition_BINARY_OP_MATCH: - query := bluge.NewMatchQuery(term).SetField(field).SetAnalyzer(Analyzers[indexRule.Analyzer]) + analyzer, operator := getMatchOptions(indexRule.Analyzer, cond.MatchOption) + query := bluge.NewMatchQuery(term).SetField(field).SetAnalyzer(analyzer).SetOperator(operator) node := newMatchNode(str, indexRule) return &queryNode{query, node}, [][]*modelv1.TagValue{entity}, false, nil case modelv1.Condition_BINARY_OP_NE: @@ -416,7 +417,7 @@ func (m *matchNode) MarshalJSON() ([]byte, error) { inner := make(map[string]interface{}, 1) inner["index"] = m.indexRule.Metadata.Name + ":" + m.indexRule.Metadata.Group inner["value"] = m.match - inner["analyzer"] = databasev1.IndexRule_Analyzer_name[int32(m.indexRule.Analyzer)] + inner["analyzer"] = m.indexRule.Analyzer data := make(map[string]interface{}, 1) data["match"] = inner return json.Marshal(data) diff --git a/pkg/query/logical/stream/index_filter.go b/pkg/query/logical/stream/index_filter.go index 533000f1..fa8852b6 100644 --- a/pkg/query/logical/stream/index_filter.go +++ b/pkg/query/logical/stream/index_filter.go @@ -126,7 +126,7 @@ func parseConditionToFilter(cond *modelv1.Condition, indexRule *databasev1.Index case modelv1.Condition_BINARY_OP_EQ: return newEq(indexRule, expr), [][]*modelv1.TagValue{entity}, nil case modelv1.Condition_BINARY_OP_MATCH: - return newMatch(indexRule, expr), [][]*modelv1.TagValue{entity}, nil + return newMatch(indexRule, expr, cond.MatchOption), [][]*modelv1.TagValue{entity}, nil case modelv1.Condition_BINARY_OP_NE: return newNot(indexRule, newEq(indexRule, expr)), [][]*modelv1.TagValue{entity}, nil case modelv1.Condition_BINARY_OP_HAVING: @@ -409,14 +409,16 @@ func (eq *eq) String() string { type match struct { *leaf + opts *modelv1.Condition_MatchOption } -func newMatch(indexRule *databasev1.IndexRule, values logical.LiteralExpr) *match { +func newMatch(indexRule *databasev1.IndexRule, values logical.LiteralExpr, opts *modelv1.Condition_MatchOption) *match { return &match{ leaf: &leaf{ Key: newFieldKey(indexRule), Expr: values, }, + opts: opts, } } @@ -433,6 +435,7 @@ func (match *match) Execute(searcher index.GetSearcher, seriesID common.SeriesID return s.Match( match.Key.toIndex(seriesID), matches, + match.opts, ) } diff --git a/pkg/test/measure/testdata/index_rules/endpoint_name.json b/pkg/test/measure/testdata/index_rules/endpoint_name.json index 10738558..b75806a7 100644 --- a/pkg/test/measure/testdata/index_rules/endpoint_name.json +++ b/pkg/test/measure/testdata/index_rules/endpoint_name.json @@ -8,6 +8,6 @@ "endpoint_name" ], "type": "TYPE_INVERTED", - "analyzer": "ANALYZER_SIMPLE", + "analyzer": "url", "updated_at": "2021-04-15T01:30:15.01Z" } \ No newline at end of file diff --git a/pkg/test/measure/testdata/index_rules/searchable_name.json b/pkg/test/measure/testdata/index_rules/searchable_name.json index afbc37bf..73c3a2f2 100644 --- a/pkg/test/measure/testdata/index_rules/searchable_name.json +++ b/pkg/test/measure/testdata/index_rules/searchable_name.json @@ -8,6 +8,6 @@ "name" ], "type": "TYPE_INVERTED", - "analyzer": "ANALYZER_SIMPLE", + "analyzer": "url", "updated_at": "2021-04-15T01:30:15.01Z" } \ No newline at end of file diff --git a/pkg/test/stream/testdata/index_rules/db.instance.json b/pkg/test/stream/testdata/index_rules/db.instance.json index 691a703e..85c3c6ad 100644 --- a/pkg/test/stream/testdata/index_rules/db.instance.json +++ b/pkg/test/stream/testdata/index_rules/db.instance.json @@ -8,6 +8,6 @@ "db.instance" ], "type": "TYPE_INVERTED", - "analyzer": "ANALYZER_SIMPLE", + "analyzer": "url", "updated_at": "2021-04-15T01:30:15.01Z" } diff --git a/test/cases/measure/data/input/entity_match.yaml b/test/cases/measure/data/input/entity_match.yaml index ce37d3cc..39a41e34 100644 --- a/test/cases/measure/data/input/entity_match.yaml +++ b/test/cases/measure/data/input/entity_match.yaml @@ -28,9 +28,11 @@ criteria: condition: name: "endpoint_name" op: "BINARY_OP_MATCH" + match_option: + operator: "OPERATOR_AND" value: str: - value: "foo" + value: "endpoint-1" left: condition: name: "service_id" diff --git a/test/cases/measure/data/testdata/endpoint_traffic.json b/test/cases/measure/data/testdata/endpoint_traffic.json index bf43fb9d..881908c3 100644 --- a/test/cases/measure/data/testdata/endpoint_traffic.json +++ b/test/cases/measure/data/testdata/endpoint_traffic.json @@ -10,7 +10,7 @@ }, { "str": { - "value": "/api/v1/foo" + "value": "/api/v1/endpoint-1" } } ] @@ -28,7 +28,7 @@ }, { "str": { - "value": "/api/v1/bar" + "value": "/api/v1/endpoint-2" } } ] @@ -46,7 +46,7 @@ }, { "str": { - "value": "/api/v1/foo" + "value": "/api/v1/endpoint-1" } } ] @@ -64,7 +64,7 @@ }, { "str": { - "value": "/api/v1/bar" + "value": "/api/v1/endpoint-2" } } ] diff --git a/test/cases/measure/data/want/entity_match.yaml b/test/cases/measure/data/want/entity_match.yaml index ccd79306..a6617241 100644 --- a/test/cases/measure/data/want/entity_match.yaml +++ b/test/cases/measure/data/want/entity_match.yaml @@ -26,4 +26,4 @@ dataPoints: - key: endpoint_name value: str: - value: /api/v1/foo + value: /api/v1/endpoint-1 diff --git a/test/stress/istio/testdata/index-rules/measure-default-index-rule.yaml b/test/stress/istio/testdata/index-rules/measure-default-index-rule.yaml index cc95f2a3..31c789bb 100644 --- a/test/stress/istio/testdata/index-rules/measure-default-index-rule.yaml +++ b/test/stress/istio/testdata/index-rules/measure-default-index-rule.yaml @@ -1,4 +1,4 @@ -- analyzer: ANALYZER_UNSPECIFIED +- analyzer: "" metadata: createRevision: "43" group: measure-default @@ -6,10 +6,10 @@ modRevision: "43" name: agent_id tags: - - agent_id + - agent_id type: TYPE_INVERTED updatedAt: null -- analyzer: ANALYZER_UNSPECIFIED +- analyzer: "" metadata: createRevision: "49" group: measure-default @@ -17,10 +17,10 @@ modRevision: "49" name: dest_endpoint tags: - - dest_endpoint + - dest_endpoint type: TYPE_INVERTED updatedAt: null -- analyzer: ANALYZER_UNSPECIFIED +- analyzer: "" metadata: createRevision: "62" group: measure-default @@ -28,10 +28,10 @@ modRevision: "62" name: dest_service_id tags: - - dest_service_id + - dest_service_id type: TYPE_INVERTED updatedAt: null -- analyzer: ANALYZER_UNSPECIFIED +- analyzer: "" metadata: createRevision: "63" group: measure-default @@ -39,10 +39,10 @@ modRevision: "63" name: dest_service_instance_id tags: - - dest_service_instance_id + - dest_service_instance_id type: TYPE_INVERTED updatedAt: null -- analyzer: ANALYZER_UNSPECIFIED +- analyzer: "" metadata: createRevision: "42" group: measure-default @@ -50,10 +50,10 @@ modRevision: "42" name: detect_type tags: - - detect_type + - detect_type type: TYPE_INVERTED updatedAt: null -- analyzer: ANALYZER_UNSPECIFIED +- analyzer: "" metadata: createRevision: "136" group: measure-default @@ -61,10 +61,10 @@ modRevision: "136" name: end_time tags: - - end_time + - end_time type: TYPE_INVERTED updatedAt: null -- analyzer: ANALYZER_UNSPECIFIED +- analyzer: "" metadata: createRevision: "131" group: measure-default @@ -72,10 +72,10 @@ modRevision: "131" name: endpoint tags: - - endpoint + - endpoint type: TYPE_INVERTED updatedAt: null -- analyzer: ANALYZER_UNSPECIFIED +- analyzer: "" metadata: createRevision: "21" group: measure-default @@ -83,10 +83,10 @@ modRevision: "21" name: id tags: - - id + - id type: TYPE_INVERTED updatedAt: "2023-05-22T18:03:28.011340226Z" -- analyzer: ANALYZER_UNSPECIFIED +- analyzer: "" metadata: createRevision: "23" group: measure-default @@ -94,10 +94,10 @@ modRevision: "23" name: last_ping tags: - - last_ping + - last_ping type: TYPE_INVERTED updatedAt: null -- analyzer: ANALYZER_UNSPECIFIED +- analyzer: "" metadata: createRevision: "38" group: measure-default @@ -105,10 +105,10 @@ modRevision: "38" name: last_update_time_bucket tags: - - last_update_time_bucket + - last_update_time_bucket type: TYPE_INVERTED updatedAt: null -- analyzer: ANALYZER_UNSPECIFIED +- analyzer: "" metadata: createRevision: "137" group: measure-default @@ -116,10 +116,10 @@ modRevision: "137" name: layer tags: - - layer + - layer type: TYPE_INVERTED updatedAt: null -- analyzer: ANALYZER_UNSPECIFIED +- analyzer: "" metadata: createRevision: "134" group: measure-default @@ -127,10 +127,10 @@ modRevision: "134" name: message tags: - - message + - message type: TYPE_INVERTED updatedAt: null -- analyzer: ANALYZER_UNSPECIFIED +- analyzer: "" metadata: createRevision: "132" group: measure-default @@ -138,10 +138,10 @@ modRevision: "132" name: name tags: - - name + - name type: TYPE_INVERTED updatedAt: null -- analyzer: ANALYZER_UNSPECIFIED +- analyzer: "" metadata: createRevision: "156" group: measure-default @@ -149,10 +149,10 @@ modRevision: "156" name: process_id tags: - - process_id + - process_id type: TYPE_INVERTED updatedAt: null -- analyzer: ANALYZER_UNSPECIFIED +- analyzer: "" metadata: createRevision: "44" group: measure-default @@ -160,10 +160,10 @@ modRevision: "44" name: profiling_support_status tags: - - profiling_support_status + - profiling_support_status type: TYPE_INVERTED updatedAt: null -- analyzer: ANALYZER_UNSPECIFIED +- analyzer: "" metadata: createRevision: "36" group: measure-default @@ -171,10 +171,10 @@ modRevision: "36" name: represent_service_id tags: - - represent_service_id + - represent_service_id type: TYPE_INVERTED updatedAt: null -- analyzer: ANALYZER_UNSPECIFIED +- analyzer: "" metadata: createRevision: "37" group: measure-default @@ -182,10 +182,10 @@ modRevision: "37" name: represent_service_instance_id tags: - - represent_service_instance_id + - represent_service_instance_id type: TYPE_INVERTED updatedAt: null -- analyzer: ANALYZER_UNSPECIFIED +- analyzer: "" metadata: createRevision: "129" group: measure-default @@ -193,10 +193,10 @@ modRevision: "129" name: service tags: - - service + - service type: TYPE_INVERTED updatedAt: null -- analyzer: ANALYZER_UNSPECIFIED +- analyzer: "" metadata: createRevision: "110" group: measure-default @@ -204,10 +204,10 @@ modRevision: "110" name: service_group tags: - - service_group + - service_group type: TYPE_INVERTED updatedAt: null -- analyzer: ANALYZER_UNSPECIFIED +- analyzer: "" metadata: createRevision: "41" group: measure-default @@ -215,10 +215,10 @@ modRevision: "41" name: service_id tags: - - service_id + - service_id type: TYPE_INVERTED updatedAt: null -- analyzer: ANALYZER_UNSPECIFIED +- analyzer: "" metadata: createRevision: "130" group: measure-default @@ -226,10 +226,10 @@ modRevision: "130" name: service_instance tags: - - service_instance + - service_instance type: TYPE_INVERTED updatedAt: null -- analyzer: ANALYZER_UNSPECIFIED +- analyzer: "" metadata: createRevision: "109" group: measure-default @@ -237,10 +237,10 @@ modRevision: "109" name: short_name tags: - - short_name + - short_name type: TYPE_INVERTED updatedAt: null -- analyzer: ANALYZER_UNSPECIFIED +- analyzer: "" metadata: createRevision: "48" group: measure-default @@ -248,10 +248,10 @@ modRevision: "48" name: source_endpoint tags: - - source_endpoint + - source_endpoint type: TYPE_INVERTED updatedAt: null -- analyzer: ANALYZER_UNSPECIFIED +- analyzer: "" metadata: createRevision: "60" group: measure-default @@ -259,10 +259,10 @@ modRevision: "60" name: source_service_id tags: - - source_service_id + - source_service_id type: TYPE_INVERTED updatedAt: null -- analyzer: ANALYZER_UNSPECIFIED +- analyzer: "" metadata: createRevision: "61" group: measure-default @@ -270,10 +270,10 @@ modRevision: "61" name: source_service_instance_id tags: - - source_service_instance_id + - source_service_instance_id type: TYPE_INVERTED updatedAt: null -- analyzer: ANALYZER_UNSPECIFIED +- analyzer: "" metadata: createRevision: "135" group: measure-default @@ -281,10 +281,10 @@ modRevision: "135" name: start_time tags: - - start_time + - start_time type: TYPE_INVERTED updatedAt: null -- analyzer: ANALYZER_UNSPECIFIED +- analyzer: "" metadata: createRevision: "155" group: measure-default @@ -292,10 +292,10 @@ modRevision: "155" name: task_id tags: - - task_id + - task_id type: TYPE_INVERTED updatedAt: null -- analyzer: ANALYZER_UNSPECIFIED +- analyzer: "" metadata: createRevision: "133" group: measure-default @@ -303,7 +303,7 @@ modRevision: "133" name: type tags: - - type + - type type: TYPE_INVERTED updatedAt: null diff --git a/test/stress/istio/testdata/index-rules/measure-minute-index-rule.yaml b/test/stress/istio/testdata/index-rules/measure-minute-index-rule.yaml index 4180da1d..7274d164 100644 --- a/test/stress/istio/testdata/index-rules/measure-minute-index-rule.yaml +++ b/test/stress/istio/testdata/index-rules/measure-minute-index-rule.yaml @@ -1,4 +1,4 @@ -- analyzer: ANALYZER_UNSPECIFIED +- analyzer: "" metadata: createRevision: "57" group: measure-minute @@ -6,10 +6,10 @@ modRevision: "57" name: dest_endpoint tags: - - dest_endpoint + - dest_endpoint type: TYPE_INVERTED updatedAt: null -- analyzer: ANALYZER_UNSPECIFIED +- analyzer: "" metadata: createRevision: "82" group: measure-minute @@ -17,10 +17,10 @@ modRevision: "82" name: dest_process_id tags: - - dest_process_id + - dest_process_id type: TYPE_INVERTED updatedAt: null -- analyzer: ANALYZER_UNSPECIFIED +- analyzer: "" metadata: createRevision: "70" group: measure-minute @@ -28,10 +28,10 @@ modRevision: "70" name: dest_service_id tags: - - dest_service_id + - dest_service_id type: TYPE_INVERTED updatedAt: null -- analyzer: ANALYZER_UNSPECIFIED +- analyzer: "" metadata: createRevision: "71" group: measure-minute @@ -39,10 +39,10 @@ modRevision: "71" name: dest_service_instance_id tags: - - dest_service_instance_id + - dest_service_instance_id type: TYPE_INVERTED updatedAt: null -- analyzer: ANALYZER_UNSPECIFIED +- analyzer: "" metadata: createRevision: "55" group: measure-minute @@ -50,10 +50,10 @@ modRevision: "55" name: id tags: - - id + - id type: TYPE_INVERTED updatedAt: "2023-05-22T18:03:28.196387339Z" -- analyzer: ANALYZER_UNSPECIFIED +- analyzer: "" metadata: createRevision: "412" group: measure-minute @@ -61,10 +61,10 @@ modRevision: "412" name: service_id tags: - - service_id + - service_id type: TYPE_INVERTED updatedAt: null -- analyzer: ANALYZER_UNSPECIFIED +- analyzer: "" metadata: createRevision: "80" group: measure-minute @@ -72,10 +72,10 @@ modRevision: "80" name: service_instance_id tags: - - service_instance_id + - service_instance_id type: TYPE_INVERTED updatedAt: null -- analyzer: ANALYZER_UNSPECIFIED +- analyzer: "" metadata: createRevision: "56" group: measure-minute @@ -83,10 +83,10 @@ modRevision: "56" name: source_endpoint tags: - - source_endpoint + - source_endpoint type: TYPE_INVERTED updatedAt: null -- analyzer: ANALYZER_UNSPECIFIED +- analyzer: "" metadata: createRevision: "81" group: measure-minute @@ -94,10 +94,10 @@ modRevision: "81" name: source_process_id tags: - - source_process_id + - source_process_id type: TYPE_INVERTED updatedAt: null -- analyzer: ANALYZER_UNSPECIFIED +- analyzer: "" metadata: createRevision: "68" group: measure-minute @@ -105,10 +105,10 @@ modRevision: "68" name: source_service_id tags: - - source_service_id + - source_service_id type: TYPE_INVERTED updatedAt: null -- analyzer: ANALYZER_UNSPECIFIED +- analyzer: "" metadata: createRevision: "69" group: measure-minute @@ -116,7 +116,7 @@ modRevision: "69" name: source_service_instance_id tags: - - source_service_instance_id + - source_service_instance_id type: TYPE_INVERTED updatedAt: null diff --git a/ui/src/components/IndexRule/Editor.vue b/ui/src/components/IndexRule/Editor.vue index 56e2543c..b7dd174b 100644 --- a/ui/src/components/IndexRule/Editor.vue +++ b/ui/src/components/IndexRule/Editor.vue @@ -66,20 +66,20 @@ const typeList = [ ] const analyzerList = [ { - label: "ANALYZER_UNSPECIFIED", - value: "ANALYZER_UNSPECIFIED" + label: "unspecified", + value: "" }, { - label: "ANALYZER_KEYWORD", - value: "ANALYZER_KEYWORD" + label: "keyword", + value: "keyword" }, { - label: "ANALYZER_STANDARD", - value: "ANALYZER_STANDARD" + label: "standard", + value: "standard" }, { - label: "ANALYZER_SIMPLE", - value: "ANALYZER_SIMPLE" + label: "simple", + value: "simple" } ] const data = reactive({
