This is an automated email from the ASF dual-hosted git repository.
maedhroz pushed a commit to branch cep-7-sai
in repository https://gitbox.apache.org/repos/asf/cassandra.git
The following commit(s) were added to refs/heads/cep-7-sai by this push:
new 05dd58783a Add basic text analysis to SAI, including "case_sensitive",
"normalize", and "ascii" modes
05dd58783a is described below
commit 05dd58783a83c3bd2073f8f96924a99a73cc4f8a
Author: Mike Adamson <[email protected]>
AuthorDate: Tue May 9 12:29:01 2023 +0100
Add basic text analysis to SAI, including "case_sensitive", "normalize",
and "ascii" modes
patch by Mike Adamson; reviewed by Caleb Rackliffe and Andres de la Peña
for CASSANDRA-18479
---
.build/cassandra-deps-template.xml | 8 +-
.build/parent-pom-template.xml | 10 +-
.../apache/cassandra/index/sai/IndexContext.java | 31 ++--
.../cassandra/index/sai/StorageAttachedIndex.java | 6 +-
.../index/sai/analyzer/AbstractAnalyzer.java | 26 ++++
.../index/sai/analyzer/NonTokenizingAnalyzer.java | 147 +++++++++++++++++++
.../index/sai/analyzer/NonTokenizingOptions.java | 156 +++++++++++++++++++++
.../index/sai/analyzer/filter/BasicFilters.java | 82 +++++++++++
.../index/sai/analyzer/filter/FilterPipeline.java | 69 +++++++++
.../analyzer/filter/FilterPipelineExecutor.java | 40 ++++++
.../index/sai/disk/v1/SSTableIndexWriter.java | 2 +-
.../index/sai/memory/TrieMemoryIndex.java | 2 +-
.../cassandra/index/sai/plan/Expression.java | 2 +-
.../apache/cassandra/index/sai/plan/Operation.java | 2 +-
.../sai/plan/StorageAttachedIndexSearcher.java | 2 +-
.../index/sai/virtual/ColumnIndexesSystemView.java | 2 +-
.../index/sasi/analyzer/filter/StemmerFactory.java | 55 +++++---
.../test/sai/ReplicaFilteringProtectionTest.java | 67 +++++++++
.../sai/analyzer/NonTokenizingAnalyzerTest.java | 75 ++++++++++
.../sai/analyzer/filter/BasicFiltersTest.java | 73 ++++++++++
.../index/sai/cql/StorageAttachedIndexDDLTest.java | 106 ++++++++++++++
.../index/sai/virtual/IndexesSystemViewTest.java | 2 +-
22 files changed, 904 insertions(+), 61 deletions(-)
diff --git a/.build/cassandra-deps-template.xml
b/.build/cassandra-deps-template.xml
index c2194a0ab7..76e3bfdd4c 100644
--- a/.build/cassandra-deps-template.xml
+++ b/.build/cassandra-deps-template.xml
@@ -328,10 +328,6 @@
<groupId>de.jflex</groupId>
<artifactId>jflex</artifactId>
</dependency>
- <dependency>
- <groupId>com.github.rholder</groupId>
- <artifactId>snowball-stemmer</artifactId>
- </dependency>
<dependency>
<groupId>com.googlecode.concurrent-trees</groupId>
<artifactId>concurrent-trees</artifactId>
@@ -380,5 +376,9 @@
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
</dependency>
+ <dependency>
+ <groupId>org.apache.lucene</groupId>
+ <artifactId>lucene-analysis-common</artifactId>
+ </dependency>
</dependencies>
</project>
diff --git a/.build/parent-pom-template.xml b/.build/parent-pom-template.xml
index 6256f1339a..d4e09aafaf 100644
--- a/.build/parent-pom-template.xml
+++ b/.build/parent-pom-template.xml
@@ -874,11 +874,6 @@
</exclusion>
</exclusions>
</dependency>
- <dependency>
- <groupId>com.github.rholder</groupId>
- <artifactId>snowball-stemmer</artifactId>
- <version>1.3.0.581.1</version>
- </dependency>
<dependency>
<groupId>com.googlecode.concurrent-trees</groupId>
<artifactId>concurrent-trees</artifactId>
@@ -1047,6 +1042,11 @@
<artifactId>lucene-core</artifactId>
<version>9.7.0</version>
</dependency>
+ <dependency>
+ <groupId>org.apache.lucene</groupId>
+ <artifactId>lucene-analysis-common</artifactId>
+ <version>9.7.0</version>
+ </dependency>
<dependency>
<groupId>com.carrotsearch.randomizedtesting</groupId>
<artifactId>randomizedtesting-runner</artifactId>
diff --git a/src/java/org/apache/cassandra/index/sai/IndexContext.java
b/src/java/org/apache/cassandra/index/sai/IndexContext.java
index 3e18afe346..59e25777bc 100644
--- a/src/java/org/apache/cassandra/index/sai/IndexContext.java
+++ b/src/java/org/apache/cassandra/index/sai/IndexContext.java
@@ -90,8 +90,7 @@ public class IndexContext
private final IndexViewManager viewManager;
private final IndexMetrics indexMetrics;
private final ColumnQueryMetrics columnQueryMetrics;
- private final AbstractAnalyzer.AnalyzerFactory indexAnalyzerFactory;
- private final AbstractAnalyzer.AnalyzerFactory queryAnalyzerFactory;
+ private final AbstractAnalyzer.AnalyzerFactory analyzerFactory;
private final PrimaryKey.Factory primaryKeyFactory;
public IndexContext(String keyspace,
@@ -119,9 +118,8 @@ public class IndexContext
this.columnQueryMetrics = isLiteral() ? new
ColumnQueryMetrics.TrieIndexMetrics(this)
: new
ColumnQueryMetrics.BalancedTreeIndexMetrics(this);
- // We currently only support the NoOpAnalyzer
- this.indexAnalyzerFactory =
AbstractAnalyzer.fromOptions(getValidator(), Collections.emptyMap());
- this.queryAnalyzerFactory =
AbstractAnalyzer.fromOptions(getValidator(), Collections.emptyMap());
+ this.analyzerFactory = indexMetadata == null ?
AbstractAnalyzer.fromOptions(getValidator(), Collections.emptyMap())
+ :
AbstractAnalyzer.fromOptions(getValidator(), indexMetadata.options);
}
public AbstractType<?> keyValidator()
@@ -199,23 +197,12 @@ public class IndexContext
}
/**
- * Returns an {@code AnalyzerFactory} for use by the write path to
transform incoming literal
- * during indexing. The analyzers can be tokenising or non-tokenising.
Tokenising analyzers
- * will split the incoming terms into multiple terms in the index while
non-tokenising analyzers
- * will not split the incoming term but will transform the term (e.g.
case-insensitive)
+ * Returns an {@link AbstractAnalyzer.AnalyzerFactory} for use by write
and query paths to transform
+ * literal values.
*/
- public AbstractAnalyzer.AnalyzerFactory getIndexAnalyzerFactory()
+ public AbstractAnalyzer.AnalyzerFactory getAnalyzerFactory()
{
- return indexAnalyzerFactory;
- }
-
- /**
- * Return an {@code AnalyzerFactory} for use by the query path to
transform query terms before
- * searching for them in the index. This can be the same as the
indexAnalyzerFactory.
- */
- public AbstractAnalyzer.AnalyzerFactory getQueryAnalyzerFactory()
- {
- return queryAnalyzerFactory;
+ return analyzerFactory;
}
public View getView()
@@ -256,9 +243,7 @@ public class IndexContext
public void invalidate()
{
viewManager.invalidate();
- indexAnalyzerFactory.close();
- if (queryAnalyzerFactory != indexAnalyzerFactory)
- queryAnalyzerFactory.close();
+ analyzerFactory.close();
if (memtableIndexManager != null)
memtableIndexManager.invalidate();
if (indexMetrics != null)
diff --git a/src/java/org/apache/cassandra/index/sai/StorageAttachedIndex.java
b/src/java/org/apache/cassandra/index/sai/StorageAttachedIndex.java
index 543c14e8a2..03f006dda7 100644
--- a/src/java/org/apache/cassandra/index/sai/StorageAttachedIndex.java
+++ b/src/java/org/apache/cassandra/index/sai/StorageAttachedIndex.java
@@ -74,6 +74,7 @@ import org.apache.cassandra.index.IndexRegistry;
import org.apache.cassandra.index.SecondaryIndexBuilder;
import org.apache.cassandra.index.TargetParser;
import org.apache.cassandra.index.sai.analyzer.AbstractAnalyzer;
+import org.apache.cassandra.index.sai.analyzer.NonTokenizingOptions;
import org.apache.cassandra.index.sai.disk.SSTableIndex;
import org.apache.cassandra.index.sai.disk.format.IndexDescriptor;
import org.apache.cassandra.index.sai.disk.format.Version;
@@ -139,7 +140,10 @@ public class StorageAttachedIndex implements Index
private static final StorageAttachedIndexBuildingSupport
INDEX_BUILDER_SUPPORT = new StorageAttachedIndexBuildingSupport();
private static final Set<String> VALID_OPTIONS =
ImmutableSet.of(IndexTarget.TARGET_OPTION_NAME,
-
IndexTarget.CUSTOM_INDEX_OPTION_NAME);
+
IndexTarget.CUSTOM_INDEX_OPTION_NAME,
+
NonTokenizingOptions.CASE_SENSITIVE,
+
NonTokenizingOptions.NORMALIZE,
+
NonTokenizingOptions.ASCII);
public static final Set<CQL3Type> SUPPORTED_TYPES =
ImmutableSet.of(CQL3Type.Native.ASCII, CQL3Type.Native.BIGINT,
CQL3Type.Native.DATE,
CQL3Type.Native.DOUBLE, CQL3Type.Native.FLOAT, CQL3Type.Native.INT,
diff --git
a/src/java/org/apache/cassandra/index/sai/analyzer/AbstractAnalyzer.java
b/src/java/org/apache/cassandra/index/sai/analyzer/AbstractAnalyzer.java
index d1e7bae2fe..1f5e339657 100644
--- a/src/java/org/apache/cassandra/index/sai/analyzer/AbstractAnalyzer.java
+++ b/src/java/org/apache/cassandra/index/sai/analyzer/AbstractAnalyzer.java
@@ -24,10 +24,13 @@ import java.util.Map;
import java.util.NoSuchElementException;
import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.StringType;
+import org.apache.cassandra.exceptions.InvalidRequestException;
public abstract class AbstractAnalyzer implements Iterator<ByteBuffer>
{
protected ByteBuffer next = null;
+ protected String nextLiteral = null;
/**
* @return true if index value is transformed, e.g. normalized or
lower-cased or tokenized.
@@ -65,6 +68,7 @@ public abstract class AbstractAnalyzer implements
Iterator<ByteBuffer>
public void reset(ByteBuffer input)
{
this.next = null;
+ this.nextLiteral = null;
resetInternal(input);
}
@@ -80,6 +84,28 @@ public abstract class AbstractAnalyzer implements
Iterator<ByteBuffer>
public static AnalyzerFactory fromOptions(AbstractType<?> type,
Map<String, String> options)
{
+ if (hasNonTokenizingOptions(options))
+ {
+ if (type instanceof StringType)
+ {
+ // validate options
+ NonTokenizingOptions.fromMap(options);
+ return () -> new NonTokenizingAnalyzer(type, options);
+ }
+ else
+ {
+ throw new InvalidRequestException("CQL type " +
type.asCQL3Type() + " cannot be analyzed.");
+ }
+ }
+
return NoOpAnalyzer::new;
}
+
+ private static boolean hasNonTokenizingOptions(Map<String, String> options)
+ {
+ return options.get(NonTokenizingOptions.ASCII) != null ||
+ options.containsKey(NonTokenizingOptions.CASE_SENSITIVE) ||
+ options.containsKey(NonTokenizingOptions.NORMALIZE);
+ }
+
}
diff --git
a/src/java/org/apache/cassandra/index/sai/analyzer/NonTokenizingAnalyzer.java
b/src/java/org/apache/cassandra/index/sai/analyzer/NonTokenizingAnalyzer.java
new file mode 100644
index 0000000000..f58158a823
--- /dev/null
+++
b/src/java/org/apache/cassandra/index/sai/analyzer/NonTokenizingAnalyzer.java
@@ -0,0 +1,147 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.index.sai.analyzer;
+
+import java.nio.ByteBuffer;
+import java.util.Map;
+
+import com.google.common.base.MoreObjects;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.StringType;
+import org.apache.cassandra.index.sai.analyzer.filter.BasicFilters;
+import org.apache.cassandra.index.sai.analyzer.filter.FilterPipeline;
+import org.apache.cassandra.index.sai.analyzer.filter.FilterPipelineExecutor;
+import org.apache.cassandra.serializers.MarshalException;
+import org.apache.cassandra.utils.ByteBufferUtil;
+
+/**
+ * Analyzer that does *not* tokenize the input. Optionally will
+ * apply filters for the input based on {@link NonTokenizingOptions}.
+ */
+public class NonTokenizingAnalyzer extends AbstractAnalyzer
+{
+ private static final Logger logger =
LoggerFactory.getLogger(NonTokenizingAnalyzer.class);
+
+ private final AbstractType<?> type;
+ private final NonTokenizingOptions options;
+ private final FilterPipeline filterPipeline;
+
+ private ByteBuffer input;
+ private boolean hasNext = false;
+
+ NonTokenizingAnalyzer(AbstractType<?> type, Map<String, String> options)
+ {
+ this(type, NonTokenizingOptions.fromMap(options));
+ }
+
+ NonTokenizingAnalyzer(AbstractType<?> type, NonTokenizingOptions
tokenizerOptions)
+ {
+ this.type = type;
+ this.options = tokenizerOptions;
+ this.filterPipeline = getFilterPipeline();
+ }
+
+ @Override
+ public boolean hasNext()
+ {
+ // check that we know how to handle the input, otherwise bail
+ if (!(type instanceof StringType)) return false;
+
+ if (hasNext)
+ {
+ try
+ {
+ String input = type.getString(this.input);
+
+ if (input == null)
+ {
+ throw new MarshalException(String.format("'null'
deserialized value for %s with %s",
+
ByteBufferUtil.bytesToHex(this.input), type));
+ }
+
+ String result = FilterPipelineExecutor.execute(filterPipeline,
input);
+
+ if (result == null)
+ {
+ nextLiteral = null;
+ next = null;
+ return false;
+ }
+
+ nextLiteral = result;
+ next = type.fromString(result);
+
+ return true;
+ }
+ catch (MarshalException e)
+ {
+ logger.error("Failed to deserialize value with " + type, e);
+ return false;
+ }
+ finally
+ {
+ hasNext = false;
+ }
+ }
+
+ return false;
+ }
+
+ @Override
+ public boolean transformValue()
+ {
+ return !options.isCaseSensitive() || options.isNormalized() ||
options.isAscii();
+ }
+
+ @Override
+ protected void resetInternal(ByteBuffer input)
+ {
+ this.input = input;
+ this.hasNext = true;
+ }
+
+ private FilterPipeline getFilterPipeline()
+ {
+ FilterPipeline builder = new FilterPipeline(new
BasicFilters.NoOperation());
+
+ if (!options.isCaseSensitive())
+ builder = builder.add("to_lower", new BasicFilters.LowerCase());
+
+ if (options.isNormalized())
+ builder = builder.add("normalize", new BasicFilters.Normalize());
+
+ if (options.isAscii())
+ builder = builder.add("ascii", new BasicFilters.Ascii());
+
+ return builder;
+ }
+
+ @Override
+ public String toString()
+ {
+ return MoreObjects.toStringHelper(this)
+ .add("caseSensitive", options.isCaseSensitive())
+ .add("normalized", options.isNormalized())
+ .add("ascii", options.isAscii())
+ .toString();
+ }
+}
diff --git
a/src/java/org/apache/cassandra/index/sai/analyzer/NonTokenizingOptions.java
b/src/java/org/apache/cassandra/index/sai/analyzer/NonTokenizingOptions.java
new file mode 100644
index 0000000000..ab6485acf5
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/analyzer/NonTokenizingOptions.java
@@ -0,0 +1,156 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.index.sai.analyzer;
+
+import java.util.HashMap;
+import java.util.Map;
+
+import com.google.common.base.Strings;
+
+import org.apache.cassandra.exceptions.InvalidRequestException;
+
+public class NonTokenizingOptions
+{
+ public static final String NORMALIZE = "normalize";
+ public static final String CASE_SENSITIVE = "case_sensitive";
+ public static final String ASCII = "ascii";
+
+ private boolean caseSensitive;
+ private boolean normalized;
+ private boolean ascii;
+
+ boolean isCaseSensitive()
+ {
+ return caseSensitive;
+ }
+
+ void setCaseSensitive(boolean caseSensitive)
+ {
+ this.caseSensitive = caseSensitive;
+ }
+
+ boolean isNormalized()
+ {
+ return this.normalized;
+ }
+
+ void setAscii(boolean ascii)
+ {
+ this.ascii = ascii;
+ }
+
+ boolean isAscii()
+ {
+ return this.ascii;
+ }
+
+ void setNormalized(boolean normalized)
+ {
+ this.normalized = normalized;
+ }
+
+ public static class OptionsBuilder
+ {
+ private boolean caseSensitive = true;
+ private boolean normalized = false;
+ private boolean ascii = false;
+
+ OptionsBuilder() {}
+
+ OptionsBuilder caseSensitive(boolean caseSensitive)
+ {
+ this.caseSensitive = caseSensitive;
+ return this;
+ }
+
+ OptionsBuilder ascii(boolean ascii)
+ {
+ this.ascii = ascii;
+ return this;
+ }
+
+ OptionsBuilder normalized(boolean normalized)
+ {
+ this.normalized = normalized;
+ return this;
+ }
+
+ public NonTokenizingOptions build()
+ {
+ NonTokenizingOptions options = new NonTokenizingOptions();
+ options.setCaseSensitive(caseSensitive);
+ options.setNormalized(normalized);
+ options.setAscii(ascii);
+ return options;
+ }
+ }
+
+ public static NonTokenizingOptions getDefaultOptions()
+ {
+ return fromMap(new HashMap<>(1));
+ }
+
+ public static NonTokenizingOptions fromMap(Map<String, String> options)
+ {
+ OptionsBuilder builder = new OptionsBuilder();
+
+ for (Map.Entry<String, String> entry : options.entrySet())
+ {
+ switch (entry.getKey())
+ {
+ case CASE_SENSITIVE:
+ {
+ boolean boolValue = validateBoolean(entry.getValue(),
CASE_SENSITIVE);
+ builder = builder.caseSensitive(boolValue);
+ break;
+ }
+
+ case NORMALIZE:
+ {
+ boolean boolValue = validateBoolean(entry.getValue(),
NORMALIZE);
+ builder = builder.normalized(boolValue);
+ break;
+ }
+
+ case ASCII:
+ {
+ boolean boolValue = validateBoolean(entry.getValue(),
ASCII);
+ builder = builder.ascii(boolValue);
+ break;
+ }
+ }
+ }
+ return builder.build();
+ }
+
+ private static boolean validateBoolean(String value, String option)
+ {
+ if (Strings.isNullOrEmpty(value))
+ {
+ throw new InvalidRequestException("Empty value for boolean option
'" + option + '\'');
+ }
+
+ if (!value.equalsIgnoreCase(Boolean.TRUE.toString()) &&
!value.equalsIgnoreCase(Boolean.FALSE.toString()))
+ {
+ throw new InvalidRequestException("Illegal value for boolean
option '" + option + "': " + value);
+ }
+
+ return Boolean.parseBoolean(value);
+ }
+}
diff --git
a/src/java/org/apache/cassandra/index/sai/analyzer/filter/BasicFilters.java
b/src/java/org/apache/cassandra/index/sai/analyzer/filter/BasicFilters.java
new file mode 100644
index 0000000000..b70fbae9da
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/analyzer/filter/BasicFilters.java
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.index.sai.analyzer.filter;
+
+import java.text.Normalizer;
+import java.util.Locale;
+
+import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
+
+public class BasicFilters
+{
+ private static final Locale DEFAULT_LOCALE = Locale.getDefault();
+
+ public static class LowerCase extends FilterPipeline.Task
+ {
+ private final Locale locale;
+
+ public LowerCase()
+ {
+ this.locale = DEFAULT_LOCALE;
+ }
+
+ @Override
+ public String process(String input)
+ {
+ return input.toLowerCase(locale);
+ }
+ }
+
+ public static class Normalize extends FilterPipeline.Task
+ {
+ public Normalize() { }
+
+ @Override
+ public String process(String input)
+ {
+ if (input == null) return null;
+ return Normalizer.isNormalized(input, Normalizer.Form.NFC) ? input
: Normalizer.normalize(input, Normalizer.Form.NFC);
+ }
+ }
+
+ public static class Ascii extends FilterPipeline.Task
+ {
+ public Ascii() { }
+
+ @Override
+ public String process(String input)
+ {
+ if (input == null) return null;
+ char[] inputChars = input.toCharArray();
+ // The output can (potentially) be 4 times the size of the input
+ char[] outputChars = new char[inputChars.length * 4];
+ int outputSize = ASCIIFoldingFilter.foldToASCII(inputChars, 0,
outputChars, 0, inputChars.length);
+ return new String(outputChars, 0, outputSize);
+ }
+ }
+
+ public static class NoOperation extends FilterPipeline.Task
+ {
+ @Override
+ public String process(String input)
+ {
+ return input;
+ }
+ }
+}
diff --git
a/src/java/org/apache/cassandra/index/sai/analyzer/filter/FilterPipeline.java
b/src/java/org/apache/cassandra/index/sai/analyzer/filter/FilterPipeline.java
new file mode 100644
index 0000000000..017168321b
--- /dev/null
+++
b/src/java/org/apache/cassandra/index/sai/analyzer/filter/FilterPipeline.java
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.index.sai.analyzer.filter;
+
+import javax.annotation.concurrent.NotThreadSafe;
+
+import com.google.common.base.Preconditions;
+
+/**
+ * A linked list of {@link Task} objects. Used to apply a sequence of
filtering tasks
+ * to provided textual input in a guaranteed order.
+ */
+@NotThreadSafe
+public class FilterPipeline
+{
+ private final Task head;
+ private Task tail;
+
+ public FilterPipeline(Task first)
+ {
+ this(first, first);
+ }
+
+ private FilterPipeline(Task first, Task tail)
+ {
+ this.head = first;
+ this.tail = tail;
+ }
+
+ public FilterPipeline add(String name, Task task)
+ {
+ Preconditions.checkArgument(task != this.tail, "Provided last task ["
+ task.name + "] cannot be set to itself");
+
+ this.tail.next = task;
+ this.tail.name = name;
+
+ this.tail = task;
+ return this;
+ }
+
+ public Task head()
+ {
+ return this.head;
+ }
+
+ public abstract static class Task
+ {
+ public String name;
+ public Task next;
+
+ public abstract String process(String input);
+ }
+}
diff --git
a/src/java/org/apache/cassandra/index/sai/analyzer/filter/FilterPipelineExecutor.java
b/src/java/org/apache/cassandra/index/sai/analyzer/filter/FilterPipelineExecutor.java
new file mode 100644
index 0000000000..c863f1e3cd
--- /dev/null
+++
b/src/java/org/apache/cassandra/index/sai/analyzer/filter/FilterPipelineExecutor.java
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.index.sai.analyzer.filter;
+
+/**
+ * Executes all linked {@link FilterPipeline.Task}s serially on the provided
input and returns a result
+ */
+public class FilterPipelineExecutor
+{
+ public static String execute(FilterPipeline pipeline, String initialInput)
+ {
+ FilterPipeline.Task currentTask = pipeline.head();
+ String result = initialInput;
+
+ while (true)
+ {
+ result = currentTask.process(result);
+ currentTask = currentTask.next;
+
+ if (currentTask == null)
+ return result;
+ }
+ }
+}
diff --git
a/src/java/org/apache/cassandra/index/sai/disk/v1/SSTableIndexWriter.java
b/src/java/org/apache/cassandra/index/sai/disk/v1/SSTableIndexWriter.java
index ec1b6a338f..544aae5e66 100644
--- a/src/java/org/apache/cassandra/index/sai/disk/v1/SSTableIndexWriter.java
+++ b/src/java/org/apache/cassandra/index/sai/disk/v1/SSTableIndexWriter.java
@@ -79,7 +79,7 @@ public class SSTableIndexWriter implements
PerColumnIndexWriter
{
this.indexDescriptor = indexDescriptor;
this.indexContext = indexContext;
- this.analyzer = indexContext.getIndexAnalyzerFactory().create();
+ this.analyzer = indexContext.getAnalyzerFactory().create();
this.limiter = limiter;
this.isIndexValid = isIndexValid;
this.maxTermSize = indexContext.isFrozen() ? MAX_FROZEN_TERM_SIZE :
MAX_STRING_TERM_SIZE;
diff --git
a/src/java/org/apache/cassandra/index/sai/memory/TrieMemoryIndex.java
b/src/java/org/apache/cassandra/index/sai/memory/TrieMemoryIndex.java
index e84a3f9466..0df665d930 100644
--- a/src/java/org/apache/cassandra/index/sai/memory/TrieMemoryIndex.java
+++ b/src/java/org/apache/cassandra/index/sai/memory/TrieMemoryIndex.java
@@ -77,7 +77,7 @@ public class TrieMemoryIndex
this.data = new InMemoryTrie<>(TrieMemtable.BUFFER_TYPE);
this.primaryKeysReducer = new PrimaryKeysReducer();
// The use of the analyzer is within a synchronized block so can be
considered thread-safe
- this.analyzerFactory = indexContext.getIndexAnalyzerFactory();
+ this.analyzerFactory = indexContext.getAnalyzerFactory();
this.validator = indexContext.getValidator();
this.isLiteral = TypeUtil.isLiteral(validator);
}
diff --git a/src/java/org/apache/cassandra/index/sai/plan/Expression.java
b/src/java/org/apache/cassandra/index/sai/plan/Expression.java
index a36bf315d5..d33470d515 100644
--- a/src/java/org/apache/cassandra/index/sai/plan/Expression.java
+++ b/src/java/org/apache/cassandra/index/sai/plan/Expression.java
@@ -93,7 +93,7 @@ public class Expression
public Expression(IndexContext indexContext)
{
this.context = indexContext;
- this.analyzerFactory = indexContext.getQueryAnalyzerFactory();
+ this.analyzerFactory = indexContext.getAnalyzerFactory();
this.validator = indexContext.getValidator();
}
diff --git a/src/java/org/apache/cassandra/index/sai/plan/Operation.java
b/src/java/org/apache/cassandra/index/sai/plan/Operation.java
index d911e09277..e9fba9760c 100644
--- a/src/java/org/apache/cassandra/index/sai/plan/Operation.java
+++ b/src/java/org/apache/cassandra/index/sai/plan/Operation.java
@@ -76,7 +76,7 @@ public class Operation
IndexContext indexContext = controller.getContext(e);
List<Expression> perColumn = analyzed.get(e.column());
- AbstractAnalyzer analyzer =
indexContext.getQueryAnalyzerFactory().create();
+ AbstractAnalyzer analyzer =
indexContext.getAnalyzerFactory().create();
try
{
analyzer.reset(e.getIndexValue().duplicate());
diff --git
a/src/java/org/apache/cassandra/index/sai/plan/StorageAttachedIndexSearcher.java
b/src/java/org/apache/cassandra/index/sai/plan/StorageAttachedIndexSearcher.java
index 09acca2649..adcf36d157 100644
---
a/src/java/org/apache/cassandra/index/sai/plan/StorageAttachedIndexSearcher.java
+++
b/src/java/org/apache/cassandra/index/sai/plan/StorageAttachedIndexSearcher.java
@@ -86,7 +86,7 @@ public class StorageAttachedIndexSearcher implements
Index.Searcher
{
for (RowFilter.Expression expression :
queryController.filterOperation())
{
- AbstractAnalyzer analyzer =
queryController.getContext(expression).getIndexAnalyzerFactory().create();
+ AbstractAnalyzer analyzer =
queryController.getContext(expression).getAnalyzerFactory().create();
try
{
if (analyzer.transformValue())
diff --git
a/src/java/org/apache/cassandra/index/sai/virtual/ColumnIndexesSystemView.java
b/src/java/org/apache/cassandra/index/sai/virtual/ColumnIndexesSystemView.java
index 2a9528db62..39eaa6d7ee 100644
---
a/src/java/org/apache/cassandra/index/sai/virtual/ColumnIndexesSystemView.java
+++
b/src/java/org/apache/cassandra/index/sai/virtual/ColumnIndexesSystemView.java
@@ -104,7 +104,7 @@ public class ColumnIndexesSystemView extends
AbstractVirtualTable
.column(IS_QUERYABLE,
manager.isIndexQueryable(index))
.column(IS_BUILDING,
manager.isIndexBuilding(indexName))
.column(IS_STRING, context.isLiteral())
- .column(ANALYZER,
context.getIndexAnalyzerFactory().toString());
+ .column(ANALYZER,
context.getAnalyzerFactory().toString());
}
}
}
diff --git
a/src/java/org/apache/cassandra/index/sasi/analyzer/filter/StemmerFactory.java
b/src/java/org/apache/cassandra/index/sasi/analyzer/filter/StemmerFactory.java
index 457876ab24..9786a86ae2 100644
---
a/src/java/org/apache/cassandra/index/sasi/analyzer/filter/StemmerFactory.java
+++
b/src/java/org/apache/cassandra/index/sasi/analyzer/filter/StemmerFactory.java
@@ -22,16 +22,29 @@ import java.util.HashMap;
import java.util.Locale;
import java.util.Map;
-import org.apache.cassandra.concurrent.ImmediateExecutor;
-import org.tartarus.snowball.SnowballStemmer;
-import org.tartarus.snowball.ext.*;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
import com.github.benmanes.caffeine.cache.CacheLoader;
import com.github.benmanes.caffeine.cache.Caffeine;
import com.github.benmanes.caffeine.cache.LoadingCache;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
+import org.apache.cassandra.concurrent.ImmediateExecutor;
+import org.tartarus.snowball.SnowballStemmer;
+import org.tartarus.snowball.ext.DanishStemmer;
+import org.tartarus.snowball.ext.DutchStemmer;
+import org.tartarus.snowball.ext.EnglishStemmer;
+import org.tartarus.snowball.ext.FinnishStemmer;
+import org.tartarus.snowball.ext.FrenchStemmer;
+import org.tartarus.snowball.ext.GermanStemmer;
+import org.tartarus.snowball.ext.HungarianStemmer;
+import org.tartarus.snowball.ext.ItalianStemmer;
+import org.tartarus.snowball.ext.NorwegianStemmer;
+import org.tartarus.snowball.ext.PortugueseStemmer;
+import org.tartarus.snowball.ext.RomanianStemmer;
+import org.tartarus.snowball.ext.RussianStemmer;
+import org.tartarus.snowball.ext.SpanishStemmer;
+import org.tartarus.snowball.ext.SwedishStemmer;
+import org.tartarus.snowball.ext.TurkishStemmer;
/**
* Returns a SnowballStemmer instance appropriate for
@@ -63,21 +76,21 @@ public class StemmerFactory
static
{
SUPPORTED_LANGUAGES = new HashMap<>();
- SUPPORTED_LANGUAGES.put("de", germanStemmer.class);
- SUPPORTED_LANGUAGES.put("da", danishStemmer.class);
- SUPPORTED_LANGUAGES.put("es", spanishStemmer.class);
- SUPPORTED_LANGUAGES.put("en", englishStemmer.class);
- SUPPORTED_LANGUAGES.put("fl", finnishStemmer.class);
- SUPPORTED_LANGUAGES.put("fr", frenchStemmer.class);
- SUPPORTED_LANGUAGES.put("hu", hungarianStemmer.class);
- SUPPORTED_LANGUAGES.put("it", italianStemmer.class);
- SUPPORTED_LANGUAGES.put("nl", dutchStemmer.class);
- SUPPORTED_LANGUAGES.put("no", norwegianStemmer.class);
- SUPPORTED_LANGUAGES.put("pt", portugueseStemmer.class);
- SUPPORTED_LANGUAGES.put("ro", romanianStemmer.class);
- SUPPORTED_LANGUAGES.put("ru", russianStemmer.class);
- SUPPORTED_LANGUAGES.put("sv", swedishStemmer.class);
- SUPPORTED_LANGUAGES.put("tr", turkishStemmer.class);
+ SUPPORTED_LANGUAGES.put("de", GermanStemmer.class);
+ SUPPORTED_LANGUAGES.put("da", DanishStemmer.class);
+ SUPPORTED_LANGUAGES.put("es", SpanishStemmer.class);
+ SUPPORTED_LANGUAGES.put("en", EnglishStemmer.class);
+ SUPPORTED_LANGUAGES.put("fl", FinnishStemmer.class);
+ SUPPORTED_LANGUAGES.put("fr", FrenchStemmer.class);
+ SUPPORTED_LANGUAGES.put("hu", HungarianStemmer.class);
+ SUPPORTED_LANGUAGES.put("it", ItalianStemmer.class);
+ SUPPORTED_LANGUAGES.put("nl", DutchStemmer.class);
+ SUPPORTED_LANGUAGES.put("no", NorwegianStemmer.class);
+ SUPPORTED_LANGUAGES.put("pt", PortugueseStemmer.class);
+ SUPPORTED_LANGUAGES.put("ro", RomanianStemmer.class);
+ SUPPORTED_LANGUAGES.put("ru", RussianStemmer.class);
+ SUPPORTED_LANGUAGES.put("sv", SwedishStemmer.class);
+ SUPPORTED_LANGUAGES.put("tr", TurkishStemmer.class);
}
public static SnowballStemmer getStemmer(Locale locale)
diff --git
a/test/distributed/org/apache/cassandra/distributed/test/sai/ReplicaFilteringProtectionTest.java
b/test/distributed/org/apache/cassandra/distributed/test/sai/ReplicaFilteringProtectionTest.java
new file mode 100644
index 0000000000..95ed4c7946
--- /dev/null
+++
b/test/distributed/org/apache/cassandra/distributed/test/sai/ReplicaFilteringProtectionTest.java
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.distributed.test.sai;
+
+import java.io.IOException;
+
+import org.junit.Test;
+
+import org.apache.cassandra.distributed.Cluster;
+import org.apache.cassandra.distributed.api.SimpleQueryResult;
+import org.apache.cassandra.distributed.test.TestBaseImpl;
+
+import static org.apache.cassandra.distributed.api.ConsistencyLevel.ALL;
+import static org.apache.cassandra.distributed.shared.AssertUtils.assertRows;
+import static org.apache.cassandra.distributed.shared.AssertUtils.row;
+
+public class ReplicaFilteringProtectionTest extends TestBaseImpl
+{
+ private static final int REPLICAS = 2;
+
+ @Test
+ public void testRFPWithIndexTransformations() throws IOException
+ {
+ try (Cluster cluster = init(Cluster.build()
+ .withNodes(REPLICAS)
+ .withConfig(config ->
config.set("hinted_handoff_enabled", false)
+
.set("commitlog_sync", "batch")).start()))
+ {
+ String tableName = "sai_rfp";
+ String fullTableName = KEYSPACE + '.' + tableName;
+
+ cluster.schemaChange("CREATE TABLE " + fullTableName + " (k int
PRIMARY KEY, v text)");
+ cluster.schemaChange("CREATE CUSTOM INDEX ON " + fullTableName +
"(v) USING 'StorageAttachedIndex' " +
+ "WITH OPTIONS = { 'case_sensitive' : false}");
+
+ // both nodes have the old value
+ cluster.coordinator(1).execute("INSERT INTO " + fullTableName +
"(k, v) VALUES (0, 'OLD')", ALL);
+
+ String select = "SELECT * FROM " + fullTableName + " WHERE v =
'old'";
+ Object[][] initialRows = cluster.coordinator(1).execute(select,
ALL);
+ assertRows(initialRows, row(0, "OLD"));
+
+ // only one node gets the new value
+ cluster.get(1).executeInternal("UPDATE " + fullTableName + " SET v
= 'new' WHERE k = 0");
+
+ // querying by the old value shouldn't return the old surviving row
+ SimpleQueryResult oldResult =
cluster.coordinator(1).executeWithResult(select, ALL);
+ assertRows(oldResult.toObjectArrays());
+ }
+ }
+}
diff --git
a/test/unit/org/apache/cassandra/index/sai/analyzer/NonTokenizingAnalyzerTest.java
b/test/unit/org/apache/cassandra/index/sai/analyzer/NonTokenizingAnalyzerTest.java
new file mode 100644
index 0000000000..fb73a98eae
--- /dev/null
+++
b/test/unit/org/apache/cassandra/index/sai/analyzer/NonTokenizingAnalyzerTest.java
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.index.sai.analyzer;
+
+import java.nio.ByteBuffer;
+
+import org.junit.Test;
+
+import org.apache.cassandra.db.marshal.UTF8Type;
+import org.apache.cassandra.utils.ByteBufferUtil;
+
+import static org.junit.Assert.assertEquals;
+
+public class NonTokenizingAnalyzerTest
+{
+ @Test
+ public void asciiAnalyzer() throws Exception
+ {
+ NonTokenizingOptions options =
NonTokenizingOptions.getDefaultOptions();
+ options.setCaseSensitive(false);
+ options.setAscii(true);
+
+ assertEquals("eppinger", getAnalyzedString("Éppinger", options));
+ }
+
+ @Test
+ public void asciiAnalyzerFalse() throws Exception
+ {
+ NonTokenizingOptions options =
NonTokenizingOptions.getDefaultOptions();
+ options.setCaseSensitive(true);
+ options.setAscii(false);
+
+ assertEquals("Éppinger", getAnalyzedString("Éppinger", options));
+ }
+
+ @Test
+ public void caseInsensitiveAnalyzer() throws Exception
+ {
+ NonTokenizingOptions options =
NonTokenizingOptions.getDefaultOptions();
+ options.setCaseSensitive(false);
+
+ assertEquals("nip it in the bud", getAnalyzedString("Nip it in the
bud", options));
+ }
+
+ @Test
+ public void caseSensitiveAnalyzer() throws Exception
+ {
+ NonTokenizingOptions options =
NonTokenizingOptions.getDefaultOptions();
+
+ assertEquals("Nip it in the bud", getAnalyzedString("Nip it in the
bud", options));
+ }
+
+ private String getAnalyzedString(String input, NonTokenizingOptions
options) throws Exception
+ {
+ NonTokenizingAnalyzer analyzer = new
NonTokenizingAnalyzer(UTF8Type.instance, options);
+ analyzer.reset(ByteBuffer.wrap(input.getBytes()));
+ return analyzer.hasNext() ? ByteBufferUtil.string(analyzer.next) :
null;
+ }
+}
diff --git
a/test/unit/org/apache/cassandra/index/sai/analyzer/filter/BasicFiltersTest.java
b/test/unit/org/apache/cassandra/index/sai/analyzer/filter/BasicFiltersTest.java
new file mode 100644
index 0000000000..01faf488c1
--- /dev/null
+++
b/test/unit/org/apache/cassandra/index/sai/analyzer/filter/BasicFiltersTest.java
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.index.sai.analyzer.filter;
+
+import java.text.Normalizer;
+
+import org.junit.Test;
+
+import org.apache.cassandra.index.sai.SAITester;
+import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
+
+import static org.junit.Assert.assertEquals;
+
+public class BasicFiltersTest
+{
+ @Test
+ public void testLowerCase()
+ {
+ BasicFilters.LowerCase lowerCase = new BasicFilters.LowerCase();
+
+ for (int count = 0; count < SAITester.getRandom().nextIntBetween(100,
1000); count++)
+ {
+ String actual = SAITester.getRandom().nextTextString(10, 50);
+ assertEquals(actual.toLowerCase(), lowerCase.process(actual));
+ }
+ }
+
+ @Test
+ public void testNormalize()
+ {
+ BasicFilters.Normalize normalize = new BasicFilters.Normalize();
+
+ for (int count = 0; count < SAITester.getRandom().nextIntBetween(100,
1000); count++)
+ {
+ String actual = SAITester.getRandom().nextTextString(10, 50);
+ assertEquals(Normalizer.normalize(actual, Normalizer.Form.NFC),
normalize.process(actual));
+ }
+ }
+
+ @Test
+ public void testAscii()
+ {
+ BasicFilters.Ascii ascii = new BasicFilters.Ascii();
+
+ for (int count = 0; count < SAITester.getRandom().nextIntBetween(100,
1000); count++)
+ {
+ String actual = SAITester.getRandom().nextTextString(100, 5000);
+
+ char[] actualChars = actual.toCharArray();
+ char[] expectedChars = new char[actualChars.length * 4];
+ int expectedSize = ASCIIFoldingFilter.foldToASCII(actualChars, 0,
expectedChars, 0, actualChars.length);
+ String expected = new String(expectedChars, 0, expectedSize);
+
+ assertEquals(expected, ascii.process(actual));
+ }
+ }
+}
diff --git
a/test/unit/org/apache/cassandra/index/sai/cql/StorageAttachedIndexDDLTest.java
b/test/unit/org/apache/cassandra/index/sai/cql/StorageAttachedIndexDDLTest.java
index fab116101f..2b621fcfe7 100644
---
a/test/unit/org/apache/cassandra/index/sai/cql/StorageAttachedIndexDDLTest.java
+++
b/test/unit/org/apache/cassandra/index/sai/cql/StorageAttachedIndexDDLTest.java
@@ -264,6 +264,112 @@ public class StorageAttachedIndexDDLTest extends SAITester
assertEquals(1, saiCreationCounter.get());
}
+ @Test
+ public void shouldBeCaseSensitiveByDefault()
+ {
+ createTable("CREATE TABLE %s (id text PRIMARY KEY, val text)");
+
+ createIndex("CREATE CUSTOM INDEX ON %s(val) USING
'StorageAttachedIndex'");
+
+ execute("INSERT INTO %s (id, val) VALUES ('1', 'Camel')");
+
+ assertEquals(1, execute("SELECT id FROM %s WHERE val =
'Camel'").size());
+
+ assertEquals(0, execute("SELECT id FROM %s WHERE val =
'camel'").size());
+ }
+
+ @Test
+ public void shouldEnableCaseSensitiveSearch()
+ {
+ createTable("CREATE TABLE %s (id text PRIMARY KEY, val text)");
+
+ createIndex("CREATE CUSTOM INDEX ON %s(val) USING
'StorageAttachedIndex' WITH OPTIONS = { 'case_sensitive' : true }");
+
+ execute("INSERT INTO %s (id, val) VALUES ('1', 'Camel')");
+
+ assertEquals(1, execute("SELECT id FROM %s WHERE val =
'Camel'").size());
+
+ assertEquals(0, execute("SELECT id FROM %s WHERE val =
'camel'").size());
+ }
+
+ @Test
+ public void shouldEnableCaseInsensitiveSearch()
+ {
+ createTable("CREATE TABLE %s (id text PRIMARY KEY, val text)");
+
+ createIndex("CREATE CUSTOM INDEX ON %s(val) USING
'StorageAttachedIndex' WITH OPTIONS = { 'case_sensitive' : false }");
+
+ execute("INSERT INTO %s (id, val) VALUES ('1', 'Camel')");
+
+ assertEquals(1, execute("SELECT id FROM %s WHERE val =
'camel'").size());
+ }
+
+ @Test
+ public void shouldBeNonNormalizedByDefault()
+ {
+ createTable("CREATE TABLE %s (id text PRIMARY KEY, val text)");
+
+ createIndex("CREATE CUSTOM INDEX ON %s(val) USING
'StorageAttachedIndex'");
+
+ execute("INSERT INTO %s (id, val) VALUES ('1', 'Cam\u00E1l')");
+
+ assertEquals(1, execute("SELECT id FROM %s WHERE val =
'Cam\u00E1l'").size());
+
+ // Both \u00E1 and \u0061\u0301 are visible as the character á, but
without NFC normalization, they won't match.
+ assertEquals(0, execute("SELECT id FROM %s WHERE val =
'Cam\u0061\u0301l'").size());
+ }
+
+ @Test
+ public void shouldEnableNonNormalizedSearch()
+ {
+ createTable("CREATE TABLE %s (id text PRIMARY KEY, val text)");
+
+ createIndex("CREATE CUSTOM INDEX ON %s(val) USING
'StorageAttachedIndex' WITH OPTIONS = { 'normalize' : false }");
+
+ execute("INSERT INTO %s (id, val) VALUES ('1', 'Cam\u00E1l')");
+
+ assertEquals(1, execute("SELECT id FROM %s WHERE val =
'Cam\u00E1l'").size());
+
+ // Both \u00E1 and \u0061\u0301 are visible as the character á, but
without NFC normalization, they won't match.
+ assertEquals(0, execute("SELECT id FROM %s WHERE val =
'Cam\u0061\u0301l'").size());
+ }
+
+ @Test
+ public void shouldEnableNormalizedSearch()
+ {
+ createTable("CREATE TABLE %s (id text PRIMARY KEY, val text)");
+
+ createIndex("CREATE CUSTOM INDEX ON %s(val) USING
'StorageAttachedIndex' WITH OPTIONS = { 'normalize' : true }");
+
+ execute("INSERT INTO %s (id, val) VALUES ('1', 'Cam\u00E1l')");
+
+ assertEquals(1, execute("SELECT id FROM %s WHERE val =
'Cam\u0061\u0301l'").size());
+ }
+
+ @Test
+ public void shouldEnableNormalizedCaseInsensitiveSearch()
+ {
+ createTable("CREATE TABLE %s (id text PRIMARY KEY, val text)");
+
+ createIndex("CREATE CUSTOM INDEX ON %s(val) USING
'StorageAttachedIndex' WITH OPTIONS = { 'normalize' : true, 'case_sensitive' :
false}");
+
+ execute("INSERT INTO %s (id, val) VALUES ('1', 'Cam\u00E1l')");
+
+ assertEquals(1, execute("SELECT id FROM %s WHERE val =
'cam\u0061\u0301l'").size());
+ }
+
+ @Test
+ public void shouldEnableAsciiSearch()
+ {
+ createTable("CREATE TABLE %s (id text PRIMARY KEY, val text)");
+
+ createIndex("CREATE CUSTOM INDEX ON %s(val) USING
'StorageAttachedIndex' WITH OPTIONS = { 'ascii' : true, 'case_sensitive' :
false}");
+
+ execute("INSERT INTO %s (id, val) VALUES ('1', 'Éppinger')");
+
+ assertEquals(1, execute("SELECT id FROM %s WHERE val =
'eppinger'").size());
+ }
+
@Test
public void shouldCreateIndexOnReversedType() throws Throwable
{
diff --git
a/test/unit/org/apache/cassandra/index/sai/virtual/IndexesSystemViewTest.java
b/test/unit/org/apache/cassandra/index/sai/virtual/IndexesSystemViewTest.java
index a8f5bf9a58..19d4cbfdf1 100644
---
a/test/unit/org/apache/cassandra/index/sai/virtual/IndexesSystemViewTest.java
+++
b/test/unit/org/apache/cassandra/index/sai/virtual/IndexesSystemViewTest.java
@@ -129,6 +129,6 @@ public class IndexesSystemViewTest extends SAITester
isQueryable,
isBuilding,
isString,
- context.getIndexAnalyzerFactory().toString());
+ context.getAnalyzerFactory().toString());
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]