This is an automated email from the ASF dual-hosted git repository. jackietien pushed a commit to branch iotdb in repository https://gitbox.apache.org/repos/asf/tsfile.git
commit c3437a4193da8bb528d257f7c42d0c701a32b804 Author: Lin Xintao <[email protected]> AuthorDate: Fri Sep 20 10:23:03 2024 +0800 Added an optimized matching framework for standard SQL's LIKE --- .../codegen/templates/FilterOperatorsTemplate.ftl | 175 +++++++++++++- .../java/org/apache/tsfile/common/regexp/DFA.java | 101 ++++++++ .../tsfile/common/regexp/DenseDfaMatcher.java | 215 +++++++++++++++++ .../apache/tsfile/common/regexp/FjsMatcher.java | 223 ++++++++++++++++++ .../apache/tsfile/common/regexp/LikeMatcher.java | 261 +++++++++++++++++++++ .../apache/tsfile/common/regexp/LikePattern.java | 98 ++++++++ .../org/apache/tsfile/common/regexp/Matcher.java | 24 ++ .../java/org/apache/tsfile/common/regexp/NFA.java | 184 +++++++++++++++ .../apache/tsfile/common/regexp/NfaMatcher.java | 176 ++++++++++++++ .../apache/tsfile/common/regexp/pattern/Any.java | 44 ++++ .../tsfile/common/regexp/pattern/Literal.java | 37 +++ .../tsfile/common/regexp/pattern/Pattern.java | 22 ++ .../tsfile/common/regexp/pattern/ZeroOrMore.java | 27 +++ .../apache/tsfile/read/filter/basic/Filter.java | 2 + .../tsfile/read/filter/basic/OperatorType.java | 6 +- .../tsfile/read/filter/factory/ValueFilterApi.java | 68 ++++-- .../org/apache/tsfile/utils/FilterDeserialize.java | 48 ++++ .../java/org/apache/tsfile/utils/RegexUtils.java | 92 -------- .../tsfile/read/filter/BinaryOperatorsTest.java | 28 ++- .../tsfile/read/filter/BooleanOperatorsTest.java | 27 ++- .../tsfile/read/filter/FilterSerializeTest.java | 114 ++++++--- .../tsfile/read/filter/NumericalOperatorsTest.java | 26 +- .../filter/PredicateRemoveNotRewriterTest.java | 40 +++- 23 files changed, 1869 insertions(+), 169 deletions(-) diff --git a/java/tsfile/src/main/codegen/templates/FilterOperatorsTemplate.ftl b/java/tsfile/src/main/codegen/templates/FilterOperatorsTemplate.ftl index b6f7a4c0..ee1ae842 100644 --- a/java/tsfile/src/main/codegen/templates/FilterOperatorsTemplate.ftl +++ b/java/tsfile/src/main/codegen/templates/FilterOperatorsTemplate.ftl @@ -27,6 +27,7 @@ package org.apache.tsfile.read.filter.operator; import static org.apache.tsfile.read.filter.factory.ValueFilterApi.CANNOT_PUSH_DOWN_MSG; import org.apache.tsfile.common.conf.TSFileDescriptor; +import org.apache.tsfile.common.regexp.LikePattern; import org.apache.tsfile.enums.TSDataType; import org.apache.tsfile.exception.NotImplementedException; import org.apache.tsfile.file.metadata.IMetadata; @@ -50,7 +51,9 @@ import java.util.Set; import java.util.regex.Pattern; import java.util.stream.Collectors; -/* +import static org.apache.tsfile.common.regexp.LikePattern.getEscapeCharacter; + + /* * This class is generated using freemarker and the ${.template_name} template. */ public final class ${className} { @@ -1147,9 +1150,9 @@ public final class ${className} { protected ValueColumnPatternMatchFilter(ByteBuffer buffer) { super(buffer); this.pattern = - Pattern.compile( - Objects.requireNonNull( - ReadWriteIOUtils.readString(buffer), "pattern cannot be null")); + Pattern.compile( + Objects.requireNonNull( + ReadWriteIOUtils.readString(buffer), "pattern cannot be null")); } @Override @@ -1181,7 +1184,7 @@ public final class ${className} { @Override public String toString() { return String.format( - OPERATOR_TO_STRING_FORMAT, measurementIndex, getOperatorType().getSymbol(), pattern); + OPERATOR_TO_STRING_FORMAT, measurementIndex, getOperatorType().getSymbol(), pattern); } } @@ -1202,11 +1205,11 @@ public final class ${className} { @Override public boolean valueSatisfy(${filter.dataType} value) { - <#if filter.dataType == "Binary"> + <#if filter.dataType == "Binary"> return pattern.matcher(new MatcherInput(value.toString(), new AccessCount())).find(); - <#else> + <#else> return pattern.matcher(new MatcherInput(String.valueOf(value), new AccessCount())).find(); - </#if> + </#if> } @Override @@ -1247,11 +1250,11 @@ public final class ${className} { @Override public boolean valueSatisfy(${filter.dataType} value) { - <#if filter.dataType == "Binary"> + <#if filter.dataType == "Binary"> return !pattern.matcher(new MatcherInput(value.toString(), new AccessCount())).find(); - <#else> + <#else> return !pattern.matcher(new MatcherInput(String.valueOf(value), new AccessCount())).find(); - </#if> + </#if> } @Override @@ -1275,6 +1278,156 @@ public final class ${className} { } } + // base class for ValueLike, ValueNotLike + abstract static class ValueColumnPatternLikeMatchFilter extends ${filterName} { + + protected final LikePattern pattern; + + protected ValueColumnPatternLikeMatchFilter(int measurementIndex, LikePattern pattern) { + super(measurementIndex); + this.pattern = Objects.requireNonNull(pattern, "pattern cannot be null"); + } + + protected ValueColumnPatternLikeMatchFilter(ByteBuffer buffer) { + super(buffer); + this.pattern = + LikePattern.compile( + ReadWriteIOUtils.readString(buffer), + ReadWriteIOUtils.readBool(buffer) + ? getEscapeCharacter(Optional.of(ReadWriteIOUtils.readString(buffer))) + : Optional.empty()); + } + + @Override + public void serialize(DataOutputStream outputStream) throws IOException { + super.serialize(outputStream); + ReadWriteIOUtils.write(pattern.getPattern(), outputStream); + if(pattern.getEscape().isPresent()){ + ReadWriteIOUtils.write(true, outputStream); + ReadWriteIOUtils.write(pattern.getEscape().get().toString(), outputStream); + } + else{ + ReadWriteIOUtils.write(false, outputStream); + } + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + if (!super.equals(o)) { + return false; + } + ValueColumnPatternLikeMatchFilter that = (ValueColumnPatternLikeMatchFilter) o; + return pattern.equals(that.pattern); + } + + @Override + public int hashCode() { + return Objects.hash(super.hashCode(), pattern); + } + + @Override + public String toString() { + return String.format( + OPERATOR_TO_STRING_FORMAT, measurementIndex, getOperatorType().getSymbol(), pattern); + } + } + + public static final class ValueLike extends ValueColumnPatternLikeMatchFilter { + + public ValueLike(int measurementIndex, LikePattern pattern) { + super(measurementIndex, pattern); + } + + public ValueLike(ByteBuffer buffer) { + super(buffer); + } + + @Override + public boolean valueSatisfy(Object value){ + return pattern.getMatcher().match(value.toString().getBytes()); + } + + @Override + public boolean valueSatisfy(${filter.dataType} value) { + <#if filter.dataType == "Binary"> + return pattern.getMatcher().match(value.toString().getBytes()); + <#else> + return pattern.getMatcher().match(String.valueOf(value).getBytes()); + </#if> + } + + @Override + protected boolean canSkip(Statistics<? extends Serializable> statistics) { + return false; + } + + @Override + protected boolean allSatisfy(Statistics<? extends Serializable> statistics) { + return false; + } + + @Override + public Filter reverse() { + return new ValueNotLike(measurementIndex, pattern); + } + + @Override + public OperatorType getOperatorType() { + return OperatorType.VALUE_LIKE; + } + } + + public static final class ValueNotLike extends ValueColumnPatternLikeMatchFilter { + + public ValueNotLike(int measurementIndex, LikePattern pattern) { + super(measurementIndex, pattern); + } + + public ValueNotLike(ByteBuffer buffer) { + super(buffer); + } + + @Override + public boolean valueSatisfy(Object value){ + return !pattern.getMatcher().match(value.toString().getBytes()); + } + + @Override + public boolean valueSatisfy(${filter.dataType} value) { + <#if filter.dataType == "Binary"> + return !pattern.getMatcher().match(value.toString().getBytes()); + <#else> + return !pattern.getMatcher().match(String.valueOf(value).getBytes()); + </#if> + } + + @Override + protected boolean canSkip(Statistics<? extends Serializable> statistics) { + return false; + } + + @Override + protected boolean allSatisfy(Statistics<? extends Serializable> statistics) { + return false; + } + + @Override + public Filter reverse() { + return new ValueLike(measurementIndex, pattern); + } + + @Override + public OperatorType getOperatorType() { + return OperatorType.VALUE_NOT_LIKE; + } + } + private static class AccessCount { private int count; private final int accessThreshold = diff --git a/java/tsfile/src/main/java/org/apache/tsfile/common/regexp/DFA.java b/java/tsfile/src/main/java/org/apache/tsfile/common/regexp/DFA.java new file mode 100644 index 00000000..31021a0c --- /dev/null +++ b/java/tsfile/src/main/java/org/apache/tsfile/common/regexp/DFA.java @@ -0,0 +1,101 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.tsfile.common.regexp; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Objects; + +public class DFA { + private final int start; + private final ArrayList<Integer> acceptStates; + private final List<List<Transition>> transitions; + + public DFA(int start, ArrayList<Integer> acceptStates, List<List<Transition>> transitions) { + this.start = start; + this.acceptStates = Objects.requireNonNull(acceptStates, "acceptStates is null"); + this.transitions = Collections.unmodifiableList(new ArrayList<>(transitions)); + } + + public List<List<Transition>> getTransitions() { + return transitions; + } + + public ArrayList<Integer> getAcceptStates() { + return acceptStates; + } + + public int getStart() { + return start; + } + + public static class Transition { + private final int value; + private final int target; + + public Transition(int value, int target) { + this.value = value; + this.target = target; + } + + public int getTarget() { + return target; + } + + public int getValue() { + return value; + } + + @Override + public String toString() { + return String.format("-[%s]-> %s", value, target); + } + } + + public static class Builder { + private int nextId; + private int start; + private final ArrayList<Integer> acceptStates = new ArrayList<>(); + private final List<List<Transition>> transitions = new ArrayList<>(); + + public int addState(boolean accept) { + int state = nextId++; + transitions.add(new ArrayList<>()); + if (accept) { + acceptStates.add(state); + } + return state; + } + + public int addStartState(boolean accept) { + start = addState(accept); + return start; + } + + public void addTransition(int from, int value, int to) { + transitions.get(from).add(new Transition(value, to)); + } + + public DFA build() { + return new DFA(start, acceptStates, transitions); + } + } +} diff --git a/java/tsfile/src/main/java/org/apache/tsfile/common/regexp/DenseDfaMatcher.java b/java/tsfile/src/main/java/org/apache/tsfile/common/regexp/DenseDfaMatcher.java new file mode 100644 index 00000000..c020ec4f --- /dev/null +++ b/java/tsfile/src/main/java/org/apache/tsfile/common/regexp/DenseDfaMatcher.java @@ -0,0 +1,215 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.tsfile.common.regexp; + +import org.apache.tsfile.common.regexp.pattern.Any; +import org.apache.tsfile.common.regexp.pattern.Literal; +import org.apache.tsfile.common.regexp.pattern.Pattern; +import org.apache.tsfile.common.regexp.pattern.ZeroOrMore; + +import java.util.Arrays; +import java.util.List; + +import static java.nio.charset.StandardCharsets.UTF_8; +import static java.util.Objects.requireNonNull; +import static org.apache.tsfile.utils.Preconditions.checkArgument; + +public class DenseDfaMatcher implements Matcher { + public static final int FAIL_STATE = -1; + + private List<Pattern> pattern; + private int start; + private int end; + private boolean exact; + + private volatile DenseDfa matcher; + + public DenseDfaMatcher(List<Pattern> pattern, int start, int end, boolean exact) { + this.pattern = requireNonNull(pattern, "pattern is null"); + this.start = start; + this.end = end; + this.exact = exact; + } + + @Override + public boolean match(byte[] input, int offset, int length) { + DenseDfa matcher = this.matcher; + if (matcher == null) { + matcher = DenseDfa.newInstance(pattern, start, end); + this.matcher = matcher; + } + + if (exact) { + return matcher.exactMatch(input, offset, length); + } + + return matcher.prefixMatch(input, offset, length); + } + + private static class DenseDfa { + // The DFA is encoded as a sequence of transitions for each possible byte value for each state. + // I.e., 256 transitions per state. + // The content of the transitions array is the base offset into + // the next state to follow. I.e., the desired state * 256 + private final int[] transitions; + + // The starting state + private final int start; + + // For each state, whether it's an accepting state + private final boolean[] accept; + + public static DenseDfa newInstance(List<Pattern> pattern, int start, int end) { + DFA dfa = makeNfa(pattern, start, end).toDfa(); + + int[] transitions = new int[dfa.getTransitions().size() * 256]; + Arrays.fill(transitions, FAIL_STATE); + + for (int state = 0; state < dfa.getTransitions().size(); state++) { + for (DFA.Transition transition : dfa.getTransitions().get(state)) { + transitions[state * 256 + transition.getValue()] = transition.getTarget() * 256; + } + } + boolean[] accept = new boolean[dfa.getTransitions().size()]; + for (int state : dfa.getAcceptStates()) { + accept[state] = true; + } + + return new DenseDfa(transitions, dfa.getStart(), accept); + } + + private DenseDfa(int[] transitions, int start, boolean[] accept) { + this.transitions = transitions; + this.start = start; + this.accept = accept; + } + + /** + * Returns a positive match when the final state after all input has been consumed is an + * accepting state + */ + public boolean exactMatch(byte[] input, int offset, int length) { + int state = start << 8; + for (int i = offset; i < offset + length; i++) { + byte inputByte = input[i]; + state = transitions[state | (inputByte & 0xFF)]; + + if (state == FAIL_STATE) { + return false; + } + } + + return accept[state >>> 8]; + } + + /** + * Returns a positive match as soon as the DFA reaches an accepting state, regardless of whether + * the whole input has been consumed + */ + public boolean prefixMatch(byte[] input, int offset, int length) { + int state = start << 8; + for (int i = offset; i < offset + length; i++) { + byte inputByte = input[i]; + state = transitions[state | (inputByte & 0xFF)]; + + if (state == FAIL_STATE) { + return false; + } + + if (accept[state >>> 8]) { + return true; + } + } + + return accept[state >>> 8]; + } + + private static NFA makeNfa(List<Pattern> pattern, int start, int end) { + checkArgument(!pattern.isEmpty(), "pattern is empty"); + + NFA.Builder builder = new NFA.Builder(); + + int state = builder.addStartState(); + + for (int e = start; e <= end; e++) { + Pattern item = pattern.get(e); + if (item instanceof Literal) { + Literal literal = (Literal) item; + for (byte current : literal.getValue().getBytes(UTF_8)) { + state = matchByte(builder, state, current); + } + } else if (item instanceof Any) { + Any any = (Any) item; + for (int i = 0; i < any.getLength(); i++) { + int next = builder.addState(); + matchSingleUtf8(builder, state, next); + state = next; + } + } else if (item instanceof ZeroOrMore) { + matchSingleUtf8(builder, state, state); + } else { + throw new UnsupportedOperationException("Not supported: " + item.getClass().getName()); + } + } + + builder.setAccept(state); + + return builder.build(); + } + + private static int matchByte(NFA.Builder builder, int state, byte value) { + int next = builder.addState(); + builder.addTransition(state, new NFA.Value(value), next); + return next; + } + + private static void matchSingleUtf8(NFA.Builder builder, int from, int to) { + /* + Implements a state machine to recognize UTF-8 characters. + + 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + O ───────────► O ───────────► O ───────────► O ───────────► O + │ ▲ ▲ ▲ + ├─────────────────────────────┘ │ │ + │ 1110xxxx │ │ + │ │ │ + ├────────────────────────────────────────────┘ │ + │ 110xxxxx │ + │ │ + └───────────────────────────────────────────────────────────┘ + 0xxxxxxx + */ + + builder.addTransition(from, new NFA.Prefix(0, 1), to); + + int state1 = builder.addState(); + int state2 = builder.addState(); + int state3 = builder.addState(); + + builder.addTransition(from, new NFA.Prefix(0b11110, 5), state1); + builder.addTransition(from, new NFA.Prefix(0b1110, 4), state2); + builder.addTransition(from, new NFA.Prefix(0b110, 3), state3); + + builder.addTransition(state1, new NFA.Prefix(0b10, 2), state2); + builder.addTransition(state2, new NFA.Prefix(0b10, 2), state3); + builder.addTransition(state3, new NFA.Prefix(0b10, 2), to); + } + } +} diff --git a/java/tsfile/src/main/java/org/apache/tsfile/common/regexp/FjsMatcher.java b/java/tsfile/src/main/java/org/apache/tsfile/common/regexp/FjsMatcher.java new file mode 100644 index 00000000..0ef73177 --- /dev/null +++ b/java/tsfile/src/main/java/org/apache/tsfile/common/regexp/FjsMatcher.java @@ -0,0 +1,223 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.tsfile.common.regexp; + +import org.apache.tsfile.common.regexp.pattern.Any; +import org.apache.tsfile.common.regexp.pattern.Literal; +import org.apache.tsfile.common.regexp.pattern.Pattern; + +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.List; + +import static java.util.Objects.requireNonNull; +import static org.apache.tsfile.utils.Preconditions.checkArgument; + +public class FjsMatcher implements Matcher { + private final List<Pattern> pattern; + private final int start; + private final int end; + private final boolean exact; + + private volatile Fjs matcher; + + public FjsMatcher(List<Pattern> pattern, int start, int end, boolean exact) { + this.pattern = requireNonNull(pattern, "pattern is null"); + this.start = start; + this.end = end; + this.exact = exact; + } + + @Override + public boolean match(byte[] input, int offset, int length) { + Fjs matcher = this.matcher; + if (matcher == null) { + matcher = new Fjs(pattern, start, end, exact); + this.matcher = matcher; + } + + return matcher.match(input, offset, length); + } + + private static class Fjs { + private final boolean exact; + private final List<byte[]> patterns = new ArrayList<>(); + private final List<int[]> bmsShifts = new ArrayList<>(); + private final List<int[]> kmpShifts = new ArrayList<>(); + + public Fjs(List<Pattern> pattern, int start, int end, boolean exact) { + this.exact = exact; + + for (int i = start; i <= end; i++) { + Pattern element = pattern.get(i); + + if (element instanceof Literal) { + Literal literal = (Literal) element; + checkArgument( + i == 0 || !(pattern.get(i - 1) instanceof Literal), + "Multiple consecutive literals found"); + byte[] bytes = literal.getValue().getBytes(StandardCharsets.UTF_8); + patterns.add(bytes); + bmsShifts.add(computeBmsShifts(bytes)); + kmpShifts.add(computeKmpShifts(bytes)); + } else if (element instanceof Any) { + throw new IllegalArgumentException("'any' pattern not supported"); + } else if (element == null) { + // Handle null case if necessary + } else { + // Handle default case if necessary + } + } + } + + private static int[] computeKmpShifts(byte[] pattern) { + int[] result = new int[pattern.length + 1]; + result[0] = -1; + + int j = -1; + for (int i = 1; i < result.length; i++) { + while (j >= 0 && pattern[i - 1] != pattern[j]) { + j = result[j]; + } + j++; + result[i] = j; + } + + return result; + } + + private static int[] computeBmsShifts(byte[] pattern) { + int[] result = new int[256]; + + for (int i = 0; i < pattern.length; i++) { + result[pattern[i] & 0xFF] = i + 1; + } + + return result; + } + + private static int find( + byte[] input, + final int offset, + final int length, + byte[] pattern, + int[] bmsShifts, + int[] kmpShifts) { + if (pattern.length > length || pattern.length == 0) { + return -1; + } + + final int inputLimit = offset + length; + + int i = offset; + while (true) { + // Attempt to match the last position of the pattern + // As long as it doesn't match, skip ahead based on the Boyer-Moore-Sunday heuristic + int matchEnd = i + pattern.length - 1; + while (matchEnd < inputLimit - 1 && input[matchEnd] != pattern[pattern.length - 1]) { + int shift = pattern.length + 1 - bmsShifts[input[matchEnd + 1] & 0xFF]; + matchEnd += shift; + } + + if (matchEnd == inputLimit - 1 && match(input, inputLimit - pattern.length, pattern)) { + return inputLimit - pattern.length; + } else if (matchEnd >= inputLimit - 1) { + return -1; + } + + // At this point, we know the last position of the pattern matches with some + // position in the input text given by "matchEnd" + // Use KMP to match the first length-1 characters of the pattern + + i = matchEnd - (pattern.length - 1); + + int j = findLongestMatch(input, i, pattern, 0, pattern.length - 1); + + if (j == pattern.length - 1) { + return i; + } + + i += j; + j = kmpShifts[j]; + + // Continue to match the whole pattern using KMP + while (j >= 0) { + int size = + findLongestMatch(input, i, pattern, j, Math.min(inputLimit - i, pattern.length - j)); + i += size; + j += size; + + if (j == pattern.length) { + return i - j; + } + + j = kmpShifts[j]; + } + + i++; + } + } + + private static int findLongestMatch( + byte[] input, int inputOffset, byte[] pattern, int patternOffset, int length) { + for (int i = 0; i < length; i++) { + if (input[inputOffset + i] != pattern[patternOffset + i]) { + return i; + } + } + + return length; + } + + private static boolean match(byte[] input, int offset, byte[] pattern) { + for (int i = 0; i < pattern.length; i++) { + if (input[offset + i] != pattern[i]) { + return false; + } + } + + return true; + } + + public boolean match(byte[] input, int offset, int length) { + int start = offset; + int remaining = length; + + for (int i = 0; i < patterns.size(); i++) { + if (remaining == 0) { + return false; + } + + byte[] term = patterns.get(i); + + int position = find(input, start, remaining, term, bmsShifts.get(i), kmpShifts.get(i)); + if (position == -1) { + return false; + } + + position += term.length; + remaining -= position - start; + start = position; + } + + return !exact || remaining == 0; + } + } +} diff --git a/java/tsfile/src/main/java/org/apache/tsfile/common/regexp/LikeMatcher.java b/java/tsfile/src/main/java/org/apache/tsfile/common/regexp/LikeMatcher.java new file mode 100644 index 00000000..49347c3d --- /dev/null +++ b/java/tsfile/src/main/java/org/apache/tsfile/common/regexp/LikeMatcher.java @@ -0,0 +1,261 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.tsfile.common.regexp; + +import org.apache.tsfile.common.regexp.pattern.Any; +import org.apache.tsfile.common.regexp.pattern.Literal; +import org.apache.tsfile.common.regexp.pattern.Pattern; +import org.apache.tsfile.common.regexp.pattern.ZeroOrMore; + +import java.util.ArrayList; +import java.util.List; +import java.util.Optional; +import java.util.OptionalInt; + +import static java.nio.charset.StandardCharsets.UTF_8; + +public class LikeMatcher { + private final int minSize; + private final OptionalInt maxSize; + private final byte[] prefix; + private final byte[] suffix; + private final Optional<Matcher> matcher; + + private LikeMatcher( + int minSize, OptionalInt maxSize, byte[] prefix, byte[] suffix, Optional<Matcher> matcher) { + this.minSize = minSize; + this.maxSize = maxSize; + this.prefix = prefix; + this.suffix = suffix; + this.matcher = matcher; + } + + public static LikeMatcher compile(String pattern) { + return compile(pattern, Optional.empty(), true); + } + + public static LikeMatcher compile(String pattern, Optional<Character> escape) { + return compile(pattern, escape, true); + } + + public static LikeMatcher compile(String pattern, Optional<Character> escape, boolean optimize) { + List<Pattern> parsed = parse(pattern, escape); + + // Calculate minimum and maximum size for candidate strings + // This is used for short-circuiting the match if the size of + // the input is outside those bounds + int minSize = 0; + int maxSize = 0; + boolean unbounded = false; + for (Pattern expression : parsed) { + if (expression instanceof Literal) { + Literal literal = (Literal) expression; + int length = literal.getValue().getBytes(UTF_8).length; + minSize += length; + maxSize += length; + } else if (expression instanceof ZeroOrMore) { + unbounded = true; + } else if (expression instanceof Any) { + Any any = (Any) expression; + int length = any.getLength(); + minSize += length; + maxSize += length * 4; // at most 4 bytes for a single UTF-8 codepoint + } + } + + // Calculate exact match prefix and suffix + // If the pattern starts and ends with a literal, we can perform a quick + // exact match to short-circuit DFA evaluation + byte[] prefix = new byte[0]; + byte[] suffix = new byte[0]; + + int patternStart = 0; + int patternEnd = parsed.size() - 1; + if (parsed.size() > 0 && parsed.get(0) instanceof Literal) { + Literal literal = (Literal) parsed.get(0); + prefix = literal.getValue().getBytes(UTF_8); + patternStart++; + } + + if (parsed.size() > 1 && parsed.get(parsed.size() - 1) instanceof Literal) { + Literal literal = (Literal) parsed.get(parsed.size() - 1); + suffix = literal.getValue().getBytes(UTF_8); + patternEnd--; + } + + // If the pattern (after excluding constant prefix/suffixes) ends with an unbounded match (i.e., + // %) + // we can perform a non-exact match and end as soon as the DFA reaches an accept state -- there + // is no need to consume the remaining input + // This section determines whether the pattern is a candidate for non-exact match. + boolean exact = true; // whether to match to the end of the input + if (patternStart <= patternEnd && parsed.get(patternEnd) instanceof ZeroOrMore) { + // guaranteed to be Any or ZeroOrMore because any Literal would've been turned into a suffix + // above + exact = false; + patternEnd--; + } + + Optional<Matcher> matcher = Optional.empty(); + if (patternStart <= patternEnd) { + boolean hasAny = false; + for (int i = patternStart; i <= patternEnd; i++) { + Pattern item = parsed.get(i); + if (item instanceof Any) { + hasAny = true; + break; + } + } + + if (hasAny) { + if (optimize) { + matcher = Optional.of(new DenseDfaMatcher(parsed, patternStart, patternEnd, exact)); + } else { + matcher = Optional.of(new NfaMatcher(parsed, patternStart, patternEnd, exact)); + } + } else { + matcher = Optional.of(new FjsMatcher(parsed, patternStart, patternEnd, exact)); + } + } + + return new LikeMatcher( + minSize, + unbounded ? OptionalInt.empty() : OptionalInt.of(maxSize), + prefix, + suffix, + matcher); + } + + public boolean match(byte[] input) { + return match(input, 0, input.length); + } + + public boolean match(byte[] input, int offset, int length) { + if (length < minSize) { + return false; + } + + if (maxSize.isPresent() && length > maxSize.getAsInt()) { + return false; + } + + if (!startsWith(prefix, input, offset)) { + return false; + } + + if (!startsWith(suffix, input, offset + length - suffix.length)) { + return false; + } + + if (matcher.isPresent()) { + return matcher + .get() + .match(input, offset + prefix.length, length - suffix.length - prefix.length); + } + + return true; + } + + private boolean startsWith(byte[] pattern, byte[] input, int offset) { + for (int i = 0; i < pattern.length; i++) { + if (pattern[i] != input[offset + i]) { + return false; + } + } + + return true; + } + + static List<Pattern> parse(String pattern, Optional<Character> escape) { + List<Pattern> result = new ArrayList<>(); + + StringBuilder literal = new StringBuilder(); + int anyCount = 0; + boolean anyUnbounded = false; + boolean inEscape = false; + for (int i = 0; i < pattern.length(); i++) { + char character = pattern.charAt(i); + + if (inEscape) { + if (character != '%' && character != '_' && character != escape.get()) { + throw new IllegalArgumentException( + "Escape character must be followed by '%', '_' or the escape character itself"); + } + + literal.append(character); + inEscape = false; + } else if (escape.isPresent() && character == escape.get()) { + inEscape = true; + + if (anyCount != 0) { + result.add(new Any(anyCount)); + anyCount = 0; + } + + if (anyUnbounded) { + result.add(new ZeroOrMore()); + anyUnbounded = false; + } + } else if (character == '%' || character == '_') { + if (literal.length() != 0) { + result.add(new Literal(literal.toString())); + literal.setLength(0); + } + + if (character == '%') { + anyUnbounded = true; + } else { + anyCount++; + } + } else { + if (anyCount != 0) { + result.add(new Any(anyCount)); + anyCount = 0; + } + + if (anyUnbounded) { + result.add(new ZeroOrMore()); + anyUnbounded = false; + } + + literal.append(character); + } + } + + if (inEscape) { + throw new IllegalArgumentException( + "Escape character must be followed by '%', '_' or the escape character itself"); + } + + if (literal.length() != 0) { + result.add(new Literal(literal.toString())); + } else { + if (anyCount != 0) { + result.add(new Any(anyCount)); + } + + if (anyUnbounded) { + result.add(new ZeroOrMore()); + } + } + + return result; + } +} diff --git a/java/tsfile/src/main/java/org/apache/tsfile/common/regexp/LikePattern.java b/java/tsfile/src/main/java/org/apache/tsfile/common/regexp/LikePattern.java new file mode 100644 index 00000000..b7633779 --- /dev/null +++ b/java/tsfile/src/main/java/org/apache/tsfile/common/regexp/LikePattern.java @@ -0,0 +1,98 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.tsfile.common.regexp; + +import java.util.Objects; +import java.util.Optional; + +import static java.util.Objects.requireNonNull; + +public class LikePattern { + private final String pattern; + private final Optional<Character> escape; + private final LikeMatcher matcher; + + public static LikePattern compile(String pattern, Optional<Character> escape) { + return new LikePattern(pattern, escape, LikeMatcher.compile(pattern, escape)); + } + + public static LikePattern compile(String pattern, Optional<Character> escape, boolean optimize) { + return new LikePattern(pattern, escape, LikeMatcher.compile(pattern, escape, optimize)); + } + + private LikePattern(String pattern, Optional<Character> escape, LikeMatcher matcher) { + this.pattern = requireNonNull(pattern, "pattern is null"); + this.escape = requireNonNull(escape, "escape is null"); + this.matcher = requireNonNull(matcher, "likeMatcher is null"); + } + + public String getPattern() { + return pattern; + } + + public Optional<Character> getEscape() { + return escape; + } + + public LikeMatcher getMatcher() { + return matcher; + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + LikePattern that = (LikePattern) o; + return Objects.equals(pattern, that.pattern) && Objects.equals(escape, that.escape); + } + + @Override + public int hashCode() { + return Objects.hash(pattern, escape); + } + + @Override + public String toString() { + // 既要有pattern,又考虑escape + return "LikePattern{" + + "pattern='" + + pattern + + '\'' + + (escape.map(character -> ", escape=" + character).orElse("")) + + '}'; + } + + public static Optional<Character> getEscapeCharacter(Optional<String> escape) { + if (escape.isPresent()) { + String escapeString = escape.get(); + if (escapeString.length() == 1) { + return Optional.of(escapeString.charAt(0)); + } else { + throw new IllegalArgumentException("Escape string must be a single character"); + } + } else { + return Optional.empty(); + } + } +} diff --git a/java/tsfile/src/main/java/org/apache/tsfile/common/regexp/Matcher.java b/java/tsfile/src/main/java/org/apache/tsfile/common/regexp/Matcher.java new file mode 100644 index 00000000..49536933 --- /dev/null +++ b/java/tsfile/src/main/java/org/apache/tsfile/common/regexp/Matcher.java @@ -0,0 +1,24 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.tsfile.common.regexp; + +public interface Matcher { + boolean match(byte[] input, int offset, int length); +} diff --git a/java/tsfile/src/main/java/org/apache/tsfile/common/regexp/NFA.java b/java/tsfile/src/main/java/org/apache/tsfile/common/regexp/NFA.java new file mode 100644 index 00000000..60d0b1e3 --- /dev/null +++ b/java/tsfile/src/main/java/org/apache/tsfile/common/regexp/NFA.java @@ -0,0 +1,184 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.tsfile.common.regexp; + +import java.util.ArrayDeque; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Queue; +import java.util.Set; + +import static java.util.Objects.requireNonNull; + +public class NFA { + private final int start; + private final int accept; + private final List<List<Transition>> transitions; + + private NFA(int start, int accept, List<List<Transition>> transitions) { + this.start = start; + this.accept = accept; + this.transitions = requireNonNull(transitions, "transitions is null"); + } + + public DFA toDfa() { + Map<Set<Integer>, Integer> activeStates = new HashMap<>(); + + DFA.Builder builder = new DFA.Builder(); + + Set<Integer> initial = new HashSet<>(); + initial.add(start); + Queue<Set<Integer>> queue = new ArrayDeque<>(); + queue.add(initial); + + int dfaStartState = builder.addStartState(initial.contains(accept)); + activeStates.put(initial, dfaStartState); + + Set<Set<Integer>> visited = new HashSet<>(); + while (!queue.isEmpty()) { + Set<Integer> current = queue.poll(); + + if (!visited.add(current)) { + continue; + } + + // For each possible byte value... + for (int byteValue = 0; byteValue < 256; byteValue++) { + Set<Integer> next = new HashSet<>(); + for (int nfaState : current) { + for (Transition transition : transitions(nfaState)) { + Condition condition = transition.getCondition(); + int target = transition.getTarget(); + + if (condition instanceof Value) { + Value valueTransition = (Value) condition; + if (valueTransition.getValue() == (byte) byteValue) { + next.add(target); + } + } else if (condition instanceof Prefix) { + Prefix prefixTransition = (Prefix) condition; + if (byteValue >>> (8 - prefixTransition.getBits()) == prefixTransition.getPrefix()) { + next.add(target); + } + } + } + } + + if (!next.isEmpty()) { + int from = activeStates.get(current); + int to = + activeStates.computeIfAbsent( + next, nfaStates -> builder.addState(nfaStates.contains(accept))); + builder.addTransition(from, byteValue, to); + + queue.add(next); + } + } + } + + return builder.build(); + } + + private List<Transition> transitions(int state) { + return transitions.get(state); + } + + public static class Builder { + private int nextId; + private int start; + private int accept; + private final List<List<Transition>> transitions = new ArrayList<>(); + + public int addState() { + transitions.add(new ArrayList<>()); + return nextId++; + } + + public int addStartState() { + start = addState(); + return start; + } + + public void setAccept(int state) { + accept = state; + } + + public void addTransition(int from, Condition condition, int to) { + transitions.get(from).add(new Transition(to, condition)); + } + + public NFA build() { + return new NFA(start, accept, transitions); + } + } + + static class Transition { + private final int target; + private final Condition condition; + + public Transition(int target, Condition condition) { + this.target = target; + this.condition = condition; + } + + public int getTarget() { + return target; + } + + public Condition getCondition() { + return condition; + } + } + + interface Condition {} + + static class Value implements Condition { + private final byte value; + + public Value(byte value) { + this.value = value; + } + + public byte getValue() { + return value; + } + } + + static class Prefix implements Condition { + private final int prefix; + private final int bits; + + public Prefix(int prefix, int bits) { + this.prefix = prefix; + this.bits = bits; + } + + public int getPrefix() { + return prefix; + } + + public int getBits() { + return bits; + } + } +} diff --git a/java/tsfile/src/main/java/org/apache/tsfile/common/regexp/NfaMatcher.java b/java/tsfile/src/main/java/org/apache/tsfile/common/regexp/NfaMatcher.java new file mode 100644 index 00000000..c6d6c477 --- /dev/null +++ b/java/tsfile/src/main/java/org/apache/tsfile/common/regexp/NfaMatcher.java @@ -0,0 +1,176 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.tsfile.common.regexp; + +import org.apache.tsfile.common.regexp.pattern.Any; +import org.apache.tsfile.common.regexp.pattern.Literal; +import org.apache.tsfile.common.regexp.pattern.Pattern; +import org.apache.tsfile.common.regexp.pattern.ZeroOrMore; + +import java.util.Arrays; +import java.util.List; + +public class NfaMatcher implements Matcher { + private static final int ANY = -1; + private static final int NONE = -2; + private static final int INVALID_CODEPOINT = -1; + + private boolean exact; + + private boolean[] loopback; + private int[] match; + private int acceptState; + private int stateCount; + + public NfaMatcher(List<Pattern> pattern, int start, int end, boolean exact) { + this.exact = exact; + + stateCount = calculateStateCount(pattern, start, end); + + loopback = new boolean[stateCount]; + match = new int[stateCount]; + Arrays.fill(match, NONE); + acceptState = stateCount - 1; + + int state = 0; + for (int j = start; j <= end; j++) { + Pattern element = pattern.get(j); + if (element instanceof Literal) { + Literal literal = (Literal) element; + for (int i = 0; i < literal.getValue().length(); i++) { + match[state++] = literal.getValue().charAt(i); + } + } else if (element instanceof Any) { + Any any = (Any) element; + for (int i = 0; i < any.getLength(); i++) { + match[state++] = ANY; + } + } else if (element instanceof ZeroOrMore) { + loopback[state] = true; + } + } + } + + public NfaMatcher() {} + + private static int calculateStateCount(List<Pattern> pattern, int start, int end) { + int states = 1; + for (int i = start; i <= end; i++) { + Pattern element = pattern.get(i); + if (element instanceof Literal) { + Literal literal = (Literal) element; + states += literal.getValue().length(); + } else if (element instanceof Any) { + Any any = (Any) element; + states += any.getLength(); + } + } + return states; + } + + @Override + public boolean match(byte[] input, int offset, int length) { + boolean[] seen = new boolean[stateCount + 1]; + int[] currentStates = new int[stateCount]; + int[] nextStates = new int[stateCount]; + int currentStatesIndex = 0; + int nextStatesIndex; + + currentStates[currentStatesIndex++] = 0; + + int limit = offset + length; + int current = offset; + boolean accept = false; + while (current < limit) { + int codepoint = INVALID_CODEPOINT; + + // decode the next UTF-8 codepoint + int header = input[current] & 0xFF; + if (header < 0x80) { + // normal ASCII + // 0xxx_xxxx + codepoint = header; + current++; + } else if ((header & 0b1110_0000) == 0b1100_0000) { + // 110x_xxxx 10xx_xxxx + if (current + 1 < limit) { + codepoint = ((header & 0b0001_1111) << 6) | (input[current + 1] & 0b0011_1111); + current += 2; + } + } else if ((header & 0b1111_0000) == 0b1110_0000) { + // 1110_xxxx 10xx_xxxx 10xx_xxxx + if (current + 2 < limit) { + codepoint = + ((header & 0b0000_1111) << 12) + | ((input[current + 1] & 0b0011_1111) << 6) + | (input[current + 2] & 0b0011_1111); + current += 3; + } + } else if ((header & 0b1111_1000) == 0b1111_0000) { + // 1111_0xxx 10xx_xxxx 10xx_xxxx 10xx_xxxx + if (current + 3 < limit) { + codepoint = + ((header & 0b0000_0111) << 18) + | ((input[current + 1] & 0b0011_1111) << 12) + | ((input[current + 2] & 0b0011_1111) << 6) + | (input[current + 3] & 0b0011_1111); + current += 4; + } + } + + if (codepoint == INVALID_CODEPOINT) { + return false; + } + + accept = false; + nextStatesIndex = 0; + Arrays.fill(seen, false); + for (int i = 0; i < currentStatesIndex; i++) { + int state = currentStates[i]; + if (!seen[state] && loopback[state]) { + nextStates[nextStatesIndex++] = state; + accept |= state == acceptState; + seen[state] = true; + } + int next = state + 1; + if (!seen[next] && (match[state] == ANY || match[state] == codepoint)) { + nextStates[nextStatesIndex++] = next; + accept |= next == acceptState; + seen[next] = true; + } + } + + if (nextStatesIndex == 0) { + return false; + } + + if (!exact && accept) { + return true; + } + + int[] tmp = currentStates; + currentStates = nextStates; + nextStates = tmp; + currentStatesIndex = nextStatesIndex; + } + + return accept; + } +} diff --git a/java/tsfile/src/main/java/org/apache/tsfile/common/regexp/pattern/Any.java b/java/tsfile/src/main/java/org/apache/tsfile/common/regexp/pattern/Any.java new file mode 100644 index 00000000..8c1e650a --- /dev/null +++ b/java/tsfile/src/main/java/org/apache/tsfile/common/regexp/pattern/Any.java @@ -0,0 +1,44 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.tsfile.common.regexp.pattern; + +public class Any implements Pattern { + private final int length; + + public Any(int length) { + if (length <= 0) { + throw new IllegalArgumentException("Length must be > 0"); + } + this.length = length; + } + + public int getLength() { + return length; + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < length; i++) { + sb.append("_"); + } + return sb.toString(); + } +} diff --git a/java/tsfile/src/main/java/org/apache/tsfile/common/regexp/pattern/Literal.java b/java/tsfile/src/main/java/org/apache/tsfile/common/regexp/pattern/Literal.java new file mode 100644 index 00000000..8ea9cf05 --- /dev/null +++ b/java/tsfile/src/main/java/org/apache/tsfile/common/regexp/pattern/Literal.java @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.tsfile.common.regexp.pattern; + +public class Literal implements Pattern { + private final String value; + + public Literal(String value) { + this.value = value; + } + + public String getValue() { + return value; + } + + @Override + public String toString() { + return value; + } +} diff --git a/java/tsfile/src/main/java/org/apache/tsfile/common/regexp/pattern/Pattern.java b/java/tsfile/src/main/java/org/apache/tsfile/common/regexp/pattern/Pattern.java new file mode 100644 index 00000000..959e5a71 --- /dev/null +++ b/java/tsfile/src/main/java/org/apache/tsfile/common/regexp/pattern/Pattern.java @@ -0,0 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.tsfile.common.regexp.pattern; + +public interface Pattern {} diff --git a/java/tsfile/src/main/java/org/apache/tsfile/common/regexp/pattern/ZeroOrMore.java b/java/tsfile/src/main/java/org/apache/tsfile/common/regexp/pattern/ZeroOrMore.java new file mode 100644 index 00000000..ac0a0443 --- /dev/null +++ b/java/tsfile/src/main/java/org/apache/tsfile/common/regexp/pattern/ZeroOrMore.java @@ -0,0 +1,27 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.tsfile.common.regexp.pattern; + +public class ZeroOrMore implements Pattern { + @Override + public String toString() { + return "%"; + } +} diff --git a/java/tsfile/src/main/java/org/apache/tsfile/read/filter/basic/Filter.java b/java/tsfile/src/main/java/org/apache/tsfile/read/filter/basic/Filter.java index 46afc5bc..cc48210d 100755 --- a/java/tsfile/src/main/java/org/apache/tsfile/read/filter/basic/Filter.java +++ b/java/tsfile/src/main/java/org/apache/tsfile/read/filter/basic/Filter.java @@ -230,6 +230,8 @@ public abstract class Filter { case VALUE_NOT_IN: case VALUE_REGEXP: case VALUE_NOT_REGEXP: + case VALUE_LIKE: + case VALUE_NOT_LIKE: case VALUE_BETWEEN_AND: case VALUE_NOT_BETWEEN_AND: return FilterDeserialize.deserializeValueFilter(type, buffer); diff --git a/java/tsfile/src/main/java/org/apache/tsfile/read/filter/basic/OperatorType.java b/java/tsfile/src/main/java/org/apache/tsfile/read/filter/basic/OperatorType.java index a4689696..db8be985 100644 --- a/java/tsfile/src/main/java/org/apache/tsfile/read/filter/basic/OperatorType.java +++ b/java/tsfile/src/main/java/org/apache/tsfile/read/filter/basic/OperatorType.java @@ -51,10 +51,14 @@ public enum OperatorType { TIME_NOT_IN("NOT IN"), VALUE_NOT_IN("NOT IN"), - // pattern match + // regexp pattern match VALUE_REGEXP("MATCH"), VALUE_NOT_REGEXP("NOT MATCH"), + // like pattern match + VALUE_LIKE("MATCH"), + VALUE_NOT_LIKE("NOT MATCH"), + // group by GROUP_BY_TIME("GROUP BY TIME"), GROUP_BY_MONTH("GROUP BY MONTH"), diff --git a/java/tsfile/src/main/java/org/apache/tsfile/read/filter/factory/ValueFilterApi.java b/java/tsfile/src/main/java/org/apache/tsfile/read/filter/factory/ValueFilterApi.java index e99e2ba2..53c4b74c 100644 --- a/java/tsfile/src/main/java/org/apache/tsfile/read/filter/factory/ValueFilterApi.java +++ b/java/tsfile/src/main/java/org/apache/tsfile/read/filter/factory/ValueFilterApi.java @@ -19,6 +19,7 @@ package org.apache.tsfile.read.filter.factory; +import org.apache.tsfile.common.regexp.LikePattern; import org.apache.tsfile.enums.TSDataType; import org.apache.tsfile.read.filter.basic.Filter; import org.apache.tsfile.read.filter.operator.BinaryFilterOperators; @@ -31,7 +32,6 @@ import org.apache.tsfile.read.filter.operator.StringFilterOperators; import org.apache.tsfile.read.filter.operator.ValueIsNotNullOperator; import org.apache.tsfile.read.filter.operator.ValueIsNullOperator; import org.apache.tsfile.utils.Binary; -import org.apache.tsfile.utils.RegexUtils; import java.util.Objects; import java.util.Set; @@ -282,24 +282,54 @@ public class ValueFilterApi { } } - public static Filter like(int measurementIndex, String regexp, TSDataType type) { - return regexp(measurementIndex, RegexUtils.compileRegex(regexp), type); - } - - public static Filter like(int measurementIndex, Pattern pattern, TSDataType type) { - return regexp(measurementIndex, pattern, type); - } - - public static Filter notLike(int measurementIndex, String regexp, TSDataType type) { - return notRegexp(measurementIndex, RegexUtils.compileRegex(regexp), type); - } - - public static Filter notLike(int measurementIndex, Pattern pattern, TSDataType type) { - return notRegexp(measurementIndex, pattern, type); + public static Filter like(int measurementIndex, LikePattern pattern, TSDataType type) { + Objects.requireNonNull(pattern, CONSTANT_CANNOT_BE_NULL_MSG); + switch (type) { + case BOOLEAN: + return new BooleanFilterOperators.ValueLike(measurementIndex, pattern); + case INT32: + case DATE: + return new IntegerFilterOperators.ValueLike(measurementIndex, pattern); + case INT64: + case TIMESTAMP: + return new LongFilterOperators.ValueLike(measurementIndex, pattern); + case DOUBLE: + return new DoubleFilterOperators.ValueLike(measurementIndex, pattern); + case FLOAT: + return new FloatFilterOperators.ValueLike(measurementIndex, pattern); + case TEXT: + case BLOB: + return new BinaryFilterOperators.ValueLike(measurementIndex, pattern); + case STRING: + return new StringFilterOperators.ValueLike(measurementIndex, pattern); + default: + throw new UnsupportedOperationException("Unsupported data type: " + type); + } } - public static Filter regexp(int measurementIndex, String regexp, TSDataType type) { - return regexp(measurementIndex, RegexUtils.compileRegex(regexp), type); + public static Filter notLike(int measurementIndex, LikePattern pattern, TSDataType type) { + Objects.requireNonNull(pattern, CONSTANT_CANNOT_BE_NULL_MSG); + switch (type) { + case BOOLEAN: + return new BooleanFilterOperators.ValueNotLike(measurementIndex, pattern); + case INT32: + case DATE: + return new IntegerFilterOperators.ValueNotLike(measurementIndex, pattern); + case INT64: + case TIMESTAMP: + return new LongFilterOperators.ValueNotLike(measurementIndex, pattern); + case DOUBLE: + return new DoubleFilterOperators.ValueNotLike(measurementIndex, pattern); + case FLOAT: + return new FloatFilterOperators.ValueNotLike(measurementIndex, pattern); + case TEXT: + case BLOB: + return new BinaryFilterOperators.ValueNotLike(measurementIndex, pattern); + case STRING: + return new StringFilterOperators.ValueNotLike(measurementIndex, pattern); + default: + throw new UnsupportedOperationException("Unsupported data type: " + type); + } } public static Filter regexp(int measurementIndex, Pattern pattern, TSDataType type) { @@ -327,10 +357,6 @@ public class ValueFilterApi { } } - public static Filter notRegexp(int measurementIndex, String regexp, TSDataType type) { - return notRegexp(measurementIndex, RegexUtils.compileRegex(regexp), type); - } - public static Filter notRegexp(int measurementIndex, Pattern pattern, TSDataType type) { Objects.requireNonNull(pattern, CONSTANT_CANNOT_BE_NULL_MSG); switch (type) { diff --git a/java/tsfile/src/main/java/org/apache/tsfile/utils/FilterDeserialize.java b/java/tsfile/src/main/java/org/apache/tsfile/utils/FilterDeserialize.java index 6b48d96b..ec7c7fa4 100644 --- a/java/tsfile/src/main/java/org/apache/tsfile/utils/FilterDeserialize.java +++ b/java/tsfile/src/main/java/org/apache/tsfile/utils/FilterDeserialize.java @@ -60,6 +60,10 @@ public class FilterDeserialize { return FilterDeserialize.deserializeValueRegexpFilter(classSerializeId, buffer); case VALUE_NOT_REGEXP: return FilterDeserialize.deserializeValueNotRegexpFilter(classSerializeId, buffer); + case VALUE_LIKE: + return FilterDeserialize.deserializeValueLikeFilter(classSerializeId, buffer); + case VALUE_NOT_LIKE: + return FilterDeserialize.deserializeValueNotLikeFilter(classSerializeId, buffer); case VALUE_BETWEEN_AND: return FilterDeserialize.deserializeValueBetweenAndFilter(classSerializeId, buffer); case VALUE_NOT_BETWEEN_AND: @@ -334,6 +338,50 @@ public class FilterDeserialize { } } + public static Filter deserializeValueLikeFilter( + ClassSerializeId classSerializeId, ByteBuffer buffer) { + switch (classSerializeId) { + case BOOLEAN: + return new BooleanFilterOperators.ValueLike(buffer); + case INTEGER: + return new IntegerFilterOperators.ValueLike(buffer); + case LONG: + return new LongFilterOperators.ValueLike(buffer); + case FLOAT: + return new FloatFilterOperators.ValueLike(buffer); + case DOUBLE: + return new DoubleFilterOperators.ValueLike(buffer); + case BINARY: + return new BinaryFilterOperators.ValueLike(buffer); + case STRING: + return new StringFilterOperators.ValueLike(buffer); + default: + throw new UnsupportedOperationException(UNSUPPORTED_DATATYPE_MESSAGE + classSerializeId); + } + } + + public static Filter deserializeValueNotLikeFilter( + ClassSerializeId classSerializeId, ByteBuffer buffer) { + switch (classSerializeId) { + case BOOLEAN: + return new BooleanFilterOperators.ValueNotLike(buffer); + case INTEGER: + return new IntegerFilterOperators.ValueNotLike(buffer); + case LONG: + return new LongFilterOperators.ValueNotLike(buffer); + case FLOAT: + return new FloatFilterOperators.ValueNotLike(buffer); + case DOUBLE: + return new DoubleFilterOperators.ValueNotLike(buffer); + case BINARY: + return new BinaryFilterOperators.ValueNotLike(buffer); + case STRING: + return new StringFilterOperators.ValueNotLike(buffer); + default: + throw new UnsupportedOperationException(UNSUPPORTED_DATATYPE_MESSAGE + classSerializeId); + } + } + public static Filter deserializeValueIsNullFilter(ByteBuffer buffer) { return new ValueIsNullOperator(buffer); } diff --git a/java/tsfile/src/main/java/org/apache/tsfile/utils/RegexUtils.java b/java/tsfile/src/main/java/org/apache/tsfile/utils/RegexUtils.java deleted file mode 100644 index 75c62341..00000000 --- a/java/tsfile/src/main/java/org/apache/tsfile/utils/RegexUtils.java +++ /dev/null @@ -1,92 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.tsfile.utils; - -import java.util.regex.Pattern; -import java.util.regex.PatternSyntaxException; - -public class RegexUtils { - - private RegexUtils() { - // util class - } - - /** - * The main idea of this part comes from - * https://codereview.stackexchange.com/questions/36861/convert-sql-like-to-regex/36864 - */ - public static String parseLikePatternToRegex(String likePattern) { - String unescapeValue = unescapeString(likePattern); - String specialRegexStr = ".^$*+?{}[]|()"; - StringBuilder patternStrBuild = new StringBuilder(); - patternStrBuild.append("^"); - for (int i = 0; i < unescapeValue.length(); i++) { - String ch = String.valueOf(unescapeValue.charAt(i)); - if (specialRegexStr.contains(ch)) { - ch = "\\" + unescapeValue.charAt(i); - } - if (i == 0 - || !"\\".equals(String.valueOf(unescapeValue.charAt(i - 1))) - || i >= 2 - && "\\\\" - .equals( - patternStrBuild.substring( - patternStrBuild.length() - 2, patternStrBuild.length()))) { - String replaceStr = ch.replace("%", ".*?").replace("_", "."); - patternStrBuild.append(replaceStr); - } else { - patternStrBuild.append(ch); - } - } - patternStrBuild.append("$"); - return patternStrBuild.toString(); - } - - // This Method is for un-escaping strings except '\' before special string '%', '_', '\', because - // we need to use '\' to judge whether to replace this to regexp string - private static String unescapeString(String value) { - StringBuilder stringBuilder = new StringBuilder(); - int curIndex = 0; - while (curIndex < value.length()) { - String ch = String.valueOf(value.charAt(curIndex)); - if ("\\".equals(ch) && curIndex < value.length() - 1) { - String nextChar = String.valueOf(value.charAt(curIndex + 1)); - if ("%".equals(nextChar) || "_".equals(nextChar) || "\\".equals(nextChar)) { - stringBuilder.append(ch); - if ("\\".equals(nextChar)) { - curIndex++; - } - } - } else { - stringBuilder.append(ch); - } - curIndex++; - } - return stringBuilder.toString(); - } - - public static Pattern compileRegex(String regex) { - try { - return Pattern.compile(regex); - } catch (PatternSyntaxException e) { - throw new PatternSyntaxException("Illegal regex expression: ", regex, e.getIndex()); - } - } -} diff --git a/java/tsfile/src/test/java/org/apache/tsfile/read/filter/BinaryOperatorsTest.java b/java/tsfile/src/test/java/org/apache/tsfile/read/filter/BinaryOperatorsTest.java index c4b3828c..9b53e809 100644 --- a/java/tsfile/src/test/java/org/apache/tsfile/read/filter/BinaryOperatorsTest.java +++ b/java/tsfile/src/test/java/org/apache/tsfile/read/filter/BinaryOperatorsTest.java @@ -14,6 +14,7 @@ package org.apache.tsfile.read.filter; +import org.apache.tsfile.common.regexp.LikePattern; import org.apache.tsfile.read.filter.basic.Filter; import org.apache.tsfile.read.filter.factory.ValueFilterApi; import org.apache.tsfile.utils.Binary; @@ -23,6 +24,8 @@ import org.junit.Test; import java.util.Arrays; import java.util.HashSet; +import java.util.Optional; +import java.util.regex.Pattern; import static org.apache.tsfile.common.conf.TSFileConfig.STRING_CHARSET; import static org.apache.tsfile.enums.TSDataType.TEXT; @@ -129,14 +132,35 @@ public class BinaryOperatorsTest { @Test public void testRegexp() { - Filter regexp = ValueFilterApi.regexp(DEFAULT_MEASUREMENT_INDEX, "a.*", TEXT); + Filter regexp = ValueFilterApi.regexp(DEFAULT_MEASUREMENT_INDEX, Pattern.compile("a.*"), TEXT); Assert.assertTrue(regexp.satisfyBinary(DEFAULT_TIMESTAMP, new Binary("abc", STRING_CHARSET))); Assert.assertFalse(regexp.satisfyBinary(DEFAULT_TIMESTAMP, new Binary("bcd", STRING_CHARSET))); } @Test public void testNotRegexp() { - Filter notRegexp = ValueFilterApi.notRegexp(DEFAULT_MEASUREMENT_INDEX, "a.*", TEXT); + Filter notRegexp = + ValueFilterApi.notRegexp(DEFAULT_MEASUREMENT_INDEX, Pattern.compile("a.*"), TEXT); + Assert.assertTrue( + notRegexp.satisfyBinary(DEFAULT_TIMESTAMP, new Binary("bcd", STRING_CHARSET))); + Assert.assertFalse( + notRegexp.satisfyBinary(DEFAULT_TIMESTAMP, new Binary("abc", STRING_CHARSET))); + } + + @Test + public void testLike() { + Filter regexp = + ValueFilterApi.like( + DEFAULT_MEASUREMENT_INDEX, LikePattern.compile("a%", Optional.empty()), TEXT); + Assert.assertTrue(regexp.satisfyBinary(DEFAULT_TIMESTAMP, new Binary("abc", STRING_CHARSET))); + Assert.assertFalse(regexp.satisfyBinary(DEFAULT_TIMESTAMP, new Binary("bcd", STRING_CHARSET))); + } + + @Test + public void testNotLike() { + Filter notRegexp = + ValueFilterApi.notLike( + DEFAULT_MEASUREMENT_INDEX, LikePattern.compile("a%", Optional.empty()), TEXT); Assert.assertTrue( notRegexp.satisfyBinary(DEFAULT_TIMESTAMP, new Binary("bcd", STRING_CHARSET))); Assert.assertFalse( diff --git a/java/tsfile/src/test/java/org/apache/tsfile/read/filter/BooleanOperatorsTest.java b/java/tsfile/src/test/java/org/apache/tsfile/read/filter/BooleanOperatorsTest.java index df9aa28e..bd12e34c 100644 --- a/java/tsfile/src/test/java/org/apache/tsfile/read/filter/BooleanOperatorsTest.java +++ b/java/tsfile/src/test/java/org/apache/tsfile/read/filter/BooleanOperatorsTest.java @@ -14,6 +14,7 @@ package org.apache.tsfile.read.filter; +import org.apache.tsfile.common.regexp.LikePattern; import org.apache.tsfile.read.filter.basic.Filter; import org.apache.tsfile.read.filter.factory.ValueFilterApi; @@ -22,6 +23,8 @@ import org.junit.Test; import java.util.Arrays; import java.util.HashSet; +import java.util.Optional; +import java.util.regex.Pattern; import static org.apache.tsfile.enums.TSDataType.BOOLEAN; import static org.apache.tsfile.read.filter.FilterTestUtil.DEFAULT_TIMESTAMP; @@ -123,14 +126,34 @@ public class BooleanOperatorsTest { @Test public void testRegexp() { - Filter regexp = ValueFilterApi.regexp(DEFAULT_MEASUREMENT_INDEX, "t.*", BOOLEAN); + Filter regexp = + ValueFilterApi.regexp(DEFAULT_MEASUREMENT_INDEX, Pattern.compile("t.*"), BOOLEAN); Assert.assertTrue(regexp.satisfyBoolean(DEFAULT_TIMESTAMP, true)); Assert.assertFalse(regexp.satisfyBoolean(DEFAULT_TIMESTAMP, false)); } @Test public void testNotRegexp() { - Filter notRegexp = ValueFilterApi.notRegexp(DEFAULT_MEASUREMENT_INDEX, "t.*", BOOLEAN); + Filter notRegexp = + ValueFilterApi.notRegexp(DEFAULT_MEASUREMENT_INDEX, Pattern.compile("t.*"), BOOLEAN); + Assert.assertTrue(notRegexp.satisfyBoolean(DEFAULT_TIMESTAMP, false)); + Assert.assertFalse(notRegexp.satisfyBoolean(DEFAULT_TIMESTAMP, true)); + } + + @Test + public void testLike() { + Filter regexp = + ValueFilterApi.like( + DEFAULT_MEASUREMENT_INDEX, LikePattern.compile("t%", Optional.empty()), BOOLEAN); + Assert.assertTrue(regexp.satisfyBoolean(DEFAULT_TIMESTAMP, true)); + Assert.assertFalse(regexp.satisfyBoolean(DEFAULT_TIMESTAMP, false)); + } + + @Test + public void testNotLike() { + Filter notRegexp = + ValueFilterApi.notLike( + DEFAULT_MEASUREMENT_INDEX, LikePattern.compile("t%", Optional.empty()), BOOLEAN); Assert.assertTrue(notRegexp.satisfyBoolean(DEFAULT_TIMESTAMP, false)); Assert.assertFalse(notRegexp.satisfyBoolean(DEFAULT_TIMESTAMP, true)); } diff --git a/java/tsfile/src/test/java/org/apache/tsfile/read/filter/FilterSerializeTest.java b/java/tsfile/src/test/java/org/apache/tsfile/read/filter/FilterSerializeTest.java index 00a9b565..bf2aa134 100644 --- a/java/tsfile/src/test/java/org/apache/tsfile/read/filter/FilterSerializeTest.java +++ b/java/tsfile/src/test/java/org/apache/tsfile/read/filter/FilterSerializeTest.java @@ -18,6 +18,7 @@ */ package org.apache.tsfile.read.filter; +import org.apache.tsfile.common.regexp.LikePattern; import org.apache.tsfile.enums.TSDataType; import org.apache.tsfile.read.filter.basic.Filter; import org.apache.tsfile.read.filter.factory.FilterFactory; @@ -34,8 +35,10 @@ import java.io.IOException; import java.nio.ByteBuffer; import java.util.Arrays; import java.util.HashSet; +import java.util.Optional; import java.util.TimeZone; import java.util.concurrent.TimeUnit; +import java.util.regex.Pattern; import static org.apache.tsfile.common.conf.TSFileConfig.STRING_CHARSET; import static org.apache.tsfile.read.filter.factory.ValueFilterApi.DEFAULT_MEASUREMENT_INDEX; @@ -139,10 +142,18 @@ public class FilterSerializeTest { ValueFilterApi.notEq(DEFAULT_MEASUREMENT_INDEX, true, TSDataType.BOOLEAN), ValueFilterApi.between(DEFAULT_MEASUREMENT_INDEX, false, true, TSDataType.BOOLEAN), ValueFilterApi.notBetween(DEFAULT_MEASUREMENT_INDEX, false, true, TSDataType.BOOLEAN), - ValueFilterApi.like(DEFAULT_MEASUREMENT_INDEX, "true", TSDataType.BOOLEAN), - ValueFilterApi.notLike(DEFAULT_MEASUREMENT_INDEX, "true", TSDataType.BOOLEAN), - ValueFilterApi.regexp(DEFAULT_MEASUREMENT_INDEX, "true", TSDataType.BOOLEAN), - ValueFilterApi.notRegexp(DEFAULT_MEASUREMENT_INDEX, "true", TSDataType.BOOLEAN), + ValueFilterApi.like( + DEFAULT_MEASUREMENT_INDEX, + LikePattern.compile("true", Optional.empty()), + TSDataType.BOOLEAN), + ValueFilterApi.notLike( + DEFAULT_MEASUREMENT_INDEX, + LikePattern.compile("true", Optional.empty()), + TSDataType.BOOLEAN), + ValueFilterApi.regexp( + DEFAULT_MEASUREMENT_INDEX, Pattern.compile("true"), TSDataType.BOOLEAN), + ValueFilterApi.notRegexp( + DEFAULT_MEASUREMENT_INDEX, Pattern.compile("true"), TSDataType.BOOLEAN), ValueFilterApi.in( DEFAULT_MEASUREMENT_INDEX, new HashSet<>(Arrays.asList(true, false)), @@ -169,10 +180,18 @@ public class FilterSerializeTest { ValueFilterApi.notEq(DEFAULT_MEASUREMENT_INDEX, 100, TSDataType.INT32), ValueFilterApi.between(DEFAULT_MEASUREMENT_INDEX, 100, 200, TSDataType.INT32), ValueFilterApi.notBetween(DEFAULT_MEASUREMENT_INDEX, 100, 200, TSDataType.INT32), - ValueFilterApi.like(DEFAULT_MEASUREMENT_INDEX, "1.*", TSDataType.INT32), - ValueFilterApi.notLike(DEFAULT_MEASUREMENT_INDEX, "1.*", TSDataType.INT32), - ValueFilterApi.regexp(DEFAULT_MEASUREMENT_INDEX, "1.*", TSDataType.INT32), - ValueFilterApi.notRegexp(DEFAULT_MEASUREMENT_INDEX, "1.*", TSDataType.INT32), + ValueFilterApi.like( + DEFAULT_MEASUREMENT_INDEX, + LikePattern.compile("1%", Optional.empty()), + TSDataType.INT32), + ValueFilterApi.notLike( + DEFAULT_MEASUREMENT_INDEX, + LikePattern.compile("1%", Optional.empty()), + TSDataType.INT32), + ValueFilterApi.regexp( + DEFAULT_MEASUREMENT_INDEX, Pattern.compile("1.*"), TSDataType.INT32), + ValueFilterApi.notRegexp( + DEFAULT_MEASUREMENT_INDEX, Pattern.compile("1.*"), TSDataType.INT32), ValueFilterApi.in( DEFAULT_MEASUREMENT_INDEX, new HashSet<>(Arrays.asList(100, 200)), TSDataType.INT32), ValueFilterApi.notIn( @@ -195,10 +214,18 @@ public class FilterSerializeTest { ValueFilterApi.notEq(DEFAULT_MEASUREMENT_INDEX, 100L, TSDataType.INT64), ValueFilterApi.between(DEFAULT_MEASUREMENT_INDEX, 100L, 200L, TSDataType.INT64), ValueFilterApi.notBetween(DEFAULT_MEASUREMENT_INDEX, 100L, 200L, TSDataType.INT64), - ValueFilterApi.like(DEFAULT_MEASUREMENT_INDEX, "1.*", TSDataType.INT64), - ValueFilterApi.notLike(DEFAULT_MEASUREMENT_INDEX, "1.*", TSDataType.INT64), - ValueFilterApi.regexp(DEFAULT_MEASUREMENT_INDEX, "1.*", TSDataType.INT64), - ValueFilterApi.notRegexp(DEFAULT_MEASUREMENT_INDEX, "1.*", TSDataType.INT64), + ValueFilterApi.like( + DEFAULT_MEASUREMENT_INDEX, + LikePattern.compile("1%", Optional.empty()), + TSDataType.INT64), + ValueFilterApi.notLike( + DEFAULT_MEASUREMENT_INDEX, + LikePattern.compile("1%", Optional.empty()), + TSDataType.INT64), + ValueFilterApi.regexp( + DEFAULT_MEASUREMENT_INDEX, Pattern.compile("1.*"), TSDataType.INT64), + ValueFilterApi.notRegexp( + DEFAULT_MEASUREMENT_INDEX, Pattern.compile("1.*"), TSDataType.INT64), ValueFilterApi.in( DEFAULT_MEASUREMENT_INDEX, new HashSet<>(Arrays.asList(100L, 200L)), @@ -225,10 +252,18 @@ public class FilterSerializeTest { ValueFilterApi.notEq(DEFAULT_MEASUREMENT_INDEX, 100.5f, TSDataType.FLOAT), ValueFilterApi.between(DEFAULT_MEASUREMENT_INDEX, 100.5f, 200.5f, TSDataType.FLOAT), ValueFilterApi.notBetween(DEFAULT_MEASUREMENT_INDEX, 100.5f, 200.5f, TSDataType.FLOAT), - ValueFilterApi.like(DEFAULT_MEASUREMENT_INDEX, "1.*", TSDataType.FLOAT), - ValueFilterApi.notLike(DEFAULT_MEASUREMENT_INDEX, "1.*", TSDataType.FLOAT), - ValueFilterApi.regexp(DEFAULT_MEASUREMENT_INDEX, "1.*", TSDataType.FLOAT), - ValueFilterApi.notRegexp(DEFAULT_MEASUREMENT_INDEX, "1.*", TSDataType.FLOAT), + ValueFilterApi.like( + DEFAULT_MEASUREMENT_INDEX, + LikePattern.compile("1%", Optional.empty()), + TSDataType.FLOAT), + ValueFilterApi.notLike( + DEFAULT_MEASUREMENT_INDEX, + LikePattern.compile("1%", Optional.empty()), + TSDataType.FLOAT), + ValueFilterApi.regexp( + DEFAULT_MEASUREMENT_INDEX, Pattern.compile("1.*"), TSDataType.FLOAT), + ValueFilterApi.notRegexp( + DEFAULT_MEASUREMENT_INDEX, Pattern.compile("1.*"), TSDataType.FLOAT), ValueFilterApi.in( DEFAULT_MEASUREMENT_INDEX, new HashSet<>(Arrays.asList(100.5f, 200.5f)), @@ -255,10 +290,18 @@ public class FilterSerializeTest { ValueFilterApi.notEq(DEFAULT_MEASUREMENT_INDEX, 100.5d, TSDataType.DOUBLE), ValueFilterApi.between(DEFAULT_MEASUREMENT_INDEX, 100.5d, 200.5d, TSDataType.DOUBLE), ValueFilterApi.notBetween(DEFAULT_MEASUREMENT_INDEX, 100.5d, 200.5d, TSDataType.DOUBLE), - ValueFilterApi.like(DEFAULT_MEASUREMENT_INDEX, "1.*", TSDataType.DOUBLE), - ValueFilterApi.notLike(DEFAULT_MEASUREMENT_INDEX, "1.*", TSDataType.DOUBLE), - ValueFilterApi.regexp(DEFAULT_MEASUREMENT_INDEX, "1.*", TSDataType.DOUBLE), - ValueFilterApi.notRegexp(DEFAULT_MEASUREMENT_INDEX, "1.*", TSDataType.DOUBLE), + ValueFilterApi.like( + DEFAULT_MEASUREMENT_INDEX, + LikePattern.compile("1%", Optional.empty()), + TSDataType.DOUBLE), + ValueFilterApi.notLike( + DEFAULT_MEASUREMENT_INDEX, + LikePattern.compile("1%", Optional.empty()), + TSDataType.DOUBLE), + ValueFilterApi.regexp( + DEFAULT_MEASUREMENT_INDEX, Pattern.compile("1.*"), TSDataType.DOUBLE), + ValueFilterApi.notRegexp( + DEFAULT_MEASUREMENT_INDEX, Pattern.compile("1.*"), TSDataType.DOUBLE), ValueFilterApi.in( DEFAULT_MEASUREMENT_INDEX, new HashSet<>(Arrays.asList(100.5d, 200.5d)), @@ -299,10 +342,17 @@ public class FilterSerializeTest { new Binary("test", STRING_CHARSET), new Binary("string", STRING_CHARSET), TSDataType.TEXT), - ValueFilterApi.like(DEFAULT_MEASUREMENT_INDEX, "t.*", TSDataType.TEXT), - ValueFilterApi.notLike(DEFAULT_MEASUREMENT_INDEX, "t.*", TSDataType.TEXT), - ValueFilterApi.regexp(DEFAULT_MEASUREMENT_INDEX, "t.*", TSDataType.TEXT), - ValueFilterApi.notRegexp(DEFAULT_MEASUREMENT_INDEX, "t.*", TSDataType.TEXT), + ValueFilterApi.like( + DEFAULT_MEASUREMENT_INDEX, + LikePattern.compile("t%", Optional.empty()), + TSDataType.TEXT), + ValueFilterApi.notLike( + DEFAULT_MEASUREMENT_INDEX, + LikePattern.compile("t%", Optional.empty()), + TSDataType.TEXT), + ValueFilterApi.regexp(DEFAULT_MEASUREMENT_INDEX, Pattern.compile("t.*"), TSDataType.TEXT), + ValueFilterApi.notRegexp( + DEFAULT_MEASUREMENT_INDEX, Pattern.compile("t.*"), TSDataType.TEXT), ValueFilterApi.in( DEFAULT_MEASUREMENT_INDEX, new HashSet<>( @@ -347,10 +397,18 @@ public class FilterSerializeTest { new Binary("test", STRING_CHARSET), new Binary("string", STRING_CHARSET), TSDataType.STRING), - ValueFilterApi.like(DEFAULT_MEASUREMENT_INDEX, "t.*", TSDataType.STRING), - ValueFilterApi.notLike(DEFAULT_MEASUREMENT_INDEX, "t.*", TSDataType.STRING), - ValueFilterApi.regexp(DEFAULT_MEASUREMENT_INDEX, "t.*", TSDataType.STRING), - ValueFilterApi.notRegexp(DEFAULT_MEASUREMENT_INDEX, "t.*", TSDataType.STRING), + ValueFilterApi.like( + DEFAULT_MEASUREMENT_INDEX, + LikePattern.compile("t%", Optional.empty()), + TSDataType.STRING), + ValueFilterApi.notLike( + DEFAULT_MEASUREMENT_INDEX, + LikePattern.compile("t%", Optional.empty()), + TSDataType.STRING), + ValueFilterApi.regexp( + DEFAULT_MEASUREMENT_INDEX, Pattern.compile("t.*"), TSDataType.STRING), + ValueFilterApi.notRegexp( + DEFAULT_MEASUREMENT_INDEX, Pattern.compile("t.*"), TSDataType.STRING), ValueFilterApi.in( DEFAULT_MEASUREMENT_INDEX, new HashSet<>( diff --git a/java/tsfile/src/test/java/org/apache/tsfile/read/filter/NumericalOperatorsTest.java b/java/tsfile/src/test/java/org/apache/tsfile/read/filter/NumericalOperatorsTest.java index deddb978..13a8369a 100644 --- a/java/tsfile/src/test/java/org/apache/tsfile/read/filter/NumericalOperatorsTest.java +++ b/java/tsfile/src/test/java/org/apache/tsfile/read/filter/NumericalOperatorsTest.java @@ -14,6 +14,7 @@ package org.apache.tsfile.read.filter; +import org.apache.tsfile.common.regexp.LikePattern; import org.apache.tsfile.read.filter.basic.Filter; import org.apache.tsfile.read.filter.factory.ValueFilterApi; @@ -22,6 +23,8 @@ import org.junit.Test; import java.util.Arrays; import java.util.HashSet; +import java.util.Optional; +import java.util.regex.Pattern; import static org.apache.tsfile.enums.TSDataType.INT32; import static org.apache.tsfile.read.filter.FilterTestUtil.DEFAULT_TIMESTAMP; @@ -113,14 +116,33 @@ public class NumericalOperatorsTest { @Test public void testRegexp() { - Filter regexp = ValueFilterApi.regexp(DEFAULT_MEASUREMENT_INDEX, "1.*", INT32); + Filter regexp = ValueFilterApi.regexp(DEFAULT_MEASUREMENT_INDEX, Pattern.compile("1.*"), INT32); Assert.assertTrue(regexp.satisfyInteger(DEFAULT_TIMESTAMP, 10)); Assert.assertFalse(regexp.satisfyInteger(DEFAULT_TIMESTAMP, 20)); } @Test public void testNotRegexp() { - Filter notRegexp = ValueFilterApi.notRegexp(DEFAULT_MEASUREMENT_INDEX, "1.*", INT32); + Filter notRegexp = + ValueFilterApi.notRegexp(DEFAULT_MEASUREMENT_INDEX, Pattern.compile("1.*"), INT32); + Assert.assertTrue(notRegexp.satisfyInteger(DEFAULT_TIMESTAMP, 20)); + Assert.assertFalse(notRegexp.satisfyInteger(DEFAULT_TIMESTAMP, 10)); + } + + @Test + public void testLike() { + Filter regexp = + ValueFilterApi.like( + DEFAULT_MEASUREMENT_INDEX, LikePattern.compile("1%", Optional.empty()), INT32); + Assert.assertTrue(regexp.satisfyInteger(DEFAULT_TIMESTAMP, 10)); + Assert.assertFalse(regexp.satisfyInteger(DEFAULT_TIMESTAMP, 20)); + } + + @Test + public void testNotLike() { + Filter notRegexp = + ValueFilterApi.notLike( + DEFAULT_MEASUREMENT_INDEX, LikePattern.compile("1%", Optional.empty()), INT32); Assert.assertTrue(notRegexp.satisfyInteger(DEFAULT_TIMESTAMP, 20)); Assert.assertFalse(notRegexp.satisfyInteger(DEFAULT_TIMESTAMP, 10)); } diff --git a/java/tsfile/src/test/java/org/apache/tsfile/read/filter/PredicateRemoveNotRewriterTest.java b/java/tsfile/src/test/java/org/apache/tsfile/read/filter/PredicateRemoveNotRewriterTest.java index 249b450f..62656d5c 100644 --- a/java/tsfile/src/test/java/org/apache/tsfile/read/filter/PredicateRemoveNotRewriterTest.java +++ b/java/tsfile/src/test/java/org/apache/tsfile/read/filter/PredicateRemoveNotRewriterTest.java @@ -20,6 +20,7 @@ package org.apache.tsfile.read.filter; import org.apache.tsfile.common.conf.TSFileConfig; +import org.apache.tsfile.common.regexp.LikePattern; import org.apache.tsfile.enums.TSDataType; import org.apache.tsfile.read.filter.factory.FilterFactory; import org.apache.tsfile.read.filter.factory.TimeFilterApi; @@ -31,6 +32,8 @@ import org.junit.Test; import java.util.Arrays; import java.util.HashSet; +import java.util.Optional; +import java.util.regex.Pattern; import static org.apache.tsfile.read.filter.factory.ValueFilterApi.DEFAULT_MEASUREMENT_INDEX; @@ -45,17 +48,33 @@ public class PredicateRemoveNotRewriterTest { Assert.assertEquals(TimeFilterApi.eq(1), TimeFilterApi.notEq(1).reverse()); Assert.assertEquals(TimeFilterApi.notEq(1), TimeFilterApi.eq(1).reverse()); Assert.assertEquals( - ValueFilterApi.like(DEFAULT_MEASUREMENT_INDEX, "s*", TSDataType.TEXT), - ValueFilterApi.notLike(DEFAULT_MEASUREMENT_INDEX, "s*", TSDataType.TEXT).reverse()); + ValueFilterApi.like( + DEFAULT_MEASUREMENT_INDEX, + LikePattern.compile("s%", Optional.empty()), + TSDataType.TEXT), + ValueFilterApi.notLike( + DEFAULT_MEASUREMENT_INDEX, + LikePattern.compile("s%", Optional.empty()), + TSDataType.TEXT) + .reverse()); Assert.assertEquals( - ValueFilterApi.notLike(DEFAULT_MEASUREMENT_INDEX, "s*", TSDataType.TEXT), - ValueFilterApi.like(DEFAULT_MEASUREMENT_INDEX, "s*", TSDataType.TEXT).reverse()); + ValueFilterApi.notLike( + DEFAULT_MEASUREMENT_INDEX, + LikePattern.compile("s%", Optional.empty()), + TSDataType.TEXT), + ValueFilterApi.like( + DEFAULT_MEASUREMENT_INDEX, + LikePattern.compile("s%", Optional.empty()), + TSDataType.TEXT) + .reverse()); Assert.assertEquals( - ValueFilterApi.regexp(DEFAULT_MEASUREMENT_INDEX, "s*", TSDataType.TEXT), - ValueFilterApi.notRegexp(DEFAULT_MEASUREMENT_INDEX, "s*", TSDataType.TEXT).reverse()); + ValueFilterApi.regexp(DEFAULT_MEASUREMENT_INDEX, Pattern.compile("s*"), TSDataType.TEXT), + ValueFilterApi.notRegexp(DEFAULT_MEASUREMENT_INDEX, Pattern.compile("s*"), TSDataType.TEXT) + .reverse()); Assert.assertEquals( - ValueFilterApi.notRegexp(DEFAULT_MEASUREMENT_INDEX, "s*", TSDataType.TEXT), - ValueFilterApi.regexp(DEFAULT_MEASUREMENT_INDEX, "s*", TSDataType.TEXT).reverse()); + ValueFilterApi.notRegexp(DEFAULT_MEASUREMENT_INDEX, Pattern.compile("s*"), TSDataType.TEXT), + ValueFilterApi.regexp(DEFAULT_MEASUREMENT_INDEX, Pattern.compile("s*"), TSDataType.TEXT) + .reverse()); Assert.assertEquals(TimeFilterApi.between(1, 100), TimeFilterApi.notBetween(1, 100).reverse()); Assert.assertEquals(TimeFilterApi.notBetween(1, 100), TimeFilterApi.between(1, 100).reverse()); // TODO: add StringFilter test @@ -122,10 +141,11 @@ public class PredicateRemoveNotRewriterTest { TimeFilterApi.ltEq(1), PredicateRemoveNotRewriter.rewrite(FilterFactory.not(TimeFilterApi.gt(1)))); Assert.assertEquals( - ValueFilterApi.like(DEFAULT_MEASUREMENT_INDEX, "s*", TSDataType.TEXT), + ValueFilterApi.regexp(DEFAULT_MEASUREMENT_INDEX, Pattern.compile("s*"), TSDataType.TEXT), PredicateRemoveNotRewriter.rewrite( FilterFactory.not( - ValueFilterApi.notLike(DEFAULT_MEASUREMENT_INDEX, "s*", TSDataType.TEXT)))); + ValueFilterApi.notRegexp( + DEFAULT_MEASUREMENT_INDEX, Pattern.compile("s*"), TSDataType.TEXT)))); Assert.assertEquals( FilterFactory.or(TimeFilterApi.gt(1), TimeFilterApi.ltEq(1)), PredicateRemoveNotRewriter.rewrite(
