This is an automated email from the ASF dual-hosted git repository. gerben pushed a commit to branch chunking in repository https://gitbox.apache.org/repos/asf/incubator-annotator.git
commit 4e1d317dcef8ba89de53267961ed2f286da1cd6f Author: Gerben <[email protected]> AuthorDate: Wed Sep 16 15:03:00 2020 +0200 Support matching across chunks --- packages/dom/src/text-quote/match.ts | 83 +++++++++++++++++++++++------- packages/dom/test/text-quote/match.test.ts | 8 +-- 2 files changed, 69 insertions(+), 22 deletions(-) diff --git a/packages/dom/src/text-quote/match.ts b/packages/dom/src/text-quote/match.ts index 6b7fd93..c3fb43d 100644 --- a/packages/dom/src/text-quote/match.ts +++ b/packages/dom/src/text-quote/match.ts @@ -54,36 +54,83 @@ interface AbstractRange<TChunk> { export function abstractTextQuoteSelectorMatcher( selector: TextQuoteSelector, ): <TChunk extends Chunk>(textChunks: AsyncIterable<TChunk>) => AsyncGenerator<AbstractRange<TChunk>, void, void> { - return async function* matchAll(textChunks) { + return async function* matchAll<TChunk extends Chunk>(textChunks: AsyncIterable<TChunk>) { const exact = selector.exact; const prefix = selector.prefix || ''; const suffix = selector.suffix || ''; const searchPattern = prefix + exact + suffix; + let partialMatches: Array<{ + startChunk: TChunk; + startIndex: number; + charactersMatched: number; + }> = []; + for await (const chunk of textChunks) { const chunkValue = chunk.toString(); - // Find the pattern in the chunk (possibly multiple times) - // TODO allow pattern to be spread across chunks - let fromIndex = 0; - while (fromIndex <= chunkValue.length) { - const patternStartIndex = chunkValue.indexOf(searchPattern, fromIndex); - if (patternStartIndex === -1) break; + // Continue checking any partial matches from the previous chunk(s). + const remainingPartialMatches: typeof partialMatches = []; + for (const { startChunk, startIndex, charactersMatched } of partialMatches) { + if (searchPattern.length - charactersMatched > chunkValue.length) { + if (chunkValue === searchPattern.substring(charactersMatched, charactersMatched + chunkValue.length)) { + // The chunk is too short to complete the match; comparison has to be completed in subsequent chunks. + remainingPartialMatches.push({ + startChunk, + startIndex, + charactersMatched: charactersMatched + chunkValue.length, + }); + } + } + else if (chunkValue.startsWith(searchPattern.substring(charactersMatched))) { + yield { + startChunk, + startIndex, + endChunk: chunk, + endIndex: searchPattern.length - charactersMatched, + }; + } + } + partialMatches = remainingPartialMatches; + + // Try find the whole pattern in the chunk (possibly multiple times). + if (searchPattern.length <= chunkValue.length) { + let fromIndex = 0; + while (fromIndex <= chunkValue.length) { + const patternStartIndex = chunkValue.indexOf(searchPattern, fromIndex); + if (patternStartIndex === -1) break; - // Correct for the prefix and suffix lengths. - const matchStartIndex = patternStartIndex + prefix.length; - const matchEndIndex = matchStartIndex + exact.length; + // Correct for the prefix and suffix lengths. + const matchStartIndex = patternStartIndex + prefix.length; + const matchEndIndex = matchStartIndex + exact.length; - yield { - startChunk: chunk, - startIndex: matchStartIndex, - endChunk: chunk, - endIndex: matchEndIndex, - }; + yield { + startChunk: chunk, + startIndex: matchStartIndex, + endChunk: chunk, + endIndex: matchEndIndex, + }; + + // Advance the search forward to detect multiple occurrences within the same chunk. + fromIndex = matchStartIndex + 1; + } + } - // Advance the search forward to detect multiple occurrences within the same chunk. - fromIndex = matchStartIndex + 1; + // Check if this chunk ends with a partial match (or even multiple partial matches). + let newPartialMatches: number[] = []; + const searchStartPoint = Math.max(chunkValue.length - searchPattern.length + 1, 0); + for (let i = searchStartPoint; i < chunkValue.length; i++) { + const character = chunkValue[i]; + newPartialMatches = newPartialMatches.filter( + partialMatchStartIndex => (character === searchPattern[i - partialMatchStartIndex]) + ); + if (character === searchPattern[0]) newPartialMatches.push(i); } + newPartialMatches.forEach(partialMatchStartIndex => partialMatches.push({ + startChunk: chunk, + startIndex: partialMatchStartIndex, + charactersMatched: chunkValue.length - partialMatchStartIndex, + })); } }; } diff --git a/packages/dom/test/text-quote/match.test.ts b/packages/dom/test/text-quote/match.test.ts index 7d52e15..fe8d36b 100644 --- a/packages/dom/test/text-quote/match.test.ts +++ b/packages/dom/test/text-quote/match.test.ts @@ -60,8 +60,8 @@ describe('createTextQuoteSelectorMatcher', () => { { startContainerXPath: '//b/text()[13]', startOffset: 0, - endContainerXPath: '//b/text()[21]', - endOffset: 0, + endContainerXPath: '//b/text()[20]', + endOffset: 1, }, ]); }); @@ -89,8 +89,8 @@ describe('createTextQuoteSelectorMatcher', () => { { startContainerXPath: '//b/text()[4]', // "dolor" startOffset: 0, - endContainerXPath: '//b/text()[8]', // "et yada yada" - endOffset: 0, + endContainerXPath: '//b/text()[6]', // " am" + endOffset: 3, }, ]); });
