This is an automated email from the ASF dual-hosted git repository. gerben pushed a commit to branch import-dom-seek in repository https://gitbox.apache.org/repos/asf/incubator-annotator.git
commit 0b3a9f6965bc4d3f2afbf666c0d44413a67d8663 Author: Gerben <[email protected]> AuthorDate: Thu Nov 5 16:22:45 2020 +0100 Make text quote search chunk by chunk This allows a document to be provided piecemeal. The behaviour changes slightly for matches that end at the end of a node/chunk: we no longer put the match at the start of the next node, in order to avoid needlessly asking the document for new chunks. --- packages/dom/src/text-quote/match.ts | 133 +++++++++++++++++++++------- packages/dom/test/text-quote/match-cases.ts | 8 +- packages/dom/test/text-quote/match.test.ts | 8 +- 3 files changed, 108 insertions(+), 41 deletions(-) diff --git a/packages/dom/src/text-quote/match.ts b/packages/dom/src/text-quote/match.ts index 0b3d3af..d18fbf8 100644 --- a/packages/dom/src/text-quote/match.ts +++ b/packages/dom/src/text-quote/match.ts @@ -19,57 +19,124 @@ */ import type { Matcher, TextQuoteSelector } from '@annotator/selector'; -import { ownerDocument } from '../owner-document'; -import { DomSeeker } from '../seek'; +import { TextNodeChunker, Chunk, Chunker } from '../chunker'; + +export interface ChunkRange<TChunk extends Chunk<any>> { + startChunk: TChunk; + startIndex: number; + endChunk: TChunk; + endIndex: number; +} export function createTextQuoteSelectorMatcher( selector: TextQuoteSelector, ): Matcher<Range, Range> { + const abstractMatcher = abstractTextQuoteSelectorMatcher(selector); return async function* matchAll(scope) { - const document = ownerDocument(scope); - const scopeText = scope.toString(); + const textChunks = new TextNodeChunker(scope); + + for await (const abstractMatch of abstractMatcher(textChunks)) { + const match = document.createRange(); + // The `+…startOffset` parts are only relevant for the first chunk, as it + // might start within a text node. + match.setStart(abstractMatch.startChunk.node, + abstractMatch.startIndex + abstractMatch.startChunk.startOffset); + match.setEnd(abstractMatch.endChunk.node, + abstractMatch.endIndex + abstractMatch.endChunk.startOffset); + yield match; + } + } +} +type AbstractMatcher<TChunk extends Chunk<string>> = + Matcher<Chunker<TChunk>, ChunkRange<TChunk>> + +export function abstractTextQuoteSelectorMatcher( + selector: TextQuoteSelector, +): AbstractMatcher<any> { + return async function* matchAll<TChunk extends Chunk<string>>(textChunks: Chunker<TChunk>) { const exact = selector.exact; const prefix = selector.prefix || ''; const suffix = selector.suffix || ''; const searchPattern = prefix + exact + suffix; - let seeker: DomSeeker; - try { - seeker = new DomSeeker(scope); - } catch (error) { - // If the scope does not contain text nodes, we can stop. (if it contains - // only empty text nodes we continue: it would still match an empty quote) - if (error instanceof RangeError) return; - else throw error; - } + let partialMatches: Array<{ + startChunk: TChunk; + startIndex: number; + charactersMatched: number; + }> = []; - let fromIndex = 0; - while (fromIndex <= scopeText.length) { - // Find the quote with its prefix and suffix in the string. - const patternStartIndex = scopeText.indexOf(searchPattern, fromIndex); - if (patternStartIndex === -1) return; + let chunk: TChunk | null; + while (chunk = textChunks.currentChunk) { + const chunkValue = chunk.data; - // Correct for the prefix and suffix lengths. - const matchStartIndex = patternStartIndex + prefix.length; - const matchEndIndex = matchStartIndex + exact.length; + // Continue checking any partial matches from the previous chunk(s). + const remainingPartialMatches: typeof partialMatches = []; + for (const { startChunk, startIndex, charactersMatched } of partialMatches) { + if (searchPattern.length - charactersMatched > chunkValue.length) { + if (chunkValue === searchPattern.substring(charactersMatched, charactersMatched + chunkValue.length)) { + // The chunk is too short to complete the match; comparison has to be completed in subsequent chunks. + remainingPartialMatches.push({ + startChunk, + startIndex, + charactersMatched: charactersMatched + chunkValue.length, + }); + } + } + else if (chunkValue.startsWith(searchPattern.substring(charactersMatched))) { + yield { + startChunk, + startIndex, + endChunk: chunk, + endIndex: searchPattern.length - charactersMatched, + }; + } + } + partialMatches = remainingPartialMatches; - // Create a range to represent this exact quote in the dom. - const match = document.createRange(); + // Try find the whole pattern in the chunk (possibly multiple times). + if (searchPattern.length <= chunkValue.length) { + let fromIndex = 0; + while (fromIndex <= chunkValue.length) { + const patternStartIndex = chunkValue.indexOf(searchPattern, fromIndex); + if (patternStartIndex === -1) break; - // Seek to the start of the match, make the range start there. - seeker.seekTo(matchStartIndex); - match.setStart(seeker.referenceNode, seeker.offsetInReferenceNode); + // Correct for the prefix and suffix lengths. + const matchStartIndex = patternStartIndex + prefix.length; + const matchEndIndex = matchStartIndex + exact.length; - // Seek to the end of the match, make the range end there. - seeker.seekTo(matchEndIndex); - match.setEnd(seeker.referenceNode, seeker.offsetInReferenceNode); + yield { + startChunk: chunk, + startIndex: matchStartIndex, + endChunk: chunk, + endIndex: matchEndIndex, + }; - // Yield the match. - yield match; + // Advance the search forward to detect multiple occurrences within the same chunk. + fromIndex = matchStartIndex + 1; + } + } + + // Check if this chunk ends with a partial match (or even multiple partial matches). + let newPartialMatches: number[] = []; + const searchStartPoint = Math.max(chunkValue.length - searchPattern.length + 1, 0); + for (let i = searchStartPoint; i < chunkValue.length; i++) { + const character = chunkValue[i]; + newPartialMatches = newPartialMatches.filter( + partialMatchStartIndex => (character === searchPattern[i - partialMatchStartIndex]) + ); + if (character === searchPattern[0]) newPartialMatches.push(i); + } + for (const partialMatchStartIndex of newPartialMatches) { + partialMatches.push({ + startChunk: chunk, + startIndex: partialMatchStartIndex, + charactersMatched: chunkValue.length - partialMatchStartIndex, + }); + } - // Advance the search forward to detect multiple occurrences. - fromIndex = matchStartIndex + 1; + if (textChunks.nextChunk() === null) + break; } }; } diff --git a/packages/dom/test/text-quote/match-cases.ts b/packages/dom/test/text-quote/match-cases.ts index 0a4bb0e..dc4d96c 100644 --- a/packages/dom/test/text-quote/match-cases.ts +++ b/packages/dom/test/text-quote/match-cases.ts @@ -98,8 +98,8 @@ export const testCases: { { startContainerXPath: '//i/text()', startOffset: 0, - endContainerXPath: '//b/text()[2]', - endOffset: 0, + endContainerXPath: '//i/text()', + endOffset: 11, }, ], }, @@ -114,8 +114,8 @@ export const testCases: { { startContainerXPath: '//title/text()', startOffset: 4, - endContainerXPath: '//b/text()[1]', - endOffset: 0, + endContainerXPath: '//title/text()', + endOffset: 9, }, ], }, diff --git a/packages/dom/test/text-quote/match.test.ts b/packages/dom/test/text-quote/match.test.ts index 276bb5d..6baf2e3 100644 --- a/packages/dom/test/text-quote/match.test.ts +++ b/packages/dom/test/text-quote/match.test.ts @@ -59,8 +59,8 @@ describe('createTextQuoteSelectorMatcher', () => { { startContainerXPath: '//b/text()[13]', startOffset: 0, - endContainerXPath: '//b/text()[21]', - endOffset: 0, + endContainerXPath: '//b/text()[20]', + endOffset: 1, }, ]); }); @@ -88,8 +88,8 @@ describe('createTextQuoteSelectorMatcher', () => { { startContainerXPath: '//b/text()[4]', // "dolor" startOffset: 0, - endContainerXPath: '//b/text()[8]', // "et yada yada" - endOffset: 0, + endContainerXPath: '//b/text()[6]', // " am" + endOffset: 3, }, ]); });
