This is an automated email from the ASF dual-hosted git repository. gerben pushed a commit to branch import-dom-seek in repository https://gitbox.apache.org/repos/asf/incubator-annotator.git
commit 657daab660a91ce7057136e5417326b7ae42f239 Author: Gerben <[email protected]> AuthorDate: Thu Nov 12 12:14:24 2020 +0100 Make describeTextQuote work, mostly. Tests pass, still some issues with empty quotes at chunk edges --- packages/dom/src/chunker.ts | 4 +- packages/dom/src/normalize-range.ts | 4 ++ packages/dom/src/seek.ts | 19 ++++--- packages/dom/src/text-quote/describe.ts | 84 +++++++++++++++++++++++------- packages/dom/src/text-quote/match.ts | 90 ++++++++++++++++++++++----------- 5 files changed, 145 insertions(+), 56 deletions(-) diff --git a/packages/dom/src/chunker.ts b/packages/dom/src/chunker.ts index 7209d7a..2becebf 100644 --- a/packages/dom/src/chunker.ts +++ b/packages/dom/src/chunker.ts @@ -104,11 +104,13 @@ export class TextNodeChunker implements Chunker<PartialTextNode> { rangeToChunkRange(range: Range): ChunkRange<PartialTextNode> { const textRange = normalizeRange(range); + // FIXME: normalizeRange can mess up: a collapsed range at the very end of + // the chunker’s scope might move to the next text node outside the scope. const startChunk = this.nodeToChunk(textRange.startContainer); const startIndex = textRange.startOffset - startChunk.startOffset; const endChunk = this.nodeToChunk(textRange.endContainer); - const endIndex = textRange.endOffset - endChunk.endOffset; + const endIndex = textRange.endOffset - endChunk.startOffset; return { startChunk, startIndex, endChunk, endIndex }; } diff --git a/packages/dom/src/normalize-range.ts b/packages/dom/src/normalize-range.ts index 8616bce..0fece41 100644 --- a/packages/dom/src/normalize-range.ts +++ b/packages/dom/src/normalize-range.ts @@ -48,6 +48,10 @@ export interface TextRange extends Range { // if there are equivalent text selections it takes the narrowest option (i.e. // it prefers the start not to be at the end of a text node, and vice versa). // +// If there is no text between the start and end, they thus collapse onto one a +// single position; and if there are multiple equivalent positions, it takes the +// first one. +// // Note that if the given range does not contain non-empty text nodes, it will // end up pointing at a text node outside of it (after it if possible, else // before). If the document does not contain any text nodes, an error is thrown. diff --git a/packages/dom/src/seek.ts b/packages/dom/src/seek.ts index 1c821a9..36914a7 100644 --- a/packages/dom/src/seek.ts +++ b/packages/dom/src/seek.ts @@ -111,15 +111,14 @@ export class TextSeeker<TChunk extends Chunk<string>> implements Seeker<string> if (this.position <= target) { while (this.position <= target) { // could be `while (true)`? - if (!roundUp && target < this.currentChunkPosition + this.currentChunk.data.length) { - // The target is before the end of the current chunk. + if ( + this.currentChunkPosition + this.currentChunk.data.length <= target + || (roundUp && this.offsetInChunk !== 0) + ) { + // The target is beyond the current chunk. // (we use < not ≤: if the target is *at* the end of the chunk, possibly // because the current chunk is empty, we prefer to take the next chunk) - const newOffset = target - this.currentChunkPosition; - if (read) result += this.currentChunk.data.substring(this.offsetInChunk, newOffset); - this.offsetInChunk = newOffset; - break; - } else { + // Move to the start of the next chunk, while counting the characters of the current one. if (read) result += this.currentChunk.data.substring(this.offsetInChunk); const chunkLength = this.currentChunk.data.length; @@ -140,6 +139,12 @@ export class TextSeeker<TChunk extends Chunk<string>> implements Seeker<string> else throw new RangeError(E_END); } + } else { + // The target is within the current chunk. + const newOffset = target - this.currentChunkPosition; + if (read) result += this.currentChunk.data.substring(this.offsetInChunk, newOffset); + this.offsetInChunk = newOffset; + break; } } } else { // Similar to the if-block, but moving backward in the text. diff --git a/packages/dom/src/text-quote/describe.ts b/packages/dom/src/text-quote/describe.ts index cbad0c3..1a7941d 100644 --- a/packages/dom/src/text-quote/describe.ts +++ b/packages/dom/src/text-quote/describe.ts @@ -26,17 +26,18 @@ import { TextSeeker, NonEmptyChunker } from '../seek'; export async function describeTextQuote( range: Range, - scope?: Range, + maybeScope?: Range, ): Promise<TextQuoteSelector> { // Default to search in the whole document. - if (scope === undefined) { + let scope: Range; + if (maybeScope !== undefined) { + scope = maybeScope; + } else { const document = ownerDocument(range); scope = document.createRange(); scope.selectNodeContents(document); } - const chunker = new TextNodeChunker(scope); - // Take the part of the range that falls within the scope. range = range.cloneRange(); if (range.compareBoundaryPoints(Range.START_TO_START, scope) === -1) @@ -44,17 +45,21 @@ export async function describeTextQuote( if (range.compareBoundaryPoints(Range.END_TO_END, scope) === 1) range.setEnd(scope.endContainer, scope.endOffset); + const chunker = new TextNodeChunker(scope); + return await abstractDescribeTextQuote( chunker.rangeToChunkRange(range), - chunker, + () => new TextNodeChunker(scope), ); } async function abstractDescribeTextQuote<TChunk extends Chunk<string>>( target: ChunkRange<TChunk>, - scope: Chunker<TChunk>, + scope: () => Chunker<TChunk>, ): Promise<TextQuoteSelector> { - const seeker = new TextSeeker(scope as NonEmptyChunker<TChunk>); + const seeker = new TextSeeker(scope() as NonEmptyChunker<TChunk>); + + // Read the target’s exact text. seeker.seekToChunk(target.startChunk, target.startIndex); const exact = seeker.readToChunk(target.endChunk, target.endIndex); @@ -70,7 +75,8 @@ async function abstractDescribeTextQuote<TChunk extends Chunk<string>>( prefix, suffix, } - const matches = abstractTextQuoteSelectorMatcher(tentativeSelector)(scope); + + const matches = abstractTextQuoteSelectorMatcher(tentativeSelector)(scope()); let nextMatch = await matches.next(); if (!nextMatch.done && chunkRangeEquals(nextMatch.value, target)) { @@ -81,17 +87,19 @@ async function abstractDescribeTextQuote<TChunk extends Chunk<string>>( // If there are no more unintended matches, our selector is unambiguous! if (nextMatch.done) return tentativeSelector; - // TODO either reset chunker to start from the beginning, or rewind the chunker by previous match’s length. - // chunker.seekTo(0) or chunker.seek(-prefix) + // A subsequent search could safely skip the part we already processed, + // we’d need the matcher to start at the seeker’s position, instead of + // searching in the whole current chunk. + // seeker.seekBy(-prefix.length + 1); // We’ll have to add more prefix/suffix to disqualify this unintended match. const unintendedMatch = nextMatch.value; - const seeker1 = new TextSeeker(scope as NonEmptyChunker<TChunk>); - const seeker2 = new TextSeeker(scope as NonEmptyChunker<TChunk>); // TODO must clone scope. + const seeker1 = new TextSeeker(scope() as NonEmptyChunker<TChunk>); + const seeker2 = new TextSeeker(scope() as NonEmptyChunker<TChunk>); // Count how many characters we’d need as a prefix to disqualify this match. - seeker1.seekToChunk(target.startChunk, target.startIndex); - seeker2.seekToChunk(unintendedMatch.startChunk, unintendedMatch.startIndex); + seeker1.seekToChunk(target.startChunk, target.startIndex - prefix.length); + seeker2.seekToChunk(unintendedMatch.startChunk, unintendedMatch.startIndex - prefix.length); let sufficientPrefix: string | undefined = prefix; while (true) { let previousCharacter: string; @@ -102,15 +110,53 @@ async function abstractDescribeTextQuote<TChunk extends Chunk<string>>( break; } sufficientPrefix = previousCharacter + sufficientPrefix; - if (previousCharacter !== seeker2.read(-1)) break; + + // Break if the newly added character makes the prefix unambiguous. + try { + const unintendedMatchPreviousCharacter = seeker2.read(-1); + if (previousCharacter !== unintendedMatchPreviousCharacter) break; + } catch (err) { + if (err instanceof RangeError) + break; + else + throw err; + } + } + + // Count how many characters we’d need as a suffix to disqualify this match. + seeker1.seekToChunk(target.endChunk, target.endIndex + suffix.length); + seeker2.seekToChunk(unintendedMatch.endChunk, unintendedMatch.endIndex + suffix.length); + let sufficientSuffix: string | undefined = suffix; + while (true) { + let nextCharacter: string; + try { + nextCharacter = seeker1.read(1); + } catch (err) { + sufficientSuffix = undefined; // End of text reached. + break; + } + sufficientSuffix += nextCharacter; + + // Break if the newly added character makes the suffix unambiguous. + try { + const unintendedMatchNextCharacter = seeker2.read(1); + if (nextCharacter !== unintendedMatchNextCharacter) break; + } catch (err) { + if (err instanceof RangeError) + break; + else + throw err; + } } // Use either the prefix or suffix, whichever is shortest. - if (sufficientPrefix !== undefined && (sufficientSuffix === undefined || sufficientPrefix.length <= sufficientSuffix.length)) - prefix = sufficientPrefix; // chunker.seek(prefix.length - sufficientPrefix.length) - else if (sufficientSuffix !== undefined) + if (sufficientPrefix !== undefined && (sufficientSuffix === undefined || sufficientPrefix.length <= sufficientSuffix.length)) { + prefix = sufficientPrefix; + // seeker.seekBy(sufficientPrefix.length - prefix.length) // Would be required if we’d skip the processed part. + } else if (sufficientSuffix !== undefined) { suffix = sufficientSuffix; - else + } else { throw new Error('Target cannot be disambiguated; how could that have happened‽'); + } } } diff --git a/packages/dom/src/text-quote/match.ts b/packages/dom/src/text-quote/match.ts index 112e054..73cd1d7 100644 --- a/packages/dom/src/text-quote/match.ts +++ b/packages/dom/src/text-quote/match.ts @@ -53,41 +53,62 @@ export function abstractTextQuoteSelectorMatcher( const suffix = selector.suffix || ''; const searchPattern = prefix + exact + suffix; - let partialMatches: Array<{ - startChunk: TChunk; - startIndex: number; + // The code below runs a loop with three steps: + // 1. Continue checking any partial matches from the previous chunk(s). + // 2. Try find the whole pattern in the chunk (possibly multiple times). + // 3. Check if this chunk ends with a partial match (or even multiple partial matches). + + interface PartialMatch { + startChunk?: TChunk; + startIndex?: number; + endChunk?: TChunk; + endIndex?: number; charactersMatched: number; - }> = []; + } + let partialMatches: PartialMatch[] = []; let chunk: TChunk | null; while (chunk = textChunks.currentChunk) { const chunkValue = chunk.data; - // Continue checking any partial matches from the previous chunk(s). + // 1. Continue checking any partial matches from the previous chunk(s). const remainingPartialMatches: typeof partialMatches = []; - for (const { startChunk, startIndex, charactersMatched } of partialMatches) { - if (searchPattern.length - charactersMatched > chunkValue.length) { - if (chunkValue === searchPattern.substring(charactersMatched, charactersMatched + chunkValue.length)) { - // The chunk is too short to complete the match; comparison has to be completed in subsequent chunks. - remainingPartialMatches.push({ - startChunk, - startIndex, - charactersMatched: charactersMatched + chunkValue.length, - }); + for (const partialMatch of partialMatches) { + const charactersMatched = partialMatch.charactersMatched; + + // If the current chunk contains the start and/or end of the match, record these. + if (partialMatch.endChunk === undefined) { + const charactersUntilMatchEnd = prefix.length + exact.length - charactersMatched; + if (charactersUntilMatchEnd <= chunkValue.length) { + partialMatch.endChunk = chunk; + partialMatch.endIndex = charactersUntilMatchEnd; } } - else if (chunkValue.startsWith(searchPattern.substring(charactersMatched))) { - yield { - startChunk, - startIndex, - endChunk: chunk, - endIndex: searchPattern.length - charactersMatched, - }; + if (partialMatch.startChunk === undefined) { + const charactersUntilMatchStart = prefix.length - charactersMatched; + if ( + charactersUntilMatchStart < chunkValue.length + || partialMatch.endChunk !== undefined // handles an edge case: an empty quote at the end of a chunk. + ) { + partialMatch.startChunk = chunk; + partialMatch.startIndex = charactersUntilMatchStart; + } + } + + const charactersUntilSuffixEnd = searchPattern.length - charactersMatched; + if (charactersUntilSuffixEnd <= chunkValue.length) { + if (chunkValue.startsWith(searchPattern.substring(charactersMatched))) { + yield partialMatch as ChunkRange<TChunk>; // all fields are certainly defined now. + } + } else if (chunkValue === searchPattern.substring(charactersMatched, charactersMatched + chunkValue.length)) { + // The chunk is too short to complete the match; comparison has to be completed in subsequent chunks. + partialMatch.charactersMatched += chunkValue.length; + remainingPartialMatches.push(partialMatch); } } partialMatches = remainingPartialMatches; - // Try find the whole pattern in the chunk (possibly multiple times). + // 2. Try find the whole pattern in the chunk (possibly multiple times). if (searchPattern.length <= chunkValue.length) { let fromIndex = 0; while (fromIndex <= chunkValue.length) { @@ -106,11 +127,11 @@ export function abstractTextQuoteSelectorMatcher( }; // Advance the search forward to detect multiple occurrences within the same chunk. - fromIndex = matchStartIndex + 1; + fromIndex = patternStartIndex + 1; } } - // Check if this chunk ends with a partial match (or even multiple partial matches). + // 3. Check if this chunk ends with a partial match (or even multiple partial matches). let newPartialMatches: number[] = []; const searchStartPoint = Math.max(chunkValue.length - searchPattern.length + 1, 0); for (let i = searchStartPoint; i < chunkValue.length; i++) { @@ -121,11 +142,22 @@ export function abstractTextQuoteSelectorMatcher( if (character === searchPattern[0]) newPartialMatches.push(i); } for (const partialMatchStartIndex of newPartialMatches) { - partialMatches.push({ - startChunk: chunk, - startIndex: partialMatchStartIndex, - charactersMatched: chunkValue.length - partialMatchStartIndex, - }); + const charactersMatched = chunkValue.length - partialMatchStartIndex; + const partialMatch: PartialMatch = { + charactersMatched, + }; + if (charactersMatched >= prefix.length + exact.length) { + partialMatch.endChunk = chunk; + partialMatch.endIndex = partialMatchStartIndex + prefix.length + exact.length; + } + if ( + charactersMatched > prefix.length + || partialMatch.endChunk !== undefined // handles an edge case: an empty quote at the end of a chunk. + ) { + partialMatch.startChunk = chunk; + partialMatch.startIndex = partialMatchStartIndex + prefix.length; + } + partialMatches.push(partialMatch); } if (textChunks.nextChunk() === null)
