This is an automated email from the ASF dual-hosted git repository. gerben pushed a commit to branch import-dom-seek in repository https://gitbox.apache.org/repos/asf/incubator-annotator.git
commit 615e8362a8f44ce314840d214e3a9dcb51151bd8 Author: Gerben <[email protected]> AuthorDate: Fri Nov 6 22:20:22 2020 +0100 WIP make describe quote work --- packages/dom/src/seek.ts | 50 +++++++++- packages/dom/src/text-quote/describe.ts | 163 ++++++++++++++------------------ 2 files changed, 116 insertions(+), 97 deletions(-) diff --git a/packages/dom/src/seek.ts b/packages/dom/src/seek.ts index c055291..0fc3a8f 100644 --- a/packages/dom/src/seek.ts +++ b/packages/dom/src/seek.ts @@ -18,11 +18,11 @@ * under the License. */ -import { Chunk, TextNodeChunker, PartialTextNode } from "./chunker"; +import { Chunk, Chunker, TextNodeChunker, PartialTextNode, chunkEquals } from "./chunker"; const E_END = 'Iterator exhausted before seek ended.'; -interface NonEmptyChunker<TChunk extends Chunk<any>> { +export interface NonEmptyChunker<TChunk extends Chunk<any>> { readonly currentChunk: TChunk; nextChunk(): TChunk | null; previousChunk(): TChunk | null; @@ -77,6 +77,28 @@ export class TextSeeker<TChunk extends Chunk<string>> implements Seeker<string> this._readOrSeekTo(false, target); } + seekToChunk(target: TChunk, offset: number = 0) { + this._readOrSeekToChunk(false, target, offset); + } + + readToChunk(target: TChunk, offset: number = 0): string { + return this._readOrSeekToChunk(true, target, offset); + } + + private _readOrSeekToChunk(read: true, target: TChunk, offset?: number): string + private _readOrSeekToChunk(read: false, target: TChunk, offset?: number): void + private _readOrSeekToChunk(read: boolean, target: TChunk, offset: number = 0): string { + // XXX We have no way of knowing whether a chunk will follow or precedes the current chunk; we assume it follows. + let result = ''; + // This will throw a RangeError if we reach the end without encountering the target chunk. + while (!chunkEquals(this.currentChunk, target)) { + if (read) result += this.read(1, true); + } + if (offset > this.offsetInChunk) + result += this.read(offset - this.offsetInChunk); + return result; + } + private _readOrSeekTo(read: true, target: number, roundUp?: boolean): string private _readOrSeekTo(read: false, target: number, roundUp?: boolean): void private _readOrSeekTo(read: boolean, target: number, roundUp: boolean = false): string | void { @@ -142,10 +164,11 @@ export class TextSeeker<TChunk extends Chunk<string>> implements Seeker<string> } } - export class DomSeeker extends TextSeeker<PartialTextNode> implements BoundaryPointer<Text> { - constructor(scope: Range) { - const chunker = new TextNodeChunker(scope); + constructor(chunkerOrScope: Chunker<PartialTextNode> | Range) { + const chunker = isTextNodeChunker(chunkerOrScope) + ? chunkerOrScope + : new TextNodeChunker(chunkerOrScope); if (chunker.currentChunk === null) throw new RangeError('Range does not contain any Text nodes.'); super(chunker as NonEmptyChunker<PartialTextNode>); @@ -158,4 +181,21 @@ export class DomSeeker extends TextSeeker<PartialTextNode> implements BoundaryPo get offsetInReferenceNode() { return this.offsetInChunk + this.currentChunk.startOffset; } + + seekToBoundaryPoint(node: Node, offset: number) { + const document = (node.ownerDocument ?? node as Document); + const target = document.createRange(); + target.setStart(node, offset); + // target.setEnd(node, offset); // (implied by setting the start) + + // Seek step by step until we are at, or crossed, the target point. + const reverse = !!(node.compareDocumentPosition(this.referenceNode) & Node.DOCUMENT_POSITION_PRECEDING); + while (target.comparePoint(this.referenceNode, this.offsetInReferenceNode) === (reverse ? 1 : -1)) { + this.seekBy(reverse ? -1 : 1); + } + } +} + +function isTextNodeChunker(obj: any): obj is Chunker<PartialTextNode> { + return ('currentChunk' in obj && 'nextChunk' in obj && 'previousChunk' in obj); } diff --git a/packages/dom/src/text-quote/describe.ts b/packages/dom/src/text-quote/describe.ts index e5846bc..8ccf47e 100644 --- a/packages/dom/src/text-quote/describe.ts +++ b/packages/dom/src/text-quote/describe.ts @@ -20,7 +20,9 @@ import type { TextQuoteSelector } from '@annotator/selector'; import { ownerDocument } from '../owner-document'; -import { Seeker } from '../seek'; +import { Chunk, Chunker, ChunkRange, PartialTextNode, TextNodeChunker, chunkRangeEquals } from '../chunker'; +import { abstractTextQuoteSelectorMatcher } from '.'; +import { DomSeeker, TextSeeker, NonEmptyChunker } from '../seek'; export async function describeTextQuote( range: Range, @@ -32,120 +34,97 @@ export async function describeTextQuote( scope = document.createRange(); scope.selectNodeContents(document); } - range = range.cloneRange(); + + const chunker = new TextNodeChunker(scope); // Take the part of the range that falls within the scope. + range = range.cloneRange(); if (range.compareBoundaryPoints(Range.START_TO_START, scope) === -1) range.setStart(scope.startContainer, scope.startOffset); if (range.compareBoundaryPoints(Range.END_TO_END, scope) === 1) range.setEnd(scope.endContainer, scope.endOffset); - return { - type: 'TextQuoteSelector', - exact: range.toString(), - ...calculateContextForDisambiguation(range, scope), - }; + return await abstractDescribeTextQuote( + convertRangeToChunkRange(chunker, range), + chunker, + ); } -function calculateContextForDisambiguation( - range: Range, - scope: Range, -): { prefix: string; suffix: string } { - const exactText = range.toString(); - const scopeText = scope.toString(); - const targetStartIndex = getRangeTextPosition(range, scope); - const targetEndIndex = targetStartIndex + exactText.length; +async function abstractDescribeTextQuote<TChunk extends Chunk<string>>( + target: ChunkRange<TChunk>, + scope: Chunker<TChunk>, +): Promise<TextQuoteSelector> { + const seeker = new TextSeeker(scope as NonEmptyChunker<TChunk>); + seeker.seekToChunk(target.startChunk, target.startIndex); + const exact = seeker.readToChunk(target.endChunk, target.endIndex); // Starting with an empty prefix and suffix, we search for matches. At each unintended match // we encounter, we extend the prefix or suffix just enough to ensure it will no longer match. let prefix = ''; let suffix = ''; - let fromIndex = 0; - while (fromIndex <= scopeText.length) { - const searchPattern = prefix + exactText + suffix; - const patternMatchIndex = scopeText.indexOf(searchPattern, fromIndex); - if (patternMatchIndex === -1) break; - fromIndex = patternMatchIndex + 1; - const matchStartIndex = patternMatchIndex + prefix.length; - const matchEndIndex = matchStartIndex + exactText.length; + while (true) { + const tentativeSelector: TextQuoteSelector = { + type: 'TextQuoteSelector', + exact, + prefix, + suffix, + } + const matches = abstractTextQuoteSelectorMatcher(tentativeSelector)(scope); + let nextMatch = await matches.next(); + + if (!nextMatch.done && chunkRangeEquals(nextMatch.value, target)) { + // This match is the intended one, ignore it. + nextMatch = await matches.next(); + } + + // If there are no more unintended matches, our selector is unambiguous! + if (nextMatch.done) return tentativeSelector; + + // TODO either reset chunker to start from the beginning, or rewind the chunker by previous match’s length. + // chunker.seekTo(0) or chunker.seek(-prefix) - // Skip the found match if it is the actual target. - if (matchStartIndex === targetStartIndex) continue; + // We’ll have to add more prefix/suffix to disqualify this unintended match. + const unintendedMatch = nextMatch.value; + const seeker1 = new TextSeeker(scope as NonEmptyChunker<TChunk>); + const seeker2 = new TextSeeker(scope as NonEmptyChunker<TChunk>); // TODO must clone scope. // Count how many characters we’d need as a prefix to disqualify this match. - let sufficientPrefixLength = prefix.length + 1; - const firstChar = (offset: number) => - scopeText[offset - sufficientPrefixLength]; - while ( - firstChar(targetStartIndex) && - firstChar(targetStartIndex) === firstChar(matchStartIndex) - ) - sufficientPrefixLength++; - if (!firstChar(targetStartIndex)) - // We reached the start of scopeText; prefix won’t work. - sufficientPrefixLength = Infinity; - - // Count how many characters we’d need as a suffix to disqualify this match. - let sufficientSuffixLength = suffix.length + 1; - const lastChar = (offset: number) => - scopeText[offset + sufficientSuffixLength - 1]; - while ( - lastChar(targetEndIndex) && - lastChar(targetEndIndex) === lastChar(matchEndIndex) - ) - sufficientSuffixLength++; - if (!lastChar(targetEndIndex)) - // We reached the end of scopeText; suffix won’t work. - sufficientSuffixLength = Infinity; + seeker1.seekToChunk(target.startChunk, target.startIndex); + seeker2.seekToChunk(unintendedMatch.startChunk, unintendedMatch.startIndex); + let sufficientPrefix: string | undefined = prefix; + while (true) { + let previousCharacter: string; + try { + previousCharacter = seeker1.read(-1); + } catch (err) { + sufficientPrefix = undefined; // Start of text reached. + break; + } + sufficientPrefix = previousCharacter + sufficientPrefix; + if (previousCharacter !== seeker2.read(-1)) break; + } // Use either the prefix or suffix, whichever is shortest. - if (sufficientPrefixLength <= sufficientSuffixLength) { - // Compensate our search position for the increase in prefix length. - fromIndex -= sufficientPrefixLength - prefix.length; - prefix = scopeText.substring( - targetStartIndex - sufficientPrefixLength, - targetStartIndex, - ); - } else { - suffix = scopeText.substring( - targetEndIndex, - targetEndIndex + sufficientSuffixLength, - ); - } + if (sufficientPrefix !== undefined && (sufficientSuffix === undefined || sufficientPrefix.length <= sufficientSuffix.length)) + prefix = sufficientPrefix; // chunker.seek(prefix.length - sufficientPrefix.length) + else if (sufficientSuffix !== undefined) + suffix = sufficientSuffix; + else + throw new Error('Target cannot be disambiguated; how could that have happened‽'); } - - return { prefix, suffix }; } -// Get the index of the first character of range within the text of scope. -function getRangeTextPosition(range: Range, scope: Range): number { - const seeker = new Seeker(scope); - const scopeOffset = isTextNode(scope.startContainer) ? scope.startOffset : 0; - if (isTextNode(range.startContainer)) - return seeker.seek(range.startContainer) + range.startOffset - scopeOffset; - else return seeker.seek(firstTextNodeInRange(range)) - scopeOffset; -} +function convertRangeToChunkRange(chunker: Chunker<PartialTextNode>, range: Range): ChunkRange<PartialTextNode> { + const domSeeker = new DomSeeker(chunker); -function firstTextNodeInRange(range: Range): Text { - // Find the first text node inside the range. - const iter = ownerDocument(range).createNodeIterator( - range.commonAncestorContainer, - NodeFilter.SHOW_TEXT, - { - acceptNode(node: Text) { - // Only reveal nodes within the range; and skip any empty text nodes. - return range.intersectsNode(node) && node.length > 0 - ? NodeFilter.FILTER_ACCEPT - : NodeFilter.FILTER_REJECT; - }, - }, - ); - const node = iter.nextNode() as Text | null; - if (node === null) throw new Error('Range contains no text nodes'); - return node; -} + domSeeker.seekToBoundaryPoint(range.startContainer, range.startOffset); + const startChunk = domSeeker.currentChunk; + const startIndex = domSeeker.offsetInChunk; + + domSeeker.seekToBoundaryPoint(range.endContainer, range.endOffset); + const endChunk = domSeeker.currentChunk; + const endIndex = domSeeker.offsetInChunk; -function isTextNode(node: Node): node is Text { - return node.nodeType === Node.TEXT_NODE; + return { startChunk, startIndex, endChunk, endIndex }; }
