This is an automated email from the ASF dual-hosted git repository. gerben pushed a commit to branch import-dom-seek in repository https://gitbox.apache.org/repos/asf/incubator-annotator.git
commit 15e8ffd485cf0c15c864c2f63540f9963c8e48e4 Author: Gerben <[email protected]> AuthorDate: Fri Nov 6 14:18:45 2020 +0100 WIP Make describing a text quote work too --- packages/dom/src/chunker.ts | 1 + packages/dom/src/seek.ts | 26 +++- packages/dom/src/text-quote/describe.ts | 214 ++++++++++++++++++-------------- 3 files changed, 145 insertions(+), 96 deletions(-) diff --git a/packages/dom/src/chunker.ts b/packages/dom/src/chunker.ts index 82a72e3..0f3b746 100644 --- a/packages/dom/src/chunker.ts +++ b/packages/dom/src/chunker.ts @@ -25,6 +25,7 @@ import { ownerDocument } from "./owner-document"; // data structure it came from (e.g. a DOM node). export interface Chunk<TData extends any> { readonly data: TData; + equals?(otherChunk: this): boolean; } // A Chunker lets one walk through the chunks of a document. diff --git a/packages/dom/src/seek.ts b/packages/dom/src/seek.ts index f09e583..9ceade3 100644 --- a/packages/dom/src/seek.ts +++ b/packages/dom/src/seek.ts @@ -18,7 +18,7 @@ * under the License. */ -import { Chunk, TextNodeChunker, PartialTextNode } from "./chunker"; +import { Chunk, Chunker, TextNodeChunker, PartialTextNode } from "./chunker"; const E_END = 'Iterator exhausted before seek ended.'; @@ -142,10 +142,11 @@ class TextSeeker<TChunk extends Chunk<string>> implements Seeker<string> { } } - export class DomSeeker extends TextSeeker<PartialTextNode> implements BoundaryPointer<Text> { - constructor(scope: Range) { - const chunker = new TextNodeChunker(scope); + constructor(chunkerOrScope: Chunker<PartialTextNode> | Range) { + const chunker = isTextNodeChunker(chunkerOrScope) + ? chunkerOrScope + : new TextNodeChunker(chunkerOrScope); if (chunker.currentChunk === null) throw new RangeError('Range does not contain any Text nodes.'); super(chunker as NonEmptyChunker<PartialTextNode>); @@ -158,4 +159,21 @@ export class DomSeeker extends TextSeeker<PartialTextNode> implements BoundaryPo get offsetInReferenceNode() { return this.offsetInChunk + this.currentChunk.startOffset; } + + seekToBoundaryPoint(node: Node, offset: number) { + const document = (node.ownerDocument ?? node as Document); + const target = document.createRange(); + target.setStart(node, offset); + // target.setEnd(node, offset); // (implied by setting the start) + + // Seek step by step until we are at, or crossed, the target point. + const reverse = !!(node.compareDocumentPosition(this.referenceNode) & Node.DOCUMENT_POSITION_PRECEDING); + while (target.comparePoint(this.referenceNode, this.offsetInReferenceNode) === (reverse ? 1 : -1)) { + this.seekBy(reverse ? -1 : 1); + } + } +} + +function isTextNodeChunker(obj: any): obj is Chunker<PartialTextNode> { + return ('currentChunk' in obj && 'nextChunk' in obj && 'previousChunk' in obj); } diff --git a/packages/dom/src/text-quote/describe.ts b/packages/dom/src/text-quote/describe.ts index e5846bc..787d60d 100644 --- a/packages/dom/src/text-quote/describe.ts +++ b/packages/dom/src/text-quote/describe.ts @@ -20,7 +20,9 @@ import type { TextQuoteSelector } from '@annotator/selector'; import { ownerDocument } from '../owner-document'; -import { Seeker } from '../seek'; +import { Chunk, Chunker, PartialTextNode, TextNodeChunker } from '../chunker'; +import { ChunkRange, abstractTextQuoteSelectorMatcher } from '.'; +import { DomSeeker } from '../seek'; export async function describeTextQuote( range: Range, @@ -32,120 +34,148 @@ export async function describeTextQuote( scope = document.createRange(); scope.selectNodeContents(document); } - range = range.cloneRange(); + + const chunker = new TextNodeChunker(scope); // Take the part of the range that falls within the scope. + range = range.cloneRange(); if (range.compareBoundaryPoints(Range.START_TO_START, scope) === -1) range.setStart(scope.startContainer, scope.startOffset); if (range.compareBoundaryPoints(Range.END_TO_END, scope) === 1) range.setEnd(scope.endContainer, scope.endOffset); - return { - type: 'TextQuoteSelector', - exact: range.toString(), - ...calculateContextForDisambiguation(range, scope), - }; + return await abstractDescribeTextQuote( + convertRangeToChunkRange(chunker, range), + chunker, + ); } -function calculateContextForDisambiguation( - range: Range, - scope: Range, -): { prefix: string; suffix: string } { - const exactText = range.toString(); - const scopeText = scope.toString(); - const targetStartIndex = getRangeTextPosition(range, scope); - const targetEndIndex = targetStartIndex + exactText.length; +async function abstractDescribeTextQuote( + target: ChunkRange<Chunk<string>>, + scope: Chunker<Chunk<string>>, +): Promise<TextQuoteSelector> { + const exact = readChunkRange(scope, target); // Starting with an empty prefix and suffix, we search for matches. At each unintended match // we encounter, we extend the prefix or suffix just enough to ensure it will no longer match. let prefix = ''; let suffix = ''; - let fromIndex = 0; - while (fromIndex <= scopeText.length) { - const searchPattern = prefix + exactText + suffix; - const patternMatchIndex = scopeText.indexOf(searchPattern, fromIndex); - if (patternMatchIndex === -1) break; - fromIndex = patternMatchIndex + 1; - - const matchStartIndex = patternMatchIndex + prefix.length; - const matchEndIndex = matchStartIndex + exactText.length; - - // Skip the found match if it is the actual target. - if (matchStartIndex === targetStartIndex) continue; - - // Count how many characters we’d need as a prefix to disqualify this match. - let sufficientPrefixLength = prefix.length + 1; - const firstChar = (offset: number) => - scopeText[offset - sufficientPrefixLength]; - while ( - firstChar(targetStartIndex) && - firstChar(targetStartIndex) === firstChar(matchStartIndex) - ) - sufficientPrefixLength++; - if (!firstChar(targetStartIndex)) - // We reached the start of scopeText; prefix won’t work. - sufficientPrefixLength = Infinity; - - // Count how many characters we’d need as a suffix to disqualify this match. - let sufficientSuffixLength = suffix.length + 1; - const lastChar = (offset: number) => - scopeText[offset + sufficientSuffixLength - 1]; - while ( - lastChar(targetEndIndex) && - lastChar(targetEndIndex) === lastChar(matchEndIndex) - ) - sufficientSuffixLength++; - if (!lastChar(targetEndIndex)) - // We reached the end of scopeText; suffix won’t work. - sufficientSuffixLength = Infinity; - // Use either the prefix or suffix, whichever is shortest. - if (sufficientPrefixLength <= sufficientSuffixLength) { - // Compensate our search position for the increase in prefix length. - fromIndex -= sufficientPrefixLength - prefix.length; - prefix = scopeText.substring( - targetStartIndex - sufficientPrefixLength, - targetStartIndex, - ); - } else { - suffix = scopeText.substring( - targetEndIndex, - targetEndIndex + sufficientSuffixLength, - ); + while (true) { + const tentativeSelector: TextQuoteSelector = { + type: 'TextQuoteSelector', + exact, + prefix, + suffix, + } + const matches = abstractTextQuoteSelectorMatcher(tentativeSelector)(scope); + let nextMatch = await matches.next(); + + if (!nextMatch.done && chunkRangeEquals(nextMatch.value, target)) { + // This match is the intended one, ignore it. + nextMatch = await matches.next(); } + + // If there are no more unintended matches, our selector is unambiguous! + if (nextMatch.done) return tentativeSelector; + + // We’ll have to add more prefix/suffix to disqualify this unintended match. + const match = nextMatch.value; + const sufficientPrefix = charactersNeededToBeUnique(scope, match, target, true); + const sufficientSuffix = charactersNeededToBeUnique(scope, match, target, false); + + // Use either the prefix or suffix, whichever is shortest. + if (sufficientPrefix !== undefined && (sufficientSuffix === undefined || sufficientPrefix.length <= sufficientSuffix.length)) + prefix = sufficientPrefix; + else if (sufficientSuffix !== undefined) + suffix = sufficientSuffix; + else + throw new Error('Target cannot be disambiguated; how could that have happened‽'); } +} + +function charactersNeededToBeUnique( + chunker: Chunker<Chunk<string>>, + match: ChunkRange<any>, + target: ChunkRange<any>, + reverse: boolean, +): string | undefined { + // TODO. How? + + // // Count how many characters we’d need as a prefix to disqualify this match. + // let sufficientPrefixLength = prefix.length + 1; + // const firstChar = (offset: number) => + // scopeText[offset - sufficientPrefixLength]; + // while ( + // firstChar(targetStartIndex) && + // firstChar(targetStartIndex) === firstChar(matchStartIndex) + // ) + // sufficientPrefixLength++; + // if (!firstChar(targetStartIndex)) + // // We reached the start of scopeText; prefix won’t work. + // sufficientPrefixLength = Infinity; + + // // Count how many characters we’d need as a suffix to disqualify this match. + // let sufficientSuffixLength = suffix.length + 1; + // const lastChar = (offset: number) => + // scopeText[offset + sufficientSuffixLength - 1]; + // while ( + // lastChar(targetEndIndex) && + // lastChar(targetEndIndex) === lastChar(matchEndIndex) + // ) + // sufficientSuffixLength++; + // if (!lastChar(targetEndIndex)) + // // We reached the end of scopeText; suffix won’t work. + // sufficientSuffixLength = Infinity; - return { prefix, suffix }; } -// Get the index of the first character of range within the text of scope. -function getRangeTextPosition(range: Range, scope: Range): number { - const seeker = new Seeker(scope); - const scopeOffset = isTextNode(scope.startContainer) ? scope.startOffset : 0; - if (isTextNode(range.startContainer)) - return seeker.seek(range.startContainer) + range.startOffset - scopeOffset; - else return seeker.seek(firstTextNodeInRange(range)) - scopeOffset; +function readChunkRange<TChunk extends Chunk<string>>( + chunker: Chunker<TChunk>, + { + startChunk, + startIndex, + endChunk, + endIndex, + }: ChunkRange<TChunk> +): string { + if (startChunk === endChunk) + return startChunk.data.substring(startIndex, endIndex); + + let text = startChunk.data.substring(startIndex); + // TODO use chunker; or implement Seeker.readToChunk or similiar? + let curChunk = startChunk; + while (curChunk && curChunk !== endChunk) { + curChunk = chunker.nextChunk(); + text += curChunk.data; + } + text += endChunk.data.substring(0, endIndex); + return text; } -function firstTextNodeInRange(range: Range): Text { - // Find the first text node inside the range. - const iter = ownerDocument(range).createNodeIterator( - range.commonAncestorContainer, - NodeFilter.SHOW_TEXT, - { - acceptNode(node: Text) { - // Only reveal nodes within the range; and skip any empty text nodes. - return range.intersectsNode(node) && node.length > 0 - ? NodeFilter.FILTER_ACCEPT - : NodeFilter.FILTER_REJECT; - }, - }, +function chunkRangeEquals(range1: ChunkRange<any>, range2: ChunkRange<any>) { + return ( + chunkEquals(range1.startChunk, range2.startChunk) + && chunkEquals(range1.endChunk, range2.endChunk) + && range1.startIndex === range2.startIndex + && range1.endIndex === range2.endIndex ); - const node = iter.nextNode() as Text | null; - if (node === null) throw new Error('Range contains no text nodes'); - return node; } -function isTextNode(node: Node): node is Text { - return node.nodeType === Node.TEXT_NODE; +function chunkEquals(chunk1: Chunk<any>, chunk2: Chunk<any>): boolean { + return chunk1.equals ? chunk1.equals(chunk2) : chunk1 === chunk2; +} + +function convertRangeToChunkRange(chunker: Chunker<PartialTextNode>, range: Range): ChunkRange<PartialTextNode> { + const domSeeker = new DomSeeker(chunker); + + domSeeker.seekToBoundaryPoint(range.startContainer, range.startOffset); + const startChunk = domSeeker.currentChunk; + const startIndex = domSeeker.offsetInChunk; + + domSeeker.seekToBoundaryPoint(range.endContainer, range.endOffset); + const endChunk = domSeeker.currentChunk; + const endIndex = domSeeker.offsetInChunk; + + return { startChunk, startIndex, endChunk, endIndex }; }
