This is an automated email from the ASF dual-hosted git repository. gerben pushed a commit to branch chunking in repository https://gitbox.apache.org/repos/asf/incubator-annotator.git
commit 0477dbea20238275423033fc521dff3952e5a124 Author: Gerben <[email protected]> AuthorDate: Wed Sep 16 19:19:20 2020 +0200 WIP rewrite describeTextQuote --- packages/dom/src/text-quote/describe.ts | 217 +++++++++++++------------------- 1 file changed, 90 insertions(+), 127 deletions(-) diff --git a/packages/dom/src/text-quote/describe.ts b/packages/dom/src/text-quote/describe.ts index dca21c8..fab2ca3 100644 --- a/packages/dom/src/text-quote/describe.ts +++ b/packages/dom/src/text-quote/describe.ts @@ -18,10 +18,11 @@ * under the License. */ -import seek from 'dom-seek'; import type { TextQuoteSelector } from '@annotator/selector'; import { ownerDocument } from '../owner-document'; +import { ChunkRange, Chunk, rangeToTextChunks } from '../text-iterator'; +import { abstractTextQuoteSelectorMatcher } from './match'; export async function describeTextQuote( range: Range, @@ -41,151 +42,113 @@ export async function describeTextQuote( if (range.compareBoundaryPoints(Range.END_TO_END, scope) === 1) range.setEnd(scope.endContainer, scope.endOffset); + return await abstractDescribeTextQuote( + { + startChunk: range.startContainer, + startIndex: range.startOffset, + endChunk: range.endContainer, + endIndex: range.endOffset, + }, + rangeToTextChunks(scope), + ); +} + +async function abstractDescribeTextQuote( + target: ChunkRange<any>, + scope: AsyncIterable<Chunk>, +): Promise<TextQuoteSelector> { + const exact = readChunks(target); + const { prefix, suffix } = await calculateContextForDisambiguation(target, scope); return { type: 'TextQuoteSelector', - exact: range.toString(), - ...calculateContextForDisambiguation(range, scope), + exact, + prefix, + suffix, }; } -function calculateContextForDisambiguation( - range: Range, - scope: Range, -): { prefix?: string; suffix?: string } { - const exactText = range.toString(); - const scopeText = scope.toString(); - const targetStartIndex = getRangeTextPosition(range, scope); - const targetEndIndex = targetStartIndex + exactText.length; - - // Find all matches of the text in the scope. - const stringMatches: number[] = []; - let fromIndex = 0; - while (fromIndex <= scopeText.length) { - const matchIndex = scopeText.indexOf(exactText, fromIndex); - if (matchIndex === -1) break; - stringMatches.push(matchIndex); - fromIndex = matchIndex + 1; - } - - // Count for each undesired match the required prefix and suffix lengths, such that either of them - // would have invalidated the match. - const affixLengthPairs: Array<[number, number]> = []; - for (const matchStartIndex of stringMatches) { - const matchEndIndex = matchStartIndex + exactText.length; - - // Skip the found match if it is the actual target. - if (matchStartIndex === targetStartIndex) continue; +async function calculateContextForDisambiguation( + target: ChunkRange<any>, + scope: AsyncIterable<Chunk>, +): Promise<{ prefix: string; suffix: string }> { + const exact = target.toString(); + let prefix = ''; + let suffix = ''; + + while (true) { + const tentativeSelector: TextQuoteSelector = { + type: 'TextQuoteSelector', + exact, + prefix, + suffix, + } + const matches = abstractTextQuoteSelectorMatcher(tentativeSelector)(scope); + const nextMatch = await matches.next(); + if (nextMatch.done) break; + const match = nextMatch.value; + + if ( + match.startChunk === target.startChunk + && match.endChunk === target.endChunk + && match.startIndex === target.startIndex + && match.endIndex === target.endIndex + ) { + // This match is the intended one, ignore it. + continue; + } - // Count how many characters before & after them the false match and target have in common. - const sufficientPrefixLength = charactersNeededToBeUnique( - scopeText.substring(0, targetStartIndex), - scopeText.substring(0, matchStartIndex), + const sufficientPrefix = charactersNeededToBeUnique( + match, + target, true, ); - const sufficientSuffixLength = charactersNeededToBeUnique( - scopeText.substring(targetEndIndex), - scopeText.substring(matchEndIndex), + const sufficientSuffix = charactersNeededToBeUnique( + match, + target, false, ); - affixLengthPairs.push([sufficientPrefixLength, sufficientSuffixLength]); + + // Use either the prefix or suffix, whichever is shortest. + if (sufficientPrefix !== undefined && (sufficientSuffix === undefined || sufficientPrefix.length <= sufficientSuffix.length)) + prefix = sufficientPrefix; + else if (sufficientSuffix !== undefined) + suffix = sufficientSuffix; + else + throw new Error('Target cannot be disambiguated; how could that have happened‽'); } - // Find the prefix and suffix that would invalidate all mismatches, using the minimal characters - // for prefix and suffix combined. - const [prefixLength, suffixLength] = minimalSolution(affixLengthPairs); - const prefix = scopeText.substring( - targetStartIndex - prefixLength, - targetStartIndex, - ); - const suffix = scopeText.substring( - targetEndIndex, - targetEndIndex + suffixLength, - ); return { prefix, suffix }; } function charactersNeededToBeUnique( - target: string, - impostor: string, - reverse = false, -) { - // Count how many characters the two strings have in common. - let overlap = 0; - const charAt = (s: string, i: number) => - reverse ? s[s.length - 1 - i] : s[overlap]; - while ( - overlap < target.length && - charAt(target, overlap) === charAt(impostor, overlap) - ) - overlap++; - if (overlap === target.length) return Infinity; - // (no substring of target can make it distinguishable from its impostor) - else return overlap + 1; + match: ChunkRange<any>, + target: ChunkRange<any>, + reverse: boolean, +): string | undefined { + // TODO. How? } -function minimalSolution( - requirements: Array<[number, number]>, -): [number, number] { - // Ensure we try solutions with an empty prefix or suffix. - requirements.push([0, 0]); - - // Build all the pairs and order them by their sums. - const pairs = requirements.flatMap((l) => - requirements.map<[number, number]>((r) => [l[0], r[1]]), - ); - pairs.sort((a, b) => a[0] + a[1] - (b[0] + b[1])); - - // Find the first pair that satisfies every requirement. - for (const pair of pairs) { - const [p0, p1] = pair; - if (requirements.every(([r0, r1]) => r0 <= p0 || r1 <= p1)) { - return pair; - } +function readChunks( + { + startChunk, + startIndex, + endChunk, + endIndex, + }: ChunkRange<Chunk> +): string { + if (startChunk === endChunk) + return startChunk.toString().substring(startIndex, endIndex); + + let text = startChunk.toString().substring(startIndex); + let curChunk = startChunk; + while (curChunk && curChunk !== endChunk) { + curChunk = nextChunk(curChunk); + text += curChunk.toString(); } - - // Return the largest pairing (unreachable). - return pairs[pairs.length - 1]; -} - -// Get the index of the first character of range within the text of scope. -function getRangeTextPosition(range: Range, scope: Range): number { - const iter = ownerDocument(scope).createNodeIterator( - scope.commonAncestorContainer, - NodeFilter.SHOW_TEXT, - { - acceptNode(node: Text) { - // Only reveal nodes within the range - return scope.intersectsNode(node) - ? NodeFilter.FILTER_ACCEPT - : NodeFilter.FILTER_REJECT; - }, - }, - ); - const scopeOffset = isTextNode(scope.startContainer) ? scope.startOffset : 0; - if (isTextNode(range.startContainer)) - return seek(iter, range.startContainer) + range.startOffset - scopeOffset; - else return seek(iter, firstTextNodeInRange(range)) - scopeOffset; -} - -function firstTextNodeInRange(range: Range): Text { - // Find the first text node inside the range. - const iter = ownerDocument(range).createNodeIterator( - range.commonAncestorContainer, - NodeFilter.SHOW_TEXT, - { - acceptNode(node: Text) { - // Only reveal nodes within the range; and skip any empty text nodes. - return range.intersectsNode(node) && node.length > 0 - ? NodeFilter.FILTER_ACCEPT - : NodeFilter.FILTER_REJECT; - }, - }, - ); - const node = iter.nextNode() as Text | null; - if (node === null) throw new Error('Range contains no text nodes'); - return node; + text += endChunk.toString().substring(0, endIndex); + return text; } -function isTextNode(node: Node): node is Text { - return node.nodeType === Node.TEXT_NODE; +function nextChunk(chunk: Chunk): Chunk { + // TODO. How? }
