This is an automated email from the ASF dual-hosted git repository. gerben pushed a commit to branch chunking in repository https://gitbox.apache.org/repos/asf/incubator-annotator.git
commit 18c9eabffd175364385c19902b11f07a630d375a Author: Gerben <[email protected]> AuthorDate: Tue Sep 15 19:55:35 2020 +0200 WIP Create chunk abstraction for text quote matching --- packages/dom/src/text-iterator.ts | 79 ++++++++++++++++++++++ packages/dom/src/text-quote/match.ts | 101 ++++++++++++++-------------- packages/dom/test/text-quote/match-cases.ts | 8 +-- 3 files changed, 134 insertions(+), 54 deletions(-) diff --git a/packages/dom/src/text-iterator.ts b/packages/dom/src/text-iterator.ts new file mode 100644 index 0000000..dfc1384 --- /dev/null +++ b/packages/dom/src/text-iterator.ts @@ -0,0 +1,79 @@ +/** + * @license + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import { ownerDocument } from "./owner-document"; + +export interface TextRange extends Range { + // We guarantee that to always have Text nodes as start and end containers. + readonly startContainer: Text; + readonly endContainer: Text; + cloneRange(): TextRange; + + // Allow only Text nodes to be passed to these methods. + insertNode(node: Text): void; + selectNodeContents(node: Text): void; + setEnd(node: Text, offset: number): void; + setStart(node: Text, offset: number): void; + + // Do not allow these methods to be used at all. + selectNode(node: never): void; + setEndAfter(node: never): void; + setEndBefore(node: never): void; + setStartAfter(node: never): void; + setStartBefore(node: never): void; + surroundContents(newParent: never): void; +} + +export interface Chunk { + toString(): string; +} + +// Yields ranges whose start and end nodes are both the *same* Text node. +export async function* chunkRange(scope: Range): AsyncIterable<TextRange> { + const document = ownerDocument(scope); + + const iter = document.createNodeIterator( + scope.commonAncestorContainer, + NodeFilter.SHOW_TEXT, + { + acceptNode(node: Text) { + // Only reveal nodes within the range; and skip any empty text nodes. + return scope.intersectsNode(node) && node.length > 0 + ? NodeFilter.FILTER_ACCEPT + : NodeFilter.FILTER_REJECT; + }, + }, + ); + + let node: Text | null; + while (node = iter.nextNode() as (Text | null)) { + const range = document.createRange() as TextRange; + range.selectNodeContents(node); + + if (node === scope.startContainer) { + range.setStart(node, scope.startOffset); + } + if (node === scope.endContainer) { + range.setEnd(node, scope.endOffset); + } + + yield range; + } +} diff --git a/packages/dom/src/text-quote/match.ts b/packages/dom/src/text-quote/match.ts index a4a216f..6b7fd93 100644 --- a/packages/dom/src/text-quote/match.ts +++ b/packages/dom/src/text-quote/match.ts @@ -18,71 +18,72 @@ * under the License. */ -import type { Matcher, TextQuoteSelector } from '@annotator/selector'; -import seek from 'dom-seek'; +import type { TextQuoteSelector } from '@annotator/selector'; -import { ownerDocument } from '../owner-document'; +import { chunkRange, Chunk, TextRange } from '../text-iterator'; export function createTextQuoteSelectorMatcher( selector: TextQuoteSelector, -): Matcher<Range, Range> { +): (scope: Range) => AsyncGenerator<TextRange, void, void> { + const abstractMatcher = abstractTextQuoteSelectorMatcher(selector); return async function* matchAll(scope) { - const document = ownerDocument(scope); - const scopeText = scope.toString(); + // Turn the scope into a stream of ranges, each wrapping exactly one text node. We wrap it in + // a range such that the first and last text node can be partially included. Could be changed + // to e.g. be an object { node: Text, startOffset, endOffset }. + const textChunks = chunkRange(scope); + for await (const abstractMatch of abstractMatcher(textChunks)) { + const match = document.createRange() as TextRange; + // The `+…startOffset` part is only relevant for the first chunk, whose text node might be partially in scope. + match.setStart(abstractMatch.startChunk.startContainer, + abstractMatch.startIndex + abstractMatch.startChunk.startOffset); + match.setEnd(abstractMatch.endChunk.startContainer, // (note that startContainer equals endContainer) + abstractMatch.endIndex + abstractMatch.endChunk.startOffset); + yield match; + } + } +} + +interface AbstractRange<TChunk> { + startChunk: TChunk; + startIndex: number; + endChunk: TChunk; + endIndex: number; +} + +export function abstractTextQuoteSelectorMatcher( + selector: TextQuoteSelector, +): <TChunk extends Chunk>(textChunks: AsyncIterable<TChunk>) => AsyncGenerator<AbstractRange<TChunk>, void, void> { + return async function* matchAll(textChunks) { const exact = selector.exact; const prefix = selector.prefix || ''; const suffix = selector.suffix || ''; const searchPattern = prefix + exact + suffix; - const iter = document.createNodeIterator( - scope.commonAncestorContainer, - NodeFilter.SHOW_TEXT, - { - acceptNode(node: Text) { - // Only reveal nodes within the range; and skip any empty text nodes. - return scope.intersectsNode(node) && node.length > 0 - ? NodeFilter.FILTER_ACCEPT - : NodeFilter.FILTER_REJECT; - }, - }, - ); + for await (const chunk of textChunks) { + const chunkValue = chunk.toString(); - // The index of the first character of iter.referenceNode inside the text. - let referenceNodeIndex = isTextNode(scope.startContainer) - ? -scope.startOffset - : 0; + // Find the pattern in the chunk (possibly multiple times) + // TODO allow pattern to be spread across chunks + let fromIndex = 0; + while (fromIndex <= chunkValue.length) { + const patternStartIndex = chunkValue.indexOf(searchPattern, fromIndex); + if (patternStartIndex === -1) break; - let fromIndex = 0; - while (fromIndex <= scopeText.length) { - // Find the quote with its prefix and suffix in the string. - const patternStartIndex = scopeText.indexOf(searchPattern, fromIndex); - if (patternStartIndex === -1) return; + // Correct for the prefix and suffix lengths. + const matchStartIndex = patternStartIndex + prefix.length; + const matchEndIndex = matchStartIndex + exact.length; - // Correct for the prefix and suffix lengths. - const matchStartIndex = patternStartIndex + prefix.length; - const matchEndIndex = matchStartIndex + exact.length; + yield { + startChunk: chunk, + startIndex: matchStartIndex, + endChunk: chunk, + endIndex: matchEndIndex, + }; - // Create a range to represent this exact quote in the dom. - const match = document.createRange(); - - // Seek to the start of the match, make the range start there. - referenceNodeIndex += seek(iter, matchStartIndex - referenceNodeIndex); - match.setStart(iter.referenceNode, matchStartIndex - referenceNodeIndex); - - // Seek to the end of the match, make the range end there. - referenceNodeIndex += seek(iter, matchEndIndex - referenceNodeIndex); - match.setEnd(iter.referenceNode, matchEndIndex - referenceNodeIndex); - - // Yield the match. - yield match; - - // Advance the search forward to detect multiple occurrences. - fromIndex = matchStartIndex + 1; + // Advance the search forward to detect multiple occurrences within the same chunk. + fromIndex = matchStartIndex + 1; + } } }; } - -function isTextNode(node: Node): node is Text { - return node.nodeType === Node.TEXT_NODE; -} diff --git a/packages/dom/test/text-quote/match-cases.ts b/packages/dom/test/text-quote/match-cases.ts index 099802c..d4c2acd 100644 --- a/packages/dom/test/text-quote/match-cases.ts +++ b/packages/dom/test/text-quote/match-cases.ts @@ -99,8 +99,8 @@ export const testCases: { { startContainerXPath: '//i/text()', startOffset: 0, - endContainerXPath: '//b/text()[2]', - endOffset: 0, + endContainerXPath: '//i/text()', + endOffset: 11, }, ], }, @@ -115,8 +115,8 @@ export const testCases: { { startContainerXPath: '//title/text()', startOffset: 4, - endContainerXPath: '//b/text()[1]', - endOffset: 0, + endContainerXPath: '//title/text()', + endOffset: 9, }, ], },
