This is an automated email from the ASF dual-hosted git repository. gerben pushed a commit to branch import-dom-seek in repository https://gitbox.apache.org/repos/asf/incubator-annotator.git
commit 5dd510521a1108d6d598602154485d5d1c03f023 Author: Gerben <[email protected]> AuthorDate: Fri Nov 20 16:40:19 2020 +0100 Move abstract code into @annotator/selector --- packages/dom/src/chunker.ts | 51 +------ packages/dom/src/text-position/describe.ts | 22 +-- packages/dom/src/text-position/match.ts | 28 +--- packages/dom/src/text-quote/describe.ts | 115 +------------- packages/dom/src/text-quote/match.ts | 148 +----------------- packages/selector/src/index.ts | 1 + packages/selector/src/text/chunker.ts | 69 +++++++++ .../src => selector/src/text}/code-point-seeker.ts | 0 .../src/text/describe-text-position.ts} | 34 +---- packages/selector/src/text/describe-text-quote.ts | 134 ++++++++++++++++ packages/selector/src/text/index.ts | 5 + .../src/text/match-text-position.ts} | 27 +--- packages/selector/src/text/match-text-quote.ts | 168 +++++++++++++++++++++ packages/{dom/src => selector/src/text}/seek.ts | 0 14 files changed, 392 insertions(+), 410 deletions(-) diff --git a/packages/dom/src/chunker.ts b/packages/dom/src/chunker.ts index 8c56924..bc5c2c1 100644 --- a/packages/dom/src/chunker.ts +++ b/packages/dom/src/chunker.ts @@ -18,59 +18,10 @@ * under the License. */ +import type { Chunk, Chunker, ChunkRange } from '@annotator/selector'; import { normalizeRange } from './normalize-range'; import { ownerDocument } from './owner-document'; -// A Chunk represents a fragment (typically a string) of some document. -// Subclasses can add further attributes to map the chunk to its position in the -// data structure it came from (e.g. a DOM node). -export interface Chunk<TData> { - readonly data: TData; - equals?(otherChunk: this): boolean; -} - -export interface ChunkRange<TChunk extends Chunk<any>> { - startChunk: TChunk; - startIndex: number; - endChunk: TChunk; - endIndex: number; -} - -export function chunkEquals(chunk1: Chunk<any>, chunk2: Chunk<any>): boolean { - return chunk1.equals ? chunk1.equals(chunk2) : chunk1 === chunk2; -} - -export function chunkRangeEquals( - range1: ChunkRange<any>, - range2: ChunkRange<any>, -): boolean { - return ( - chunkEquals(range1.startChunk, range2.startChunk) && - chunkEquals(range1.endChunk, range2.endChunk) && - range1.startIndex === range2.startIndex && - range1.endIndex === range2.endIndex - ); -} - -// A Chunker lets one walk through the chunks of a document. -// It is inspired by, and similar to, the DOM’s NodeIterator. (but unlike -// NodeIterator, it has no concept of being ‘before’ or ‘after’ a chunk) -export interface Chunker<TChunk extends Chunk<any>> { - // The chunk currently being pointed at. - readonly currentChunk: TChunk; - - // Move currentChunk to the chunk following it, and return that chunk. - // If there are no chunks following it, keep currentChunk unchanged and return null. - nextChunk(): TChunk | null; - - // Move currentChunk to the chunk preceding it, and return that chunk. - // If there are no preceding chunks, keep currentChunk unchanged and return null. - previousChunk(): TChunk | null; - - // Test if a given chunk is before the current chunk. - precedesCurrentChunk(chunk: TChunk): boolean; -} - export interface PartialTextNode extends Chunk<string> { readonly node: Text; readonly startOffset: number; diff --git a/packages/dom/src/text-position/describe.ts b/packages/dom/src/text-position/describe.ts index 5f7f9a3..992a633 100644 --- a/packages/dom/src/text-position/describe.ts +++ b/packages/dom/src/text-position/describe.ts @@ -19,11 +19,9 @@ */ import type { TextPositionSelector } from '@annotator/selector'; -import type { Chunk, Chunker, ChunkRange } from '../chunker'; +import { describeTextPosition as abstractDescribeTextPosition } from '@annotator/selector'; import { TextNodeChunker } from '../chunker'; -import { CodePointSeeker } from '../code-point-seeker'; import { ownerDocument } from '../owner-document'; -import { TextSeeker } from '../seek'; export async function describeTextPosition( range: Range, @@ -48,21 +46,3 @@ export async function describeTextPosition( textChunks, ); } - -async function abstractDescribeTextPosition<TChunk extends Chunk<string>>( - target: ChunkRange<TChunk>, - scope: Chunker<TChunk>, -): Promise<TextPositionSelector> { - const codeUnitSeeker = new TextSeeker(scope); - const codePointSeeker = new CodePointSeeker(codeUnitSeeker); - - codePointSeeker.seekToChunk(target.startChunk, target.startIndex); - const start = codePointSeeker.position; - codePointSeeker.seekToChunk(target.endChunk, target.endIndex); - const end = codePointSeeker.position; - return { - type: 'TextPositionSelector', - start, - end, - }; -} diff --git a/packages/dom/src/text-position/match.ts b/packages/dom/src/text-position/match.ts index becd957..3dccede 100644 --- a/packages/dom/src/text-position/match.ts +++ b/packages/dom/src/text-position/match.ts @@ -19,10 +19,8 @@ */ import type { Matcher, TextPositionSelector } from '@annotator/selector'; -import type { Chunk, ChunkRange, Chunker } from '../chunker'; +import { textPositionSelectorMatcher as abstractTextPositionSelectorMatcher } from '@annotator/selector'; import { TextNodeChunker } from '../chunker'; -import { CodePointSeeker } from '../code-point-seeker'; -import { TextSeeker } from '../seek'; export function createTextPositionSelectorMatcher( selector: TextPositionSelector, @@ -39,27 +37,3 @@ export function createTextPositionSelectorMatcher( } }; } - -export function abstractTextPositionSelectorMatcher( - selector: TextPositionSelector, -): <TChunk extends Chunk<any>>( - scope: Chunker<TChunk>, -) => AsyncGenerator<ChunkRange<TChunk>, void, void> { - const { start, end } = selector; - - return async function* matchAll<TChunk extends Chunk<string>>( - textChunks: Chunker<TChunk>, - ) { - const codeUnitSeeker = new TextSeeker(textChunks); - const codePointSeeker = new CodePointSeeker(codeUnitSeeker); - - codePointSeeker.seekTo(start); - const startChunk = codeUnitSeeker.currentChunk; - const startIndex = codeUnitSeeker.offsetInChunk; - codePointSeeker.seekTo(end); - const endChunk = codeUnitSeeker.currentChunk; - const endIndex = codeUnitSeeker.offsetInChunk; - - yield { startChunk, startIndex, endChunk, endIndex }; - }; -} diff --git a/packages/dom/src/text-quote/describe.ts b/packages/dom/src/text-quote/describe.ts index 3dfa45e..3ca3f0b 100644 --- a/packages/dom/src/text-quote/describe.ts +++ b/packages/dom/src/text-quote/describe.ts @@ -19,12 +19,9 @@ */ import type { TextQuoteSelector } from '@annotator/selector'; -import type { Chunk, Chunker, ChunkRange } from '../chunker'; -import { TextNodeChunker, chunkRangeEquals } from '../chunker'; +import { describeTextQuote as abstractDescribeTextQuote } from '@annotator/selector'; +import { TextNodeChunker } from '../chunker'; import { ownerDocument } from '../owner-document'; -import type { Seeker } from '../seek'; -import { TextSeeker } from '../seek'; -import { abstractTextQuoteSelectorMatcher } from '.'; export async function describeTextQuote( range: Range, @@ -47,111 +44,3 @@ export async function describeTextQuote( () => new TextNodeChunker(scope), ); } - -async function abstractDescribeTextQuote<TChunk extends Chunk<string>>( - target: ChunkRange<TChunk>, - scope: () => Chunker<TChunk>, -): Promise<TextQuoteSelector> { - const seeker = new TextSeeker(scope()); - - // Read the target’s exact text. - seeker.seekToChunk(target.startChunk, target.startIndex); - const exact = seeker.readToChunk(target.endChunk, target.endIndex); - - // Starting with an empty prefix and suffix, we search for matches. At each unintended match - // we encounter, we extend the prefix or suffix just enough to ensure it will no longer match. - let prefix = ''; - let suffix = ''; - - while (true) { - const tentativeSelector: TextQuoteSelector = { - type: 'TextQuoteSelector', - exact, - prefix, - suffix, - }; - - const matches = abstractTextQuoteSelectorMatcher(tentativeSelector)( - scope(), - ); - let nextMatch = await matches.next(); - - // If this match is the intended one, no need to act. - // XXX This test is fragile: nextMatch and target are assumed to be normalised. - if (!nextMatch.done && chunkRangeEquals(nextMatch.value, target)) { - nextMatch = await matches.next(); - } - - // If there are no more unintended matches, our selector is unambiguous! - if (nextMatch.done) return tentativeSelector; - - // Possible optimisation: A subsequent search could safely skip the part we - // already processed, instead of starting from the beginning again. But we’d - // need the matcher to start at the seeker’s position, instead of searching - // in the whole current chunk. Then we could just seek back to just after - // the start of the prefix: seeker.seekBy(-prefix.length + 1); (don’t forget - // to also correct for any changes in the prefix we will make below) - - // We’ll have to add more prefix/suffix to disqualify this unintended match. - const unintendedMatch = nextMatch.value; - const seeker1 = new TextSeeker(scope()); - const seeker2 = new TextSeeker(scope()); - - // Count how many characters we’d need as a prefix to disqualify this match. - seeker1.seekToChunk(target.startChunk, target.startIndex - prefix.length); - seeker2.seekToChunk( - unintendedMatch.startChunk, - unintendedMatch.startIndex - prefix.length, - ); - const extraPrefix = readUntilDifferent(seeker1, seeker2, true); - - // Count how many characters we’d need as a suffix to disqualify this match. - seeker1.seekToChunk(target.endChunk, target.endIndex + suffix.length); - seeker2.seekToChunk( - unintendedMatch.endChunk, - unintendedMatch.endIndex + suffix.length, - ); - const extraSuffix = readUntilDifferent(seeker1, seeker2, false); - - // Use either the prefix or suffix, whichever is shortest. - if ( - extraPrefix !== undefined && - (extraSuffix === undefined || extraPrefix.length <= extraSuffix.length) - ) { - prefix = extraPrefix + prefix; - } else if (extraSuffix !== undefined) { - suffix = suffix + extraSuffix; - } else { - throw new Error( - 'Target cannot be disambiguated; how could that have happened‽', - ); - } - } -} - -function readUntilDifferent( - seeker1: Seeker, - seeker2: Seeker, - reverse: boolean, -): string | undefined { - let result = ''; - while (true) { - let nextCharacter: string; - try { - nextCharacter = seeker1.read(reverse ? -1 : 1); - } catch (err) { - return undefined; // Start/end of text reached: cannot expand result. - } - result = reverse ? nextCharacter + result : result + nextCharacter; - - // Check if the newly added character makes the result differ from the second seeker. - let comparisonCharacter: string | undefined; - try { - comparisonCharacter = seeker2.read(reverse ? -1 : 1); - } catch (err) { - // A RangeError would merely mean seeker2 is exhausted. - if (!(err instanceof RangeError)) throw err; - } - if (nextCharacter !== comparisonCharacter) return result; - } -} diff --git a/packages/dom/src/text-quote/match.ts b/packages/dom/src/text-quote/match.ts index 9e09990..0c16b8a 100644 --- a/packages/dom/src/text-quote/match.ts +++ b/packages/dom/src/text-quote/match.ts @@ -19,7 +19,7 @@ */ import type { Matcher, TextQuoteSelector } from '@annotator/selector'; -import type { Chunk, Chunker, ChunkRange } from '../chunker'; +import { textQuoteSelectorMatcher as abstractTextQuoteSelectorMatcher } from '@annotator/selector'; import { TextNodeChunker, EmptyScopeError } from '../chunker'; export function createTextQuoteSelectorMatcher( @@ -42,149 +42,3 @@ export function createTextQuoteSelectorMatcher( } }; } - -export function abstractTextQuoteSelectorMatcher( - selector: TextQuoteSelector, -): <TChunk extends Chunk<any>>( - scope: Chunker<TChunk>, -) => AsyncGenerator<ChunkRange<TChunk>, void, void> { - return async function* matchAll<TChunk extends Chunk<string>>( - textChunks: Chunker<TChunk>, - ) { - const exact = selector.exact; - const prefix = selector.prefix || ''; - const suffix = selector.suffix || ''; - const searchPattern = prefix + exact + suffix; - - // The code below runs a loop with three steps: - // 1. Continue checking any partial matches from the previous chunk(s). - // 2. Try find the whole pattern in the chunk (possibly multiple times). - // 3. Check if this chunk ends with a partial match (or even multiple partial matches). - - interface PartialMatch { - startChunk?: TChunk; - startIndex?: number; - endChunk?: TChunk; - endIndex?: number; - charactersMatched: number; - } - let partialMatches: PartialMatch[] = []; - - let isFirstChunk = true; - do { - const chunk = textChunks.currentChunk; - const chunkValue = chunk.data; - - // 1. Continue checking any partial matches from the previous chunk(s). - const remainingPartialMatches: typeof partialMatches = []; - for (const partialMatch of partialMatches) { - const charactersMatched = partialMatch.charactersMatched; - - // If the current chunk contains the start and/or end of the match, record these. - if (partialMatch.endChunk === undefined) { - const charactersUntilMatchEnd = - prefix.length + exact.length - charactersMatched; - if (charactersUntilMatchEnd <= chunkValue.length) { - partialMatch.endChunk = chunk; - partialMatch.endIndex = charactersUntilMatchEnd; - } - } - if (partialMatch.startChunk === undefined) { - const charactersUntilMatchStart = prefix.length - charactersMatched; - if ( - charactersUntilMatchStart < chunkValue.length || - partialMatch.endChunk !== undefined // handles an edge case: an empty quote at the end of a chunk. - ) { - partialMatch.startChunk = chunk; - partialMatch.startIndex = charactersUntilMatchStart; - } - } - - const charactersUntilSuffixEnd = - searchPattern.length - charactersMatched; - if (charactersUntilSuffixEnd <= chunkValue.length) { - if ( - chunkValue.startsWith(searchPattern.substring(charactersMatched)) - ) { - yield partialMatch as ChunkRange<TChunk>; // all fields are certainly defined now. - } - } else if ( - chunkValue === - searchPattern.substring( - charactersMatched, - charactersMatched + chunkValue.length, - ) - ) { - // The chunk is too short to complete the match; comparison has to be completed in subsequent chunks. - partialMatch.charactersMatched += chunkValue.length; - remainingPartialMatches.push(partialMatch); - } - } - partialMatches = remainingPartialMatches; - - // 2. Try find the whole pattern in the chunk (possibly multiple times). - if (searchPattern.length <= chunkValue.length) { - let fromIndex = 0; - while (fromIndex <= chunkValue.length) { - const patternStartIndex = chunkValue.indexOf( - searchPattern, - fromIndex, - ); - if (patternStartIndex === -1) break; - fromIndex = patternStartIndex + 1; - - // Handle edge case: an empty searchPattern would already have been yielded at the end of the last chunk. - if ( - patternStartIndex === 0 && - searchPattern.length === 0 && - !isFirstChunk - ) - continue; - - yield { - startChunk: chunk, - startIndex: patternStartIndex + prefix.length, - endChunk: chunk, - endIndex: patternStartIndex + prefix.length + exact.length, - }; - } - } - - // 3. Check if this chunk ends with a partial match (or even multiple partial matches). - let newPartialMatches: number[] = []; - const searchStartPoint = Math.max( - chunkValue.length - searchPattern.length + 1, - 0, - ); - for (let i = searchStartPoint; i < chunkValue.length; i++) { - const character = chunkValue[i]; - newPartialMatches = newPartialMatches.filter( - (partialMatchStartIndex) => - character === searchPattern[i - partialMatchStartIndex], - ); - if (character === searchPattern[0]) newPartialMatches.push(i); - } - for (const partialMatchStartIndex of newPartialMatches) { - const charactersMatched = chunkValue.length - partialMatchStartIndex; - const partialMatch: PartialMatch = { - charactersMatched, - }; - if (charactersMatched >= prefix.length + exact.length) { - partialMatch.endChunk = chunk; - partialMatch.endIndex = - partialMatchStartIndex + prefix.length + exact.length; - } - if ( - charactersMatched > prefix.length || - partialMatch.endChunk !== undefined // handles an edge case: an empty quote at the end of a chunk. - ) { - partialMatch.startChunk = chunk; - partialMatch.startIndex = partialMatchStartIndex + prefix.length; - } - partialMatches.push(partialMatch); - } - - isFirstChunk = false; - } while (textChunks.nextChunk() !== null); - }; -} diff --git a/packages/selector/src/index.ts b/packages/selector/src/index.ts index 73caa05..adefd04 100644 --- a/packages/selector/src/index.ts +++ b/packages/selector/src/index.ts @@ -27,6 +27,7 @@ export type { TextPositionSelector, TextQuoteSelector, } from './types'; +export * from './text'; export function makeRefinable< // Any subtype of Selector can be made refinable; but note we limit the value diff --git a/packages/selector/src/text/chunker.ts b/packages/selector/src/text/chunker.ts new file mode 100644 index 0000000..eb0d970 --- /dev/null +++ b/packages/selector/src/text/chunker.ts @@ -0,0 +1,69 @@ +/** + * @license + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +// A Chunk represents a fragment (typically a string) of some document. +// Subclasses can add further attributes to map the chunk to its position in the +// data structure it came from (e.g. a DOM node). +export interface Chunk<TData> { + readonly data: TData; + equals?(otherChunk: this): boolean; +} + +export function chunkEquals(chunk1: Chunk<any>, chunk2: Chunk<any>): boolean { + return chunk1.equals ? chunk1.equals(chunk2) : chunk1 === chunk2; +} + +export interface ChunkRange<TChunk extends Chunk<any>> { + startChunk: TChunk; + startIndex: number; + endChunk: TChunk; + endIndex: number; +} + +export function chunkRangeEquals( + range1: ChunkRange<any>, + range2: ChunkRange<any>, +): boolean { + return ( + chunkEquals(range1.startChunk, range2.startChunk) && + chunkEquals(range1.endChunk, range2.endChunk) && + range1.startIndex === range2.startIndex && + range1.endIndex === range2.endIndex + ); +} + +// A Chunker lets one walk through the chunks of a document. +// It is inspired by, and similar to, the DOM’s NodeIterator. (but unlike +// NodeIterator, it has no concept of being ‘before’ or ‘after’ a chunk) +export interface Chunker<TChunk extends Chunk<any>> { + // The chunk currently being pointed at. + readonly currentChunk: TChunk; + + // Move currentChunk to the chunk following it, and return that chunk. + // If there are no chunks following it, keep currentChunk unchanged and return null. + nextChunk(): TChunk | null; + + // Move currentChunk to the chunk preceding it, and return that chunk. + // If there are no preceding chunks, keep currentChunk unchanged and return null. + previousChunk(): TChunk | null; + + // Test if a given chunk is before the current chunk. + precedesCurrentChunk(chunk: TChunk): boolean; +} diff --git a/packages/dom/src/code-point-seeker.ts b/packages/selector/src/text/code-point-seeker.ts similarity index 100% rename from packages/dom/src/code-point-seeker.ts rename to packages/selector/src/text/code-point-seeker.ts diff --git a/packages/dom/src/text-position/describe.ts b/packages/selector/src/text/describe-text-position.ts similarity index 58% copy from packages/dom/src/text-position/describe.ts copy to packages/selector/src/text/describe-text-position.ts index 5f7f9a3..99485b0 100644 --- a/packages/dom/src/text-position/describe.ts +++ b/packages/selector/src/text/describe-text-position.ts @@ -19,37 +19,11 @@ */ import type { TextPositionSelector } from '@annotator/selector'; -import type { Chunk, Chunker, ChunkRange } from '../chunker'; -import { TextNodeChunker } from '../chunker'; -import { CodePointSeeker } from '../code-point-seeker'; -import { ownerDocument } from '../owner-document'; -import { TextSeeker } from '../seek'; +import type { Chunk, Chunker, ChunkRange } from './chunker'; +import { CodePointSeeker } from './code-point-seeker'; +import { TextSeeker } from './seek'; -export async function describeTextPosition( - range: Range, - maybeScope?: Range, -): Promise<TextPositionSelector> { - // Default to search in the whole document. - let scope: Range; - if (maybeScope !== undefined) { - scope = maybeScope; - } else { - const document = ownerDocument(range); - scope = document.createRange(); - scope.selectNodeContents(document); - } - - const textChunks = new TextNodeChunker(scope); - if (textChunks.currentChunk === null) - throw new RangeError('Range does not contain any Text nodes.'); - - return await abstractDescribeTextPosition( - textChunks.rangeToChunkRange(range), - textChunks, - ); -} - -async function abstractDescribeTextPosition<TChunk extends Chunk<string>>( +export async function describeTextPosition<TChunk extends Chunk<string>>( target: ChunkRange<TChunk>, scope: Chunker<TChunk>, ): Promise<TextPositionSelector> { diff --git a/packages/selector/src/text/describe-text-quote.ts b/packages/selector/src/text/describe-text-quote.ts new file mode 100644 index 0000000..c97a4f8 --- /dev/null +++ b/packages/selector/src/text/describe-text-quote.ts @@ -0,0 +1,134 @@ +/** + * @license + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import type { TextQuoteSelector } from '@annotator/selector'; +import type { Chunk, Chunker, ChunkRange } from './chunker'; +import { chunkRangeEquals } from './chunker'; +import type { Seeker } from './seek'; +import { TextSeeker } from './seek'; +import { textQuoteSelectorMatcher } from '.'; + +export async function describeTextQuote<TChunk extends Chunk<string>>( + target: ChunkRange<TChunk>, + scope: () => Chunker<TChunk>, + ): Promise<TextQuoteSelector> { + const seeker = new TextSeeker(scope()); + + // Read the target’s exact text. + seeker.seekToChunk(target.startChunk, target.startIndex); + const exact = seeker.readToChunk(target.endChunk, target.endIndex); + + // Starting with an empty prefix and suffix, we search for matches. At each unintended match + // we encounter, we extend the prefix or suffix just enough to ensure it will no longer match. + let prefix = ''; + let suffix = ''; + + while (true) { + const tentativeSelector: TextQuoteSelector = { + type: 'TextQuoteSelector', + exact, + prefix, + suffix, + }; + + const matches = textQuoteSelectorMatcher(tentativeSelector)( + scope(), + ); + let nextMatch = await matches.next(); + + // If this match is the intended one, no need to act. + // XXX This test is fragile: nextMatch and target are assumed to be normalised. + if (!nextMatch.done && chunkRangeEquals(nextMatch.value, target)) { + nextMatch = await matches.next(); + } + + // If there are no more unintended matches, our selector is unambiguous! + if (nextMatch.done) return tentativeSelector; + + // Possible optimisation: A subsequent search could safely skip the part we + // already processed, instead of starting from the beginning again. But we’d + // need the matcher to start at the seeker’s position, instead of searching + // in the whole current chunk. Then we could just seek back to just after + // the start of the prefix: seeker.seekBy(-prefix.length + 1); (don’t forget + // to also correct for any changes in the prefix we will make below) + + // We’ll have to add more prefix/suffix to disqualify this unintended match. + const unintendedMatch = nextMatch.value; + const seeker1 = new TextSeeker(scope()); + const seeker2 = new TextSeeker(scope()); + + // Count how many characters we’d need as a prefix to disqualify this match. + seeker1.seekToChunk(target.startChunk, target.startIndex - prefix.length); + seeker2.seekToChunk( + unintendedMatch.startChunk, + unintendedMatch.startIndex - prefix.length, + ); + const extraPrefix = readUntilDifferent(seeker1, seeker2, true); + + // Count how many characters we’d need as a suffix to disqualify this match. + seeker1.seekToChunk(target.endChunk, target.endIndex + suffix.length); + seeker2.seekToChunk( + unintendedMatch.endChunk, + unintendedMatch.endIndex + suffix.length, + ); + const extraSuffix = readUntilDifferent(seeker1, seeker2, false); + + // Use either the prefix or suffix, whichever is shortest. + if ( + extraPrefix !== undefined && + (extraSuffix === undefined || extraPrefix.length <= extraSuffix.length) + ) { + prefix = extraPrefix + prefix; + } else if (extraSuffix !== undefined) { + suffix = suffix + extraSuffix; + } else { + throw new Error( + 'Target cannot be disambiguated; how could that have happened‽', + ); + } + } + } + + function readUntilDifferent( + seeker1: Seeker, + seeker2: Seeker, + reverse: boolean, + ): string | undefined { + let result = ''; + while (true) { + let nextCharacter: string; + try { + nextCharacter = seeker1.read(reverse ? -1 : 1); + } catch (err) { + return undefined; // Start/end of text reached: cannot expand result. + } + result = reverse ? nextCharacter + result : result + nextCharacter; + + // Check if the newly added character makes the result differ from the second seeker. + let comparisonCharacter: string | undefined; + try { + comparisonCharacter = seeker2.read(reverse ? -1 : 1); + } catch (err) { + // A RangeError would merely mean seeker2 is exhausted. + if (!(err instanceof RangeError)) throw err; + } + if (nextCharacter !== comparisonCharacter) return result; + } + } diff --git a/packages/selector/src/text/index.ts b/packages/selector/src/text/index.ts new file mode 100644 index 0000000..ca88b66 --- /dev/null +++ b/packages/selector/src/text/index.ts @@ -0,0 +1,5 @@ +export * from './describe-text-quote'; +export * from './match-text-quote'; +export * from './describe-text-position'; +export * from './match-text-position'; +export * from './chunker'; diff --git a/packages/dom/src/text-position/match.ts b/packages/selector/src/text/match-text-position.ts similarity index 66% copy from packages/dom/src/text-position/match.ts copy to packages/selector/src/text/match-text-position.ts index becd957..c00f619 100644 --- a/packages/dom/src/text-position/match.ts +++ b/packages/selector/src/text/match-text-position.ts @@ -18,29 +18,12 @@ * under the License. */ -import type { Matcher, TextPositionSelector } from '@annotator/selector'; -import type { Chunk, ChunkRange, Chunker } from '../chunker'; -import { TextNodeChunker } from '../chunker'; -import { CodePointSeeker } from '../code-point-seeker'; -import { TextSeeker } from '../seek'; +import type { TextPositionSelector } from '@annotator/selector'; +import type { Chunk, ChunkRange, Chunker } from './chunker'; +import { CodePointSeeker } from './code-point-seeker'; +import { TextSeeker } from './seek'; -export function createTextPositionSelectorMatcher( - selector: TextPositionSelector, -): Matcher<Range, Range> { - const abstractMatcher = abstractTextPositionSelectorMatcher(selector); - - return async function* matchAll(scope) { - const textChunks = new TextNodeChunker(scope); - - const matches = abstractMatcher(textChunks); - - for await (const abstractMatch of matches) { - yield textChunks.chunkRangeToRange(abstractMatch); - } - }; -} - -export function abstractTextPositionSelectorMatcher( +export function textPositionSelectorMatcher( selector: TextPositionSelector, ): <TChunk extends Chunk<any>>( scope: Chunker<TChunk>, diff --git a/packages/selector/src/text/match-text-quote.ts b/packages/selector/src/text/match-text-quote.ts new file mode 100644 index 0000000..8e90a2f --- /dev/null +++ b/packages/selector/src/text/match-text-quote.ts @@ -0,0 +1,168 @@ +/** + * @license + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import type { TextQuoteSelector } from '@annotator/selector'; +import type { Chunk, Chunker, ChunkRange } from './chunker'; + +export function textQuoteSelectorMatcher( + selector: TextQuoteSelector, + ): <TChunk extends Chunk<any>>( + scope: Chunker<TChunk>, + ) => AsyncGenerator<ChunkRange<TChunk>, void, void> { + return async function* matchAll<TChunk extends Chunk<string>>( + textChunks: Chunker<TChunk>, + ) { + const exact = selector.exact; + const prefix = selector.prefix || ''; + const suffix = selector.suffix || ''; + const searchPattern = prefix + exact + suffix; + + // The code below runs a loop with three steps: + // 1. Continue checking any partial matches from the previous chunk(s). + // 2. Try find the whole pattern in the chunk (possibly multiple times). + // 3. Check if this chunk ends with a partial match (or even multiple partial matches). + + interface PartialMatch { + startChunk?: TChunk; + startIndex?: number; + endChunk?: TChunk; + endIndex?: number; + charactersMatched: number; + } + let partialMatches: PartialMatch[] = []; + + let isFirstChunk = true; + do { + const chunk = textChunks.currentChunk; + const chunkValue = chunk.data; + + // 1. Continue checking any partial matches from the previous chunk(s). + const remainingPartialMatches: typeof partialMatches = []; + for (const partialMatch of partialMatches) { + const charactersMatched = partialMatch.charactersMatched; + + // If the current chunk contains the start and/or end of the match, record these. + if (partialMatch.endChunk === undefined) { + const charactersUntilMatchEnd = + prefix.length + exact.length - charactersMatched; + if (charactersUntilMatchEnd <= chunkValue.length) { + partialMatch.endChunk = chunk; + partialMatch.endIndex = charactersUntilMatchEnd; + } + } + if (partialMatch.startChunk === undefined) { + const charactersUntilMatchStart = prefix.length - charactersMatched; + if ( + charactersUntilMatchStart < chunkValue.length || + partialMatch.endChunk !== undefined // handles an edge case: an empty quote at the end of a chunk. + ) { + partialMatch.startChunk = chunk; + partialMatch.startIndex = charactersUntilMatchStart; + } + } + + const charactersUntilSuffixEnd = + searchPattern.length - charactersMatched; + if (charactersUntilSuffixEnd <= chunkValue.length) { + if ( + chunkValue.startsWith(searchPattern.substring(charactersMatched)) + ) { + yield partialMatch as ChunkRange<TChunk>; // all fields are certainly defined now. + } + } else if ( + chunkValue === + searchPattern.substring( + charactersMatched, + charactersMatched + chunkValue.length, + ) + ) { + // The chunk is too short to complete the match; comparison has to be completed in subsequent chunks. + partialMatch.charactersMatched += chunkValue.length; + remainingPartialMatches.push(partialMatch); + } + } + partialMatches = remainingPartialMatches; + + // 2. Try find the whole pattern in the chunk (possibly multiple times). + if (searchPattern.length <= chunkValue.length) { + let fromIndex = 0; + while (fromIndex <= chunkValue.length) { + const patternStartIndex = chunkValue.indexOf( + searchPattern, + fromIndex, + ); + if (patternStartIndex === -1) break; + fromIndex = patternStartIndex + 1; + + // Handle edge case: an empty searchPattern would already have been yielded at the end of the last chunk. + if ( + patternStartIndex === 0 && + searchPattern.length === 0 && + !isFirstChunk + ) + continue; + + yield { + startChunk: chunk, + startIndex: patternStartIndex + prefix.length, + endChunk: chunk, + endIndex: patternStartIndex + prefix.length + exact.length, + }; + } + } + + // 3. Check if this chunk ends with a partial match (or even multiple partial matches). + let newPartialMatches: number[] = []; + const searchStartPoint = Math.max( + chunkValue.length - searchPattern.length + 1, + 0, + ); + for (let i = searchStartPoint; i < chunkValue.length; i++) { + const character = chunkValue[i]; + newPartialMatches = newPartialMatches.filter( + (partialMatchStartIndex) => + character === searchPattern[i - partialMatchStartIndex], + ); + if (character === searchPattern[0]) newPartialMatches.push(i); + } + for (const partialMatchStartIndex of newPartialMatches) { + const charactersMatched = chunkValue.length - partialMatchStartIndex; + const partialMatch: PartialMatch = { + charactersMatched, + }; + if (charactersMatched >= prefix.length + exact.length) { + partialMatch.endChunk = chunk; + partialMatch.endIndex = + partialMatchStartIndex + prefix.length + exact.length; + } + if ( + charactersMatched > prefix.length || + partialMatch.endChunk !== undefined // handles an edge case: an empty quote at the end of a chunk. + ) { + partialMatch.startChunk = chunk; + partialMatch.startIndex = partialMatchStartIndex + prefix.length; + } + partialMatches.push(partialMatch); + } + + isFirstChunk = false; + } while (textChunks.nextChunk() !== null); + }; + } diff --git a/packages/dom/src/seek.ts b/packages/selector/src/text/seek.ts similarity index 100% rename from packages/dom/src/seek.ts rename to packages/selector/src/text/seek.ts
