This is an automated email from the ASF dual-hosted git repository. gerben pushed a commit to branch more-context in repository https://gitbox.apache.org/repos/asf/incubator-annotator.git
commit d967698f6c46515a7996bdfd9902511849a932f1 Author: Gerben <[email protected]> AuthorDate: Thu Dec 24 17:09:22 2020 +0100 Generate less minimal prefixes&suffixes 1. Round them up to the next whitespace. 2. Optionally add prefix&suffix around a short quote even if it is not ambiguous. The previous behaviour can still be obtained using the option `minimalContext`; especially useful if robustness against document variations is not required. Also refactor a bit, reusing the seekers instead of creating new ones on every match. --- packages/dom/src/text-quote/describe.ts | 4 +- packages/selector/src/text/describe-text-quote.ts | 182 ++++++++++++++++++---- packages/selector/src/text/seeker.ts | 24 +-- web/demo/index.js | 2 +- 4 files changed, 167 insertions(+), 45 deletions(-) diff --git a/packages/dom/src/text-quote/describe.ts b/packages/dom/src/text-quote/describe.ts index 45b8adc..ecbbd36 100644 --- a/packages/dom/src/text-quote/describe.ts +++ b/packages/dom/src/text-quote/describe.ts @@ -18,7 +18,7 @@ * under the License. */ -import type { TextQuoteSelector } from '@annotator/selector'; +import type { TextQuoteSelector, DescribeTextQuoteOptions } from '@annotator/selector'; import { describeTextQuote as abstractDescribeTextQuote } from '@annotator/selector'; import { TextNodeChunker } from '../text-node-chunker'; import { ownerDocument } from '../owner-document'; @@ -26,6 +26,7 @@ import { ownerDocument } from '../owner-document'; export async function describeTextQuote( range: Range, maybeScope?: Range, + options: DescribeTextQuoteOptions = {}, ): Promise<TextQuoteSelector> { // Default to search in the whole document. let scope: Range; @@ -42,5 +43,6 @@ export async function describeTextQuote( return await abstractDescribeTextQuote( chunker.rangeToChunkRange(range), () => new TextNodeChunker(scope), + options, ); } diff --git a/packages/selector/src/text/describe-text-quote.ts b/packages/selector/src/text/describe-text-quote.ts index 9b6526a..b3c4d48 100644 --- a/packages/selector/src/text/describe-text-quote.ts +++ b/packages/selector/src/text/describe-text-quote.ts @@ -25,21 +25,105 @@ import type { RelativeSeeker } from './seeker'; import { TextSeeker } from './seeker'; import { textQuoteSelectorMatcher } from '.'; +export interface DescribeTextQuoteOptions { + /** + * Keep prefix and suffix to the minimum that is necessary to disambiguate + * the quote. Use only if robustness against text variations is not required. + */ + minimalContext?: boolean; + + /** + * Add prefix and suffix to quotes below this length, such that the total of + * prefix + exact + suffix is at least this length. + */ + minimumQuoteLength?: number + + /** + * When attempting to find a whitespace to make the prefix/suffix start/end + * (resp.) at a word boundary, give up after this number of characters. + */ + maxWordLength?: number; +} + +/** + * Returns a {@link TextQuoteSelector} that points at the target quote in the + * given text. + * + * @remarks + * The selector will contain the *exact* target quote, and in case this quote + * appears multiple times in the text, sufficient context around the quote will + * be included in the selector’s *prefix* and *suffix* attributes to + * disambiguate. By default, more prefix and suffix are included than strictly + * required; both in order to be robust against slight modifications, and in an + * attempt to not end halfway a word (mainly for the sake of human readability). + * + * @param target - The range of characters that the selector should describe + * @param scope - The text containing the target range; or, more accurately, a + * function creating {@link Chunker}s that allow walking through the text. + * @param options + * @returns the {@link TextQuoteSelector} that describes *target*. + */ export async function describeTextQuote<TChunk extends Chunk<string>>( target: ChunkRange<TChunk>, scope: () => Chunker<TChunk>, + { + minimalContext = false, + minimumQuoteLength = 0, + maxWordLength = 50, + }: DescribeTextQuoteOptions = {}, ): Promise<TextQuoteSelector> { - const seeker = new TextSeeker(scope()); + // Create a seeker to read the target quote and the context around it. + // TODO Possible optimisation: as it need not be an AbsoluteSeeker, a + // different implementation could provide direct ‘jump’ access in seekToChunk + // (the scope’s Chunker would of course also have to support this). + const seekerAtTarget = new TextSeeker(scope()); + + // Create a second seeker so that we will be able to simultaneously read + // characters near both the target and an unintended match, if we find any. + const seekerAtUnintendedMatch = new TextSeeker(scope()); // Read the target’s exact text. - seeker.seekToChunk(target.startChunk, target.startIndex); - const exact = seeker.readToChunk(target.endChunk, target.endIndex); + seekerAtTarget.seekToChunk(target.startChunk, target.startIndex); + const exact = seekerAtTarget.readToChunk(target.endChunk, target.endIndex); - // Starting with an empty prefix and suffix, we search for matches. At each unintended match - // we encounter, we extend the prefix or suffix just enough to ensure it will no longer match. + // Start with an empty prefix and suffix. let prefix = ''; let suffix = ''; + // If the quote is below the given minimum length, add some prefix & suffix. + const currentQuoteLength = () => prefix.length + exact.length + suffix.length; + if (currentQuoteLength() < minimumQuoteLength) { + // Expand the prefix, but only to reach halfway towards the desired length. + seekerAtTarget.seekToChunk(target.startChunk, target.startIndex - prefix.length); + const length = Math.floor((minimumQuoteLength - currentQuoteLength()) / 2); + prefix = seekerAtTarget.read(-length, false, true) + prefix; + + // If needed, expand the suffix to achieve the minimum length. + if (currentQuoteLength() < minimumQuoteLength) { + seekerAtTarget.seekToChunk(target.endChunk, target.endIndex + suffix.length); + const length = minimumQuoteLength - currentQuoteLength(); + suffix = suffix + seekerAtTarget.read(length, false, true); + + // We might have to expand the prefix again (if at the end of the scope). + if (currentQuoteLength() < minimumQuoteLength) { + seekerAtTarget.seekToChunk(target.startChunk, target.startIndex - prefix.length); + const length = minimumQuoteLength - currentQuoteLength(); + prefix = seekerAtTarget.read(-length, false, true) + prefix; + } + } + } + + // Expand prefix & suffix to avoid them ending somewhere halfway in a word. + if (!minimalContext) { + seekerAtTarget.seekToChunk(target.startChunk, target.startIndex - prefix.length); + prefix = readUntilWhitespace(seekerAtTarget, maxWordLength, true) + prefix; + seekerAtTarget.seekToChunk(target.endChunk, target.endIndex + suffix.length); + suffix = suffix + readUntilWhitespace(seekerAtTarget, maxWordLength, false); + } + + // Search for matches of the quote using the current prefix and suffix. At + // each unintended match we encounter, we extend the prefix or suffix to + // ensure it will no longer match. while (true) { const tentativeSelector: TextQuoteSelector = { type: 'TextQuoteSelector', @@ -48,9 +132,7 @@ export async function describeTextQuote<TChunk extends Chunk<string>>( suffix, }; - const matches = textQuoteSelectorMatcher(tentativeSelector)( - scope(), - ); + const matches = textQuoteSelectorMatcher(tentativeSelector)(scope()); let nextMatch = await matches.next(); // If this match is the intended one, no need to act. @@ -72,42 +154,44 @@ export async function describeTextQuote<TChunk extends Chunk<string>>( // We’ll have to add more prefix/suffix to disqualify this unintended match. const unintendedMatch = nextMatch.value; - // Create two seekers to simultaneously read characters near both the target - // and the unintended match. - // Possible optimisation: as these need not be AbsoluteSeekers, a different - // implementation could provide direct ‘jump’ access in seekToChunk (the - // scope’s Chunker would of course also have to support this). - const seeker1 = new TextSeeker(scope()); - const seeker2 = new TextSeeker(scope()); - // Count how many characters we’d need as a prefix to disqualify this match. - seeker1.seekToChunk(target.startChunk, target.startIndex - prefix.length); - seeker2.seekToChunk( + seekerAtTarget.seekToChunk(target.startChunk, target.startIndex - prefix.length); + seekerAtUnintendedMatch.seekToChunk( unintendedMatch.startChunk, unintendedMatch.startIndex - prefix.length, ); - const extraPrefix = readUntilDifferent(seeker1, seeker2, true); + let extraPrefix = readUntilDifferent(seekerAtTarget, seekerAtUnintendedMatch, true); + if (extraPrefix !== undefined && !minimalContext) + extraPrefix = readUntilWhitespace(seekerAtTarget, maxWordLength, true) + extraPrefix; // Count how many characters we’d need as a suffix to disqualify this match. - seeker1.seekToChunk(target.endChunk, target.endIndex + suffix.length); - seeker2.seekToChunk( + seekerAtTarget.seekToChunk(target.endChunk, target.endIndex + suffix.length); + seekerAtUnintendedMatch.seekToChunk( unintendedMatch.endChunk, unintendedMatch.endIndex + suffix.length, ); - const extraSuffix = readUntilDifferent(seeker1, seeker2, false); - - // Use either the prefix or suffix, whichever is shortest. - if ( - extraPrefix !== undefined && - (extraSuffix === undefined || extraPrefix.length <= extraSuffix.length) - ) { - prefix = extraPrefix + prefix; - } else if (extraSuffix !== undefined) { - suffix = suffix + extraSuffix; + let extraSuffix = readUntilDifferent(seekerAtTarget, seekerAtUnintendedMatch, false); + if (extraSuffix !== undefined && !minimalContext) + extraSuffix = extraSuffix + readUntilWhitespace(seekerAtTarget, maxWordLength, false); + + if (minimalContext) { + // Use either the prefix or suffix, whichever is shortest. + if ( + extraPrefix !== undefined && + (extraSuffix === undefined || extraPrefix.length <= extraSuffix.length) + ) { + prefix = extraPrefix + prefix; + } else if (extraSuffix !== undefined) { + suffix = suffix + extraSuffix; + } else { + throw new Error( + 'Target cannot be disambiguated; how could that have happened‽', + ); + } } else { - throw new Error( - 'Target cannot be disambiguated; how could that have happened‽', - ); + // For redundancy, expand both prefix and suffix. + if (extraPrefix !== undefined) prefix = extraPrefix + prefix; + if (extraSuffix !== undefined) suffix = suffix + extraSuffix; } } } @@ -138,3 +222,33 @@ function readUntilDifferent( if (nextCharacter !== comparisonCharacter) return result; } } + +function readUntilWhitespace( + seeker: RelativeSeeker, + limit: number = Infinity, + reverse = false +): string { + let result = ''; + while (result.length < limit) { + let nextCharacter: string; + try { + nextCharacter = seeker.read(reverse ? -1 : 1); + } catch (err) { + if (!(err instanceof RangeError)) throw err; + break; // End/start of text reached. + } + + // Stop if we reached whitespace. + if (isWhitespace(nextCharacter)) { + seeker.seekBy(reverse ? 1 : -1); // ‘undo’ the last read. + break; + } + + result = reverse ? nextCharacter + result : result + nextCharacter; + } + return result; +} + +function isWhitespace(s: string): boolean { + return s.match(/^\s+$/) !== null; +} diff --git a/packages/selector/src/text/seeker.ts b/packages/selector/src/text/seeker.ts index 2c1f788..1605d45 100644 --- a/packages/selector/src/text/seeker.ts +++ b/packages/selector/src/text/seeker.ts @@ -69,12 +69,14 @@ export interface RelativeSeeker<TData extends Iterable<any> = string> { * backwards in the file. * @param roundUp - If true, then, after reading the given number of * characters, read further until the end (or start) of the current chunk. + * @param lessIsFine - If true, and there are not enough characters in the + * file, return the result so far instead of throwing an error. * @returns The characters passed (in their normal order, even when moving * backwards) - * @throws RangeError if there are not enough characters in the file. The - * pointer is left at the end/start of the file. + * @throws RangeError if there are not enough characters in the file (unless + * `lessIsFine` is true). The pointer is left at the end/start of the file. */ - read(length?: number, roundUp?: boolean): TData; + read(length?: number, roundUp?: boolean, lessIsFine?: boolean): TData; } /** @@ -195,8 +197,8 @@ export class TextSeeker<TChunk extends Chunk<string>> this.seekTo(0); } - read(length: number, roundUp = false): string { - return this.readTo(this.position + length, roundUp); + read(length: number, roundUp = false, lessIsFine = false): string { + return this._readOrSeekTo(true, this.position + length, roundUp, lessIsFine); } readTo(target: number, roundUp = false): string { @@ -277,12 +279,13 @@ export class TextSeeker<TChunk extends Chunk<string>> } } - private _readOrSeekTo(read: true, target: number, roundUp?: boolean): string; - private _readOrSeekTo(read: false, target: number, roundUp?: boolean): void; + private _readOrSeekTo(read: true, target: number, roundUp?: boolean, lessIsFine?: boolean): string; + private _readOrSeekTo(read: false, target: number, roundUp?: boolean, lessIsFine?: boolean): void; private _readOrSeekTo( read: boolean, target: number, roundUp = false, + lessIsFine = false, ): string | void { let result = ''; @@ -298,7 +301,7 @@ export class TextSeeker<TChunk extends Chunk<string>> const [data, nextChunk] = this._readToNextChunk(); if (read) result += data; if (nextChunk === null) { - if (this.position === target) break; + if (this.position === target || lessIsFine) break; else throw new RangeError(E_END); } } else { @@ -335,7 +338,10 @@ export class TextSeeker<TChunk extends Chunk<string>> } else { const [data, previousChunk] = this._readToPreviousChunk(); if (read) result = data + result; - if (previousChunk === null) throw new RangeError(E_END); + if (previousChunk === null) { + if (lessIsFine) break; + else throw new RangeError(E_END); + } } } } diff --git a/web/demo/index.js b/web/demo/index.js index d513252..a9773a4 100644 --- a/web/demo/index.js +++ b/web/demo/index.js @@ -138,7 +138,7 @@ async function onSelectionChange() { const selector = describeMode === 'TextPosition' ? await describeTextPosition(range, scope) - : await describeTextQuote(range, scope); + : await describeTextQuote(range, scope, { minimumQuoteLength: 10 }); await anchor(selector); } }
