This is an automated email from the ASF dual-hosted git repository. gerben pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/incubator-annotator.git
commit e05cd887ff317642c7c2ef8195ac72fccecb00de Author: Gerben <[email protected]> AuthorDate: Tue Jan 5 18:39:09 2021 +0100 More API documentation! --- packages/dom/src/css.ts | 28 +++++ packages/dom/src/highlight-range.ts | 32 ++++-- packages/dom/src/normalize-range.ts | 40 ++++--- packages/dom/src/range/match.ts | 66 +++++++++++ packages/dom/src/text-position/describe.ts | 31 ++++++ packages/dom/src/text-position/match.ts | 38 +++++++ packages/dom/src/text-quote/describe.ts | 36 ++++++ packages/dom/src/text-quote/match.ts | 47 +++++++- packages/selector/src/index.ts | 13 +++ packages/selector/src/text/chunker.ts | 121 ++++++++++++++++++--- packages/selector/src/text/code-point-seeker.ts | 26 +++++ .../selector/src/text/describe-text-position.ts | 19 ++++ packages/selector/src/text/describe-text-quote.ts | 40 ++++--- packages/selector/src/text/match-text-position.ts | 29 +++++ packages/selector/src/text/match-text-quote.ts | 43 +++++++- packages/selector/src/text/seeker.ts | 33 +++++- packages/selector/src/types.ts | 55 ++++++++++ 17 files changed, 639 insertions(+), 58 deletions(-) diff --git a/packages/dom/src/css.ts b/packages/dom/src/css.ts index fd4bd51..f6782f5 100644 --- a/packages/dom/src/css.ts +++ b/packages/dom/src/css.ts @@ -21,6 +21,34 @@ import type { CssSelector, Matcher } from '@apache-annotator/selector'; import { ownerDocument } from './owner-document'; +/** + * Find the elements corresponding to the given {@link + * @apache-annotator/selector#CssSelector}. + * + * @remarks + * The given CssSelector returns all elements within `scope` that it matches. + * However, the selector is evaluated relative to the Document as a whole. + * *(XXX is this intentional, a mistake, or compromise?)* + * + * The function is curried, taking first the selector and then the scope. + * + * As there may be multiple matches for a given selector, the matcher will + * return an (async) generator that produces each match in the order they are + * found in the text. + * + * Each matching element is returned as a {@link https://developer.mozilla.org/en-US/docs/Web/API/Range + * | Range} surrounding that element. This in order to make its output reusable + * as the scope for any subsequents selectors that {@link + * @apache-annotator/selector#Selector.refinedBy | refine} this CssSelector. + * + * @param selector - The {@link @apache-annotator/selector#CssSelector} to be + * anchored + * @returns A {@link @apache-annotator/selector#Matcher} function that applies + * `selector` to a given {@link https://developer.mozilla.org/en-US/docs/Web/API/Range + * | Range} + * + * @public + */ export function createCssSelectorMatcher( selector: CssSelector, ): Matcher<Range, Range> { diff --git a/packages/dom/src/highlight-range.ts b/packages/dom/src/highlight-range.ts index 7abda53..18684c5 100644 --- a/packages/dom/src/highlight-range.ts +++ b/packages/dom/src/highlight-range.ts @@ -20,16 +20,28 @@ import { ownerDocument } from './owner-document'; -// Wrap each text node in a given DOM Range with a <mark> or other element. -// Breaks start and/or end node if needed. -// Returns a function that cleans up the created highlight (not a perfect undo: split text nodes are -// not merged again; if desired, you could run range.commonAncestorContainer.normalize() afterwards). -// -// Parameters: -// - range: a DOM Range object. Note that as highlighting modifies the DOM, the range may be -// unusable afterwards -// - tagName: the element used to wrap text nodes. Defaults to 'mark'. -// - attributes: an Object defining any attributes to be set on the wrapper elements. +/** + * Wrap each text node in a given DOM Range with a `<mark>` or other element. + * + * @remarks + * If the Range start and/or ends within a Text node, that node will be split + * in order to only wrap the contained part in the mark element. + * + * The highlight can be removed again by calling the function that cleans up the + * wrapper elements. Note that this might not perfectly restore the DOM to its + * previous state: text nodes that were split are not merged again. One could + * consider running `range.commonAncestorContainer.normalize()` afterwards to + * join all adjacent text nodes. + * + * @param range - A DOM Range object. Note that as highlighting modifies the + * DOM, the range may be unusable afterwards. + * @param tagName - The element used to wrap text nodes. Defaults to 'mark'. + * @param attributes - An object defining any attributes to be set on the + * wrapper elements + * @returns A function that removes the created highlight. + * + * @public + */ export function highlightRange( range: Range, tagName = 'mark', diff --git a/packages/dom/src/normalize-range.ts b/packages/dom/src/normalize-range.ts index 30c1e37..84bbda2 100644 --- a/packages/dom/src/normalize-range.ts +++ b/packages/dom/src/normalize-range.ts @@ -20,10 +20,12 @@ import { ownerDocument } from './owner-document'; -// TextRange is a Range that guarantees to always have Text nodes as its start -// and end nodes. To ensure the type remains correct, it also restricts usage -// of methods that would modify these nodes (note that a user can simply cast -// the TextRange back to a Range to remove these restrictions). +/** + * TextRange is a Range that guarantees to always have Text nodes as its start + * and end nodes. To ensure the type remains correct, it also restricts usage + * of methods that would modify these nodes (note that a user can simply cast + * the TextRange back to a Range to remove these restrictions). + */ export interface TextRange extends Range { readonly startContainer: Text; readonly endContainer: Text; @@ -44,17 +46,25 @@ export interface TextRange extends Range { surroundContents(newParent: never): void; } -// Normalise a range such that both its start and end are text nodes, and that -// if there are equivalent text selections it takes the narrowest option (i.e. -// it prefers the start not to be at the end of a text node, and vice versa). -// -// If there is no text between the start and end, they thus collapse onto one a -// single position; and if there are multiple equivalent positions, it takes the -// first one; or, if scope is passed, the first equivalent falling within scope. -// -// Note that if the given range does not contain non-empty text nodes, it will -// end up pointing at a text node outside of it (before it if possible, else -// after). If the document does not contain any text nodes, an error is thrown. +/** + * Normalise a {@link https://developer.mozilla.org/en-US/docs/Web/API/Range | + * Range} such that ranges spanning the same text become exact equals. + * + * @remarks + * *Note: in this context ‘text’ means any characters, including whitespace.* + + * Normalises a range such that both its start and end are text nodes, and that + * if there are equivalent text selections it takes the narrowest option (i.e. + * it prefers the start not to be at the end of a text node, and vice versa). + * + * If there is no text between the start and end, they thus collapse onto one a + * single position; and if there are multiple equivalent positions, it takes the + * first one; or, if scope is passed, the first equivalent falling within scope. + * + * Note that if the given range does not contain non-empty text nodes, it may + * end up pointing at a text node outside of it (before it if possible, else + * after). If the document does not contain any text nodes, an error is thrown. + */ export function normalizeRange(range: Range, scope?: Range): TextRange { const document = ownerDocument(range); const walker = document.createTreeWalker(document, NodeFilter.SHOW_TEXT, { diff --git a/packages/dom/src/range/match.ts b/packages/dom/src/range/match.ts index ed02e32..dd4b982 100644 --- a/packages/dom/src/range/match.ts +++ b/packages/dom/src/range/match.ts @@ -26,6 +26,72 @@ import type { import { ownerDocument } from '../owner-document'; import { cartesian } from './cartesian'; +/** + * Find the range(s) corresponding to the given {@link + * @apache-annotator/selector#RangeSelector}. + * + * As a RangeSelector itself nests two further selectors, one needs to pass a + * `createMatcher` function that will be used to process those nested selectors. + * + * The function is curried, taking first the `createMatcher` function, then the + * selector, and then the scope. + * + * As there may be multiple matches for a given selector, the matcher will + * return an (async) generator that produces each match in the order they are + * found in the text. If both its nested selectors produce multiple matches, the + * RangeSelector matches each possible pair among those in which the order of + * start and end are respected. *(Note this behaviour is a rather free + * interpretation — the Web Annotation Data Model spec is silent about multiple + * matches for RangeSelectors)* + * + * @example + * By using a matcher for {@link @apache-annotator/selector#TextQuoteSelector}s, one + * could create a matcher for text quotes with ellipsis to select a phrase + * “ipsum … amet,”: + * ``` + * const selector = { + * type: 'RangeSelector', + * startSelector: { + * type: 'TextQuoteSelector', + * exact: 'ipsum ', + * }, + * endSelector: { + * type: 'TextQuoteSelector', + * // Because the end of a RangeSelector is *exclusive*, we’ll present the + * // latter part of the quote as the *prefix* so it will part of the match. + * exact: '', + * prefix: ' amet,', + * } + * }} + * const createRangeSelectorMatcher = makeCreateRangeSelectorMatcher(createTextQuoteMatcher); + * const match = createRangeSelectorMatcher(selector)(document.body); + * console.log(match) + * // ⇒ Range { startContainer: #text, startOffset: 6, endContainer: #text, + * // endOffset: 27, … } + * ``` + * + * @example + * To support RangeSelectors that might themselves contain RangeSelectors, + * recursion can be created by supplying the resulting matcher creator function + * as the `createMatcher` parameter: + * ``` + * const createWhicheverMatcher = (selector) => { + * const innerCreateMatcher = { + * TextQuoteSelector: createTextQuoteSelectorMatcher, + * TextPositionSelector: createTextPositionSelectorMatcher, + * RangeSelector: makeCreateRangeSelectorMatcher(createWhicheverMatcher), + * }[selector.type]; + * return innerCreateMatcher(selector); + * }); + * ``` + * + * @param createMatcher - The function used to process nested selectors. + * @returns A function that, given a RangeSelector, creates a {@link + * @apache-annotator/selector#Matcher} function that applies it to a given {@link https://developer.mozilla.org/en-US/docs/Web/API/Range + * | Range} + * + * @public + */ export function makeCreateRangeSelectorMatcher( createMatcher: <T extends Selector>(selector: T) => Matcher<Range, Range>, ): (selector: RangeSelector) => Matcher<Range, Range> { diff --git a/packages/dom/src/text-position/describe.ts b/packages/dom/src/text-position/describe.ts index 3808056..ae06a94 100644 --- a/packages/dom/src/text-position/describe.ts +++ b/packages/dom/src/text-position/describe.ts @@ -23,6 +23,37 @@ import { describeTextPosition as abstractDescribeTextPosition } from '@apache-an import { ownerDocument } from '../owner-document'; import { TextNodeChunker } from '../text-node-chunker'; +/** + * Returns a {@link @apache-annotator/selector#TextPositionSelector} that points + * at the target text within the given scope. + * + * When no scope is given, the position is described relative to the document + * as a whole. Note this means all the characters in all Text nodes are counted + * to determine the target’s position, including those in the `<head>` and + * whitespace, hence even a minor modification could make the selector point to + * a different text than its original target. + * + * @example + * ``` + * const target = window.getSelection().getRangeAt(0); + * const selector = await describeTextPosition(target); + * console.log(selector); + * // { + * // type: 'TextPositionSelector', + * // start: 702, + * // end: 736 + * // } + * ``` + * + * @param range - The range of characters that the selector should describe + * @param maybeScope - A {@link https://developer.mozilla.org/en-US/docs/Web/API/Range + * | Range} that serves as the ‘document’ for purposes of finding occurrences + * and determining prefix and suffix. Defaults to span the full Document + * containing the range. + * @returns The selector describing the `range` relative to `scope` + * + * @public + */ export async function describeTextPosition( range: Range, maybeScope?: Range, diff --git a/packages/dom/src/text-position/match.ts b/packages/dom/src/text-position/match.ts index db50696..f8d0917 100644 --- a/packages/dom/src/text-position/match.ts +++ b/packages/dom/src/text-position/match.ts @@ -22,6 +22,44 @@ import type { Matcher, TextPositionSelector } from '@apache-annotator/selector'; import { textPositionSelectorMatcher as abstractTextPositionSelectorMatcher } from '@apache-annotator/selector'; import { TextNodeChunker } from '../text-node-chunker'; +/** + * Find the range of text corresponding to the given {@link + * @apache-annotator/selector#TextPositionSelector}. + * + * @remarks + * The start and end positions are measured relative to the first text character + * in the given scope. + * + * The function is curried, taking first the selector and then the scope. + * + * Its end result is an (async) generator producing a single {@link https://developer.mozilla.org/en-US/docs/Web/API/Range + * | Range} to represent the match. (unlike a {@link + * @apache-annotator/selector#TextQuoteSelector}, a TextPositionSelector cannot have + * multiple matches). + * + * @example + * ``` + * const selector = { type: 'TextPositionSelector', start: 702, end: 736 }; + * + * // Search in the whole document. + * const scope = document.createRange(); + * scope.selectNodeContents(document); + * + * const matches = textQuoteSelectorMatcher(selector)(scope); + * const match = (await matches.next()).value; + * + * // ⇒ Range { startContainer: #text, startOffset: 64, endContainer: #text, + * // endOffset: 98, … } + * ``` + * + * @param selector - The {@link @apache-annotator/selector#TextPositionSelector} + * to be anchored + * @returns A {@link @apache-annotator/selector#Matcher} function that applies + * `selector` to a given {@link https://developer.mozilla.org/en-US/docs/Web/API/Range + * | Range} + * + * @public + */ export function createTextPositionSelectorMatcher( selector: TextPositionSelector, ): Matcher<Range, Range> { diff --git a/packages/dom/src/text-quote/describe.ts b/packages/dom/src/text-quote/describe.ts index e1513a9..3958633 100644 --- a/packages/dom/src/text-quote/describe.ts +++ b/packages/dom/src/text-quote/describe.ts @@ -26,6 +26,42 @@ import { describeTextQuote as abstractDescribeTextQuote } from '@apache-annotato import { ownerDocument } from '../owner-document'; import { TextNodeChunker } from '../text-node-chunker'; +/** + * Create a {@link @apache-annotator/selector#TextQuoteSelector} that + * unambiguously describes the given range. + * + * @remarks + * The selector will contain the *exact* target quote, and in case this quote + * appears multiple times in the text, sufficient context around the quote will + * be included in the selector’s *prefix* and *suffix* attributes to + * disambiguate. By default, more prefix and suffix are included than strictly + * required; both in order to be robust against slight modifications, and in an + * attempt to not end halfway a word (mainly for the sake of human readability). + * + * @example + * ``` + * const target = window.getSelection().getRangeAt(0); + * const selector = await describeTextQuote(target); + * console.log(selector); + * // { + * // type: 'TextQuoteSelector', + * // exact: 'ipsum', + * // prefix: 'Lorem ', + * // suffix: ' dolor' + * // } + * ``` + * + * @param range - The {@link https://developer.mozilla.org/en-US/docs/Web/API/Range + * | Range} whose text content will be described + * @param maybeScope - A {@link https://developer.mozilla.org/en-US/docs/Web/API/Range + * | Range} that serves as the ‘document’ for purposes of finding occurrences + * and determining prefix and suffix. Defaults to span the full Document + * containing the range. + * @param options - Options to fine-tune the function’s behaviour. + * @returns The selector unambiguously describing the `range` in `scope`. + * + * @public + */ export async function describeTextQuote( range: Range, maybeScope?: Range, diff --git a/packages/dom/src/text-quote/match.ts b/packages/dom/src/text-quote/match.ts index fd341fc..0f263e7 100644 --- a/packages/dom/src/text-quote/match.ts +++ b/packages/dom/src/text-quote/match.ts @@ -22,6 +22,51 @@ import type { Matcher, TextQuoteSelector } from '@apache-annotator/selector'; import { textQuoteSelectorMatcher as abstractTextQuoteSelectorMatcher } from '@apache-annotator/selector'; import { TextNodeChunker, EmptyScopeError } from '../text-node-chunker'; +/** + * Find occurrences in a text matching the given {@link + * @apache-annotator/selector#TextQuoteSelector}. + * + * @remarks + * This performs an exact search for the selector’s quote (including prefix and + * suffix) within the text contained in the given scope (a {@link + * https://developer.mozilla.org/en-US/docs/Web/API/Range | Range}). + * + * Note the match is based on strict character-by-character equivalence, i.e. + * it is sensitive to whitespace, capitalisation, etc. + * + * The function is curried, taking first the selector and then the scope. + * + * As there may be multiple matches for a given selector (when its prefix and + * suffix attributes are not sufficient to disambiguate it), the matcher will + * return an (async) generator that produces each match in the order they are + * found in the text. + * + * @example + * ``` + * // Find the word ‘banana’. + * const selector = { type: 'TextQuoteSelector', exact: 'banana' }; + * + * // Search in the document body. + * const scope = document.createRange(); + * scope.selectNodeContents(document.body); + * + * // Read all matches. + * const matches = textQuoteSelectorMatcher(selector)(scope); + * for await (match of matches) console.log(match); + * // ⇒ Range { startContainer: #text, startOffset: 187, endContainer: #text, + * // endOffset: 193, … } + * // ⇒ Range { startContainer: #text, startOffset: 631, endContainer: #text, + * // endOffset: 637, … } + * ``` + * + * @param selector - The {@link @apache-annotator/selector#TextQuoteSelector} + * to be anchored + * @returns a {@link @apache-annotator/selector#Matcher} function that applies + * `selector` to a given {@link https://developer.mozilla.org/en-US/docs/Web/API/Range + * | Range} + * + * @public + */ export function createTextQuoteSelectorMatcher( selector: TextQuoteSelector, ): Matcher<Range, Range> { @@ -32,8 +77,8 @@ export function createTextQuoteSelectorMatcher( try { textChunks = new TextNodeChunker(scope); } catch (err) { - if (err instanceof EmptyScopeError) return; // An empty range contains no matches. + if (err instanceof EmptyScopeError) return; else throw err; } diff --git a/packages/selector/src/index.ts b/packages/selector/src/index.ts index adefd04..d0a6607 100644 --- a/packages/selector/src/index.ts +++ b/packages/selector/src/index.ts @@ -29,6 +29,19 @@ export type { } from './types'; export * from './text'; +/** + * Wrap a matcher creation function so that it supports refinement of selection. + * + * @remarks + * See {@link https://www.w3.org/TR/2017/REC-annotation-model-20170223/#refinement-of-selection + * | §4.2.9 Refinement of Selection} in the Web Annotation Data Model. + * + * @param matcherCreator - The function to wrap; it will be executed both for + * {@link Selector}s passed to the returned wrapper function, and for any + * refining Selector those might contain (and any refinement of that, etc.). + * + * @public + */ export function makeRefinable< // Any subtype of Selector can be made refinable; but note we limit the value // of refinedBy because it must also be accepted by matcherCreator. diff --git a/packages/selector/src/text/chunker.ts b/packages/selector/src/text/chunker.ts index eb0d970..88fac13 100644 --- a/packages/selector/src/text/chunker.ts +++ b/packages/selector/src/text/chunker.ts @@ -18,18 +18,57 @@ * under the License. */ -// A Chunk represents a fragment (typically a string) of some document. -// Subclasses can add further attributes to map the chunk to its position in the -// data structure it came from (e.g. a DOM node). +/** + * Represents a piece of text in any kind of ‘file’. + * + * @remarks + * Its purpose is to enable generic algorithms to deal with text content of any + * type of ‘file’ that consists of many pieces of text (e.g. a DOM, PDF, …). + * Each Chunk represents one piece of text ({@link Chunk.data}). An object + * implementing this interface would typically have other attributes as well to + * map the chunk back to its position in the file (e.g. a Text node in the DOM). + * + * @typeParam TData - Piece of text, typically `string` + * + * @public + */ export interface Chunk<TData> { + /** + * The piece of text this chunk represents. + */ readonly data: TData; equals?(otherChunk: this): boolean; } +/** + * Test two {@link Chunk}s for equality. + * + * @remarks + * Equality here means that both represent the same piece of text (i.e. at the + * same position) in the file. It compares using the custom {@link Chunk.equals} + * method either chunk defines one, and falls back to checking the objects’ + * identity (i.e. `chunk1 === chunk2`). + * + * @public + */ export function chunkEquals(chunk1: Chunk<any>, chunk2: Chunk<any>): boolean { - return chunk1.equals ? chunk1.equals(chunk2) : chunk1 === chunk2; + if (chunk1.equals) return chunk1.equals(chunk2); + if (chunk2.equals) return chunk2.equals(chunk1); + return chunk1 === chunk2; } +/** + * Points at a range of characters between two points inside {@link Chunk}s. + * + * @remarks + * Analogous to the DOM’s ({@link https://developer.mozilla.org/en-US/docs/Web/API/AbstractRange + * | Abstract}){@link https://developer.mozilla.org/en-US/docs/Web/API/Range | + * Range}. Each index expresses an offset inside the value of the corresponding + * {@link Chunk.data}, and can equal the length of that data in order to point + * to the position right after the chunk’s last character. + * + * @public + */ export interface ChunkRange<TChunk extends Chunk<any>> { startChunk: TChunk; startIndex: number; @@ -37,6 +76,25 @@ export interface ChunkRange<TChunk extends Chunk<any>> { endIndex: number; } +/** + * Test two {@link ChunkRange}s for equality. + * + * @remarks + * Equality here means equality of each of their four properties (i.e. + * {@link ChunkRange.startChunk}, {@link ChunkRange.startIndex}, + * {@link ChunkRange.endChunk}, and {@link ChunkRange.endIndex}). + * For the `startChunk` and `endChunk`, this function uses the custom + * {@link Chunk.equals} method if defined. + * + * Note that if the start/end of one range points at the end of a chunk, and the + * other to the start of a subsequent chunk, they are not considered equal, even + * though semantically they may be representing the same range of characters. To + * test for such semantic equivalence, ensure that both inputs are normalised: + * typically this means the range is shrunk to its narrowest equivalent, and (if + * it is empty) positioned at its first equivalent. + * + * @public + */ export function chunkRangeEquals( range1: ChunkRange<any>, range2: ChunkRange<any>, @@ -49,21 +107,58 @@ export function chunkRangeEquals( ); } -// A Chunker lets one walk through the chunks of a document. -// It is inspired by, and similar to, the DOM’s NodeIterator. (but unlike -// NodeIterator, it has no concept of being ‘before’ or ‘after’ a chunk) +/** + * Presents the pieces of text contained in some underlying ‘file’ as a sequence + * of {@link Chunk}s. + * + * @remarks + * Rather than presenting a list of all pieces, the `Chunker` provides methods + * to walk through the file piece by piece. This permits implementations to read + * and convert the file to `Chunk`s lazily. + * + * For those familiar with the DOM APIs, it is similar to a NodeIterator (but + * unlike NodeIterator, it has no concept of being ‘before’ or ‘after’ a chunk). + * + * @typeParam TChunk - (sub)type of `Chunk` being used. + * + * @public + */ export interface Chunker<TChunk extends Chunk<any>> { - // The chunk currently being pointed at. + /** + * The chunk currently being pointed at. + * + * @remarks + * Initially, this should normally be the first chunk in the file. + */ readonly currentChunk: TChunk; - // Move currentChunk to the chunk following it, and return that chunk. - // If there are no chunks following it, keep currentChunk unchanged and return null. + /** + * Point {@link Chunker.currentChunk} at the chunk following it, and return that chunk. + * If there are no chunks following it, keep `currentChunk` unchanged and + * return null. + */ nextChunk(): TChunk | null; - // Move currentChunk to the chunk preceding it, and return that chunk. - // If there are no preceding chunks, keep currentChunk unchanged and return null. + /** + * Point {@link Chunker.currentChunk} at the chunk preceding it, and return that chunk. + * If there are no chunks preceding it, keep `currentChunk` unchanged and + * return null. + */ previousChunk(): TChunk | null; - // Test if a given chunk is before the current chunk. + /** + * Test if a given `chunk` is before the {@link Chunker.currentChunk|current + * chunk}. + * + * @remarks + * Returns true if `chunk` is before `this.currentChunk`, false otherwise + * (i.e. if `chunk` follows it or is the current chunk). + * + * The given `chunk` need not necessarily be obtained from the same `Chunker`, + * but the chunkers would need to represent the same file. Otherwise behaviour + * is unspecified (an implementation might throw or just return `false`). + * + * @param chunk - A chunk, typically obtained from the same `Chunker`. + */ precedesCurrentChunk(chunk: TChunk): boolean; } diff --git a/packages/selector/src/text/code-point-seeker.ts b/packages/selector/src/text/code-point-seeker.ts index 68f45a3..9e31def 100644 --- a/packages/selector/src/text/code-point-seeker.ts +++ b/packages/selector/src/text/code-point-seeker.ts @@ -21,10 +21,36 @@ import type { Chunk } from './chunker'; import type { Seeker } from './seeker'; +/** + * Seeks through text counting Unicode *code points* instead of *code units*. + * + * @remarks + * Javascript characters correspond to 16 bits *code units*, hence two such + * ‘characters’ might together constitute a single Unicode character (i.e. a + * *code point*). The {@link CodePointSeeker} allows to ignore this + * variable-length encoding, by counting code points instead. + * + * It is made to wrap a {@link Seeker} that counts code units (presumably a + * {@link TextSeeker}), which must be passed to its {@link this:constructor | + * constructor}. + * + * When reading from the `CodePointSeeker`, the returned values is not a string + * but an array of strings, each containing one code point (thus each having a + * `length` that is either 1 or 2). + * + * @public + */ export class CodePointSeeker<TChunk extends Chunk<string>> implements Seeker<TChunk, string[]> { position = 0; + /** + * + * @param raw The {@link Seeker} to wrap, which counts in code *units* (e.g. + * a {@link TextSeeker}). It should have {@link Seeker.position | position} + * `0` and its methods must no longer be used directly if the + * `CodePointSeeker`’s position is to remain correct. + */ constructor(public readonly raw: Seeker<TChunk>) {} seekBy(length: number): void { diff --git a/packages/selector/src/text/describe-text-position.ts b/packages/selector/src/text/describe-text-position.ts index 5c18ef1..a026380 100644 --- a/packages/selector/src/text/describe-text-position.ts +++ b/packages/selector/src/text/describe-text-position.ts @@ -23,6 +23,25 @@ import type { Chunk, Chunker, ChunkRange } from './chunker'; import { CodePointSeeker } from './code-point-seeker'; import { TextSeeker } from './seeker'; +/** + * Returns a {@link TextPositionSelector} that points at the target text within + * the given scope. + * + * This is an abstract implementation of the function’s logic, which expects a + * generic {@link Chunker} to represent the text, and a {@link ChunkRange} to + * represent the target. + * + * See {@link @apache-annotator/dom#describeTextPosition} for a wrapper around + * this implementation which applies it to the text of an HTML DOM. + * + * @param target - The range of characters that the selector should describe + * @param scope - The text, presented as a {@link Chunker}, which contains the + * target range, and relative to which its position will be measured + * @returns The {@link TextPositionSelector} that describes `target` relative + * to `scope` + * + * @public + */ export async function describeTextPosition<TChunk extends Chunk<string>>( target: ChunkRange<TChunk>, scope: Chunker<TChunk>, diff --git a/packages/selector/src/text/describe-text-quote.ts b/packages/selector/src/text/describe-text-quote.ts index 24e366b..3b129fb 100644 --- a/packages/selector/src/text/describe-text-quote.ts +++ b/packages/selector/src/text/describe-text-quote.ts @@ -25,6 +25,9 @@ import type { RelativeSeeker } from './seeker'; import { TextSeeker } from './seeker'; import { textQuoteSelectorMatcher } from '.'; +/** + * @public + */ export interface DescribeTextQuoteOptions { /** * Keep prefix and suffix to the minimum that is necessary to disambiguate @@ -34,7 +37,7 @@ export interface DescribeTextQuoteOptions { /** * Add prefix and suffix to quotes below this length, such that the total of - * prefix + exact + suffix is at least this length. + * `prefix + exact + suffix` is at least this length. */ minimumQuoteLength?: number; @@ -50,28 +53,39 @@ export interface DescribeTextQuoteOptions { * given text. * * @remarks - * The selector will contain the *exact* target quote, and in case this quote - * appears multiple times in the text, sufficient context around the quote will - * be included in the selector’s *prefix* and *suffix* attributes to - * disambiguate. By default, more prefix and suffix are included than strictly - * required; both in order to be robust against slight modifications, and in an - * attempt to not end halfway a word (mainly for the sake of human readability). + * The selector will contain the exact target quote. In case this quote appears + * multiple times in the text, sufficient context around the quote will be + * included in the selector’s `prefix` and `suffix` attributes to disambiguate. + * By default, more prefix and suffix are included than strictly required; both + * in order to be robust against slight modifications, and in an attempt to not + * end halfway a word (mainly for human readability). + * + * This is an abstract implementation of the function’s logic, which expects a + * generic {@link Chunker} to represent the text, and a {@link ChunkRange} to + * represent the target. + * + * See {@link @apache-annotator/dom#describeTextQuote} for a wrapper around this + * implementation which applies it to the text of an HTML DOM. * * @param target - The range of characters that the selector should describe * @param scope - The text containing the target range; or, more accurately, a - * function creating {@link Chunker}s that allow walking through the text. - * @param options - * @returns the {@link TextQuoteSelector} that describes *target*. + * function that produces {@link Chunker}s corresponding to this text. + * @param options - Options to fine-tune the function’s behaviour. + * @returns The {@link TextQuoteSelector} that describes `target`. + * + * @public */ export async function describeTextQuote<TChunk extends Chunk<string>>( target: ChunkRange<TChunk>, scope: () => Chunker<TChunk>, - { + options: DescribeTextQuoteOptions = {}, +): Promise<TextQuoteSelector> { + const { minimalContext = false, minimumQuoteLength = 0, maxWordLength = 50, - }: DescribeTextQuoteOptions = {}, -): Promise<TextQuoteSelector> { + } = options; + // Create a seeker to read the target quote and the context around it. // TODO Possible optimisation: as it need not be an AbsoluteSeeker, a // different implementation could provide direct ‘jump’ access in seekToChunk diff --git a/packages/selector/src/text/match-text-position.ts b/packages/selector/src/text/match-text-position.ts index d6d156f..b6bb3e9 100644 --- a/packages/selector/src/text/match-text-position.ts +++ b/packages/selector/src/text/match-text-position.ts @@ -23,6 +23,35 @@ import type { Chunk, ChunkRange, Chunker } from './chunker'; import { CodePointSeeker } from './code-point-seeker'; import { TextSeeker } from './seeker'; +/** + * Find the range of text corresponding to the given {@link TextPositionSelector}. + * + * @remarks + * This is an abstract implementation of the function’s logic, which expects a + * generic {@link Chunker} to represent the text, and returns an (async) + * generator producing a single {@link ChunkRange} to represent the match. + * (unlike e.g. TextQuoteSelector, it cannot result in multiple matches). + * + * See {@link @apache-annotator/dom#createTextPositionSelectorMatcher} for a + * wrapper around this implementation which applies it to the text of an HTML + * DOM. + * + * The function is curried, taking first the selector and then the text. + * + * @example + * ``` + * const selector = { type: 'TextPositionSelector', start: 702, end: 736 }; + * const matches = textPositionSelectorMatcher(selector)(textChunks); + * const match = (await matches.next()).value; + * console.log(match); + * // ⇒ { startChunk: { … }, startIndex: 64, endChunk: { … }, endIndex: 98 } + * ``` + * + * @param selector - the {@link TextPositionSelector} to be anchored + * @returns a {@link Matcher} function that applies `selector` to a given text + * + * @public + */ export function textPositionSelectorMatcher( selector: TextPositionSelector, ): <TChunk extends Chunk<any>>( diff --git a/packages/selector/src/text/match-text-quote.ts b/packages/selector/src/text/match-text-quote.ts index ec63036..d3712bb 100644 --- a/packages/selector/src/text/match-text-quote.ts +++ b/packages/selector/src/text/match-text-quote.ts @@ -21,6 +21,45 @@ import type { TextQuoteSelector } from '../types'; import type { Chunk, Chunker, ChunkRange } from './chunker'; +/** + * Find occurrences in a text matching the given {@link TextQuoteSelector}. + * + * @remarks + * This performs an exact search the selector’s quote (including prefix and + * suffix) within the given text. + * + * Note the match is based on strict character-by-character equivalence, i.e. + * it is sensitive to whitespace, capitalisation, etc. + * + * This is an abstract implementation of the function’s logic, which expects a + * generic {@link Chunker} to represent the text, and returns an (async) + * generator of {@link ChunkRange}s to represent the matches. + * + * See {@link @apache-annotator/dom#createTextQuoteSelectorMatcher} for a + * wrapper around this implementation which applies it to the text of an HTML + * DOM. + * + * The function is curried, taking first the selector and then the text. + * + * As there may be multiple matches for a given selector (when its prefix and + * suffix attributes are not sufficient to disambiguate it), the matcher will + * return an (async) generator that produces each match in the order they are + * found in the text. + * + * @example + * ``` + * const selector = { type: 'TextQuoteSelector', exact: 'banana' }; + * const matches = textQuoteSelectorMatcher(selector)(textChunks); + * for await (match of matches) console.log(match); + * // ⇒ { startChunk: { … }, startIndex: 187, endChunk: { … }, endIndex: 193 } + * // ⇒ { startChunk: { … }, startIndex: 631, endChunk: { … }, endIndex: 637 } + * ``` + * + * @param selector - The {@link TextQuoteSelector} to be anchored + * @returns a {@link Matcher} function that applies `selector` to a given text + * + * @public + */ export function textQuoteSelectorMatcher( selector: TextQuoteSelector, ): <TChunk extends Chunk<any>>( @@ -34,7 +73,9 @@ export function textQuoteSelectorMatcher( const suffix = selector.suffix || ''; const searchPattern = prefix + exact + suffix; - // The code below runs a loop with three steps: + // The code below essentially just performs string.indexOf(searchPattern), + // but on a string that is chopped up in multiple chunks. It runs a loop + // containing three steps: // 1. Continue checking any partial matches from the previous chunk(s). // 2. Try find the whole pattern in the chunk (possibly multiple times). // 3. Check if this chunk ends with a partial match (or even multiple partial matches). diff --git a/packages/selector/src/text/seeker.ts b/packages/selector/src/text/seeker.ts index bfd953f..53fdb49 100644 --- a/packages/selector/src/text/seeker.ts +++ b/packages/selector/src/text/seeker.ts @@ -39,6 +39,8 @@ const E_END = 'Iterator exhausted before seek ended.'; * @typeParam TData - Type of data this seeker’s read methods will return (not * necessarily the same as the `TData` parameter of {@link Chunk}, see e.g. * {@link CodePointSeeker}) + * + * @public */ export interface Seeker< TChunk extends Chunk<any>, @@ -50,6 +52,8 @@ export interface Seeker< /** * Seeks/reads by a given number of characters. + * + * @public */ export interface RelativeSeeker<TData extends Iterable<any> = string> { /** @@ -83,6 +87,8 @@ export interface RelativeSeeker<TData extends Iterable<any> = string> { /** * Seek/read to absolute positions in the file. + * + * @public */ export interface AbsoluteSeeker<TData extends Iterable<any> = string> { /** @@ -124,6 +130,8 @@ export interface AbsoluteSeeker<TData extends Iterable<any> = string> { * Note that all offset numbers in this interface are representing units of the * {@link Chunk.data | data type of `TChunk`}; which might differ from that of * `TData`. + * + * @public */ export interface ChunkSeeker< TChunk extends Chunk<any>, @@ -172,10 +180,25 @@ export interface ChunkSeeker< readToChunk(chunk: TChunk, offset?: number): TData; } -// The TextSeeker takes a Chunker as input, and lets it be treated as a single -// string. Seeking to a given numeric position will cause it to pull chunks from -// the underlying Chunker, counting their lengths until the requested position -// is reached. +/** + * A TextSeeker is constructed around a {@link Chunker}, to let it be treated as + * a continuous sequence of characters. + * + * @remarks + * Seeking to a given numeric position will cause a `TextSeeker` to pull chunks + * from the underlying `Chunker`, counting their lengths until the requested + * position is reached. `Chunks` are not stored but simply read again when + * seeking backwards. + * + * The `Chunker` is presumed to read an unchanging file. If a chunk’s length + * would change while seeking, a TextSeeker’s absolute positioning would be + * incorrect. + * + * See {@link CodePointSeeker} for a {@link Seeker} that counts Unicode *code + * points* instead of Javascript’s ‘normal’ characters. + * + * @public + */ export class TextSeeker<TChunk extends Chunk<string>> implements Seeker<TChunk> { // The chunk containing our current text position. @@ -265,7 +288,7 @@ export class TextSeeker<TChunk extends Chunk<string>> // Now we know where the chunk is, walk to the requested offset. // Note we might have started inside the chunk, and the offset could even - // point to a position before or after the chunk. + // point at a position before or after the chunk. const targetPosition = this.currentChunkPosition + offset; if (!read) { this.seekTo(targetPosition); diff --git a/packages/selector/src/types.ts b/packages/selector/src/types.ts index e57fed0..fa367ee 100644 --- a/packages/selector/src/types.ts +++ b/packages/selector/src/types.ts @@ -18,15 +18,48 @@ * under the License. */ +/** + * A {@link https://www.w3.org/TR/2017/REC-annotation-model-20170223/#selectors + * | Selector} object of the Web Annotation Data Model. + * + * Corresponds to RDF class {@link http://www.w3.org/ns/oa#Selector} + * + * @public + */ export interface Selector { + /** + * A Selector can be refined by another Selector. + * + * @remarks + * See {@link https://www.w3.org/TR/2017/REC-annotation-model-20170223/#refinement-of-selection + * | §4.2.9 Refinement of Selection} in the Web Annotation Data Model. + * + * Corresponds to RDF property {@link http://www.w3.org/ns/oa#refinedBy} + */ refinedBy?: Selector; } +/** + * The {@link https://www.w3.org/TR/2017/REC-annotation-model-20170223/#css-selector + * | CssSelector} of the Web Annotation Data Model. + * + * Corresponds to RDF class {@link http://www.w3.org/ns/oa#CssSelector} + * + * @public + */ export interface CssSelector extends Selector { type: 'CssSelector'; value: string; } +/** + * The {@link https://www.w3.org/TR/2017/REC-annotation-model-20170223/#text-quote-selector + * | TextQuoteSelector} of the Web Annotation Data Model. + * + * Corresponds to RDF class {@link http://www.w3.org/ns/oa#TextQuoteSelector} + * + * @public + */ export interface TextQuoteSelector extends Selector { type: 'TextQuoteSelector'; exact: string; @@ -34,18 +67,40 @@ export interface TextQuoteSelector extends Selector { suffix?: string; } +/** + * The {@link https://www.w3.org/TR/2017/REC-annotation-model-20170223/#text-position-selector + * | TextPositionSelector} of the Web Annotation Data Model. + * + * Corresponds to RDF class {@link http://www.w3.org/ns/oa#TextPositionSelector} + * + * @public + */ export interface TextPositionSelector extends Selector { type: 'TextPositionSelector'; start: number; // more precisely: non-negative integer end: number; // more precisely: non-negative integer } +/** + * The {@link https://www.w3.org/TR/2017/REC-annotation-model-20170223/#range-selector + * | RangeSelector} of the Web Annotation Data Model. + * + * Corresponds to RDF class {@link http://www.w3.org/ns/oa#RangeSelector} + * + * @public + */ export interface RangeSelector extends Selector { type: 'RangeSelector'; startSelector: Selector; endSelector: Selector; } +/** + * A function that finds the match(es) in the given (sub)document (the ‘scope’) + * corresponding to some (prespecified) selector(s). + * + * @public + */ export interface Matcher<TScope, TMatch> { (scope: TScope): AsyncGenerator<TMatch, void, void>; }
