selector

gerben Fri, 20 Nov 2020 07:45:34 -0800

This is an automated email from the ASF dual-hosted git repository.

gerben pushed a commit to branch import-dom-seek
in repository https://gitbox.apache.org/repos/asf/incubator-annotator.git


commit 5dd510521a1108d6d598602154485d5d1c03f023
Author: Gerben <[email protected]>
AuthorDate: Fri Nov 20 16:40:19 2020 +0100

    Move abstract code into @annotator/selector
---
 packages/dom/src/chunker.ts                        |  51 +------
 packages/dom/src/text-position/describe.ts         |  22 +--
 packages/dom/src/text-position/match.ts            |  28 +---
 packages/dom/src/text-quote/describe.ts            | 115 +-------------
 packages/dom/src/text-quote/match.ts               | 148 +-----------------
 packages/selector/src/index.ts                     |   1 +
 packages/selector/src/text/chunker.ts              |  69 +++++++++
 .../src => selector/src/text}/code-point-seeker.ts |   0
 .../src/text/describe-text-position.ts}            |  34 +----
 packages/selector/src/text/describe-text-quote.ts  | 134 ++++++++++++++++
 packages/selector/src/text/index.ts                |   5 +
 .../src/text/match-text-position.ts}               |  27 +---
 packages/selector/src/text/match-text-quote.ts     | 168 +++++++++++++++++++++
 packages/{dom/src => selector/src/text}/seek.ts    |   0
 14 files changed, 392 insertions(+), 410 deletions(-)

diff --git a/packages/dom/src/chunker.ts b/packages/dom/src/chunker.ts
index 8c56924..bc5c2c1 100644
--- a/packages/dom/src/chunker.ts
+++ b/packages/dom/src/chunker.ts
@@ -18,59 +18,10 @@
  * under the License.
  */
 
+import type { Chunk, Chunker, ChunkRange } from '@annotator/selector';
 import { normalizeRange } from './normalize-range';
 import { ownerDocument } from './owner-document';
 
-// A Chunk represents a fragment (typically a string) of some document.
-// Subclasses can add further attributes to map the chunk to its position in 
the
-// data structure it came from (e.g. a DOM node).
-export interface Chunk<TData> {
-  readonly data: TData;
-  equals?(otherChunk: this): boolean;
-}
-
-export interface ChunkRange<TChunk extends Chunk<any>> {
-  startChunk: TChunk;
-  startIndex: number;
-  endChunk: TChunk;
-  endIndex: number;
-}
-
-export function chunkEquals(chunk1: Chunk<any>, chunk2: Chunk<any>): boolean {
-  return chunk1.equals ? chunk1.equals(chunk2) : chunk1 === chunk2;
-}
-
-export function chunkRangeEquals(
-  range1: ChunkRange<any>,
-  range2: ChunkRange<any>,
-): boolean {
-  return (
-    chunkEquals(range1.startChunk, range2.startChunk) &&
-    chunkEquals(range1.endChunk, range2.endChunk) &&
-    range1.startIndex === range2.startIndex &&
-    range1.endIndex === range2.endIndex
-  );
-}
-
-// A Chunker lets one walk through the chunks of a document.
-// It is inspired by, and similar to, the DOM’s NodeIterator. (but unlike
-// NodeIterator, it has no concept of being ‘before’ or ‘after’ a chunk)
-export interface Chunker<TChunk extends Chunk<any>> {
-  // The chunk currently being pointed at.
-  readonly currentChunk: TChunk;
-
-  // Move currentChunk to the chunk following it, and return that chunk.
-  // If there are no chunks following it, keep currentChunk unchanged and 
return null.
-  nextChunk(): TChunk | null;
-
-  // Move currentChunk to the chunk preceding it, and return that chunk.
-  // If there are no preceding chunks, keep currentChunk unchanged and return 
null.
-  previousChunk(): TChunk | null;
-
-  // Test if a given chunk is before the current chunk.
-  precedesCurrentChunk(chunk: TChunk): boolean;
-}
-
 export interface PartialTextNode extends Chunk<string> {
   readonly node: Text;
   readonly startOffset: number;
diff --git a/packages/dom/src/text-position/describe.ts 
b/packages/dom/src/text-position/describe.ts
index 5f7f9a3..992a633 100644
--- a/packages/dom/src/text-position/describe.ts
+++ b/packages/dom/src/text-position/describe.ts
@@ -19,11 +19,9 @@
  */
 
 import type { TextPositionSelector } from '@annotator/selector';
-import type { Chunk, Chunker, ChunkRange } from '../chunker';
+import { describeTextPosition as abstractDescribeTextPosition } from 
'@annotator/selector';
 import { TextNodeChunker } from '../chunker';
-import { CodePointSeeker } from '../code-point-seeker';
 import { ownerDocument } from '../owner-document';
-import { TextSeeker } from '../seek';
 
 export async function describeTextPosition(
   range: Range,
@@ -48,21 +46,3 @@ export async function describeTextPosition(
     textChunks,
   );
 }
-
-async function abstractDescribeTextPosition<TChunk extends Chunk<string>>(
-  target: ChunkRange<TChunk>,
-  scope: Chunker<TChunk>,
-): Promise<TextPositionSelector> {
-  const codeUnitSeeker = new TextSeeker(scope);
-  const codePointSeeker = new CodePointSeeker(codeUnitSeeker);
-
-  codePointSeeker.seekToChunk(target.startChunk, target.startIndex);
-  const start = codePointSeeker.position;
-  codePointSeeker.seekToChunk(target.endChunk, target.endIndex);
-  const end = codePointSeeker.position;
-  return {
-    type: 'TextPositionSelector',
-    start,
-    end,
-  };
-}
diff --git a/packages/dom/src/text-position/match.ts 
b/packages/dom/src/text-position/match.ts
index becd957..3dccede 100644
--- a/packages/dom/src/text-position/match.ts
+++ b/packages/dom/src/text-position/match.ts
@@ -19,10 +19,8 @@
  */
 
 import type { Matcher, TextPositionSelector } from '@annotator/selector';
-import type { Chunk, ChunkRange, Chunker } from '../chunker';
+import { textPositionSelectorMatcher as abstractTextPositionSelectorMatcher } 
from '@annotator/selector';
 import { TextNodeChunker } from '../chunker';
-import { CodePointSeeker } from '../code-point-seeker';
-import { TextSeeker } from '../seek';
 
 export function createTextPositionSelectorMatcher(
   selector: TextPositionSelector,
@@ -39,27 +37,3 @@ export function createTextPositionSelectorMatcher(
     }
   };
 }
-
-export function abstractTextPositionSelectorMatcher(
-  selector: TextPositionSelector,
-): <TChunk extends Chunk<any>>(
-  scope: Chunker<TChunk>,
-) => AsyncGenerator<ChunkRange<TChunk>, void, void> {
-  const { start, end } = selector;
-
-  return async function* matchAll<TChunk extends Chunk<string>>(
-    textChunks: Chunker<TChunk>,
-  ) {
-    const codeUnitSeeker = new TextSeeker(textChunks);
-    const codePointSeeker = new CodePointSeeker(codeUnitSeeker);
-
-    codePointSeeker.seekTo(start);
-    const startChunk = codeUnitSeeker.currentChunk;
-    const startIndex = codeUnitSeeker.offsetInChunk;
-    codePointSeeker.seekTo(end);
-    const endChunk = codeUnitSeeker.currentChunk;
-    const endIndex = codeUnitSeeker.offsetInChunk;
-
-    yield { startChunk, startIndex, endChunk, endIndex };
-  };
-}
diff --git a/packages/dom/src/text-quote/describe.ts 
b/packages/dom/src/text-quote/describe.ts
index 3dfa45e..3ca3f0b 100644
--- a/packages/dom/src/text-quote/describe.ts
+++ b/packages/dom/src/text-quote/describe.ts
@@ -19,12 +19,9 @@
  */
 
 import type { TextQuoteSelector } from '@annotator/selector';
-import type { Chunk, Chunker, ChunkRange } from '../chunker';
-import { TextNodeChunker, chunkRangeEquals } from '../chunker';
+import { describeTextQuote as abstractDescribeTextQuote } from 
'@annotator/selector';
+import { TextNodeChunker } from '../chunker';
 import { ownerDocument } from '../owner-document';
-import type { Seeker } from '../seek';
-import { TextSeeker } from '../seek';
-import { abstractTextQuoteSelectorMatcher } from '.';
 
 export async function describeTextQuote(
   range: Range,
@@ -47,111 +44,3 @@ export async function describeTextQuote(
     () => new TextNodeChunker(scope),
   );
 }
-
-async function abstractDescribeTextQuote<TChunk extends Chunk<string>>(
-  target: ChunkRange<TChunk>,
-  scope: () => Chunker<TChunk>,
-): Promise<TextQuoteSelector> {
-  const seeker = new TextSeeker(scope());
-
-  // Read the target’s exact text.
-  seeker.seekToChunk(target.startChunk, target.startIndex);
-  const exact = seeker.readToChunk(target.endChunk, target.endIndex);
-
-  // Starting with an empty prefix and suffix, we search for matches. At each 
unintended match
-  // we encounter, we extend the prefix or suffix just enough to ensure it 
will no longer match.
-  let prefix = '';
-  let suffix = '';
-
-  while (true) {
-    const tentativeSelector: TextQuoteSelector = {
-      type: 'TextQuoteSelector',
-      exact,
-      prefix,
-      suffix,
-    };
-
-    const matches = abstractTextQuoteSelectorMatcher(tentativeSelector)(
-      scope(),
-    );
-    let nextMatch = await matches.next();
-
-    // If this match is the intended one, no need to act.
-    // XXX This test is fragile: nextMatch and target are assumed to be 
normalised.
-    if (!nextMatch.done && chunkRangeEquals(nextMatch.value, target)) {
-      nextMatch = await matches.next();
-    }
-
-    // If there are no more unintended matches, our selector is unambiguous!
-    if (nextMatch.done) return tentativeSelector;
-
-    // Possible optimisation: A subsequent search could safely skip the part we
-    // already processed, instead of starting from the beginning again. But 
we’d
-    // need the matcher to start at the seeker’s position, instead of searching
-    // in the whole current chunk. Then we could just seek back to just after
-    // the start of the prefix: seeker.seekBy(-prefix.length + 1); (don’t 
forget
-    // to also correct for any changes in the prefix we will make below)
-
-    // We’ll have to add more prefix/suffix to disqualify this unintended 
match.
-    const unintendedMatch = nextMatch.value;
-    const seeker1 = new TextSeeker(scope());
-    const seeker2 = new TextSeeker(scope());
-
-    // Count how many characters we’d need as a prefix to disqualify this 
match.
-    seeker1.seekToChunk(target.startChunk, target.startIndex - prefix.length);
-    seeker2.seekToChunk(
-      unintendedMatch.startChunk,
-      unintendedMatch.startIndex - prefix.length,
-    );
-    const extraPrefix = readUntilDifferent(seeker1, seeker2, true);
-
-    // Count how many characters we’d need as a suffix to disqualify this 
match.
-    seeker1.seekToChunk(target.endChunk, target.endIndex + suffix.length);
-    seeker2.seekToChunk(
-      unintendedMatch.endChunk,
-      unintendedMatch.endIndex + suffix.length,
-    );
-    const extraSuffix = readUntilDifferent(seeker1, seeker2, false);
-
-    // Use either the prefix or suffix, whichever is shortest.
-    if (
-      extraPrefix !== undefined &&
-      (extraSuffix === undefined || extraPrefix.length <= extraSuffix.length)
-    ) {
-      prefix = extraPrefix + prefix;
-    } else if (extraSuffix !== undefined) {
-      suffix = suffix + extraSuffix;
-    } else {
-      throw new Error(
-        'Target cannot be disambiguated; how could that have happened‽',
-      );
-    }
-  }
-}
-
-function readUntilDifferent(
-  seeker1: Seeker,
-  seeker2: Seeker,
-  reverse: boolean,
-): string | undefined {
-  let result = '';
-  while (true) {
-    let nextCharacter: string;
-    try {
-      nextCharacter = seeker1.read(reverse ? -1 : 1);
-    } catch (err) {
-      return undefined; // Start/end of text reached: cannot expand result.
-    }
-    result = reverse ? nextCharacter + result : result + nextCharacter;
-
-    // Check if the newly added character makes the result differ from the 
second seeker.
-    let comparisonCharacter: string | undefined;
-    try {
-      comparisonCharacter = seeker2.read(reverse ? -1 : 1);
-    } catch (err) {
-      // A RangeError would merely mean seeker2 is exhausted.
-      if (!(err instanceof RangeError)) throw err;
-    }
-    if (nextCharacter !== comparisonCharacter) return result;
-  }
-}
diff --git a/packages/dom/src/text-quote/match.ts 
b/packages/dom/src/text-quote/match.ts
index 9e09990..0c16b8a 100644
--- a/packages/dom/src/text-quote/match.ts
+++ b/packages/dom/src/text-quote/match.ts
@@ -19,7 +19,7 @@
  */
 
 import type { Matcher, TextQuoteSelector } from '@annotator/selector';
-import type { Chunk, Chunker, ChunkRange } from '../chunker';
+import { textQuoteSelectorMatcher as abstractTextQuoteSelectorMatcher } from 
'@annotator/selector';
 import { TextNodeChunker, EmptyScopeError } from '../chunker';
 
 export function createTextQuoteSelectorMatcher(
@@ -42,149 +42,3 @@ export function createTextQuoteSelectorMatcher(
     }
   };
 }
-
-export function abstractTextQuoteSelectorMatcher(
-  selector: TextQuoteSelector,
-): <TChunk extends Chunk<any>>(
-  scope: Chunker<TChunk>,
-) => AsyncGenerator<ChunkRange<TChunk>, void, void> {
-  return async function* matchAll<TChunk extends Chunk<string>>(
-    textChunks: Chunker<TChunk>,
-  ) {
-    const exact = selector.exact;
-    const prefix = selector.prefix || '';
-    const suffix = selector.suffix || '';
-    const searchPattern = prefix + exact + suffix;
-
-    // The code below runs a loop with three steps:
-    // 1. Continue checking any partial matches from the previous chunk(s).
-    // 2. Try find the whole pattern in the chunk (possibly multiple times).
-    // 3. Check if this chunk ends with a partial match (or even multiple 
partial matches).
-
-    interface PartialMatch {
-      startChunk?: TChunk;
-      startIndex?: number;
-      endChunk?: TChunk;
-      endIndex?: number;
-      charactersMatched: number;
-    }
-    let partialMatches: PartialMatch[] = [];
-
-    let isFirstChunk = true;
-    do {
-      const chunk = textChunks.currentChunk;
-      const chunkValue = chunk.data;
-
-      // 1. Continue checking any partial matches from the previous chunk(s).
-      const remainingPartialMatches: typeof partialMatches = [];
-      for (const partialMatch of partialMatches) {
-        const charactersMatched = partialMatch.charactersMatched;
-
-        // If the current chunk contains the start and/or end of the match, 
record these.
-        if (partialMatch.endChunk === undefined) {
-          const charactersUntilMatchEnd =
-            prefix.length + exact.length - charactersMatched;
-          if (charactersUntilMatchEnd <= chunkValue.length) {
-            partialMatch.endChunk = chunk;
-            partialMatch.endIndex = charactersUntilMatchEnd;
-          }
-        }
-        if (partialMatch.startChunk === undefined) {
-          const charactersUntilMatchStart = prefix.length - charactersMatched;
-          if (
-            charactersUntilMatchStart < chunkValue.length ||
-            partialMatch.endChunk !== undefined // handles an edge case: an 
empty quote at the end of a chunk.
-          ) {
-            partialMatch.startChunk = chunk;
-            partialMatch.startIndex = charactersUntilMatchStart;
-          }
-        }
-
-        const charactersUntilSuffixEnd =
-          searchPattern.length - charactersMatched;
-        if (charactersUntilSuffixEnd <= chunkValue.length) {
-          if (
-            chunkValue.startsWith(searchPattern.substring(charactersMatched))
-          ) {
-            yield partialMatch as ChunkRange<TChunk>; // all fields are 
certainly defined now.
-          }
-        } else if (
-          chunkValue ===
-          searchPattern.substring(
-            charactersMatched,
-            charactersMatched + chunkValue.length,
-          )
-        ) {
-          // The chunk is too short to complete the match; comparison has to 
be completed in subsequent chunks.
-          partialMatch.charactersMatched += chunkValue.length;
-          remainingPartialMatches.push(partialMatch);
-        }
-      }
-      partialMatches = remainingPartialMatches;
-
-      // 2. Try find the whole pattern in the chunk (possibly multiple times).
-      if (searchPattern.length <= chunkValue.length) {
-        let fromIndex = 0;
-        while (fromIndex <= chunkValue.length) {
-          const patternStartIndex = chunkValue.indexOf(
-            searchPattern,
-            fromIndex,
-          );
-          if (patternStartIndex === -1) break;
-          fromIndex = patternStartIndex + 1;
-
-          // Handle edge case: an empty searchPattern would already have been 
yielded at the end of the last chunk.
-          if (
-            patternStartIndex === 0 &&
-            searchPattern.length === 0 &&
-            !isFirstChunk
-          )
-            continue;
-
-          yield {
-            startChunk: chunk,
-            startIndex: patternStartIndex + prefix.length,
-            endChunk: chunk,
-            endIndex: patternStartIndex + prefix.length + exact.length,
-          };
-        }
-      }
-
-      // 3. Check if this chunk ends with a partial match (or even multiple 
partial matches).
-      let newPartialMatches: number[] = [];
-      const searchStartPoint = Math.max(
-        chunkValue.length - searchPattern.length + 1,
-        0,
-      );
-      for (let i = searchStartPoint; i < chunkValue.length; i++) {
-        const character = chunkValue[i];
-        newPartialMatches = newPartialMatches.filter(
-          (partialMatchStartIndex) =>
-            character === searchPattern[i - partialMatchStartIndex],
-        );
-        if (character === searchPattern[0]) newPartialMatches.push(i);
-      }
-      for (const partialMatchStartIndex of newPartialMatches) {
-        const charactersMatched = chunkValue.length - partialMatchStartIndex;
-        const partialMatch: PartialMatch = {
-          charactersMatched,
-        };
-        if (charactersMatched >= prefix.length + exact.length) {
-          partialMatch.endChunk = chunk;
-          partialMatch.endIndex =
-            partialMatchStartIndex + prefix.length + exact.length;
-        }
-        if (
-          charactersMatched > prefix.length ||
-          partialMatch.endChunk !== undefined // handles an edge case: an 
empty quote at the end of a chunk.
-        ) {
-          partialMatch.startChunk = chunk;
-          partialMatch.startIndex = partialMatchStartIndex + prefix.length;
-        }
-        partialMatches.push(partialMatch);
-      }
-
-      isFirstChunk = false;
-    } while (textChunks.nextChunk() !== null);
-  };
-}
diff --git a/packages/selector/src/index.ts b/packages/selector/src/index.ts
index 73caa05..adefd04 100644
--- a/packages/selector/src/index.ts
+++ b/packages/selector/src/index.ts
@@ -27,6 +27,7 @@ export type {
   TextPositionSelector,
   TextQuoteSelector,
 } from './types';
+export * from './text';
 
 export function makeRefinable<
   // Any subtype of Selector can be made refinable; but note we limit the value
diff --git a/packages/selector/src/text/chunker.ts 
b/packages/selector/src/text/chunker.ts
new file mode 100644
index 0000000..eb0d970
--- /dev/null
+++ b/packages/selector/src/text/chunker.ts
@@ -0,0 +1,69 @@
+/**
+ * @license
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+// A Chunk represents a fragment (typically a string) of some document.
+// Subclasses can add further attributes to map the chunk to its position in 
the
+// data structure it came from (e.g. a DOM node).
+export interface Chunk<TData> {
+  readonly data: TData;
+  equals?(otherChunk: this): boolean;
+}
+
+export function chunkEquals(chunk1: Chunk<any>, chunk2: Chunk<any>): boolean {
+  return chunk1.equals ? chunk1.equals(chunk2) : chunk1 === chunk2;
+}
+
+export interface ChunkRange<TChunk extends Chunk<any>> {
+  startChunk: TChunk;
+  startIndex: number;
+  endChunk: TChunk;
+  endIndex: number;
+}
+
+export function chunkRangeEquals(
+  range1: ChunkRange<any>,
+  range2: ChunkRange<any>,
+): boolean {
+  return (
+    chunkEquals(range1.startChunk, range2.startChunk) &&
+    chunkEquals(range1.endChunk, range2.endChunk) &&
+    range1.startIndex === range2.startIndex &&
+    range1.endIndex === range2.endIndex
+  );
+}
+
+// A Chunker lets one walk through the chunks of a document.
+// It is inspired by, and similar to, the DOM’s NodeIterator. (but unlike
+// NodeIterator, it has no concept of being ‘before’ or ‘after’ a chunk)
+export interface Chunker<TChunk extends Chunk<any>> {
+  // The chunk currently being pointed at.
+  readonly currentChunk: TChunk;
+
+  // Move currentChunk to the chunk following it, and return that chunk.
+  // If there are no chunks following it, keep currentChunk unchanged and 
return null.
+  nextChunk(): TChunk | null;
+
+  // Move currentChunk to the chunk preceding it, and return that chunk.
+  // If there are no preceding chunks, keep currentChunk unchanged and return 
null.
+  previousChunk(): TChunk | null;
+
+  // Test if a given chunk is before the current chunk.
+  precedesCurrentChunk(chunk: TChunk): boolean;
+}
diff --git a/packages/dom/src/code-point-seeker.ts 
b/packages/selector/src/text/code-point-seeker.ts
similarity index 100%
rename from packages/dom/src/code-point-seeker.ts
rename to packages/selector/src/text/code-point-seeker.ts
diff --git a/packages/dom/src/text-position/describe.ts 
b/packages/selector/src/text/describe-text-position.ts
similarity index 58%
copy from packages/dom/src/text-position/describe.ts
copy to packages/selector/src/text/describe-text-position.ts
index 5f7f9a3..99485b0 100644
--- a/packages/dom/src/text-position/describe.ts
+++ b/packages/selector/src/text/describe-text-position.ts
@@ -19,37 +19,11 @@
  */
 
 import type { TextPositionSelector } from '@annotator/selector';
-import type { Chunk, Chunker, ChunkRange } from '../chunker';
-import { TextNodeChunker } from '../chunker';
-import { CodePointSeeker } from '../code-point-seeker';
-import { ownerDocument } from '../owner-document';
-import { TextSeeker } from '../seek';
+import type { Chunk, Chunker, ChunkRange } from './chunker';
+import { CodePointSeeker } from './code-point-seeker';
+import { TextSeeker } from './seek';
 
-export async function describeTextPosition(
-  range: Range,
-  maybeScope?: Range,
-): Promise<TextPositionSelector> {
-  // Default to search in the whole document.
-  let scope: Range;
-  if (maybeScope !== undefined) {
-    scope = maybeScope;
-  } else {
-    const document = ownerDocument(range);
-    scope = document.createRange();
-    scope.selectNodeContents(document);
-  }
-
-  const textChunks = new TextNodeChunker(scope);
-  if (textChunks.currentChunk === null)
-    throw new RangeError('Range does not contain any Text nodes.');
-
-  return await abstractDescribeTextPosition(
-    textChunks.rangeToChunkRange(range),
-    textChunks,
-  );
-}
-
-async function abstractDescribeTextPosition<TChunk extends Chunk<string>>(
+export async function describeTextPosition<TChunk extends Chunk<string>>(
   target: ChunkRange<TChunk>,
   scope: Chunker<TChunk>,
 ): Promise<TextPositionSelector> {
diff --git a/packages/selector/src/text/describe-text-quote.ts 
b/packages/selector/src/text/describe-text-quote.ts
new file mode 100644
index 0000000..c97a4f8
--- /dev/null
+++ b/packages/selector/src/text/describe-text-quote.ts
@@ -0,0 +1,134 @@
+/**
+ * @license
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+import type { TextQuoteSelector } from '@annotator/selector';
+import type { Chunk, Chunker, ChunkRange } from './chunker';
+import { chunkRangeEquals } from './chunker';
+import type { Seeker } from './seek';
+import { TextSeeker } from './seek';
+import { textQuoteSelectorMatcher } from '.';
+
+export async function describeTextQuote<TChunk extends Chunk<string>>(
+    target: ChunkRange<TChunk>,
+    scope: () => Chunker<TChunk>,
+  ): Promise<TextQuoteSelector> {
+    const seeker = new TextSeeker(scope());
+
+    // Read the target’s exact text.
+    seeker.seekToChunk(target.startChunk, target.startIndex);
+    const exact = seeker.readToChunk(target.endChunk, target.endIndex);
+
+    // Starting with an empty prefix and suffix, we search for matches. At 
each unintended match
+    // we encounter, we extend the prefix or suffix just enough to ensure it 
will no longer match.
+    let prefix = '';
+    let suffix = '';
+
+    while (true) {
+      const tentativeSelector: TextQuoteSelector = {
+        type: 'TextQuoteSelector',
+        exact,
+        prefix,
+        suffix,
+      };
+
+      const matches = textQuoteSelectorMatcher(tentativeSelector)(
+        scope(),
+      );
+      let nextMatch = await matches.next();
+
+      // If this match is the intended one, no need to act.
+      // XXX This test is fragile: nextMatch and target are assumed to be 
normalised.
+      if (!nextMatch.done && chunkRangeEquals(nextMatch.value, target)) {
+        nextMatch = await matches.next();
+      }
+
+      // If there are no more unintended matches, our selector is unambiguous!
+      if (nextMatch.done) return tentativeSelector;
+
+      // Possible optimisation: A subsequent search could safely skip the part 
we
+      // already processed, instead of starting from the beginning again. But 
we’d
+      // need the matcher to start at the seeker’s position, instead of 
searching
+      // in the whole current chunk. Then we could just seek back to just after
+      // the start of the prefix: seeker.seekBy(-prefix.length + 1); (don’t 
forget
+      // to also correct for any changes in the prefix we will make below)
+
+      // We’ll have to add more prefix/suffix to disqualify this unintended 
match.
+      const unintendedMatch = nextMatch.value;
+      const seeker1 = new TextSeeker(scope());
+      const seeker2 = new TextSeeker(scope());
+
+      // Count how many characters we’d need as a prefix to disqualify this 
match.
+      seeker1.seekToChunk(target.startChunk, target.startIndex - 
prefix.length);
+      seeker2.seekToChunk(
+        unintendedMatch.startChunk,
+        unintendedMatch.startIndex - prefix.length,
+      );
+      const extraPrefix = readUntilDifferent(seeker1, seeker2, true);
+
+      // Count how many characters we’d need as a suffix to disqualify this 
match.
+      seeker1.seekToChunk(target.endChunk, target.endIndex + suffix.length);
+      seeker2.seekToChunk(
+        unintendedMatch.endChunk,
+        unintendedMatch.endIndex + suffix.length,
+      );
+      const extraSuffix = readUntilDifferent(seeker1, seeker2, false);
+
+      // Use either the prefix or suffix, whichever is shortest.
+      if (
+        extraPrefix !== undefined &&
+        (extraSuffix === undefined || extraPrefix.length <= extraSuffix.length)
+      ) {
+        prefix = extraPrefix + prefix;
+      } else if (extraSuffix !== undefined) {
+        suffix = suffix + extraSuffix;
+      } else {
+        throw new Error(
+          'Target cannot be disambiguated; how could that have happened‽',
+        );
+      }
+    }
+  }
+
+  function readUntilDifferent(
+    seeker1: Seeker,
+    seeker2: Seeker,
+    reverse: boolean,
+  ): string | undefined {
+    let result = '';
+    while (true) {
+      let nextCharacter: string;
+      try {
+        nextCharacter = seeker1.read(reverse ? -1 : 1);
+      } catch (err) {
+        return undefined; // Start/end of text reached: cannot expand result.
+      }
+      result = reverse ? nextCharacter + result : result + nextCharacter;
+
+      // Check if the newly added character makes the result differ from the 
second seeker.
+      let comparisonCharacter: string | undefined;
+      try {
+        comparisonCharacter = seeker2.read(reverse ? -1 : 1);
+      } catch (err) {
+        // A RangeError would merely mean seeker2 is exhausted.
+        if (!(err instanceof RangeError)) throw err;
+      }
+      if (nextCharacter !== comparisonCharacter) return result;
+    }
+  }
diff --git a/packages/selector/src/text/index.ts 
b/packages/selector/src/text/index.ts
new file mode 100644
index 0000000..ca88b66
--- /dev/null
+++ b/packages/selector/src/text/index.ts
@@ -0,0 +1,5 @@
+export * from './describe-text-quote';
+export * from './match-text-quote';
+export * from './describe-text-position';
+export * from './match-text-position';
+export * from './chunker';
diff --git a/packages/dom/src/text-position/match.ts 
b/packages/selector/src/text/match-text-position.ts
similarity index 66%
copy from packages/dom/src/text-position/match.ts
copy to packages/selector/src/text/match-text-position.ts
index becd957..c00f619 100644
--- a/packages/dom/src/text-position/match.ts
+++ b/packages/selector/src/text/match-text-position.ts
@@ -18,29 +18,12 @@
  * under the License.
  */
 
-import type { Matcher, TextPositionSelector } from '@annotator/selector';
-import type { Chunk, ChunkRange, Chunker } from '../chunker';
-import { TextNodeChunker } from '../chunker';
-import { CodePointSeeker } from '../code-point-seeker';
-import { TextSeeker } from '../seek';
+import type { TextPositionSelector } from '@annotator/selector';
+import type { Chunk, ChunkRange, Chunker } from './chunker';
+import { CodePointSeeker } from './code-point-seeker';
+import { TextSeeker } from './seek';
 
-export function createTextPositionSelectorMatcher(
-  selector: TextPositionSelector,
-): Matcher<Range, Range> {
-  const abstractMatcher = abstractTextPositionSelectorMatcher(selector);
-
-  return async function* matchAll(scope) {
-    const textChunks = new TextNodeChunker(scope);
-
-    const matches = abstractMatcher(textChunks);
-
-    for await (const abstractMatch of matches) {
-      yield textChunks.chunkRangeToRange(abstractMatch);
-    }
-  };
-}
-
-export function abstractTextPositionSelectorMatcher(
+export function textPositionSelectorMatcher(
   selector: TextPositionSelector,
 ): <TChunk extends Chunk<any>>(
   scope: Chunker<TChunk>,
diff --git a/packages/selector/src/text/match-text-quote.ts 
b/packages/selector/src/text/match-text-quote.ts
new file mode 100644
index 0000000..8e90a2f
--- /dev/null
+++ b/packages/selector/src/text/match-text-quote.ts
@@ -0,0 +1,168 @@
+/**
+ * @license
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+import type { TextQuoteSelector } from '@annotator/selector';
+import type { Chunk, Chunker, ChunkRange } from './chunker';
+
+export function textQuoteSelectorMatcher(
+    selector: TextQuoteSelector,
+  ): <TChunk extends Chunk<any>>(
+    scope: Chunker<TChunk>,
+  ) => AsyncGenerator<ChunkRange<TChunk>, void, void> {
+    return async function* matchAll<TChunk extends Chunk<string>>(
+      textChunks: Chunker<TChunk>,
+    ) {
+      const exact = selector.exact;
+      const prefix = selector.prefix || '';
+      const suffix = selector.suffix || '';
+      const searchPattern = prefix + exact + suffix;
+
+      // The code below runs a loop with three steps:
+      // 1. Continue checking any partial matches from the previous chunk(s).
+      // 2. Try find the whole pattern in the chunk (possibly multiple times).
+      // 3. Check if this chunk ends with a partial match (or even multiple 
partial matches).
+
+      interface PartialMatch {
+        startChunk?: TChunk;
+        startIndex?: number;
+        endChunk?: TChunk;
+        endIndex?: number;
+        charactersMatched: number;
+      }
+      let partialMatches: PartialMatch[] = [];
+
+      let isFirstChunk = true;
+      do {
+        const chunk = textChunks.currentChunk;
+        const chunkValue = chunk.data;
+
+        // 1. Continue checking any partial matches from the previous chunk(s).
+        const remainingPartialMatches: typeof partialMatches = [];
+        for (const partialMatch of partialMatches) {
+          const charactersMatched = partialMatch.charactersMatched;
+
+          // If the current chunk contains the start and/or end of the match, 
record these.
+          if (partialMatch.endChunk === undefined) {
+            const charactersUntilMatchEnd =
+              prefix.length + exact.length - charactersMatched;
+            if (charactersUntilMatchEnd <= chunkValue.length) {
+              partialMatch.endChunk = chunk;
+              partialMatch.endIndex = charactersUntilMatchEnd;
+            }
+          }
+          if (partialMatch.startChunk === undefined) {
+            const charactersUntilMatchStart = prefix.length - 
charactersMatched;
+            if (
+              charactersUntilMatchStart < chunkValue.length ||
+              partialMatch.endChunk !== undefined // handles an edge case: an 
empty quote at the end of a chunk.
+            ) {
+              partialMatch.startChunk = chunk;
+              partialMatch.startIndex = charactersUntilMatchStart;
+            }
+          }
+
+          const charactersUntilSuffixEnd =
+            searchPattern.length - charactersMatched;
+          if (charactersUntilSuffixEnd <= chunkValue.length) {
+            if (
+              chunkValue.startsWith(searchPattern.substring(charactersMatched))
+            ) {
+              yield partialMatch as ChunkRange<TChunk>; // all fields are 
certainly defined now.
+            }
+          } else if (
+            chunkValue ===
+            searchPattern.substring(
+              charactersMatched,
+              charactersMatched + chunkValue.length,
+            )
+          ) {
+            // The chunk is too short to complete the match; comparison has to 
be completed in subsequent chunks.
+            partialMatch.charactersMatched += chunkValue.length;
+            remainingPartialMatches.push(partialMatch);
+          }
+        }
+        partialMatches = remainingPartialMatches;
+
+        // 2. Try find the whole pattern in the chunk (possibly multiple 
times).
+        if (searchPattern.length <= chunkValue.length) {
+          let fromIndex = 0;
+          while (fromIndex <= chunkValue.length) {
+            const patternStartIndex = chunkValue.indexOf(
+              searchPattern,
+              fromIndex,
+            );
+            if (patternStartIndex === -1) break;
+            fromIndex = patternStartIndex + 1;
+
+            // Handle edge case: an empty searchPattern would already have 
been yielded at the end of the last chunk.
+            if (
+              patternStartIndex === 0 &&
+              searchPattern.length === 0 &&
+              !isFirstChunk
+            )
+              continue;
+
+            yield {
+              startChunk: chunk,
+              startIndex: patternStartIndex + prefix.length,
+              endChunk: chunk,
+              endIndex: patternStartIndex + prefix.length + exact.length,
+            };
+          }
+        }
+
+        // 3. Check if this chunk ends with a partial match (or even multiple 
partial matches).
+        let newPartialMatches: number[] = [];
+        const searchStartPoint = Math.max(
+          chunkValue.length - searchPattern.length + 1,
+          0,
+        );
+        for (let i = searchStartPoint; i < chunkValue.length; i++) {
+          const character = chunkValue[i];
+          newPartialMatches = newPartialMatches.filter(
+            (partialMatchStartIndex) =>
+              character === searchPattern[i - partialMatchStartIndex],
+          );
+          if (character === searchPattern[0]) newPartialMatches.push(i);
+        }
+        for (const partialMatchStartIndex of newPartialMatches) {
+          const charactersMatched = chunkValue.length - partialMatchStartIndex;
+          const partialMatch: PartialMatch = {
+            charactersMatched,
+          };
+          if (charactersMatched >= prefix.length + exact.length) {
+            partialMatch.endChunk = chunk;
+            partialMatch.endIndex =
+              partialMatchStartIndex + prefix.length + exact.length;
+          }
+          if (
+            charactersMatched > prefix.length ||
+            partialMatch.endChunk !== undefined // handles an edge case: an 
empty quote at the end of a chunk.
+          ) {
+            partialMatch.startChunk = chunk;
+            partialMatch.startIndex = partialMatchStartIndex + prefix.length;
+          }
+          partialMatches.push(partialMatch);
+        }
+
+        isFirstChunk = false;
+      } while (textChunks.nextChunk() !== null);
+    };
+  }
diff --git a/packages/dom/src/seek.ts b/packages/selector/src/text/seek.ts
similarity index 100%
rename from packages/dom/src/seek.ts
rename to packages/selector/src/text/seek.ts

[incubator-annotator] 02/03: Move abstract code into @annotator/selector

Reply via email to