[incubator-annotator] 02/03: Generate less minimal prefixes&suffixes

gerben Mon, 04 Jan 2021 09:40:48 -0800

This is an automated email from the ASF dual-hosted git repository.

gerben pushed a commit to branch more-context
in repository https://gitbox.apache.org/repos/asf/incubator-annotator.git


commit d967698f6c46515a7996bdfd9902511849a932f1
Author: Gerben <[email protected]>
AuthorDate: Thu Dec 24 17:09:22 2020 +0100

    Generate less minimal prefixes&suffixes
    
    1. Round them up to the next whitespace.
    2. Optionally add prefix&suffix around a short quote even if it is not
       ambiguous.
    
    The previous behaviour can still be obtained using the option
    `minimalContext`; especially useful if robustness against document
    variations is not required.
    
    Also refactor a bit, reusing the seekers instead of creating new ones
    on every match.
---
 packages/dom/src/text-quote/describe.ts           |   4 +-
 packages/selector/src/text/describe-text-quote.ts | 182 ++++++++++++++++++----
 packages/selector/src/text/seeker.ts              |  24 +--
 web/demo/index.js                                 |   2 +-
 4 files changed, 167 insertions(+), 45 deletions(-)

diff --git a/packages/dom/src/text-quote/describe.ts 
b/packages/dom/src/text-quote/describe.ts
index 45b8adc..ecbbd36 100644
--- a/packages/dom/src/text-quote/describe.ts
+++ b/packages/dom/src/text-quote/describe.ts
@@ -18,7 +18,7 @@
  * under the License.
  */
 
-import type { TextQuoteSelector } from '@annotator/selector';
+import type { TextQuoteSelector, DescribeTextQuoteOptions } from 
'@annotator/selector';
 import { describeTextQuote as abstractDescribeTextQuote } from 
'@annotator/selector';
 import { TextNodeChunker } from '../text-node-chunker';
 import { ownerDocument } from '../owner-document';
@@ -26,6 +26,7 @@ import { ownerDocument } from '../owner-document';
 export async function describeTextQuote(
   range: Range,
   maybeScope?: Range,
+  options: DescribeTextQuoteOptions = {},
 ): Promise<TextQuoteSelector> {
   // Default to search in the whole document.
   let scope: Range;
@@ -42,5 +43,6 @@ export async function describeTextQuote(
   return await abstractDescribeTextQuote(
     chunker.rangeToChunkRange(range),
     () => new TextNodeChunker(scope),
+    options,
   );
 }
diff --git a/packages/selector/src/text/describe-text-quote.ts 
b/packages/selector/src/text/describe-text-quote.ts
index 9b6526a..b3c4d48 100644
--- a/packages/selector/src/text/describe-text-quote.ts
+++ b/packages/selector/src/text/describe-text-quote.ts
@@ -25,21 +25,105 @@ import type { RelativeSeeker } from './seeker';
 import { TextSeeker } from './seeker';
 import { textQuoteSelectorMatcher } from '.';
 
+export interface DescribeTextQuoteOptions {
+  /**
+   * Keep prefix and suffix to the minimum that is necessary to disambiguate
+   * the quote. Use only if robustness against text variations is not required.
+   */
+  minimalContext?: boolean;
+
+  /**
+   * Add prefix and suffix to quotes below this length, such that the total of
+   * prefix + exact + suffix is at least this length.
+   */
+  minimumQuoteLength?: number
+
+  /**
+   * When attempting to find a whitespace to make the prefix/suffix start/end
+   * (resp.) at a word boundary, give up after this number of characters.
+   */
+  maxWordLength?: number;
+}
+
+/**
+ * Returns a {@link TextQuoteSelector} that points at the target quote in the
+ * given text.
+ *
+ * @remarks
+ * The selector will contain the *exact* target quote, and in case this quote
+ * appears multiple times in the text, sufficient context around the quote will
+ * be included in the selector’s *prefix* and *suffix* attributes to
+ * disambiguate. By default, more prefix and suffix are included than strictly
+ * required; both in order to be robust against slight modifications, and in an
+ * attempt to not end halfway a word (mainly for the sake of human 
readability).
+ *
+ * @param target - The range of characters that the selector should describe
+ * @param scope - The text containing the target range; or, more accurately, a
+ * function creating {@link Chunker}s that allow walking through the text.
+ * @param options
+ * @returns the {@link TextQuoteSelector} that describes *target*.
+ */
 export async function describeTextQuote<TChunk extends Chunk<string>>(
   target: ChunkRange<TChunk>,
   scope: () => Chunker<TChunk>,
+  {
+    minimalContext = false,
+    minimumQuoteLength = 0,
+    maxWordLength = 50,
+  }: DescribeTextQuoteOptions = {},
 ): Promise<TextQuoteSelector> {
-  const seeker = new TextSeeker(scope());
+  // Create a seeker to read the target quote and the context around it.
+  // TODO Possible optimisation: as it need not be an AbsoluteSeeker, a
+  // different implementation could provide direct ‘jump’ access in seekToChunk
+  // (the scope’s Chunker would of course also have to support this).
+  const seekerAtTarget = new TextSeeker(scope());
+
+  // Create a second seeker so that we will be able to simultaneously read
+  // characters near both the target and an unintended match, if we find any.
+  const seekerAtUnintendedMatch = new TextSeeker(scope());
 
   // Read the target’s exact text.
-  seeker.seekToChunk(target.startChunk, target.startIndex);
-  const exact = seeker.readToChunk(target.endChunk, target.endIndex);
+  seekerAtTarget.seekToChunk(target.startChunk, target.startIndex);
+  const exact = seekerAtTarget.readToChunk(target.endChunk, target.endIndex);
 
-  // Starting with an empty prefix and suffix, we search for matches. At each 
unintended match
-  // we encounter, we extend the prefix or suffix just enough to ensure it 
will no longer match.
+  // Start with an empty prefix and suffix.
   let prefix = '';
   let suffix = '';
 
+  // If the quote is below the given minimum length, add some prefix & suffix.
+  const currentQuoteLength = () => prefix.length + exact.length + 
suffix.length;
+  if (currentQuoteLength() < minimumQuoteLength) {
+    // Expand the prefix, but only to reach halfway towards the desired length.
+    seekerAtTarget.seekToChunk(target.startChunk, target.startIndex - 
prefix.length);
+    const length = Math.floor((minimumQuoteLength - currentQuoteLength()) / 2);
+    prefix = seekerAtTarget.read(-length, false, true) + prefix;
+
+    // If needed, expand the suffix to achieve the minimum length.
+    if (currentQuoteLength() < minimumQuoteLength) {
+      seekerAtTarget.seekToChunk(target.endChunk, target.endIndex + 
suffix.length);
+      const length = minimumQuoteLength - currentQuoteLength();
+      suffix = suffix + seekerAtTarget.read(length, false, true);
+
+      // We might have to expand the prefix again (if at the end of the scope).
+      if (currentQuoteLength() < minimumQuoteLength) {
+        seekerAtTarget.seekToChunk(target.startChunk, target.startIndex - 
prefix.length);
+        const length = minimumQuoteLength - currentQuoteLength();
+        prefix = seekerAtTarget.read(-length, false, true) + prefix;
+      }
+    }
+  }
+
+  // Expand prefix & suffix to avoid them ending somewhere halfway in a word.
+  if (!minimalContext) {
+    seekerAtTarget.seekToChunk(target.startChunk, target.startIndex - 
prefix.length);
+    prefix = readUntilWhitespace(seekerAtTarget, maxWordLength, true) + prefix;
+    seekerAtTarget.seekToChunk(target.endChunk, target.endIndex + 
suffix.length);
+    suffix = suffix + readUntilWhitespace(seekerAtTarget, maxWordLength, 
false);
+  }
+
+  // Search for matches of the quote using the current prefix and suffix. At
+  // each unintended match we encounter, we extend the prefix or suffix to
+  // ensure it will no longer match.
   while (true) {
     const tentativeSelector: TextQuoteSelector = {
       type: 'TextQuoteSelector',
@@ -48,9 +132,7 @@ export async function describeTextQuote<TChunk extends 
Chunk<string>>(
       suffix,
     };
 
-    const matches = textQuoteSelectorMatcher(tentativeSelector)(
-      scope(),
-    );
+    const matches = textQuoteSelectorMatcher(tentativeSelector)(scope());
     let nextMatch = await matches.next();
 
     // If this match is the intended one, no need to act.
@@ -72,42 +154,44 @@ export async function describeTextQuote<TChunk extends 
Chunk<string>>(
     // We’ll have to add more prefix/suffix to disqualify this unintended 
match.
     const unintendedMatch = nextMatch.value;
 
-    // Create two seekers to simultaneously read characters near both the 
target
-    // and the unintended match.
-    // Possible optimisation: as these need not be AbsoluteSeekers, a different
-    // implementation could provide direct ‘jump’ access in seekToChunk (the
-    // scope’s Chunker would of course also have to support this).
-    const seeker1 = new TextSeeker(scope());
-    const seeker2 = new TextSeeker(scope());
-
     // Count how many characters we’d need as a prefix to disqualify this 
match.
-    seeker1.seekToChunk(target.startChunk, target.startIndex - prefix.length);
-    seeker2.seekToChunk(
+    seekerAtTarget.seekToChunk(target.startChunk, target.startIndex - 
prefix.length);
+    seekerAtUnintendedMatch.seekToChunk(
       unintendedMatch.startChunk,
       unintendedMatch.startIndex - prefix.length,
     );
-    const extraPrefix = readUntilDifferent(seeker1, seeker2, true);
+    let extraPrefix = readUntilDifferent(seekerAtTarget, 
seekerAtUnintendedMatch, true);
+    if (extraPrefix !== undefined && !minimalContext)
+      extraPrefix = readUntilWhitespace(seekerAtTarget, maxWordLength, true) + 
extraPrefix;
 
     // Count how many characters we’d need as a suffix to disqualify this 
match.
-    seeker1.seekToChunk(target.endChunk, target.endIndex + suffix.length);
-    seeker2.seekToChunk(
+    seekerAtTarget.seekToChunk(target.endChunk, target.endIndex + 
suffix.length);
+    seekerAtUnintendedMatch.seekToChunk(
       unintendedMatch.endChunk,
       unintendedMatch.endIndex + suffix.length,
     );
-    const extraSuffix = readUntilDifferent(seeker1, seeker2, false);
-
-    // Use either the prefix or suffix, whichever is shortest.
-    if (
-      extraPrefix !== undefined &&
-      (extraSuffix === undefined || extraPrefix.length <= extraSuffix.length)
-    ) {
-      prefix = extraPrefix + prefix;
-    } else if (extraSuffix !== undefined) {
-      suffix = suffix + extraSuffix;
+    let extraSuffix = readUntilDifferent(seekerAtTarget, 
seekerAtUnintendedMatch, false);
+    if (extraSuffix !== undefined && !minimalContext)
+      extraSuffix = extraSuffix + readUntilWhitespace(seekerAtTarget, 
maxWordLength, false);
+
+    if (minimalContext) {
+      // Use either the prefix or suffix, whichever is shortest.
+      if (
+        extraPrefix !== undefined &&
+        (extraSuffix === undefined || extraPrefix.length <= extraSuffix.length)
+      ) {
+        prefix = extraPrefix + prefix;
+      } else if (extraSuffix !== undefined) {
+        suffix = suffix + extraSuffix;
+      } else {
+        throw new Error(
+          'Target cannot be disambiguated; how could that have happened‽',
+        );
+      }
     } else {
-      throw new Error(
-        'Target cannot be disambiguated; how could that have happened‽',
-      );
+      // For redundancy, expand both prefix and suffix.
+      if (extraPrefix !== undefined) prefix = extraPrefix + prefix;
+      if (extraSuffix !== undefined) suffix = suffix + extraSuffix;
     }
   }
 }
@@ -138,3 +222,33 @@ function readUntilDifferent(
     if (nextCharacter !== comparisonCharacter) return result;
   }
 }
+
+function readUntilWhitespace(
+  seeker: RelativeSeeker,
+  limit: number = Infinity,
+  reverse = false
+): string {
+  let result = '';
+  while (result.length < limit) {
+    let nextCharacter: string;
+    try {
+      nextCharacter = seeker.read(reverse ? -1 : 1);
+    } catch (err) {
+      if (!(err instanceof RangeError)) throw err;
+      break; // End/start of text reached.
+    }
+
+    // Stop if we reached whitespace.
+    if (isWhitespace(nextCharacter)) {
+      seeker.seekBy(reverse ? 1 : -1); // ‘undo’ the last read.
+      break;
+    }
+
+    result = reverse ? nextCharacter + result : result + nextCharacter;
+  }
+  return result;
+}
+
+function isWhitespace(s: string): boolean {
+  return s.match(/^\s+$/) !== null;
+}
diff --git a/packages/selector/src/text/seeker.ts 
b/packages/selector/src/text/seeker.ts
index 2c1f788..1605d45 100644
--- a/packages/selector/src/text/seeker.ts
+++ b/packages/selector/src/text/seeker.ts
@@ -69,12 +69,14 @@ export interface RelativeSeeker<TData extends Iterable<any> 
= string> {
    * backwards in the file.
    * @param roundUp - If true, then, after reading the given number of
    * characters, read further until the end (or start) of the current chunk.
+   * @param lessIsFine - If true, and there are not enough characters in the
+   * file, return the result so far instead of throwing an error.
    * @returns The characters passed (in their normal order, even when moving
    * backwards)
-   * @throws RangeError if there are not enough characters in the file. The
-   * pointer is left at the end/start of the file.
+   * @throws RangeError if there are not enough characters in the file (unless
+   * `lessIsFine` is true). The pointer is left at the end/start of the file.
    */
-  read(length?: number, roundUp?: boolean): TData;
+  read(length?: number, roundUp?: boolean, lessIsFine?: boolean): TData;
 }
 
 /**
@@ -195,8 +197,8 @@ export class TextSeeker<TChunk extends Chunk<string>>
     this.seekTo(0);
   }
 
-  read(length: number, roundUp = false): string {
-    return this.readTo(this.position + length, roundUp);
+  read(length: number, roundUp = false, lessIsFine = false): string {
+    return this._readOrSeekTo(true, this.position + length, roundUp, 
lessIsFine);
   }
 
   readTo(target: number, roundUp = false): string {
@@ -277,12 +279,13 @@ export class TextSeeker<TChunk extends Chunk<string>>
     }
   }
 
-  private _readOrSeekTo(read: true, target: number, roundUp?: boolean): string;
-  private _readOrSeekTo(read: false, target: number, roundUp?: boolean): void;
+  private _readOrSeekTo(read: true, target: number, roundUp?: boolean, 
lessIsFine?: boolean): string;
+  private _readOrSeekTo(read: false, target: number, roundUp?: boolean, 
lessIsFine?: boolean): void;
   private _readOrSeekTo(
     read: boolean,
     target: number,
     roundUp = false,
+    lessIsFine = false,
   ): string | void {
     let result = '';
 
@@ -298,7 +301,7 @@ export class TextSeeker<TChunk extends Chunk<string>>
           const [data, nextChunk] = this._readToNextChunk();
           if (read) result += data;
           if (nextChunk === null) {
-            if (this.position === target) break;
+            if (this.position === target || lessIsFine) break;
             else throw new RangeError(E_END);
           }
         } else {
@@ -335,7 +338,10 @@ export class TextSeeker<TChunk extends Chunk<string>>
         } else {
           const [data, previousChunk] = this._readToPreviousChunk();
           if (read) result = data + result;
-          if (previousChunk === null) throw new RangeError(E_END);
+          if (previousChunk === null) {
+            if (lessIsFine) break;
+            else throw new RangeError(E_END);
+          }
         }
       }
     }
diff --git a/web/demo/index.js b/web/demo/index.js
index d513252..a9773a4 100644
--- a/web/demo/index.js
+++ b/web/demo/index.js
@@ -138,7 +138,7 @@ async function onSelectionChange() {
     const selector =
       describeMode === 'TextPosition'
         ? await describeTextPosition(range, scope)
-        : await describeTextQuote(range, scope);
+        : await describeTextQuote(range, scope, { minimumQuoteLength: 10 });
     await anchor(selector);
   }
 }

[incubator-annotator] 02/03: Generate less minimal prefixes&suffixes

Reply via email to