[incubator-annotator] 01/02: WIP make describe quote work

gerben Wed, 11 Nov 2020 07:56:57 -0800

This is an automated email from the ASF dual-hosted git repository.

gerben pushed a commit to branch import-dom-seek
in repository https://gitbox.apache.org/repos/asf/incubator-annotator.git


commit bffbeaf17076905576ef1fcbb91ba0f67f2af787
Author: Gerben <[email protected]>
AuthorDate: Fri Nov 6 22:20:22 2020 +0100

    WIP make describe quote work
---
 packages/dom/src/seek.ts                |  46 ++++++++-
 packages/dom/src/text-quote/describe.ts | 163 ++++++++++++++------------------
 2 files changed, 112 insertions(+), 97 deletions(-)

diff --git a/packages/dom/src/seek.ts b/packages/dom/src/seek.ts
index c055291..7d7c107 100644
--- a/packages/dom/src/seek.ts
+++ b/packages/dom/src/seek.ts
@@ -18,11 +18,11 @@
  * under the License.
  */
 
-import { Chunk, TextNodeChunker, PartialTextNode } from "./chunker";
+import { Chunk, Chunker, TextNodeChunker, PartialTextNode, chunkEquals } from 
"./chunker";
 
 const E_END = 'Iterator exhausted before seek ended.';
 
-interface NonEmptyChunker<TChunk extends Chunk<any>> {
+export interface NonEmptyChunker<TChunk extends Chunk<any>> {
   readonly currentChunk: TChunk;
   nextChunk(): TChunk | null;
   previousChunk(): TChunk | null;
@@ -77,6 +77,28 @@ export class TextSeeker<TChunk extends Chunk<string>> 
implements Seeker<string>
     this._readOrSeekTo(false, target);
   }
 
+  seekToChunk(target: TChunk, offset: number = 0) {
+    this._readOrSeekToChunk(false, target, offset);
+  }
+
+  readToChunk(target: TChunk, offset: number = 0): string {
+    return this._readOrSeekToChunk(true, target, offset);
+  }
+
+  private _readOrSeekToChunk(read: true, target: TChunk, offset?: number): 
string
+  private _readOrSeekToChunk(read: false, target: TChunk, offset?: number): 
void
+  private _readOrSeekToChunk(read: boolean, target: TChunk, offset: number = 
0): string {
+    // XXX We have no way of knowing whether a chunk will follow or precedes 
the current chunk; we assume it follows.
+    let result = '';
+    // This will throw a RangeError if we reach the end without encountering 
the target chunk.
+    while (!chunkEquals(this.currentChunk, target)) {
+      if (read) result += this.read(1, true);
+    }
+    if (offset > this.offsetInChunk)
+      result += this.read(offset - this.offsetInChunk);
+    return result;
+  }
+
   private _readOrSeekTo(read: true, target: number, roundUp?: boolean): string
   private _readOrSeekTo(read: false, target: number, roundUp?: boolean): void
   private _readOrSeekTo(read: boolean, target: number, roundUp: boolean = 
false): string | void {
@@ -142,10 +164,11 @@ export class TextSeeker<TChunk extends Chunk<string>> 
implements Seeker<string>
   }
 }
 
-
 export class DomSeeker extends TextSeeker<PartialTextNode> implements 
BoundaryPointer<Text> {
-  constructor(scope: Range) {
-    const chunker = new TextNodeChunker(scope);
+  constructor(chunkerOrScope: Chunker<PartialTextNode> | Range) {
+    const chunker = 'currentChunk' in chunkerOrScope
+      ? chunkerOrScope
+      : new TextNodeChunker(chunkerOrScope);
     if (chunker.currentChunk === null)
       throw new RangeError('Range does not contain any Text nodes.');
     super(chunker as NonEmptyChunker<PartialTextNode>);
@@ -158,4 +181,17 @@ export class DomSeeker extends TextSeeker<PartialTextNode> 
implements BoundaryPo
   get offsetInReferenceNode() {
     return this.offsetInChunk + this.currentChunk.startOffset;
   }
+
+  seekToBoundaryPoint(node: Node, offset: number) {
+    const document = (node.ownerDocument ?? node as Document);
+    const target = document.createRange();
+    target.setStart(node, offset);
+    // target.setEnd(node, offset); // (implied by setting the start)
+
+    // Seek step by step until we are at, or crossed, the target point.
+    const reverse = !!(node.compareDocumentPosition(this.referenceNode) & 
Node.DOCUMENT_POSITION_PRECEDING);
+    while (target.comparePoint(this.referenceNode, this.offsetInReferenceNode) 
=== (reverse ? 1 : -1)) {
+      this.seekBy(reverse ? -1 : 1);
+    }
+  }
 }
diff --git a/packages/dom/src/text-quote/describe.ts 
b/packages/dom/src/text-quote/describe.ts
index e5846bc..8ccf47e 100644
--- a/packages/dom/src/text-quote/describe.ts
+++ b/packages/dom/src/text-quote/describe.ts
@@ -20,7 +20,9 @@
 
 import type { TextQuoteSelector } from '@annotator/selector';
 import { ownerDocument } from '../owner-document';
-import { Seeker } from '../seek';
+import { Chunk, Chunker, ChunkRange, PartialTextNode, TextNodeChunker, 
chunkRangeEquals } from '../chunker';
+import { abstractTextQuoteSelectorMatcher } from '.';
+import { DomSeeker, TextSeeker, NonEmptyChunker } from '../seek';
 
 export async function describeTextQuote(
   range: Range,
@@ -32,120 +34,97 @@ export async function describeTextQuote(
     scope = document.createRange();
     scope.selectNodeContents(document);
   }
-  range = range.cloneRange();
+
+  const chunker = new TextNodeChunker(scope);
 
   // Take the part of the range that falls within the scope.
+  range = range.cloneRange();
   if (range.compareBoundaryPoints(Range.START_TO_START, scope) === -1)
     range.setStart(scope.startContainer, scope.startOffset);
   if (range.compareBoundaryPoints(Range.END_TO_END, scope) === 1)
     range.setEnd(scope.endContainer, scope.endOffset);
 
-  return {
-    type: 'TextQuoteSelector',
-    exact: range.toString(),
-    ...calculateContextForDisambiguation(range, scope),
-  };
+  return await abstractDescribeTextQuote(
+    convertRangeToChunkRange(chunker, range),
+    chunker,
+  );
 }
 
-function calculateContextForDisambiguation(
-  range: Range,
-  scope: Range,
-): { prefix: string; suffix: string } {
-  const exactText = range.toString();
-  const scopeText = scope.toString();
-  const targetStartIndex = getRangeTextPosition(range, scope);
-  const targetEndIndex = targetStartIndex + exactText.length;
+async function abstractDescribeTextQuote<TChunk extends Chunk<string>>(
+  target: ChunkRange<TChunk>,
+  scope: Chunker<TChunk>,
+): Promise<TextQuoteSelector> {
+  const seeker = new TextSeeker(scope as NonEmptyChunker<TChunk>);
+  seeker.seekToChunk(target.startChunk, target.startIndex);
+  const exact = seeker.readToChunk(target.endChunk, target.endIndex);
 
   // Starting with an empty prefix and suffix, we search for matches. At each 
unintended match
   // we encounter, we extend the prefix or suffix just enough to ensure it 
will no longer match.
   let prefix = '';
   let suffix = '';
-  let fromIndex = 0;
-  while (fromIndex <= scopeText.length) {
-    const searchPattern = prefix + exactText + suffix;
-    const patternMatchIndex = scopeText.indexOf(searchPattern, fromIndex);
-    if (patternMatchIndex === -1) break;
-    fromIndex = patternMatchIndex + 1;
 
-    const matchStartIndex = patternMatchIndex + prefix.length;
-    const matchEndIndex = matchStartIndex + exactText.length;
+  while (true) {
+    const tentativeSelector: TextQuoteSelector = {
+      type: 'TextQuoteSelector',
+      exact,
+      prefix,
+      suffix,
+    }
+    const matches = abstractTextQuoteSelectorMatcher(tentativeSelector)(scope);
+    let nextMatch = await matches.next();
+
+    if (!nextMatch.done && chunkRangeEquals(nextMatch.value, target)) {
+      // This match is the intended one, ignore it.
+      nextMatch = await matches.next();
+    }
+
+    // If there are no more unintended matches, our selector is unambiguous!
+    if (nextMatch.done) return tentativeSelector;
+
+    // TODO either reset chunker to start from the beginning, or rewind the 
chunker by previous match’s length.
+    // chunker.seekTo(0)  or chunker.seek(-prefix)
 
-    // Skip the found match if it is the actual target.
-    if (matchStartIndex === targetStartIndex) continue;
+    // We’ll have to add more prefix/suffix to disqualify this unintended 
match.
+    const unintendedMatch = nextMatch.value;
+    const seeker1 = new TextSeeker(scope as NonEmptyChunker<TChunk>);
+    const seeker2 = new TextSeeker(scope as NonEmptyChunker<TChunk>); // TODO 
must clone scope.
 
     // Count how many characters we’d need as a prefix to disqualify this 
match.
-    let sufficientPrefixLength = prefix.length + 1;
-    const firstChar = (offset: number) =>
-      scopeText[offset - sufficientPrefixLength];
-    while (
-      firstChar(targetStartIndex) &&
-      firstChar(targetStartIndex) === firstChar(matchStartIndex)
-    )
-      sufficientPrefixLength++;
-    if (!firstChar(targetStartIndex))
-      // We reached the start of scopeText; prefix won’t work.
-      sufficientPrefixLength = Infinity;
-
-    // Count how many characters we’d need as a suffix to disqualify this 
match.
-    let sufficientSuffixLength = suffix.length + 1;
-    const lastChar = (offset: number) =>
-      scopeText[offset + sufficientSuffixLength - 1];
-    while (
-      lastChar(targetEndIndex) &&
-      lastChar(targetEndIndex) === lastChar(matchEndIndex)
-    )
-      sufficientSuffixLength++;
-    if (!lastChar(targetEndIndex))
-      // We reached the end of scopeText; suffix won’t work.
-      sufficientSuffixLength = Infinity;
+    seeker1.seekToChunk(target.startChunk, target.startIndex);
+    seeker2.seekToChunk(unintendedMatch.startChunk, 
unintendedMatch.startIndex);
+    let sufficientPrefix: string | undefined = prefix;
+    while (true) {
+      let previousCharacter: string;
+      try {
+        previousCharacter = seeker1.read(-1);
+      } catch (err) {
+        sufficientPrefix = undefined; // Start of text reached.
+        break;
+      }
+      sufficientPrefix = previousCharacter + sufficientPrefix;
+      if (previousCharacter !== seeker2.read(-1)) break;
+    }
 
     // Use either the prefix or suffix, whichever is shortest.
-    if (sufficientPrefixLength <= sufficientSuffixLength) {
-      // Compensate our search position for the increase in prefix length.
-      fromIndex -= sufficientPrefixLength - prefix.length;
-      prefix = scopeText.substring(
-        targetStartIndex - sufficientPrefixLength,
-        targetStartIndex,
-      );
-    } else {
-      suffix = scopeText.substring(
-        targetEndIndex,
-        targetEndIndex + sufficientSuffixLength,
-      );
-    }
+    if (sufficientPrefix !== undefined && (sufficientSuffix === undefined || 
sufficientPrefix.length <= sufficientSuffix.length))
+      prefix = sufficientPrefix; // chunker.seek(prefix.length - 
sufficientPrefix.length)
+    else if (sufficientSuffix !== undefined)
+      suffix = sufficientSuffix;
+    else
+      throw new Error('Target cannot be disambiguated; how could that have 
happened‽');
   }
-
-  return { prefix, suffix };
 }
 
-// Get the index of the first character of range within the text of scope.
-function getRangeTextPosition(range: Range, scope: Range): number {
-  const seeker = new Seeker(scope);
-  const scopeOffset = isTextNode(scope.startContainer) ? scope.startOffset : 0;
-  if (isTextNode(range.startContainer))
-    return seeker.seek(range.startContainer) + range.startOffset - scopeOffset;
-  else return seeker.seek(firstTextNodeInRange(range)) - scopeOffset;
-}
+function convertRangeToChunkRange(chunker: Chunker<PartialTextNode>, range: 
Range): ChunkRange<PartialTextNode> {
+  const domSeeker = new DomSeeker(chunker);
 
-function firstTextNodeInRange(range: Range): Text {
-  // Find the first text node inside the range.
-  const iter = ownerDocument(range).createNodeIterator(
-    range.commonAncestorContainer,
-    NodeFilter.SHOW_TEXT,
-    {
-      acceptNode(node: Text) {
-        // Only reveal nodes within the range; and skip any empty text nodes.
-        return range.intersectsNode(node) && node.length > 0
-          ? NodeFilter.FILTER_ACCEPT
-          : NodeFilter.FILTER_REJECT;
-      },
-    },
-  );
-  const node = iter.nextNode() as Text | null;
-  if (node === null) throw new Error('Range contains no text nodes');
-  return node;
-}
+  domSeeker.seekToBoundaryPoint(range.startContainer, range.startOffset);
+  const startChunk = domSeeker.currentChunk;
+  const startIndex = domSeeker.offsetInChunk;
+
+  domSeeker.seekToBoundaryPoint(range.endContainer, range.endOffset);
+  const endChunk = domSeeker.currentChunk;
+  const endIndex = domSeeker.offsetInChunk;
 
-function isTextNode(node: Node): node is Text {
-  return node.nodeType === Node.TEXT_NODE;
+  return { startChunk, startIndex, endChunk, endIndex };
 }

[incubator-annotator] 01/02: WIP make describe quote work

Reply via email to