[incubator-annotator] 02/02: WIP Make describing a text quote work too

gerben Fri, 06 Nov 2020 05:19:46 -0800

This is an automated email from the ASF dual-hosted git repository.

gerben pushed a commit to branch import-dom-seek
in repository https://gitbox.apache.org/repos/asf/incubator-annotator.git


commit 15e8ffd485cf0c15c864c2f63540f9963c8e48e4
Author: Gerben <[email protected]>
AuthorDate: Fri Nov 6 14:18:45 2020 +0100

    WIP Make describing a text quote work too
---
 packages/dom/src/chunker.ts             |   1 +
 packages/dom/src/seek.ts                |  26 +++-
 packages/dom/src/text-quote/describe.ts | 214 ++++++++++++++++++--------------
 3 files changed, 145 insertions(+), 96 deletions(-)

diff --git a/packages/dom/src/chunker.ts b/packages/dom/src/chunker.ts
index 82a72e3..0f3b746 100644
--- a/packages/dom/src/chunker.ts
+++ b/packages/dom/src/chunker.ts
@@ -25,6 +25,7 @@ import { ownerDocument } from "./owner-document";
 // data structure it came from (e.g. a DOM node).
 export interface Chunk<TData extends any> {
   readonly data: TData;
+  equals?(otherChunk: this): boolean;
 }
 
 // A Chunker lets one walk through the chunks of a document.
diff --git a/packages/dom/src/seek.ts b/packages/dom/src/seek.ts
index f09e583..9ceade3 100644
--- a/packages/dom/src/seek.ts
+++ b/packages/dom/src/seek.ts
@@ -18,7 +18,7 @@
  * under the License.
  */
 
-import { Chunk, TextNodeChunker, PartialTextNode } from "./chunker";
+import { Chunk, Chunker, TextNodeChunker, PartialTextNode } from "./chunker";
 
 const E_END = 'Iterator exhausted before seek ended.';
 
@@ -142,10 +142,11 @@ class TextSeeker<TChunk extends Chunk<string>> implements 
Seeker<string> {
   }
 }
 
-
 export class DomSeeker extends TextSeeker<PartialTextNode> implements 
BoundaryPointer<Text> {
-  constructor(scope: Range) {
-    const chunker = new TextNodeChunker(scope);
+  constructor(chunkerOrScope: Chunker<PartialTextNode> | Range) {
+    const chunker = isTextNodeChunker(chunkerOrScope)
+      ? chunkerOrScope
+      : new TextNodeChunker(chunkerOrScope);
     if (chunker.currentChunk === null)
       throw new RangeError('Range does not contain any Text nodes.');
     super(chunker as NonEmptyChunker<PartialTextNode>);
@@ -158,4 +159,21 @@ export class DomSeeker extends TextSeeker<PartialTextNode> 
implements BoundaryPo
   get offsetInReferenceNode() {
     return this.offsetInChunk + this.currentChunk.startOffset;
   }
+
+  seekToBoundaryPoint(node: Node, offset: number) {
+    const document = (node.ownerDocument ?? node as Document);
+    const target = document.createRange();
+    target.setStart(node, offset);
+    // target.setEnd(node, offset); // (implied by setting the start)
+
+    // Seek step by step until we are at, or crossed, the target point.
+    const reverse = !!(node.compareDocumentPosition(this.referenceNode) & 
Node.DOCUMENT_POSITION_PRECEDING);
+    while (target.comparePoint(this.referenceNode, this.offsetInReferenceNode) 
=== (reverse ? 1 : -1)) {
+      this.seekBy(reverse ? -1 : 1);
+    }
+  }
+}
+
+function isTextNodeChunker(obj: any): obj is Chunker<PartialTextNode> {
+  return ('currentChunk' in obj && 'nextChunk' in obj && 'previousChunk' in 
obj);
 }
diff --git a/packages/dom/src/text-quote/describe.ts 
b/packages/dom/src/text-quote/describe.ts
index e5846bc..787d60d 100644
--- a/packages/dom/src/text-quote/describe.ts
+++ b/packages/dom/src/text-quote/describe.ts
@@ -20,7 +20,9 @@
 
 import type { TextQuoteSelector } from '@annotator/selector';
 import { ownerDocument } from '../owner-document';
-import { Seeker } from '../seek';
+import { Chunk, Chunker, PartialTextNode, TextNodeChunker } from '../chunker';
+import { ChunkRange, abstractTextQuoteSelectorMatcher } from '.';
+import { DomSeeker } from '../seek';
 
 export async function describeTextQuote(
   range: Range,
@@ -32,120 +34,148 @@ export async function describeTextQuote(
     scope = document.createRange();
     scope.selectNodeContents(document);
   }
-  range = range.cloneRange();
+
+  const chunker = new TextNodeChunker(scope);
 
   // Take the part of the range that falls within the scope.
+  range = range.cloneRange();
   if (range.compareBoundaryPoints(Range.START_TO_START, scope) === -1)
     range.setStart(scope.startContainer, scope.startOffset);
   if (range.compareBoundaryPoints(Range.END_TO_END, scope) === 1)
     range.setEnd(scope.endContainer, scope.endOffset);
 
-  return {
-    type: 'TextQuoteSelector',
-    exact: range.toString(),
-    ...calculateContextForDisambiguation(range, scope),
-  };
+  return await abstractDescribeTextQuote(
+    convertRangeToChunkRange(chunker, range),
+    chunker,
+  );
 }
 
-function calculateContextForDisambiguation(
-  range: Range,
-  scope: Range,
-): { prefix: string; suffix: string } {
-  const exactText = range.toString();
-  const scopeText = scope.toString();
-  const targetStartIndex = getRangeTextPosition(range, scope);
-  const targetEndIndex = targetStartIndex + exactText.length;
+async function abstractDescribeTextQuote(
+  target: ChunkRange<Chunk<string>>,
+  scope: Chunker<Chunk<string>>,
+): Promise<TextQuoteSelector> {
+  const exact = readChunkRange(scope, target);
 
   // Starting with an empty prefix and suffix, we search for matches. At each 
unintended match
   // we encounter, we extend the prefix or suffix just enough to ensure it 
will no longer match.
   let prefix = '';
   let suffix = '';
-  let fromIndex = 0;
-  while (fromIndex <= scopeText.length) {
-    const searchPattern = prefix + exactText + suffix;
-    const patternMatchIndex = scopeText.indexOf(searchPattern, fromIndex);
-    if (patternMatchIndex === -1) break;
-    fromIndex = patternMatchIndex + 1;
-
-    const matchStartIndex = patternMatchIndex + prefix.length;
-    const matchEndIndex = matchStartIndex + exactText.length;
-
-    // Skip the found match if it is the actual target.
-    if (matchStartIndex === targetStartIndex) continue;
-
-    // Count how many characters we’d need as a prefix to disqualify this 
match.
-    let sufficientPrefixLength = prefix.length + 1;
-    const firstChar = (offset: number) =>
-      scopeText[offset - sufficientPrefixLength];
-    while (
-      firstChar(targetStartIndex) &&
-      firstChar(targetStartIndex) === firstChar(matchStartIndex)
-    )
-      sufficientPrefixLength++;
-    if (!firstChar(targetStartIndex))
-      // We reached the start of scopeText; prefix won’t work.
-      sufficientPrefixLength = Infinity;
-
-    // Count how many characters we’d need as a suffix to disqualify this 
match.
-    let sufficientSuffixLength = suffix.length + 1;
-    const lastChar = (offset: number) =>
-      scopeText[offset + sufficientSuffixLength - 1];
-    while (
-      lastChar(targetEndIndex) &&
-      lastChar(targetEndIndex) === lastChar(matchEndIndex)
-    )
-      sufficientSuffixLength++;
-    if (!lastChar(targetEndIndex))
-      // We reached the end of scopeText; suffix won’t work.
-      sufficientSuffixLength = Infinity;
 
-    // Use either the prefix or suffix, whichever is shortest.
-    if (sufficientPrefixLength <= sufficientSuffixLength) {
-      // Compensate our search position for the increase in prefix length.
-      fromIndex -= sufficientPrefixLength - prefix.length;
-      prefix = scopeText.substring(
-        targetStartIndex - sufficientPrefixLength,
-        targetStartIndex,
-      );
-    } else {
-      suffix = scopeText.substring(
-        targetEndIndex,
-        targetEndIndex + sufficientSuffixLength,
-      );
+  while (true) {
+    const tentativeSelector: TextQuoteSelector = {
+      type: 'TextQuoteSelector',
+      exact,
+      prefix,
+      suffix,
+    }
+    const matches = abstractTextQuoteSelectorMatcher(tentativeSelector)(scope);
+    let nextMatch = await matches.next();
+
+    if (!nextMatch.done && chunkRangeEquals(nextMatch.value, target)) {
+      // This match is the intended one, ignore it.
+      nextMatch = await matches.next();
     }
+
+    // If there are no more unintended matches, our selector is unambiguous!
+    if (nextMatch.done) return tentativeSelector;
+
+    // We’ll have to add more prefix/suffix to disqualify this unintended 
match.
+    const match = nextMatch.value;
+    const sufficientPrefix = charactersNeededToBeUnique(scope, match, target, 
true);
+    const sufficientSuffix = charactersNeededToBeUnique(scope, match, target, 
false);
+
+    // Use either the prefix or suffix, whichever is shortest.
+    if (sufficientPrefix !== undefined && (sufficientSuffix === undefined || 
sufficientPrefix.length <= sufficientSuffix.length))
+      prefix = sufficientPrefix;
+    else if (sufficientSuffix !== undefined)
+      suffix = sufficientSuffix;
+    else
+      throw new Error('Target cannot be disambiguated; how could that have 
happened‽');
   }
+}
+
+function charactersNeededToBeUnique(
+  chunker: Chunker<Chunk<string>>,
+  match: ChunkRange<any>,
+  target: ChunkRange<any>,
+  reverse: boolean,
+): string | undefined {
+  // TODO. How?
+
+  // // Count how many characters we’d need as a prefix to disqualify this 
match.
+  // let sufficientPrefixLength = prefix.length + 1;
+  // const firstChar = (offset: number) =>
+  //   scopeText[offset - sufficientPrefixLength];
+  // while (
+  //   firstChar(targetStartIndex) &&
+  //   firstChar(targetStartIndex) === firstChar(matchStartIndex)
+  // )
+  //   sufficientPrefixLength++;
+  // if (!firstChar(targetStartIndex))
+  //   // We reached the start of scopeText; prefix won’t work.
+  //   sufficientPrefixLength = Infinity;
+
+  // // Count how many characters we’d need as a suffix to disqualify this 
match.
+  // let sufficientSuffixLength = suffix.length + 1;
+  // const lastChar = (offset: number) =>
+  //   scopeText[offset + sufficientSuffixLength - 1];
+  // while (
+  //   lastChar(targetEndIndex) &&
+  //   lastChar(targetEndIndex) === lastChar(matchEndIndex)
+  // )
+  //   sufficientSuffixLength++;
+  // if (!lastChar(targetEndIndex))
+  //   // We reached the end of scopeText; suffix won’t work.
+  //   sufficientSuffixLength = Infinity;
 
-  return { prefix, suffix };
 }
 
-// Get the index of the first character of range within the text of scope.
-function getRangeTextPosition(range: Range, scope: Range): number {
-  const seeker = new Seeker(scope);
-  const scopeOffset = isTextNode(scope.startContainer) ? scope.startOffset : 0;
-  if (isTextNode(range.startContainer))
-    return seeker.seek(range.startContainer) + range.startOffset - scopeOffset;
-  else return seeker.seek(firstTextNodeInRange(range)) - scopeOffset;
+function readChunkRange<TChunk extends Chunk<string>>(
+  chunker: Chunker<TChunk>,
+  {
+    startChunk,
+    startIndex,
+    endChunk,
+    endIndex,
+  }: ChunkRange<TChunk>
+): string {
+  if (startChunk === endChunk)
+    return startChunk.data.substring(startIndex, endIndex);
+
+  let text = startChunk.data.substring(startIndex);
+  // TODO use chunker; or implement Seeker.readToChunk or similiar?
+  let curChunk = startChunk;
+  while (curChunk && curChunk !== endChunk) {
+    curChunk = chunker.nextChunk();
+    text += curChunk.data;
+  }
+  text += endChunk.data.substring(0, endIndex);
+  return text;
 }
 
-function firstTextNodeInRange(range: Range): Text {
-  // Find the first text node inside the range.
-  const iter = ownerDocument(range).createNodeIterator(
-    range.commonAncestorContainer,
-    NodeFilter.SHOW_TEXT,
-    {
-      acceptNode(node: Text) {
-        // Only reveal nodes within the range; and skip any empty text nodes.
-        return range.intersectsNode(node) && node.length > 0
-          ? NodeFilter.FILTER_ACCEPT
-          : NodeFilter.FILTER_REJECT;
-      },
-    },
+function chunkRangeEquals(range1: ChunkRange<any>, range2: ChunkRange<any>) {
+  return (
+    chunkEquals(range1.startChunk, range2.startChunk)
+    && chunkEquals(range1.endChunk, range2.endChunk)
+    && range1.startIndex === range2.startIndex
+    && range1.endIndex === range2.endIndex
   );
-  const node = iter.nextNode() as Text | null;
-  if (node === null) throw new Error('Range contains no text nodes');
-  return node;
 }
 
-function isTextNode(node: Node): node is Text {
-  return node.nodeType === Node.TEXT_NODE;
+function chunkEquals(chunk1: Chunk<any>, chunk2: Chunk<any>): boolean {
+  return chunk1.equals ? chunk1.equals(chunk2) : chunk1 === chunk2;
+}
+
+function convertRangeToChunkRange(chunker: Chunker<PartialTextNode>, range: 
Range): ChunkRange<PartialTextNode> {
+  const domSeeker = new DomSeeker(chunker);
+
+  domSeeker.seekToBoundaryPoint(range.startContainer, range.startOffset);
+  const startChunk = domSeeker.currentChunk;
+  const startIndex = domSeeker.offsetInChunk;
+
+  domSeeker.seekToBoundaryPoint(range.endContainer, range.endOffset);
+  const endChunk = domSeeker.currentChunk;
+  const endIndex = domSeeker.offsetInChunk;
+
+  return { startChunk, startIndex, endChunk, endIndex };
 }

[incubator-annotator] 02/02: WIP Make describing a text quote work too

Reply via email to