This is an automated email from the ASF dual-hosted git repository.

gerben pushed a commit to branch chunking
in repository https://gitbox.apache.org/repos/asf/incubator-annotator.git

commit 18c9eabffd175364385c19902b11f07a630d375a
Author: Gerben <[email protected]>
AuthorDate: Tue Sep 15 19:55:35 2020 +0200

    WIP Create chunk abstraction for text quote matching
---
 packages/dom/src/text-iterator.ts           |  79 ++++++++++++++++++++++
 packages/dom/src/text-quote/match.ts        | 101 ++++++++++++++--------------
 packages/dom/test/text-quote/match-cases.ts |   8 +--
 3 files changed, 134 insertions(+), 54 deletions(-)

diff --git a/packages/dom/src/text-iterator.ts 
b/packages/dom/src/text-iterator.ts
new file mode 100644
index 0000000..dfc1384
--- /dev/null
+++ b/packages/dom/src/text-iterator.ts
@@ -0,0 +1,79 @@
+/**
+ * @license
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+import { ownerDocument } from "./owner-document";
+
+export interface TextRange extends Range {
+  // We guarantee that to always have Text nodes as start and end containers.
+  readonly startContainer: Text;
+  readonly endContainer: Text;
+  cloneRange(): TextRange;
+
+  // Allow only Text nodes to be passed to these methods.
+  insertNode(node: Text): void;
+  selectNodeContents(node: Text): void;
+  setEnd(node: Text, offset: number): void;
+  setStart(node: Text, offset: number): void;
+
+  // Do not allow these methods to be used at all.
+  selectNode(node: never): void;
+  setEndAfter(node: never): void;
+  setEndBefore(node: never): void;
+  setStartAfter(node: never): void;
+  setStartBefore(node: never): void;
+  surroundContents(newParent: never): void;
+}
+
+export interface Chunk {
+  toString(): string;
+}
+
+// Yields ranges whose start and end nodes are both the *same* Text node.
+export async function* chunkRange(scope: Range): AsyncIterable<TextRange> {
+  const document = ownerDocument(scope);
+
+  const iter = document.createNodeIterator(
+    scope.commonAncestorContainer,
+    NodeFilter.SHOW_TEXT,
+    {
+      acceptNode(node: Text) {
+        // Only reveal nodes within the range; and skip any empty text nodes.
+        return scope.intersectsNode(node) && node.length > 0
+          ? NodeFilter.FILTER_ACCEPT
+          : NodeFilter.FILTER_REJECT;
+      },
+    },
+  );
+
+  let node: Text | null;
+  while (node = iter.nextNode() as (Text | null)) {
+    const range = document.createRange() as TextRange;
+    range.selectNodeContents(node);
+
+    if (node === scope.startContainer) {
+      range.setStart(node, scope.startOffset);
+    }
+    if (node === scope.endContainer) {
+      range.setEnd(node, scope.endOffset);
+    }
+
+    yield range;
+  }
+}
diff --git a/packages/dom/src/text-quote/match.ts 
b/packages/dom/src/text-quote/match.ts
index a4a216f..6b7fd93 100644
--- a/packages/dom/src/text-quote/match.ts
+++ b/packages/dom/src/text-quote/match.ts
@@ -18,71 +18,72 @@
  * under the License.
  */
 
-import type { Matcher, TextQuoteSelector } from '@annotator/selector';
-import seek from 'dom-seek';
+import type { TextQuoteSelector } from '@annotator/selector';
 
-import { ownerDocument } from '../owner-document';
+import { chunkRange, Chunk, TextRange } from '../text-iterator';
 
 export function createTextQuoteSelectorMatcher(
   selector: TextQuoteSelector,
-): Matcher<Range, Range> {
+): (scope: Range) => AsyncGenerator<TextRange, void, void> {
+  const abstractMatcher = abstractTextQuoteSelectorMatcher(selector);
   return async function* matchAll(scope) {
-    const document = ownerDocument(scope);
-    const scopeText = scope.toString();
+    // Turn the scope into a stream of ranges, each wrapping exactly one text 
node. We wrap it in
+    // a range such that the first and last text node can be partially 
included. Could be changed
+    // to e.g. be an object { node: Text, startOffset, endOffset }.
+    const textChunks = chunkRange(scope);
 
+    for await (const abstractMatch of abstractMatcher(textChunks)) {
+      const match = document.createRange() as TextRange;
+      // The `+…startOffset` part is only relevant for the first chunk, whose 
text node might be partially in scope.
+      match.setStart(abstractMatch.startChunk.startContainer,
+        abstractMatch.startIndex + abstractMatch.startChunk.startOffset);
+      match.setEnd(abstractMatch.endChunk.startContainer, // (note that 
startContainer equals endContainer)
+        abstractMatch.endIndex + abstractMatch.endChunk.startOffset);
+      yield match;
+    }
+  }
+}
+
+interface AbstractRange<TChunk> {
+  startChunk: TChunk;
+  startIndex: number;
+  endChunk: TChunk;
+  endIndex: number;
+}
+
+export function abstractTextQuoteSelectorMatcher(
+  selector: TextQuoteSelector,
+): <TChunk extends Chunk>(textChunks: AsyncIterable<TChunk>) => 
AsyncGenerator<AbstractRange<TChunk>, void, void> {
+  return async function* matchAll(textChunks) {
     const exact = selector.exact;
     const prefix = selector.prefix || '';
     const suffix = selector.suffix || '';
     const searchPattern = prefix + exact + suffix;
 
-    const iter = document.createNodeIterator(
-      scope.commonAncestorContainer,
-      NodeFilter.SHOW_TEXT,
-      {
-        acceptNode(node: Text) {
-          // Only reveal nodes within the range; and skip any empty text nodes.
-          return scope.intersectsNode(node) && node.length > 0
-            ? NodeFilter.FILTER_ACCEPT
-            : NodeFilter.FILTER_REJECT;
-        },
-      },
-    );
+    for await (const chunk of textChunks) {
+      const chunkValue = chunk.toString();
 
-    // The index of the first character of iter.referenceNode inside the text.
-    let referenceNodeIndex = isTextNode(scope.startContainer)
-      ? -scope.startOffset
-      : 0;
+      // Find the pattern in the chunk (possibly multiple times)
+      // TODO allow pattern to be spread across chunks
+      let fromIndex = 0;
+      while (fromIndex <= chunkValue.length) {
+        const patternStartIndex = chunkValue.indexOf(searchPattern, fromIndex);
+        if (patternStartIndex === -1) break;
 
-    let fromIndex = 0;
-    while (fromIndex <= scopeText.length) {
-      // Find the quote with its prefix and suffix in the string.
-      const patternStartIndex = scopeText.indexOf(searchPattern, fromIndex);
-      if (patternStartIndex === -1) return;
+        // Correct for the prefix and suffix lengths.
+        const matchStartIndex = patternStartIndex + prefix.length;
+        const matchEndIndex = matchStartIndex + exact.length;
 
-      // Correct for the prefix and suffix lengths.
-      const matchStartIndex = patternStartIndex + prefix.length;
-      const matchEndIndex = matchStartIndex + exact.length;
+        yield {
+          startChunk: chunk,
+          startIndex: matchStartIndex,
+          endChunk: chunk,
+          endIndex: matchEndIndex,
+        };
 
-      // Create a range to represent this exact quote in the dom.
-      const match = document.createRange();
-
-      // Seek to the start of the match, make the range start there.
-      referenceNodeIndex += seek(iter, matchStartIndex - referenceNodeIndex);
-      match.setStart(iter.referenceNode, matchStartIndex - referenceNodeIndex);
-
-      // Seek to the end of the match, make the range end there.
-      referenceNodeIndex += seek(iter, matchEndIndex - referenceNodeIndex);
-      match.setEnd(iter.referenceNode, matchEndIndex - referenceNodeIndex);
-
-      // Yield the match.
-      yield match;
-
-      // Advance the search forward to detect multiple occurrences.
-      fromIndex = matchStartIndex + 1;
+        // Advance the search forward to detect multiple occurrences within 
the same chunk.
+        fromIndex = matchStartIndex + 1;
+      }
     }
   };
 }
-
-function isTextNode(node: Node): node is Text {
-  return node.nodeType === Node.TEXT_NODE;
-}
diff --git a/packages/dom/test/text-quote/match-cases.ts 
b/packages/dom/test/text-quote/match-cases.ts
index 099802c..d4c2acd 100644
--- a/packages/dom/test/text-quote/match-cases.ts
+++ b/packages/dom/test/text-quote/match-cases.ts
@@ -99,8 +99,8 @@ export const testCases: {
       {
         startContainerXPath: '//i/text()',
         startOffset: 0,
-        endContainerXPath: '//b/text()[2]',
-        endOffset: 0,
+        endContainerXPath: '//i/text()',
+        endOffset: 11,
       },
     ],
   },
@@ -115,8 +115,8 @@ export const testCases: {
       {
         startContainerXPath: '//title/text()',
         startOffset: 4,
-        endContainerXPath: '//b/text()[1]',
-        endOffset: 0,
+        endContainerXPath: '//title/text()',
+        endOffset: 9,
       },
     ],
   },

Reply via email to