This is an automated email from the ASF dual-hosted git repository. gerben pushed a commit to branch text-position in repository https://gitbox.apache.org/repos/asf/incubator-annotator.git
commit ccfd9288cf17331dba46330e899122009a7c7f8d Author: Gerben <[email protected]> AuthorDate: Thu Oct 8 23:15:05 2020 +0200 Implement text-position matching. Largely copying from the text quote implementation. No effort done yet regarding deduplication, abstraction, efficiency. --- .../types.ts => dom/src/text-position/index.ts} | 26 +-- packages/dom/src/text-position/match.ts | 88 ++++++++ packages/dom/test/text-position/match-cases.ts | 142 +++++++++++++ packages/dom/test/text-position/match.test.ts | 227 +++++++++++++++++++++ packages/selector/src/index.ts | 2 +- packages/selector/src/types.ts | 6 + 6 files changed, 465 insertions(+), 26 deletions(-) diff --git a/packages/selector/src/types.ts b/packages/dom/src/text-position/index.ts similarity index 61% copy from packages/selector/src/types.ts copy to packages/dom/src/text-position/index.ts index fc4f64b..011e994 100644 --- a/packages/selector/src/types.ts +++ b/packages/dom/src/text-position/index.ts @@ -18,28 +18,4 @@ * under the License. */ -export interface Selector { - refinedBy?: Selector; -} - -export interface CssSelector extends Selector { - type: 'CssSelector'; - value: string; -} - -export interface TextQuoteSelector extends Selector { - type: 'TextQuoteSelector'; - exact: string; - prefix?: string; - suffix?: string; -} - -export interface RangeSelector extends Selector { - type: 'RangeSelector'; - startSelector: Selector; - endSelector: Selector; -} - -export interface Matcher<TScope, TMatch> { - (scope: TScope): AsyncGenerator<TMatch, void, void>; -} +export * from './match'; diff --git a/packages/dom/src/text-position/match.ts b/packages/dom/src/text-position/match.ts new file mode 100644 index 0000000..a579e94 --- /dev/null +++ b/packages/dom/src/text-position/match.ts @@ -0,0 +1,88 @@ +/** + * @license + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import seek from 'dom-seek'; +import type { Matcher, TextPositionSelector } from '@annotator/selector'; +import { ownerDocument } from '../owner-document'; + +export function createTextPositionSelectorMatcher( + selector: TextPositionSelector, +): Matcher<Range, Range> { + return async function* matchAll(scope) { + const document = ownerDocument(scope); + const scopeText = scope.toString(); + + const { start, end } = selector; + + const iter = document.createNodeIterator( + scope.commonAncestorContainer, + NodeFilter.SHOW_TEXT, + { + acceptNode(node: Text) { + // Only reveal nodes within the range; and skip any empty text nodes. + return scope.intersectsNode(node) && node.length > 0 + ? NodeFilter.FILTER_ACCEPT + : NodeFilter.FILTER_REJECT; + }, + }, + ); + + // The index of the first character of iter.referenceNode inside the text. + let referenceNodeIndex = isTextNode(scope.startContainer) + ? -scope.startOffset + : 0; + + // String indices are based on code points, not code units, so we actually have to count. + const matchStartIndex = getIndexOfCharacterNumber(scopeText, start); + const matchEndIndex = getIndexOfCharacterNumber(scopeText, end); + + // Create a range to represent the described text in the dom. + const match = document.createRange(); + + // Seek to the start of the match, make the range start there. + referenceNodeIndex += seek(iter, matchStartIndex - referenceNodeIndex); + match.setStart(iter.referenceNode, matchStartIndex - referenceNodeIndex); + + // Seek to the end of the match, make the range end there. + referenceNodeIndex += seek(iter, matchEndIndex - referenceNodeIndex); + match.setEnd(iter.referenceNode, matchEndIndex - referenceNodeIndex); + + // Yield the match. + yield match; + }; +} + +function isTextNode(node: Node): node is Text { + return node.nodeType === Node.TEXT_NODE; +} + +function getIndexOfCharacterNumber(text: string, characterNumber: number): number { + let index = 0; + let characterCount = 0; + for (let character of text) { + if (characterCount >= characterNumber) // using >= to avoid infinite loop on invalid input. + break; + index += character.length; // note the length is either 1 or 2 + characterCount++; + } + if (characterCount === characterNumber) + return index; + throw new RangeError; +} diff --git a/packages/dom/test/text-position/match-cases.ts b/packages/dom/test/text-position/match-cases.ts new file mode 100644 index 0000000..0916446 --- /dev/null +++ b/packages/dom/test/text-position/match-cases.ts @@ -0,0 +1,142 @@ +/** + * @license + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import type { TextPositionSelector } from '@annotator/selector'; +import type { RangeInfo } from '../utils'; + +export const testCases: { + [name: string]: { + html: string; + selector: TextPositionSelector; + expected: RangeInfo[]; + }; +} = { + simple: { + html: '<b>l😃rem ipsum dolor amet yada yada</b>', + selector: { + type: 'TextPositionSelector', + start: 12, + end: 20, + }, + expected: [ + { + startContainerXPath: '//b/text()', + startOffset: 13, + endContainerXPath: '//b/text()', + endOffset: 21, + }, + ], + }, + 'first characters': { + html: '<b>l😃rem ipsum dolor amet yada yada</b>', + selector: { + type: 'TextPositionSelector', + start: 0, + end: 11, + }, + expected: [ + { + startContainerXPath: '//b/text()', + startOffset: 0, + endContainerXPath: '//b/text()', + endOffset: 12, + }, + ], + }, + 'last characters': { + html: '<b>l😃rem ipsum dolor amet yada yada</b>', + selector: { + type: 'TextPositionSelector', + start: 23, + end: 32, + }, + expected: [ + { + startContainerXPath: '//b/text()', + startOffset: 24, + endContainerXPath: '//b/text()', + endOffset: 33, + }, + ], + }, + 'across elements': { + html: '<b>l😃rem <i>ipsum</i> dolor <u>amet</u> yada yada</b>', + selector: { + type: 'TextPositionSelector', + start: 12, + end: 20, + }, + expected: [ + { + startContainerXPath: '//b/text()[2]', + startOffset: 1, + endContainerXPath: '//u/text()', + endOffset: 2, + }, + ], + }, + 'exact element contents': { + html: '<b>l😃rem <i>ipsum dolor</i> amet yada yada</b>', + selector: { + type: 'TextPositionSelector', + start: 6, + end: 17, + }, + expected: [ + { + startContainerXPath: '//i/text()', + startOffset: 0, + endContainerXPath: '//b/text()[2]', + endOffset: 0, + }, + ], + }, + 'text inside <head>': { + html: + '<head><title>l😃rem ipsum dolor amet</title></head><b>yada yada</b>', + selector: { + type: 'TextPositionSelector', + start: 18, + end: 22, + }, + expected: [ + { + startContainerXPath: '//title/text()', + startOffset: 19, + endContainerXPath: '//b/text()[1]', + endOffset: 0, + }, + ], + }, + 'empty quote': { + html: '<b>l😃rem</b>', + selector: { + type: 'TextPositionSelector', + start: 3, + end: 3, + }, + expected: [{ + startContainerXPath: '//b/text()', + startOffset: 4, + endContainerXPath: '//b/text()', + endOffset: 4, + }], + }, +}; diff --git a/packages/dom/test/text-position/match.test.ts b/packages/dom/test/text-position/match.test.ts new file mode 100644 index 0000000..1acaed0 --- /dev/null +++ b/packages/dom/test/text-position/match.test.ts @@ -0,0 +1,227 @@ +/** + * @license + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import { assert } from 'chai'; +import type { TextPositionSelector } from '@annotator/selector'; +import { createTextPositionSelectorMatcher } from '../../src/text-position/match'; +import { evaluateXPath } from '../utils'; +import type { RangeInfo } from '../utils'; +import { testCases } from './match-cases'; + +const domParser = new window.DOMParser(); + +describe('createTextPositionSelectorMatcher', () => { + for (const [name, { html, selector, expected }] of Object.entries( + testCases, + )) { + it(`works for case: '${name}'`, async () => { + const doc = domParser.parseFromString(html, 'text/html'); + + const scope = doc.createRange(); + scope.selectNodeContents(doc); + + await testMatcher(doc, scope, selector, expected); + }); + } + + it('handles adjacent text nodes', async () => { + const { html, selector } = testCases['simple']; + const doc = domParser.parseFromString(html, 'text/html'); + + const scope = doc.createRange(); + scope.selectNodeContents(doc); + + const textNode = evaluateXPath(doc, '//b/text()') as Text; + + textNode.splitText(16); + // console.log([...textNode.parentNode.childNodes].map(node => node.textContent)) + // → [ 'l😃rem ipsum dol', 'or amet yada yada' ] + + await testMatcher(doc, scope, selector, [ + { + startContainerXPath: '//b/text()[1]', + startOffset: 13, + endContainerXPath: '//b/text()[2]', + endOffset: 5, + }, + ]); + }); + + it('handles empty text nodes', async () => { + const { html, selector } = testCases['simple']; + const doc = domParser.parseFromString(html, 'text/html'); + + const scope = doc.createRange(); + scope.selectNodeContents(doc); + + const textNode = evaluateXPath(doc, '//b/text()') as Text; + textNode.splitText(textNode.length); + textNode.splitText(21); + textNode.splitText(21); + textNode.splitText(18); + textNode.splitText(18); + textNode.splitText(13); + textNode.splitText(13); + textNode.splitText(0); + // console.log([...textNode.parentNode.childNodes].map(node => node.textContent)) + // → [ '', 'l😃rem ipsum ', '', 'dolor', '', ' am', '', 'et yada yada', '' ] + + + await testMatcher(doc, scope, selector, [ + { + startContainerXPath: '//b/text()[4]', // "dolor" + startOffset: 0, + endContainerXPath: '//b/text()[8]', // "et yada yada" + endOffset: 0, + }, + ]); + }); + + it('works when scope spans one text node’s contents, matching its first characters', async () => { + const { html, selector, expected } = testCases['first characters']; + const doc = domParser.parseFromString(html, 'text/html'); + + const scope = doc.createRange(); + scope.selectNodeContents(evaluateXPath(doc, '//b/text()')); + + await testMatcher(doc, scope, selector, expected); + }); + + it('works when scope starts with an empty text node, matching its first characters', async () => { + const { html, selector } = testCases['first characters']; + const doc = domParser.parseFromString(html, 'text/html'); + + const textNode = evaluateXPath(doc, '//b/text()') as Text; + textNode.splitText(0); + + const scope = doc.createRange(); + scope.selectNodeContents(evaluateXPath(doc, '//b')); + + await testMatcher(doc, scope, selector, [ + { + startContainerXPath: '//b/text()[2]', + startOffset: 0, + endContainerXPath: '//b/text()[2]', + endOffset: 12, + }, + ]); + }); + + it('works when scope has both ends within one text node', async () => { + const { html, expected } = testCases['simple']; + + const doc = domParser.parseFromString(html, 'text/html'); + + // Use the substring ‘ipsum dolor amet’ as scope. + const scope = doc.createRange(); + scope.setStart(evaluateXPath(doc, '//b/text()'), 7); + scope.setEnd(evaluateXPath(doc, '//b/text()'), 23); + + const selector: TextPositionSelector = { + type: 'TextPositionSelector', + start: 6, + end: 14, + }; + + await testMatcher(doc, scope, selector, expected); + }); + + it('works when scope has both ends inside text nodes', async () => { + const { html, expected } = testCases['across elements']; + const doc = domParser.parseFromString(html, 'text/html'); + + // Use the substring ‘sum dolor am’ as scope. + const scope = doc.createRange(); + scope.setStart(evaluateXPath(doc, '//i/text()'), 2); + scope.setEnd(evaluateXPath(doc, '//u/text()'), 2); + + const selector: TextPositionSelector = { + type: 'TextPositionSelector', + start: 4, + end: 12, + }; + + await testMatcher(doc, scope, selector, expected); + }); + + it('works when scope has both ends inside an element', async () => { + const { html, expected } = testCases['across elements']; + const doc = domParser.parseFromString(html, 'text/html'); + + const scope = doc.createRange(); + scope.setStart(evaluateXPath(doc, '//b'), 1); // before the <i> + scope.setEnd(evaluateXPath(doc, '//b'), 4); // before the " yada yada" + const selector: TextPositionSelector = { + type: 'TextPositionSelector', + start: 6, + end: 14, + }; + await testMatcher(doc, scope, selector, expected); + }); +}); + +async function testMatcher( + doc: Document, + scope: Range, + selector: TextPositionSelector, + expected: RangeInfo[], +) { + const matcher = createTextPositionSelectorMatcher(selector); + const matches = []; + for await (const value of matcher(scope)) matches.push(value); + assert.equal(matches.length, expected.length); + matches.forEach((match, i) => { + const expectedRange = expected[i]; + const expectedStartContainer = evaluateXPath( + doc, + expectedRange.startContainerXPath, + ); + const expectedEndContainer = evaluateXPath( + doc, + expectedRange.endContainerXPath, + ); + assert( + match.startContainer === expectedStartContainer, + `unexpected start container: ${prettyNodeName(match.startContainer)}; ` + + `expected ${prettyNodeName(expectedStartContainer)}`, + ); + assert.equal(match.startOffset, expectedRange.startOffset); + assert( + match.endContainer === + evaluateXPath(doc, expectedRange.endContainerXPath), + `unexpected end container: ${prettyNodeName(match.endContainer)}; ` + + `expected ${prettyNodeName(expectedEndContainer)}`, + ); + assert.equal(match.endOffset, expectedRange.endOffset); + }); +} + +function prettyNodeName(node: Node) { + switch (node.nodeType) { + case Node.TEXT_NODE: { + const text = (node as Text).nodeValue || ''; + return `#text "${text.length > 50 ? text.substring(0, 50) + '…' : text}"`; + } + case Node.ELEMENT_NODE: + return `<${(node as Element).tagName.toLowerCase()}>`; + default: + return node.nodeName.toLowerCase(); + } +} diff --git a/packages/selector/src/index.ts b/packages/selector/src/index.ts index c66bd94..ffab70b 100644 --- a/packages/selector/src/index.ts +++ b/packages/selector/src/index.ts @@ -21,7 +21,7 @@ import type { Matcher, Selector } from './types'; export type { Matcher, Selector } from './types'; -export type { CssSelector, RangeSelector, TextQuoteSelector } from './types'; +export type { CssSelector, RangeSelector, TextPositionSelector, TextQuoteSelector } from './types'; export function makeRefinable< // Any subtype of Selector can be made refinable; but note we limit the value diff --git a/packages/selector/src/types.ts b/packages/selector/src/types.ts index fc4f64b..e57fed0 100644 --- a/packages/selector/src/types.ts +++ b/packages/selector/src/types.ts @@ -34,6 +34,12 @@ export interface TextQuoteSelector extends Selector { suffix?: string; } +export interface TextPositionSelector extends Selector { + type: 'TextPositionSelector'; + start: number; // more precisely: non-negative integer + end: number; // more precisely: non-negative integer +} + export interface RangeSelector extends Selector { type: 'RangeSelector'; startSelector: Selector;
