This is an automated email from the ASF dual-hosted git repository. gerben pushed a commit to branch import-dom-seek in repository https://gitbox.apache.org/repos/asf/incubator-annotator.git
commit 79dd711f45a3ffc7f6db003d23b02c68777c0bac Author: Gerben <[email protected]> AuthorDate: Sat Oct 17 16:41:11 2020 +0200 Handle code points that cross chunks Not actually needed in our scenario, but fun to implement correctly. --- packages/dom/src/seek.ts | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/packages/dom/src/seek.ts b/packages/dom/src/seek.ts index fc2af9a..732ab8e 100644 --- a/packages/dom/src/seek.ts +++ b/packages/dom/src/seek.ts @@ -202,22 +202,40 @@ class _CharSeeker implements Seeker<string[]> { let result: string[] = []; if (this.position < target) { + let unpairedSurrogate = ''; while (this.position < target) { - characters = [...this.raw.read(1, true)]; + let s = unpairedSurrogate + this.raw.read(1, true); + if (endsWithinCharacter(s)) { + unpairedSurrogate = s.slice(-1); // consider this half-character part of the next string. + s = s.slice(0,-1); + } else { + unpairedSurrogate = ''; + } + characters = [...s]; this.position += characters.length; if (read) result = result.concat(characters); } + if (unpairedSurrogate) this.raw.seekBy(-1); // align with the last complete character. if (!roundUp) { const overshootInCodePoints = this.position - target; const overshootInCodeUnits = characters.slice(overshootInCodePoints).join('').length; this.raw.seekBy(-overshootInCodeUnits); } } else { + let unpairedSurrogate = ''; while (this.position > target) { - characters = [...this.raw.read(-1, true)]; + let s = this.raw.read(-1, true) + unpairedSurrogate; + if (startsWithinCharacter(s)) { + unpairedSurrogate = s[0]; + s = s.slice(1); + } else { + unpairedSurrogate = ''; + } + characters = [...s]; this.position -= characters.length; if (read) result = characters.concat(result); } + if (unpairedSurrogate) this.raw.seekBy(1); if (!roundUp) { const overshootInCodePoints = target - this.position; const overshootInCodeUnits = characters.slice(0, overshootInCodePoints).join('').length; @@ -244,3 +262,13 @@ export class CharSeeker extends _CharSeeker implements Seeker<string[]>, Boundar function isText(node: Node): node is Text { return node.nodeType === Node.TEXT_NODE; } + +function endsWithinCharacter(s: string) { + const codeUnit = s.charCodeAt(s.length - 1); + return (0xD800 <= codeUnit && codeUnit <= 0xDBFF) +} + +function startsWithinCharacter(s: string) { + const codeUnit = s.charCodeAt(0); + return (0xDC00 <= codeUnit && codeUnit <= 0xDFFF) +}
