Make Rikaichamp better recognize text on asahi.com and nikkei.com etc.

birchill · May 22, 2021 · aa8209b · aa8209b
1 parent 35a4a0c
commit aa8209b
Show file tree

Hide file tree

Showing 3 changed files with 162 additions and 47 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,8 @@
 
 - Fixed handling of the first character of a text box
   ([#605](https://github.com/birtles/rikaichamp/issues/605))
+- Made Rikaichamp better able to recognize covered-up text such as is used on
+  [asahi.com](https://asahi.com) and [nikkei.com](https://nikkei.com)
 
 ## 0.5.5 (2021-05-11)
 

diff --git a/src/get-text.ts b/src/get-text.ts
@@ -47,18 +47,7 @@ export function getTextAtPoint(
   point: Point,
   maxLength?: number
 ): GetTextAtPointResult | null {
-  let position: CursorPosition | null;
-  if (document.caretPositionFromPoint) {
-    position = document.caretPositionFromPoint(point.x, point.y);
-  } else {
-    const range = document.caretRangeFromPoint(point.x, point.y);
-    position = range
-      ? {
-          offsetNode: range.startContainer,
-          offset: range.startOffset,
-        }
-      : null;
-  }
+  let position = carentPositionFromPoint(point);
 
   // Chrome not only doesn't support caretPositionFromPoint, but also
   // caretRangeFromPoint doesn't return text input elements. Instead it returns
@@ -114,47 +103,65 @@ export function getTextAtPoint(
       point
     );
 
+    let closeEnough = true;
     if (distanceResult) {
       // If we're more than about three characters away, don't show the
       // pop-up.
       const { distance, glyphExtent } = distanceResult;
       if (distance > glyphExtent * 3) {
-        previousResult = undefined;
-        return null;
+        closeEnough = false;
       }
     }
 
-    const result = getTextFromTextNode(
-      startNode,
-      position!.offset,
-      point,
-      maxLength
-    );
-
-    if (result) {
-      console.assert(
-        !!result.rangeStart,
-        'The range start should be set when getting text from a text node'
-      );
+    if (closeEnough) {
+      const result = getTextFromTextNode({
+        startNode,
+        startOffset: position!.offset,
+        point,
+        maxLength,
+      });
 
-      // If we synthesized a text node, substitute the original node back in.
-      if (startNode !== position!.offsetNode) {
-        console.assert(
-          result.rangeStart!.container === startNode,
-          'When using a synthesized text node the range should start' +
-            ' from that node'
-        );
+      if (result) {
         console.assert(
-          result.rangeEnds.length === 1 &&
-            result.rangeEnds[0].container === startNode,
-          'When using a synthesized text node there should be a single' +
-            ' range end using the synthesized node'
+          !!result.rangeStart,
+          'The range start should be set when getting text from a text node'
         );
-        result.rangeStart!.container = position!.offsetNode;
-        result.rangeEnds[0].container = position!.offsetNode;
+
+        // If we synthesized a text node, substitute the original node back in.
+        if (startNode !== position!.offsetNode) {
+          console.assert(
+            result.rangeStart!.container === startNode,
+            'When using a synthesized text node the range should start' +
+              ' from that node'
+          );
+          console.assert(
+            result.rangeEnds.length === 1 &&
+              result.rangeEnds[0].container === startNode,
+            'When using a synthesized text node there should be a single' +
+              ' range end using the synthesized node'
+          );
+          result.rangeStart!.container = position!.offsetNode;
+          result.rangeEnds[0].container = position!.offsetNode;
+        }
+
+        previousResult = { point, position: position!, result };
+        return result;
       }
+    }
+  }
 
-      previousResult = { point, position: position!, result };
+  // See if we are dealing with a covering link
+  const parentLink = getParentLink(startNode);
+  if (parentLink) {
+    const result = getTextFromCoveringLink({
+      linkElem: parentLink,
+      originalElem: startNode,
+      point,
+      maxLength,
+    });
+    if (result) {
+      // Don't cache `position` since it's not the position we actually used.
+      previousResult = { point, position: undefined, result };
       return result;
     }
   }
@@ -171,7 +178,6 @@ export function getTextAtPoint(
         rangeEnds: [],
       };
       previousResult = { point, position: undefined, result };
-
       return result;
     }
   }
@@ -193,6 +199,20 @@ export function getTextAtPoint(
   return null;
 }
 
+function carentPositionFromPoint(point: Point): CursorPosition | null {
+  if (document.caretPositionFromPoint) {
+    return document.caretPositionFromPoint(point.x, point.y);
+  }
+
+  const range = document.caretRangeFromPoint(point.x, point.y);
+  return range
+    ? {
+        offsetNode: range.startContainer,
+        offset: range.startOffset,
+      }
+    : null;
+}
+
 function getOffsetFromTextInputNode({
   node,
   point,
@@ -289,15 +309,20 @@ function getDistanceFromTextNode(
   return { distance, glyphExtent };
 }
 
-function getTextFromTextNode(
-  startNode: CharacterData,
-  startOffset: number,
+function getTextFromTextNode({
+  startNode,
+  startOffset,
+  point,
+  maxLength,
+}: {
+  startNode: CharacterData;
+  startOffset: number;
   point: {
     x: number;
     y: number;
-  },
-  maxLength?: number
-): GetTextAtPointResult | null {
+  };
+  maxLength?: number;
+}): GetTextAtPointResult | null {
   const isRubyAnnotationElement = (element: Element | null) => {
     if (!element) {
       return false;
@@ -529,6 +554,77 @@ function getTextFromTextNode(
   return result;
 }
 
+function getParentLink(node: Node | null): HTMLAnchorElement | null {
+  if (node && node.nodeType === Node.ELEMENT_NODE) {
+    return (node as Element).closest('a');
+  }
+
+  if (isTextNode(node)) {
+    return node.parentElement ? node.parentElement.closest('a') : null;
+  }
+
+  return null;
+}
+
+// Take care of "covering links". "Convering links" is the name we give to the
+// approach used by at least asahi.com and nikkei.com on their homepages where
+// they create a big <a> element and a tiny (1px x 1px) span with the link text
+// and then render the actual link content in a separate layer.
+//
+// Roughly it looks something like the following:
+//
+// <div>
+//   <a> <-- Link to article with abs-pos left/right/top/bottom: 0
+//     <span/> <-- Link text as a 1x1 div
+//   </a>
+//   <div> <!-- Actual link content
+//     <figure/>
+//     <h2><a>Link text again</a></h2>
+//     etc.
+//   </div>
+// </div>
+//
+// If we fail to find any text but are pointing at a link, we should try digging
+// for content underneath the link
+function getTextFromCoveringLink({
+  linkElem,
+  originalElem,
+  point,
+  maxLength,
+}: {
+  linkElem: HTMLAnchorElement;
+  originalElem: Node | null;
+  point: {
+    x: number;
+    y: number;
+  };
+  maxLength?: number;
+}): GetTextAtPointResult | null {
+  // Turn off pointer-events for the covering link
+  const previousPointEvents = linkElem.style.pointerEvents;
+  linkElem.style.pointerEvents = 'none';
+
+  const position = carentPositionFromPoint(point);
+
+  linkElem.style.pointerEvents = previousPointEvents;
+
+  // See if we successfully found a different text node
+  if (
+    !position ||
+    position.offsetNode === originalElem ||
+    !isTextNode(position.offsetNode)
+  ) {
+    return null;
+  }
+
+  return getTextFromTextNode({
+    startNode: position.offsetNode,
+    startOffset: position.offset,
+    point,
+    maxLength,
+  });
+}
+
 // This is a bit complicated because for a numeric year we don't require the
 // 年 but for 元年 we do. i.e. '令和2' is valid but '令和元' is not.
 const yearRegex = /(?:([0-9０-９一二三四五六七八九十百]+)\s*年?|(?:元\s*年))/;

diff --git a/tests/get-text.test.ts b/tests/get-text.test.ts
@@ -171,6 +171,23 @@ describe('getTextAtPoint', () => {
     assertTextResultEqual(result, 'うえお', lastTextNode, 0, lastTextNode, 3);
   });
 
+  it('should dig into the content behind covering links', () => {
+    // The following is based very heavily on the structure of article previews
+    // in asahi.com as of 2021-05-22 although nikkei.com is similar
+    testDiv.innerHTML =
+      '<div><a href="/articles/" style="position: absolute; top: 0; bottom: 0; left: 0; right: 0; z-index: 1"><span aria-hidden="true" style="display: block; width: 1px; height: 1px; overflow: hidden">あいうえお</span></a><div><div style="position: relative; width: 100%"><h2 style="z-index: auto"><a href="/articles/" id="innerLink">あいうえお</a></h2></div></div>';
+
+    const textNode = testDiv.querySelector('#innerLink')!.firstChild as Text;
+    const bbox = getBboxForOffset(textNode, 0);
+
+    const result = getTextAtPoint({
+      x: bbox.right,
+      y: bbox.top + bbox.height / 2,
+    });
+
+    assertTextResultEqual(result, 'いうえお', textNode, 1, textNode, 5);
+  });
+
   it('should ignore non-Japanese characters', () => {
     testDiv.append('あいabc');
     const textNode = testDiv.firstChild as Text;