diff --git a/CHANGELOG.md b/CHANGELOG.md index 8d8bcfbc24..6666a7bf95 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,7 @@ app. ([#1314](https://github.com/birchill/10ten-ja-reader/issues/1314)). - Make the `x` key close the popup if it is configured to _both_ close it and expand it. +- Fixed recognition of words that end in half-width numerals like 小1. - Made the options page show up in a new tab on Edge. - Added a workaround for a [Firefox bug](https://bugzilla.mozilla.org/show_bug.cgi?id=1860486) that would diff --git a/src/content/currency.ts b/src/content/currency.ts index fc834a7c65..9522fc8bae 100644 --- a/src/content/currency.ts +++ b/src/content/currency.ts @@ -1,7 +1,7 @@ import { getCombinedCharRange, getNegatedCharRange, - halfWidthNumbers, + startsWithNumeral, } from '../utils/char-range'; import { parseNumber } from './numbers'; @@ -13,25 +13,30 @@ export type CurrencyMeta = { }; export function lookForCurrency({ + currentText, nodeText, - textDelimiter: originalTextDelimeter, + textDelimiter: originalTextDelimiter, }: { + currentText: string; nodeText: string; textDelimiter: RegExp; }): { textDelimiter: RegExp; textEnd: number; } | null { - // We only need to expand the search range if it starts with a currency - // symbol. For the 8千円 case, the regular text lookup will find the necessary - // text. - if (nodeText.length && nodeText[0] !== '¥' && nodeText[0] !== '¥') { + // If the source text might be a currency, expand our text delimiter to allow + // extra symbols that would normally be ignored. + const sourceText = currentText + nodeText; + const mightBeCurrency = + sourceText[0] === '¥' || + sourceText[0] === '¥' || + (startsWithNumeral(sourceText) && sourceText.indexOf('円') > 0); + if (!mightBeCurrency) { return null; } const japaneseOrPrice = getCombinedCharRange([ - getNegatedCharRange(originalTextDelimeter), - halfWidthNumbers, + getNegatedCharRange(originalTextDelimiter), /[¥¥\s,、..。]/, ]); const textDelimiter = getNegatedCharRange(japaneseOrPrice); diff --git a/src/content/measure.ts b/src/content/measure.ts index 9cc242ca73..3a586ff273 100644 --- a/src/content/measure.ts +++ b/src/content/measure.ts @@ -1,14 +1,15 @@ import { getCombinedCharRange, getNegatedCharRange, - startsWithNumber, + startsWithDigit, + startsWithNumeral, } from '../utils/char-range'; import { parseNumber } from './numbers'; export function lookForMeasure({ nodeText, - textDelimiter: originalTextDelimeter, + textDelimiter: originalTextDelimiter, }: { nodeText: string; textDelimiter: RegExp; @@ -16,16 +17,14 @@ export function lookForMeasure({ textDelimiter: RegExp; textEnd: number; } | null { - if (!startsWithNumber(nodeText)) { + if (!startsWithNumeral(nodeText)) { return null; } - // getTextFromTextNode should already have expanded this range to include - // half-width numbers and serparators so we just need to add the units and - // space characters. + const includeSeparators = startsWithDigit(nodeText); const japaneseOrUnit = getCombinedCharRange([ - getNegatedCharRange(originalTextDelimeter), - /[\sm2㎡²]/, + getNegatedCharRange(originalTextDelimiter), + includeSeparators ? /[\sm2㎡²,、..]/ : /[\sm2㎡²]/, ]); const textDelimiter = getNegatedCharRange(japaneseOrUnit); diff --git a/src/content/meta.ts b/src/content/meta.ts index a6e575af77..e210b6a5ff 100644 --- a/src/content/meta.ts +++ b/src/content/meta.ts @@ -33,9 +33,9 @@ export function lookForMetadata({ } { return ( (matchCurrency - ? lookForCurrency({ nodeText, textDelimiter }) + ? lookForCurrency({ currentText, nodeText, textDelimiter }) : undefined) || - lookForEra({ currentText, nodeText, textEnd }) || + lookForEra({ currentText, nodeText, textEnd, textDelimiter }) || lookForShogi({ nodeText, textDelimiter }) || lookForMeasure({ nodeText, textDelimiter }) || { textDelimiter, diff --git a/src/content/scan-text.ts b/src/content/scan-text.ts index 4d98a0f924..cffffa5ece 100644 --- a/src/content/scan-text.ts +++ b/src/content/scan-text.ts @@ -1,8 +1,4 @@ -import { - nonJapaneseChar, - nonJapaneseCharOrNumber, - startsWithNumber, -} from '../utils/char-range'; +import { nonJapaneseChar } from '../utils/char-range'; import { CursorPosition } from './get-cursor-position'; import { GetTextAtPointResult } from './get-text'; import { extractGetTextMetadata, lookForMetadata } from './meta'; @@ -92,12 +88,6 @@ export function scanText({ result.text + nodeText.substring(0, textEnd === -1 ? undefined : textEnd); - // If the source starts with a number, expand our text delimeter to allow - // reading the rest of the number since it might be something like 5つ. - if (!currentText.length && startsWithNumber(nodeText)) { - textDelimiter = nonJapaneseCharOrNumber; - } - // Check if we should further expand the set of allowed characters in // order to recognize certain types of metadata-type strings (e.g. years // or floor space measurements). diff --git a/src/content/shogi.ts b/src/content/shogi.ts index c166747776..1a302d6d92 100644 --- a/src/content/shogi.ts +++ b/src/content/shogi.ts @@ -66,7 +66,7 @@ export type ShogiMeta = { export function lookForShogi({ nodeText, - textDelimiter: originalTextDelimeter, + textDelimiter: originalTextDelimiter, }: { nodeText: string; textDelimiter: RegExp; @@ -79,28 +79,28 @@ export function lookForShogi({ } // If the test starts with one of the shogi side indicators, then we assume - // that the text is a shogi move and we can use the shogi delimeter. + // that the text is a shogi move and we can use the shogi delimiter. if (['▲', '△', '☗', '☖'].includes(nodeText[0])) { return { - textDelimiter: shogiDelimeter, - textEnd: nodeText.search(shogiDelimeter), + textDelimiter: shogiDelimiter, + textEnd: nodeText.search(shogiDelimiter), }; } // Otherwise, if it starts with an Arabic number followed by a kanji number // OR it starts with one of the characters meaning "same position" then - // expand the delimeter range to include all the shogi characters. + // expand the delimiter range to include all the shogi characters. if (!unprefixedShogiStart.test(nodeText)) { return null; } - const expandedDelimeter = getCombinedCharRange([ - getNegatedCharRange(originalTextDelimeter), + const expandedDelimiter = getCombinedCharRange([ + getNegatedCharRange(originalTextDelimiter), /[↑]/, // All the other characters such as 𠔼丶フゝ・○ etc. should already be // covered by `japaneseChar` so we don't need to add them here. ]); - const textDelimiter = getNegatedCharRange(expandedDelimeter); + const textDelimiter = getNegatedCharRange(expandedDelimiter); return { textDelimiter, @@ -109,7 +109,7 @@ export function lookForShogi({ } // This needs to be kept in sync with the regexes below. -const shogiDelimeter = +const shogiDelimiter = /[^▲△☗☖1-91-9一二三四五六七八九同仝-𠔼ド歩兵丶フゝ・香禾キ↑桂土銀ヨ角ク飛ヒ乙金人と成ナ馬マウ龍竜立リ玉王○打引寄上行入右左直行入不生]/u; const unprefixedShogiStart = /^[1-91-9][一二三四五六七八九]|[同仝-𠔼ド]/u; diff --git a/src/content/years.ts b/src/content/years.ts index 2d1edfb44d..128ead4e88 100644 --- a/src/content/years.ts +++ b/src/content/years.ts @@ -1,27 +1,39 @@ +import { getCombinedCharRange, getNegatedCharRange } from '../utils/char-range'; import { parseNumber } from './numbers'; -const nonEraCharacter = /[^\s0-90-9一二三四五六七八九十百元年]/; - export function lookForEra({ currentText, nodeText, + textDelimiter: originalTextDelimiter, textEnd, }: { currentText: string; nodeText: string; + textDelimiter: RegExp; textEnd: number; }): { textDelimiter: RegExp; textEnd: number; } | null { + // We only want to _extend_ the current range so if `textEnd` is already -1 + // (i.e. end of the text) then we don't need to do anything. if (textEnd < 0 || !startsWithEraName(currentText)) { return null; } - const endOfEra = nodeText.substring(textEnd).search(nonEraCharacter); + // The original text delimiter should include all the characters needed to + // match Japanese years except spaces between the era and the year, and + // spaces between the year and the final 年 character, if any. + const japaneseOrSpace = getCombinedCharRange([ + getNegatedCharRange(originalTextDelimiter), + /[\s]/, + ]); + const textDelimiter = getNegatedCharRange(japaneseOrSpace); + + const endOfEra = nodeText.substring(textEnd).search(textDelimiter); return { - textDelimiter: nonEraCharacter, + textDelimiter, textEnd: endOfEra === -1 ? -1 : textEnd + endOfEra, }; } diff --git a/src/utils/char-range.ts b/src/utils/char-range.ts index aa83b2530e..90b66a553b 100644 --- a/src/utils/char-range.ts +++ b/src/utils/char-range.ts @@ -138,6 +138,8 @@ function isCharacterClassRange(re: RegExp): boolean { // typically delimit words. /** @public */ export const japaneseChar = getCombinedCharRange([ + // We include half-width numbers so we can recognize things like 小1 + halfWidthNumbers, fullWidthAlphanumerics, zeroWidthNonJoiner, whiteCircle, @@ -174,18 +176,40 @@ export function getNegatedCharRange(range: RegExp): RegExp { export const nonJapaneseChar = getNegatedCharRange(japaneseChar); -export const nonJapaneseCharOrNumber = getNegatedCharRange( - getCombinedCharRange([japaneseChar, halfWidthNumbers, /[,、..]/]) -); - export function hasKatakana(text: string): boolean { return katakana.test(text); } -const numberStartRegex = /^[0-90-9一二三四五六七八九十百]/; +export function startsWithDigit(input: string): boolean { + const c = input.length ? input.charCodeAt(0) : 0; + return (c >= 48 && c <= 57) || (c >= 65296 && c <= 65305); +} -export function startsWithNumber(input: string): boolean { - return !!input.length && numberStartRegex.test(input); +const kanjiNumerals = [ + '〇', + '一', + '二', + '三', + '四', + '五', + '六', + '七', + '八', + '九', + '十', + '百', + '千', + '万', + '億', + '兆', + '京', +]; + +export function startsWithNumeral(input: string): boolean { + return ( + startsWithDigit(input) || + (!!input.length && kanjiNumerals.includes(input[0])) + ); } const onlyDigits = /^[0-90-9,,、..]+$/; diff --git a/tests/get-text.test.ts b/tests/get-text.test.ts index 4860e8c223..d84db97aca 100644 --- a/tests/get-text.test.ts +++ b/tests/get-text.test.ts @@ -419,6 +419,21 @@ describe('getTextAtPoint', () => { assertTextResultEqual(result, 'あ\u200cい\u200cう\u200c', [textNode, 0, 6]); }); + it('should include trailing half-width numerals', () => { + testDiv.append('小1。'); + const textNode = testDiv.firstChild as Text; + const bbox = getBboxForOffset(textNode, 0); + + const result = getTextAtPoint({ + point: { + x: bbox.left, + y: bbox.top + bbox.height / 2, + }, + }); + + assertTextResultEqual(result, '小1', [textNode, 0, 2]); + }); + it('should include the year when recognizing years', () => { testDiv.append('昭和56年に'); const textNode = testDiv.firstChild as Text; @@ -431,7 +446,7 @@ describe('getTextAtPoint', () => { }, }); - assertTextResultEqual(result, '昭和56年', [textNode, 0, 5]); + assertTextResultEqual(result, '昭和56年に', [textNode, 0, 6]); assert.deepEqual(result!.meta, { type: 'era', era: '昭和', @@ -473,7 +488,7 @@ describe('getTextAtPoint', () => { }, }); - assertTextResultEqual(result, '昭和56年', [textNode, 0, 5]); + assertTextResultEqual(result, '昭和56年に', [textNode, 0, 6]); assert.deepEqual(result!.meta, { type: 'era', era: '昭和', @@ -495,7 +510,7 @@ describe('getTextAtPoint', () => { }, }); - assertTextResultEqual(result, '昭和 56 年', [textNode, 0, 7]); + assertTextResultEqual(result, '昭和 56 年に', [textNode, 0, 8]); assert.deepEqual(result!.meta, { type: 'era', era: '昭和', @@ -517,7 +532,7 @@ describe('getTextAtPoint', () => { }, }); - assertTextResultEqual(result, '昭和56', [textNode, 0, 4]); + assertTextResultEqual(result, '昭和56に', [textNode, 0, 5]); assert.deepEqual(result!.meta, { type: 'era', era: '昭和', @@ -542,10 +557,10 @@ describe('getTextAtPoint', () => { assertTextResultEqual( result, - '昭和56年', + '昭和56年に', [firstTextNode, 0, 2], [middleTextNode, 0, 2], - [lastTextNode, 0, 1] + [lastTextNode, 0, 2] ); assert.deepEqual(result!.meta, { type: 'era', @@ -570,9 +585,9 @@ describe('getTextAtPoint', () => { assertTextResultEqual( result, - '昭和 56年', + '昭和 56年に', [firstTextNode, 0, 3], - [middleTextNode, 0, 4] + [middleTextNode, 0, 5] ); assert.deepEqual(result!.meta, { type: 'era', @@ -586,6 +601,7 @@ describe('getTextAtPoint', () => { testDiv.innerHTML = '昭和56年に'; const firstTextNode = testDiv.firstChild as Text; const middleTextNode = testDiv.childNodes[1].firstChild as Text; + const finalTextNode = testDiv.lastChild as Text; const bbox = getBboxForOffset(firstTextNode, 0); const result = getTextAtPoint({ @@ -597,9 +613,10 @@ describe('getTextAtPoint', () => { assertTextResultEqual( result, - '昭和56年', + '昭和56年に', [firstTextNode, 0, 2], - [middleTextNode, 0, 3] + [middleTextNode, 0, 3], + [finalTextNode, 0, 1] ); assert.deepEqual(result!.meta, { type: 'era', @@ -678,7 +695,7 @@ describe('getTextAtPoint', () => { }); }); - it('should stop at delimeters (even when matching years)', () => { + it('should stop at delimiters (even when matching years)', () => { testDiv.append('昭和三大馬鹿査定」発言に'); const textNode = testDiv.firstChild as Text; const bbox = getBboxForOffset(textNode, 0); @@ -719,7 +736,7 @@ describe('getTextAtPoint', () => { }); }); - it('should recognize Japanese yen values that start with ¥', () => { + it('should recognize Japanese yen values that start with ¥ (full-width)', () => { testDiv.append('価格¥8万8千です'); const textNode = testDiv.firstChild as Text; const bbox = getBboxForOffset(textNode, 2); @@ -739,7 +756,7 @@ describe('getTextAtPoint', () => { }); }); - it('should recognize Japanese yen values that start with ¥', () => { + it('should recognize Japanese yen values that start with ¥ (half-width)', () => { testDiv.append('価格¥ 8万8千です'); const textNode = testDiv.firstChild as Text; const bbox = getBboxForOffset(textNode, 2); @@ -759,6 +776,32 @@ describe('getTextAtPoint', () => { }); }); + it('should recognize Japanese yen values that start with ¥ in a separate span', () => { + testDiv.innerHTML = '¥ 88,000です'; + const firstTextNode = testDiv.childNodes[0].firstChild as Text; + const secondTextNode = testDiv.childNodes[1] as Text; + const bbox = getBboxForOffset(firstTextNode, 0); + + const result = getTextAtPoint({ + point: { + x: bbox.left + bbox.width / 2, + y: bbox.top + bbox.height / 2, + }, + }); + + assertTextResultEqual( + result, + '¥ 88,000です', + [firstTextNode, 0, 1], + [secondTextNode, 0, 9] + ); + assert.deepEqual(result!.meta, { + type: 'currency', + value: 88000, + matchLen: 8, + }); + }); + it('should recognize Japanese yen values that include commas', () => { testDiv.append('価格8,800円です'); const textNode = testDiv.firstChild as Text; diff --git a/tests/playground.html b/tests/playground.html index d53b85f46e..d211022598 100644 --- a/tests/playground.html +++ b/tests/playground.html @@ -127,10 +127,11 @@