Skip to content

Commit

Permalink
fix: recognize words that end in half-width numerals like 小1
Browse files Browse the repository at this point in the history
As reported here:
#709 (comment)
  • Loading branch information
birtles committed Oct 24, 2023
1 parent cd77a9e commit 4538ebf
Show file tree
Hide file tree
Showing 10 changed files with 138 additions and 63 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ app.
([#1314](https://github.com/birchill/10ten-ja-reader/issues/1314)).
- Make the `x` key close the popup if it is configured to _both_ close it and
expand it.
- Fixed recognition of words that end in half-width numerals like 小1.
- Made the options page show up in a new tab on Edge.
- Added a workaround for a
[Firefox bug](https://bugzilla.mozilla.org/show_bug.cgi?id=1860486) that would
Expand Down
21 changes: 13 additions & 8 deletions src/content/currency.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import {
getCombinedCharRange,
getNegatedCharRange,
halfWidthNumbers,
startsWithNumeral,
} from '../utils/char-range';

import { parseNumber } from './numbers';
Expand All @@ -13,25 +13,30 @@ export type CurrencyMeta = {
};

export function lookForCurrency({
currentText,
nodeText,
textDelimiter: originalTextDelimeter,
textDelimiter: originalTextDelimiter,
}: {
currentText: string;
nodeText: string;
textDelimiter: RegExp;
}): {
textDelimiter: RegExp;
textEnd: number;
} | null {
// We only need to expand the search range if it starts with a currency
// symbol. For the 8千円 case, the regular text lookup will find the necessary
// text.
if (nodeText.length && nodeText[0] !== '¥' && nodeText[0] !== '¥') {
// If the source text might be a currency, expand our text delimiter to allow
// extra symbols that would normally be ignored.
const sourceText = currentText + nodeText;
const mightBeCurrency =
sourceText[0] === '¥' ||
sourceText[0] === '¥' ||
(startsWithNumeral(sourceText) && sourceText.indexOf('円') > 0);
if (!mightBeCurrency) {
return null;
}

const japaneseOrPrice = getCombinedCharRange([
getNegatedCharRange(originalTextDelimeter),
halfWidthNumbers,
getNegatedCharRange(originalTextDelimiter),
/[¥\s,.]/,
]);
const textDelimiter = getNegatedCharRange(japaneseOrPrice);
Expand Down
15 changes: 7 additions & 8 deletions src/content/measure.ts
Original file line number Diff line number Diff line change
@@ -1,31 +1,30 @@
import {
getCombinedCharRange,
getNegatedCharRange,
startsWithNumber,
startsWithDigit,
startsWithNumeral,
} from '../utils/char-range';

import { parseNumber } from './numbers';

export function lookForMeasure({
nodeText,
textDelimiter: originalTextDelimeter,
textDelimiter: originalTextDelimiter,
}: {
nodeText: string;
textDelimiter: RegExp;
}): {
textDelimiter: RegExp;
textEnd: number;
} | null {
if (!startsWithNumber(nodeText)) {
if (!startsWithNumeral(nodeText)) {
return null;
}

// getTextFromTextNode should already have expanded this range to include
// half-width numbers and serparators so we just need to add the units and
// space characters.
const includeSeparators = startsWithDigit(nodeText);
const japaneseOrUnit = getCombinedCharRange([
getNegatedCharRange(originalTextDelimeter),
/[\sm2²]/,
getNegatedCharRange(originalTextDelimiter),
includeSeparators ? /[\sm2²,.]/ : /[\sm2²]/,
]);
const textDelimiter = getNegatedCharRange(japaneseOrUnit);

Expand Down
4 changes: 2 additions & 2 deletions src/content/meta.ts
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,9 @@ export function lookForMetadata({
} {
return (
(matchCurrency
? lookForCurrency({ nodeText, textDelimiter })
? lookForCurrency({ currentText, nodeText, textDelimiter })
: undefined) ||
lookForEra({ currentText, nodeText, textEnd }) ||
lookForEra({ currentText, nodeText, textEnd, textDelimiter }) ||
lookForShogi({ nodeText, textDelimiter }) ||
lookForMeasure({ nodeText, textDelimiter }) || {
textDelimiter,
Expand Down
12 changes: 1 addition & 11 deletions src/content/scan-text.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,4 @@
import {
nonJapaneseChar,
nonJapaneseCharOrNumber,
startsWithNumber,
} from '../utils/char-range';
import { nonJapaneseChar } from '../utils/char-range';
import { CursorPosition } from './get-cursor-position';
import { GetTextAtPointResult } from './get-text';
import { extractGetTextMetadata, lookForMetadata } from './meta';
Expand Down Expand Up @@ -92,12 +88,6 @@ export function scanText({
result.text +
nodeText.substring(0, textEnd === -1 ? undefined : textEnd);

// If the source starts with a number, expand our text delimeter to allow
// reading the rest of the number since it might be something like 5つ.
if (!currentText.length && startsWithNumber(nodeText)) {
textDelimiter = nonJapaneseCharOrNumber;
}

// Check if we should further expand the set of allowed characters in
// order to recognize certain types of metadata-type strings (e.g. years
// or floor space measurements).
Expand Down
18 changes: 9 additions & 9 deletions src/content/shogi.ts
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ export type ShogiMeta = {

export function lookForShogi({
nodeText,
textDelimiter: originalTextDelimeter,
textDelimiter: originalTextDelimiter,
}: {
nodeText: string;
textDelimiter: RegExp;
Expand All @@ -79,28 +79,28 @@ export function lookForShogi({
}

// If the test starts with one of the shogi side indicators, then we assume
// that the text is a shogi move and we can use the shogi delimeter.
// that the text is a shogi move and we can use the shogi delimiter.
if (['▲', '△', '☗', '☖'].includes(nodeText[0])) {
return {
textDelimiter: shogiDelimeter,
textEnd: nodeText.search(shogiDelimeter),
textDelimiter: shogiDelimiter,
textEnd: nodeText.search(shogiDelimiter),
};
}

// Otherwise, if it starts with an Arabic number followed by a kanji number
// OR it starts with one of the characters meaning "same position" then
// expand the delimeter range to include all the shogi characters.
// expand the delimiter range to include all the shogi characters.
if (!unprefixedShogiStart.test(nodeText)) {
return null;
}

const expandedDelimeter = getCombinedCharRange([
getNegatedCharRange(originalTextDelimeter),
const expandedDelimiter = getCombinedCharRange([
getNegatedCharRange(originalTextDelimiter),
/[]/,
// All the other characters such as 𠔼丶フゝ・○ etc. should already be
// covered by `japaneseChar` so we don't need to add them here.
]);
const textDelimiter = getNegatedCharRange(expandedDelimeter);
const textDelimiter = getNegatedCharRange(expandedDelimiter);

return {
textDelimiter,
Expand All @@ -109,7 +109,7 @@ export function lookForShogi({
}

// This needs to be kept in sync with the regexes below.
const shogiDelimeter =
const shogiDelimiter =
/[^1-9-𠔼]/u;
const unprefixedShogiStart = /^[1-9-][]|[𠔼]/u;

Expand Down
20 changes: 16 additions & 4 deletions src/content/years.ts
Original file line number Diff line number Diff line change
@@ -1,27 +1,39 @@
import { getCombinedCharRange, getNegatedCharRange } from '../utils/char-range';
import { parseNumber } from './numbers';

const nonEraCharacter = /[^\s0-9-]/;

export function lookForEra({
currentText,
nodeText,
textDelimiter: originalTextDelimiter,
textEnd,
}: {
currentText: string;
nodeText: string;
textDelimiter: RegExp;
textEnd: number;
}): {
textDelimiter: RegExp;
textEnd: number;
} | null {
// We only want to _extend_ the current range so if `textEnd` is already -1
// (i.e. end of the text) then we don't need to do anything.
if (textEnd < 0 || !startsWithEraName(currentText)) {
return null;
}

const endOfEra = nodeText.substring(textEnd).search(nonEraCharacter);
// The original text delimiter should include all the characters needed to
// match Japanese years except spaces between the era and the year, and
// spaces between the year and the final 年 character, if any.
const japaneseOrSpace = getCombinedCharRange([
getNegatedCharRange(originalTextDelimiter),
/[\s]/,
]);
const textDelimiter = getNegatedCharRange(japaneseOrSpace);

const endOfEra = nodeText.substring(textEnd).search(textDelimiter);

return {
textDelimiter: nonEraCharacter,
textDelimiter,
textEnd: endOfEra === -1 ? -1 : textEnd + endOfEra,
};
}
Expand Down
38 changes: 31 additions & 7 deletions src/utils/char-range.ts
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,8 @@ function isCharacterClassRange(re: RegExp): boolean {
// typically delimit words.
/** @public */
export const japaneseChar = getCombinedCharRange([
// We include half-width numbers so we can recognize things like 小1
halfWidthNumbers,
fullWidthAlphanumerics,
zeroWidthNonJoiner,
whiteCircle,
Expand Down Expand Up @@ -174,18 +176,40 @@ export function getNegatedCharRange(range: RegExp): RegExp {

export const nonJapaneseChar = getNegatedCharRange(japaneseChar);

export const nonJapaneseCharOrNumber = getNegatedCharRange(
getCombinedCharRange([japaneseChar, halfWidthNumbers, /[,.]/])
);

export function hasKatakana(text: string): boolean {
return katakana.test(text);
}

const numberStartRegex = /^[0-9-]/;
export function startsWithDigit(input: string): boolean {
const c = input.length ? input.charCodeAt(0) : 0;
return (c >= 48 && c <= 57) || (c >= 65296 && c <= 65305);
}

export function startsWithNumber(input: string): boolean {
return !!input.length && numberStartRegex.test(input);
const kanjiNumerals = [
'〇',
'一',
'二',
'三',
'四',
'五',
'六',
'七',
'八',
'九',
'十',
'百',
'千',
'万',
'億',
'兆',
'京',
];

export function startsWithNumeral(input: string): boolean {
return (
startsWithDigit(input) ||
(!!input.length && kanjiNumerals.includes(input[0]))
);
}

const onlyDigits = /^[0-9-,.]+$/;
Expand Down
Loading

0 comments on commit 4538ebf

Please sign in to comment.