Skip to content

Commit eedb297

Browse files
committed
Enforce order for results with same confidence
1 parent b7413ea commit eedb297

9 files changed

+56
-19
lines changed

2022.go

+2-1
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,11 @@ type recognizer2022 struct {
99
escapes [][]byte
1010
}
1111

12-
func (r *recognizer2022) Match(input *recognizerInput) (output recognizerOutput) {
12+
func (r *recognizer2022) Match(input *recognizerInput, order int) (output recognizerOutput) {
1313
return recognizerOutput{
1414
Charset: r.charset,
1515
Confidence: r.matchConfidence(input.input),
16+
order: order,
1617
}
1718
}
1819

detector.go

+15-10
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,9 @@ type Result struct {
1414
Language string
1515
// Confidence of the Result. Scale from 1 to 100. The bigger, the more confident.
1616
Confidence int
17+
18+
// used for sorting internally
19+
order int
1720
}
1821

1922
// Detector implements charset detection.
@@ -87,13 +90,13 @@ var (
8790
func (d *Detector) DetectBest(b []byte) (r *Result, err error) {
8891
input := newRecognizerInput(b, d.stripTag)
8992
outputChan := make(chan recognizerOutput)
90-
for _, r := range d.recognizers {
91-
go matchHelper(r, input, outputChan)
93+
for i, r := range d.recognizers {
94+
go matchHelper(r, input, outputChan, i)
9295
}
9396
var output Result
9497
for i := 0; i < len(d.recognizers); i++ {
9598
o := <-outputChan
96-
if output.Confidence < o.Confidence {
99+
if output.Confidence < o.Confidence || (output.Confidence == o.Confidence && o.order < output.order) {
97100
output = Result(o)
98101
}
99102
}
@@ -107,8 +110,8 @@ func (d *Detector) DetectBest(b []byte) (r *Result, err error) {
107110
func (d *Detector) DetectAll(b []byte) ([]Result, error) {
108111
input := newRecognizerInput(b, d.stripTag)
109112
outputChan := make(chan recognizerOutput)
110-
for _, r := range d.recognizers {
111-
go matchHelper(r, input, outputChan)
113+
for i, r := range d.recognizers {
114+
go matchHelper(r, input, outputChan, i)
112115
}
113116
outputs := make(recognizerOutputs, 0, len(d.recognizers))
114117
for i := 0; i < len(d.recognizers); i++ {
@@ -136,12 +139,14 @@ func (d *Detector) DetectAll(b []byte) ([]Result, error) {
136139
return dedupOutputs, nil
137140
}
138141

139-
func matchHelper(r recognizer, input *recognizerInput, outputChan chan<- recognizerOutput) {
140-
outputChan <- r.Match(input)
142+
func matchHelper(r recognizer, input *recognizerInput, outputChan chan<- recognizerOutput, order int) {
143+
outputChan <- r.Match(input, order)
141144
}
142145

143146
type recognizerOutputs []recognizerOutput
144147

145-
func (r recognizerOutputs) Len() int { return len(r) }
146-
func (r recognizerOutputs) Less(i, j int) bool { return r[i].Confidence > r[j].Confidence }
147-
func (r recognizerOutputs) Swap(i, j int) { r[i], r[j] = r[j], r[i] }
148+
func (r recognizerOutputs) Len() int { return len(r) }
149+
func (r recognizerOutputs) Less(i, j int) bool {
150+
return r[i].Confidence > r[j].Confidence || (r[i].Confidence == r[j].Confidence && r[i].order < r[j].order)
151+
}
152+
func (r recognizerOutputs) Swap(i, j int) { r[i], r[j] = r[j], r[i] }

detector_test.go

+24-1
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,12 @@ package chardet_test
22

33
import (
44
"bytes"
5-
"github.com/gogs/chardet"
65
"io"
76
"os"
87
"path/filepath"
98
"testing"
9+
10+
"github.com/gogs/chardet"
1011
)
1112

1213
func TestDetector(t *testing.T) {
@@ -58,6 +59,28 @@ func TestDetector(t *testing.T) {
5859
t.Errorf("Expected language %s, actual %s", d.Language, result.Language)
5960
}
6061
}
62+
63+
// "ノエル" Shift JIS encoded
64+
test := []byte("\x83m\x83G\x83\x8b")
65+
66+
result, err := textDetector.DetectAll(test)
67+
if err != nil {
68+
t.Fatal(err)
69+
}
70+
if len(result) != 3 {
71+
t.Errorf("Expected 3 results, actual %d", len(result))
72+
}
73+
if result[0].Charset != "Shift_JIS" || result[1].Charset != "GB18030" || result[2].Charset != "Big5" {
74+
t.Errorf("DetectAll order is wrong: %v", result)
75+
}
76+
77+
singleResult, err := textDetector.DetectBest(test)
78+
if err != nil {
79+
t.Fatal(err)
80+
}
81+
if singleResult.Charset != "Shift_JIS" {
82+
t.Errorf("DetectBest result is wrong: %v", singleResult)
83+
}
6184
}
6285

6386
func BenchmarkDetectBest(b *testing.B) {

go.mod

+2
Original file line numberDiff line numberDiff line change
@@ -1 +1,3 @@
11
module github.com/gogs/chardet
2+
3+
go 1.19

multi_byte.go

+2-1
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,12 @@ type charDecoder interface {
1616
DecodeOneChar([]byte) (c uint16, remain []byte, err error)
1717
}
1818

19-
func (r *recognizerMultiByte) Match(input *recognizerInput) (output recognizerOutput) {
19+
func (r *recognizerMultiByte) Match(input *recognizerInput, order int) (output recognizerOutput) {
2020
return recognizerOutput{
2121
Charset: r.charset,
2222
Language: r.language,
2323
Confidence: r.matchConfidence(input),
24+
order: order,
2425
}
2526
}
2627

recognizer.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
package chardet
22

33
type recognizer interface {
4-
Match(*recognizerInput) recognizerOutput
4+
Match(*recognizerInput, int) recognizerOutput
55
}
66

77
type recognizerOutput Result

single_byte.go

+2-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ type recognizerSingleByte struct {
99
ngram *[64]uint32
1010
}
1111

12-
func (r *recognizerSingleByte) Match(input *recognizerInput) recognizerOutput {
12+
func (r *recognizerSingleByte) Match(input *recognizerInput, order int) recognizerOutput {
1313
var charset string = r.charset
1414
if input.hasC1Bytes && len(r.hasC1ByteCharset) > 0 {
1515
charset = r.hasC1ByteCharset
@@ -18,6 +18,7 @@ func (r *recognizerSingleByte) Match(input *recognizerInput) recognizerOutput {
1818
Charset: charset,
1919
Language: r.language,
2020
Confidence: r.parseNgram(input.input),
21+
order: order,
2122
}
2223
}
2324

unicode.go

+6-3
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,10 @@ func newRecognizer_utf16be() *recognizerUtf16be {
1818
return &recognizerUtf16be{}
1919
}
2020

21-
func (*recognizerUtf16be) Match(input *recognizerInput) (output recognizerOutput) {
21+
func (*recognizerUtf16be) Match(input *recognizerInput, order int) (output recognizerOutput) {
2222
output = recognizerOutput{
2323
Charset: "UTF-16BE",
24+
order: order,
2425
}
2526
if bytes.HasPrefix(input.raw, utf16beBom) {
2627
output.Confidence = 100
@@ -35,9 +36,10 @@ func newRecognizer_utf16le() *recognizerUtf16le {
3536
return &recognizerUtf16le{}
3637
}
3738

38-
func (*recognizerUtf16le) Match(input *recognizerInput) (output recognizerOutput) {
39+
func (*recognizerUtf16le) Match(input *recognizerInput, order int) (output recognizerOutput) {
3940
output = recognizerOutput{
4041
Charset: "UTF-16LE",
42+
order: order,
4143
}
4244
if bytes.HasPrefix(input.raw, utf16leBom) && !bytes.HasPrefix(input.raw, utf32leBom) {
4345
output.Confidence = 100
@@ -75,9 +77,10 @@ func newRecognizer_utf32le() *recognizerUtf32 {
7577
}
7678
}
7779

78-
func (r *recognizerUtf32) Match(input *recognizerInput) (output recognizerOutput) {
80+
func (r *recognizerUtf32) Match(input *recognizerInput, order int) (output recognizerOutput) {
7981
output = recognizerOutput{
8082
Charset: r.name,
83+
order: order,
8184
}
8285
hasBom := bytes.HasPrefix(input.raw, r.bom)
8386
var numValid, numInvalid uint32

utf8.go

+2-1
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,10 @@ func newRecognizer_utf8() *recognizerUtf8 {
1313
return &recognizerUtf8{}
1414
}
1515

16-
func (*recognizerUtf8) Match(input *recognizerInput) (output recognizerOutput) {
16+
func (*recognizerUtf8) Match(input *recognizerInput, order int) (output recognizerOutput) {
1717
output = recognizerOutput{
1818
Charset: "UTF-8",
19+
order: order,
1920
}
2021
hasBom := bytes.HasPrefix(input.raw, utf8Bom)
2122
inputLen := len(input.raw)

0 commit comments

Comments
 (0)