Enforce order for results with same confidence

xWTF · xWTF · commit eedb297150c7 · 2023-01-30T17:56:20.000+08:00
diff --git a/2022.go b/2022.go
@@ -9,10 +9,11 @@ type recognizer2022 struct {
 	escapes [][]byte
 }
 
-func (r *recognizer2022) Match(input *recognizerInput) (output recognizerOutput) {
+func (r *recognizer2022) Match(input *recognizerInput, order int) (output recognizerOutput) {
 	return recognizerOutput{
 		Charset:    r.charset,
 		Confidence: r.matchConfidence(input.input),
+		order:      order,
 	}
 }
 
diff --git a/detector.go b/detector.go
@@ -14,6 +14,9 @@ type Result struct {
 	Language string
 	// Confidence of the Result. Scale from 1 to 100. The bigger, the more confident.
 	Confidence int
+
+	// used for sorting internally
+	order int
 }
 
 // Detector implements charset detection.
@@ -87,13 +90,13 @@ var (
 func (d *Detector) DetectBest(b []byte) (r *Result, err error) {
 	input := newRecognizerInput(b, d.stripTag)
 	outputChan := make(chan recognizerOutput)
-	for _, r := range d.recognizers {
-		go matchHelper(r, input, outputChan)
+	for i, r := range d.recognizers {
+		go matchHelper(r, input, outputChan, i)
 	}
 	var output Result
 	for i := 0; i < len(d.recognizers); i++ {
 		o := <-outputChan
-		if output.Confidence < o.Confidence {
+		if output.Confidence < o.Confidence || (output.Confidence == o.Confidence && o.order < output.order) {
 			output = Result(o)
 		}
 	}
@@ -107,8 +110,8 @@ func (d *Detector) DetectBest(b []byte) (r *Result, err error) {
 func (d *Detector) DetectAll(b []byte) ([]Result, error) {
 	input := newRecognizerInput(b, d.stripTag)
 	outputChan := make(chan recognizerOutput)
-	for _, r := range d.recognizers {
-		go matchHelper(r, input, outputChan)
+	for i, r := range d.recognizers {
+		go matchHelper(r, input, outputChan, i)
 	}
 	outputs := make(recognizerOutputs, 0, len(d.recognizers))
 	for i := 0; i < len(d.recognizers); i++ {
@@ -136,12 +139,14 @@ func (d *Detector) DetectAll(b []byte) ([]Result, error) {
 	return dedupOutputs, nil
 }
 
-func matchHelper(r recognizer, input *recognizerInput, outputChan chan<- recognizerOutput) {
-	outputChan <- r.Match(input)
+func matchHelper(r recognizer, input *recognizerInput, outputChan chan<- recognizerOutput, order int) {
+	outputChan <- r.Match(input, order)
 }
 
 type recognizerOutputs []recognizerOutput
 
-func (r recognizerOutputs) Len() int           { return len(r) }
-func (r recognizerOutputs) Less(i, j int) bool { return r[i].Confidence > r[j].Confidence }
-func (r recognizerOutputs) Swap(i, j int)      { r[i], r[j] = r[j], r[i] }
+func (r recognizerOutputs) Len() int { return len(r) }
+func (r recognizerOutputs) Less(i, j int) bool {
+	return r[i].Confidence > r[j].Confidence || (r[i].Confidence == r[j].Confidence && r[i].order < r[j].order)
+}
+func (r recognizerOutputs) Swap(i, j int) { r[i], r[j] = r[j], r[i] }
diff --git a/detector_test.go b/detector_test.go
@@ -2,11 +2,12 @@ package chardet_test
 
 import (
 	"bytes"
-	"github.com/gogs/chardet"
 	"io"
 	"os"
 	"path/filepath"
 	"testing"
+
+	"github.com/gogs/chardet"
 )
 
 func TestDetector(t *testing.T) {
@@ -58,6 +59,28 @@ func TestDetector(t *testing.T) {
 			t.Errorf("Expected language %s, actual %s", d.Language, result.Language)
 		}
 	}
+
+	// "ノエル" Shift JIS encoded
+	test := []byte("\x83m\x83G\x83\x8b")
+
+	result, err := textDetector.DetectAll(test)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if len(result) != 3 {
+		t.Errorf("Expected 3 results, actual %d", len(result))
+	}
+	if result[0].Charset != "Shift_JIS" || result[1].Charset != "GB18030" || result[2].Charset != "Big5" {
+		t.Errorf("DetectAll order is wrong: %v", result)
+	}
+
+	singleResult, err := textDetector.DetectBest(test)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if singleResult.Charset != "Shift_JIS" {
+		t.Errorf("DetectBest result is wrong: %v", singleResult)
+	}
 }
 
 func BenchmarkDetectBest(b *testing.B) {
diff --git a/go.mod b/go.mod
@@ -1 +1,3 @@
 module github.com/gogs/chardet
+
+go 1.19
diff --git a/multi_byte.go b/multi_byte.go
@@ -16,11 +16,12 @@ type charDecoder interface {
 	DecodeOneChar([]byte) (c uint16, remain []byte, err error)
 }
 
-func (r *recognizerMultiByte) Match(input *recognizerInput) (output recognizerOutput) {
+func (r *recognizerMultiByte) Match(input *recognizerInput, order int) (output recognizerOutput) {
 	return recognizerOutput{
 		Charset:    r.charset,
 		Language:   r.language,
 		Confidence: r.matchConfidence(input),
+		order:      order,
 	}
 }
 
diff --git a/recognizer.go b/recognizer.go
@@ -1,7 +1,7 @@
 package chardet
 
 type recognizer interface {
-	Match(*recognizerInput) recognizerOutput
+	Match(*recognizerInput, int) recognizerOutput
 }
 
 type recognizerOutput Result
diff --git a/single_byte.go b/single_byte.go
@@ -9,7 +9,7 @@ type recognizerSingleByte struct {
 	ngram            *[64]uint32
 }
 
-func (r *recognizerSingleByte) Match(input *recognizerInput) recognizerOutput {
+func (r *recognizerSingleByte) Match(input *recognizerInput, order int) recognizerOutput {
 	var charset string = r.charset
 	if input.hasC1Bytes && len(r.hasC1ByteCharset) > 0 {
 		charset = r.hasC1ByteCharset
@@ -18,6 +18,7 @@ func (r *recognizerSingleByte) Match(input *recognizerInput) recognizerOutput {
 		Charset:    charset,
 		Language:   r.language,
 		Confidence: r.parseNgram(input.input),
+		order:      order,
 	}
 }
 
diff --git a/unicode.go b/unicode.go
@@ -18,9 +18,10 @@ func newRecognizer_utf16be() *recognizerUtf16be {
 	return &recognizerUtf16be{}
 }
 
-func (*recognizerUtf16be) Match(input *recognizerInput) (output recognizerOutput) {
+func (*recognizerUtf16be) Match(input *recognizerInput, order int) (output recognizerOutput) {
 	output = recognizerOutput{
 		Charset: "UTF-16BE",
+		order:   order,
 	}
 	if bytes.HasPrefix(input.raw, utf16beBom) {
 		output.Confidence = 100
@@ -35,9 +36,10 @@ func newRecognizer_utf16le() *recognizerUtf16le {
 	return &recognizerUtf16le{}
 }
 
-func (*recognizerUtf16le) Match(input *recognizerInput) (output recognizerOutput) {
+func (*recognizerUtf16le) Match(input *recognizerInput, order int) (output recognizerOutput) {
 	output = recognizerOutput{
 		Charset: "UTF-16LE",
+		order:   order,
 	}
 	if bytes.HasPrefix(input.raw, utf16leBom) && !bytes.HasPrefix(input.raw, utf32leBom) {
 		output.Confidence = 100
@@ -75,9 +77,10 @@ func newRecognizer_utf32le() *recognizerUtf32 {
 	}
 }
 
-func (r *recognizerUtf32) Match(input *recognizerInput) (output recognizerOutput) {
+func (r *recognizerUtf32) Match(input *recognizerInput, order int) (output recognizerOutput) {
 	output = recognizerOutput{
 		Charset: r.name,
+		order:   order,
 	}
 	hasBom := bytes.HasPrefix(input.raw, r.bom)
 	var numValid, numInvalid uint32
diff --git a/utf8.go b/utf8.go
@@ -13,9 +13,10 @@ func newRecognizer_utf8() *recognizerUtf8 {
 	return &recognizerUtf8{}
 }
 
-func (*recognizerUtf8) Match(input *recognizerInput) (output recognizerOutput) {
+func (*recognizerUtf8) Match(input *recognizerInput, order int) (output recognizerOutput) {
 	output = recognizerOutput{
 		Charset: "UTF-8",
+		order:   order,
 	}
 	hasBom := bytes.HasPrefix(input.raw, utf8Bom)
 	inputLen := len(input.raw)

Original file line number	Diff line number	Diff line change
`@@ -9,10 +9,11 @@ type recognizer2022 struct {`
`9`	`9`	`escapes [][]byte`
`10`	`10`	`}`
`11`	`11`
`12`		`-func (r recognizer2022) Match(input recognizerInput) (output recognizerOutput) {`
	`12`	`+func (r recognizer2022) Match(input recognizerInput, order int) (output recognizerOutput) {`
`13`	`13`	`return recognizerOutput{`
`14`	`14`	`Charset: r.charset,`
`15`	`15`	`Confidence: r.matchConfidence(input.input),`
	`16`	`+ order: order,`
`16`	`17`	`}`
`17`	`18`	`}`
`18`	`19`
Original file line number	Diff line number	Diff line change
`@@ -1 +1,3 @@`
`1`	`1`	`module github.com/gogs/chardet`
	`2`	`+`
	`3`	`+go 1.19`
Original file line number	Diff line number	Diff line change
`@@ -16,11 +16,12 @@ type charDecoder interface {`
`16`	`16`	`DecodeOneChar([]byte) (c uint16, remain []byte, err error)`
`17`	`17`	`}`
`18`	`18`
`19`		`-func (r recognizerMultiByte) Match(input recognizerInput) (output recognizerOutput) {`
	`19`	`+func (r recognizerMultiByte) Match(input recognizerInput, order int) (output recognizerOutput) {`
`20`	`20`	`return recognizerOutput{`
`21`	`21`	`Charset: r.charset,`
`22`	`22`	`Language: r.language,`
`23`	`23`	`Confidence: r.matchConfidence(input),`
	`24`	`+ order: order,`
`24`	`25`	`}`
`25`	`26`	`}`
`26`	`27`
Original file line number	Diff line number	Diff line change
`@@ -1,7 +1,7 @@`
`1`	`1`	`package chardet`
`2`	`2`
`3`	`3`	`type recognizer interface {`
`4`		`- Match(*recognizerInput) recognizerOutput`
	`4`	`+ Match(*recognizerInput, int) recognizerOutput`
`5`	`5`	`}`
`6`	`6`
`7`	`7`	`type recognizerOutput Result`
Original file line number	Diff line number	Diff line change
`@@ -9,7 +9,7 @@ type recognizerSingleByte struct {`
`9`	`9`	`ngram *[64]uint32`
`10`	`10`	`}`
`11`	`11`
`12`		`-func (r recognizerSingleByte) Match(input recognizerInput) recognizerOutput {`
	`12`	`+func (r recognizerSingleByte) Match(input recognizerInput, order int) recognizerOutput {`
`13`	`13`	`var charset string = r.charset`
`14`	`14`	`if input.hasC1Bytes && len(r.hasC1ByteCharset) > 0 {`
`15`	`15`	`charset = r.hasC1ByteCharset`
`@@ -18,6 +18,7 @@ func (r recognizerSingleByte) Match(input recognizerInput) recognizerOutput {`
`18`	`18`	`Charset: charset,`
`19`	`19`	`Language: r.language,`
`20`	`20`	`Confidence: r.parseNgram(input.input),`
	`21`	`+ order: order,`
`21`	`22`	`}`
`22`	`23`	`}`
`23`	`24`
Original file line number	Diff line number	Diff line change
`@@ -18,9 +18,10 @@ func newRecognizer_utf16be() *recognizerUtf16be {`
`18`	`18`	`return &recognizerUtf16be{}`
`19`	`19`	`}`
`20`	`20`
`21`		`-func (recognizerUtf16be) Match(input recognizerInput) (output recognizerOutput) {`
	`21`	`+func (recognizerUtf16be) Match(input recognizerInput, order int) (output recognizerOutput) {`
`22`	`22`	`output = recognizerOutput{`
`23`	`23`	`Charset: "UTF-16BE",`
	`24`	`+ order: order,`
`24`	`25`	`}`
`25`	`26`	`if bytes.HasPrefix(input.raw, utf16beBom) {`
`26`	`27`	`output.Confidence = 100`
`@@ -35,9 +36,10 @@ func newRecognizer_utf16le() *recognizerUtf16le {`
`35`	`36`	`return &recognizerUtf16le{}`
`36`	`37`	`}`
`37`	`38`
`38`		`-func (recognizerUtf16le) Match(input recognizerInput) (output recognizerOutput) {`
	`39`	`+func (recognizerUtf16le) Match(input recognizerInput, order int) (output recognizerOutput) {`
`39`	`40`	`output = recognizerOutput{`
`40`	`41`	`Charset: "UTF-16LE",`
	`42`	`+ order: order,`
`41`	`43`	`}`
`42`	`44`	`if bytes.HasPrefix(input.raw, utf16leBom) && !bytes.HasPrefix(input.raw, utf32leBom) {`
`43`	`45`	`output.Confidence = 100`
`@@ -75,9 +77,10 @@ func newRecognizer_utf32le() *recognizerUtf32 {`
`75`	`77`	`}`
`76`	`78`	`}`
`77`	`79`
`78`		`-func (r recognizerUtf32) Match(input recognizerInput) (output recognizerOutput) {`
	`80`	`+func (r recognizerUtf32) Match(input recognizerInput, order int) (output recognizerOutput) {`
`79`	`81`	`output = recognizerOutput{`
`80`	`82`	`Charset: r.name,`
	`83`	`+ order: order,`
`81`	`84`	`}`
`82`	`85`	`hasBom := bytes.HasPrefix(input.raw, r.bom)`
`83`	`86`	`var numValid, numInvalid uint32`
Original file line number	Diff line number	Diff line change
`@@ -13,9 +13,10 @@ func newRecognizer_utf8() *recognizerUtf8 {`
`13`	`13`	`return &recognizerUtf8{}`
`14`	`14`	`}`
`15`	`15`
`16`		`-func (recognizerUtf8) Match(input recognizerInput) (output recognizerOutput) {`
	`16`	`+func (recognizerUtf8) Match(input recognizerInput, order int) (output recognizerOutput) {`
`17`	`17`	`output = recognizerOutput{`
`18`	`18`	`Charset: "UTF-8",`
	`19`	`+ order: order,`
`19`	`20`	`}`
`20`	`21`	`hasBom := bytes.HasPrefix(input.raw, utf8Bom)`
`21`	`22`	`inputLen := len(input.raw)`