gogs · xWTF · Jan 30, 2023
diff --git a/2022.go b/2022.go
@@ -9,10 +9,11 @@ type recognizer2022 struct {
 	escapes [][]byte
 }
 
-func (r *recognizer2022) Match(input *recognizerInput) (output recognizerOutput) {
+func (r *recognizer2022) Match(input *recognizerInput, order int) (output recognizerOutput) {
 	return recognizerOutput{
 		Charset:    r.charset,
 		Confidence: r.matchConfidence(input.input),
+		order:      order,
 	}
 }
 

diff --git a/detector.go b/detector.go
@@ -14,6 +14,9 @@ type Result struct {
 	Language string
 	// Confidence of the Result. Scale from 1 to 100. The bigger, the more confident.
 	Confidence int
+
+	// used for sorting internally
+	order int
 }
 
 // Detector implements charset detection.
@@ -87,13 +90,13 @@ var (
 func (d *Detector) DetectBest(b []byte) (r *Result, err error) {
 	input := newRecognizerInput(b, d.stripTag)
 	outputChan := make(chan recognizerOutput)
-	for _, r := range d.recognizers {
-		go matchHelper(r, input, outputChan)
+	for i, r := range d.recognizers {
+		go matchHelper(r, input, outputChan, i)
 	}
 	var output Result
 	for i := 0; i < len(d.recognizers); i++ {
 		o := <-outputChan
-		if output.Confidence < o.Confidence {
+		if output.Confidence < o.Confidence || (output.Confidence == o.Confidence && o.order < output.order) {
 			output = Result(o)
 		}
 	}
@@ -107,8 +110,8 @@ func (d *Detector) DetectBest(b []byte) (r *Result, err error) {
 func (d *Detector) DetectAll(b []byte) ([]Result, error) {
 	input := newRecognizerInput(b, d.stripTag)
 	outputChan := make(chan recognizerOutput)
-	for _, r := range d.recognizers {
-		go matchHelper(r, input, outputChan)
+	for i, r := range d.recognizers {
+		go matchHelper(r, input, outputChan, i)
 	}
 	outputs := make(recognizerOutputs, 0, len(d.recognizers))
 	for i := 0; i < len(d.recognizers); i++ {
@@ -136,12 +139,14 @@ func (d *Detector) DetectAll(b []byte) ([]Result, error) {
 	return dedupOutputs, nil
 }
 
-func matchHelper(r recognizer, input *recognizerInput, outputChan chan<- recognizerOutput) {
-	outputChan <- r.Match(input)
+func matchHelper(r recognizer, input *recognizerInput, outputChan chan<- recognizerOutput, order int) {
+	outputChan <- r.Match(input, order)
 }
 
 type recognizerOutputs []recognizerOutput
 
-func (r recognizerOutputs) Len() int           { return len(r) }
-func (r recognizerOutputs) Less(i, j int) bool { return r[i].Confidence > r[j].Confidence }
-func (r recognizerOutputs) Swap(i, j int)      { r[i], r[j] = r[j], r[i] }
+func (r recognizerOutputs) Len() int { return len(r) }
+func (r recognizerOutputs) Less(i, j int) bool {
+	return r[i].Confidence > r[j].Confidence || (r[i].Confidence == r[j].Confidence && r[i].order < r[j].order)
+}
+func (r recognizerOutputs) Swap(i, j int) { r[i], r[j] = r[j], r[i] }
diff --git a/detector_test.go b/detector_test.go
@@ -2,11 +2,12 @@ package chardet_test
 
 import (
 	"bytes"
-	"github.com/gogs/chardet"
 	"io"
 	"os"
 	"path/filepath"
 	"testing"
+
+	"github.com/gogs/chardet"
 )
 
 func TestDetector(t *testing.T) {
@@ -58,6 +59,28 @@ func TestDetector(t *testing.T) {
 			t.Errorf("Expected language %s, actual %s", d.Language, result.Language)
 		}
 	}
+
+	// "ノエル" Shift JIS encoded
+	test := []byte("\x83m\x83G\x83\x8b")
+
+	result, err := textDetector.DetectAll(test)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if len(result) != 3 {
+		t.Errorf("Expected 3 results, actual %d", len(result))
+	}
+	if result[0].Charset != "Shift_JIS" || result[1].Charset != "GB18030" || result[2].Charset != "Big5" {
+		t.Errorf("DetectAll order is wrong: %v", result)
+	}
+
+	singleResult, err := textDetector.DetectBest(test)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if singleResult.Charset != "Shift_JIS" {
+		t.Errorf("DetectBest result is wrong: %v", singleResult)
+	}
 }
 
 func BenchmarkDetectBest(b *testing.B) {

diff --git a/go.mod b/go.mod
@@ -1 +1,3 @@
 module github.com/gogs/chardet
+
+go 1.19
diff --git a/multi_byte.go b/multi_byte.go
@@ -16,11 +16,12 @@ type charDecoder interface {
 	DecodeOneChar([]byte) (c uint16, remain []byte, err error)
 }
 
-func (r *recognizerMultiByte) Match(input *recognizerInput) (output recognizerOutput) {
+func (r *recognizerMultiByte) Match(input *recognizerInput, order int) (output recognizerOutput) {
 	return recognizerOutput{
 		Charset:    r.charset,
 		Language:   r.language,
 		Confidence: r.matchConfidence(input),
+		order:      order,
 	}
 }
 

diff --git a/recognizer.go b/recognizer.go
@@ -1,7 +1,7 @@
 package chardet
 
 type recognizer interface {
-	Match(*recognizerInput) recognizerOutput
+	Match(*recognizerInput, int) recognizerOutput
 }
 
 type recognizerOutput Result

diff --git a/single_byte.go b/single_byte.go
@@ -9,7 +9,7 @@ type recognizerSingleByte struct {
 	ngram            *[64]uint32
 }
 
-func (r *recognizerSingleByte) Match(input *recognizerInput) recognizerOutput {
+func (r *recognizerSingleByte) Match(input *recognizerInput, order int) recognizerOutput {
 	var charset string = r.charset
 	if input.hasC1Bytes && len(r.hasC1ByteCharset) > 0 {
 		charset = r.hasC1ByteCharset
@@ -18,6 +18,7 @@ func (r *recognizerSingleByte) Match(input *recognizerInput) recognizerOutput {
 		Charset:    charset,
 		Language:   r.language,
 		Confidence: r.parseNgram(input.input),
+		order:      order,
 	}
 }
 

diff --git a/unicode.go b/unicode.go
@@ -18,9 +18,10 @@ func newRecognizer_utf16be() *recognizerUtf16be {
 	return &recognizerUtf16be{}
 }
 
-func (*recognizerUtf16be) Match(input *recognizerInput) (output recognizerOutput) {
+func (*recognizerUtf16be) Match(input *recognizerInput, order int) (output recognizerOutput) {
 	output = recognizerOutput{
 		Charset: "UTF-16BE",
+		order:   order,
 	}
 	if bytes.HasPrefix(input.raw, utf16beBom) {
 		output.Confidence = 100
@@ -35,9 +36,10 @@ func newRecognizer_utf16le() *recognizerUtf16le {
 	return &recognizerUtf16le{}
 }
 
-func (*recognizerUtf16le) Match(input *recognizerInput) (output recognizerOutput) {
+func (*recognizerUtf16le) Match(input *recognizerInput, order int) (output recognizerOutput) {
 	output = recognizerOutput{
 		Charset: "UTF-16LE",
+		order:   order,
 	}
 	if bytes.HasPrefix(input.raw, utf16leBom) && !bytes.HasPrefix(input.raw, utf32leBom) {
 		output.Confidence = 100
@@ -75,9 +77,10 @@ func newRecognizer_utf32le() *recognizerUtf32 {
 	}
 }
 
-func (r *recognizerUtf32) Match(input *recognizerInput) (output recognizerOutput) {
+func (r *recognizerUtf32) Match(input *recognizerInput, order int) (output recognizerOutput) {
 	output = recognizerOutput{
 		Charset: r.name,
+		order:   order,
 	}
 	hasBom := bytes.HasPrefix(input.raw, r.bom)
 	var numValid, numInvalid uint32

diff --git a/utf8.go b/utf8.go
@@ -13,9 +13,10 @@ func newRecognizer_utf8() *recognizerUtf8 {
 	return &recognizerUtf8{}
 }
 
-func (*recognizerUtf8) Match(input *recognizerInput) (output recognizerOutput) {
+func (*recognizerUtf8) Match(input *recognizerInput, order int) (output recognizerOutput) {
 	output = recognizerOutput{
 		Charset: "UTF-8",
+		order:   order,
 	}
 	hasBom := bytes.HasPrefix(input.raw, utf8Bom)
 	inputLen := len(input.raw)
Original file line number	Diff line number	Diff line change
		@@ -1 +1,3 @@
		module github.com/gogs/chardet

		go 1.19