From eedb297150c7f681216611c50ed96a122ac18053 Mon Sep 17 00:00:00 2001
From: xWTF <e@x.wtf>
Date: Mon, 30 Jan 2023 16:50:17 +0800
Subject: [PATCH] Enforce order for results with same confidence

---
 2022.go          |  3 ++-
 detector.go      | 25 +++++++++++++++----------
 detector_test.go | 25 ++++++++++++++++++++++++-
 go.mod           |  2 ++
 multi_byte.go    |  3 ++-
 recognizer.go    |  2 +-
 single_byte.go   |  3 ++-
 unicode.go       |  9 ++++++---
 utf8.go          |  3 ++-
 9 files changed, 56 insertions(+), 19 deletions(-)

diff --git a/2022.go b/2022.go
index e667225..e1186f2 100644
--- a/2022.go
+++ b/2022.go
@@ -9,10 +9,11 @@ type recognizer2022 struct {
 	escapes [][]byte
 }
 
-func (r *recognizer2022) Match(input *recognizerInput) (output recognizerOutput) {
+func (r *recognizer2022) Match(input *recognizerInput, order int) (output recognizerOutput) {
 	return recognizerOutput{
 		Charset:    r.charset,
 		Confidence: r.matchConfidence(input.input),
+		order:      order,
 	}
 }
 
diff --git a/detector.go b/detector.go
index 027e1c3..e0a9e19 100644
--- a/detector.go
+++ b/detector.go
@@ -14,6 +14,9 @@ type Result struct {
 	Language string
 	// Confidence of the Result. Scale from 1 to 100. The bigger, the more confident.
 	Confidence int
+
+	// used for sorting internally
+	order int
 }
 
 // Detector implements charset detection.
@@ -87,13 +90,13 @@ var (
 func (d *Detector) DetectBest(b []byte) (r *Result, err error) {
 	input := newRecognizerInput(b, d.stripTag)
 	outputChan := make(chan recognizerOutput)
-	for _, r := range d.recognizers {
-		go matchHelper(r, input, outputChan)
+	for i, r := range d.recognizers {
+		go matchHelper(r, input, outputChan, i)
 	}
 	var output Result
 	for i := 0; i < len(d.recognizers); i++ {
 		o := <-outputChan
-		if output.Confidence < o.Confidence {
+		if output.Confidence < o.Confidence || (output.Confidence == o.Confidence && o.order < output.order) {
 			output = Result(o)
 		}
 	}
@@ -107,8 +110,8 @@ func (d *Detector) DetectBest(b []byte) (r *Result, err error) {
 func (d *Detector) DetectAll(b []byte) ([]Result, error) {
 	input := newRecognizerInput(b, d.stripTag)
 	outputChan := make(chan recognizerOutput)
-	for _, r := range d.recognizers {
-		go matchHelper(r, input, outputChan)
+	for i, r := range d.recognizers {
+		go matchHelper(r, input, outputChan, i)
 	}
 	outputs := make(recognizerOutputs, 0, len(d.recognizers))
 	for i := 0; i < len(d.recognizers); i++ {
@@ -136,12 +139,14 @@ func (d *Detector) DetectAll(b []byte) ([]Result, error) {
 	return dedupOutputs, nil
 }
 
-func matchHelper(r recognizer, input *recognizerInput, outputChan chan<- recognizerOutput) {
-	outputChan <- r.Match(input)
+func matchHelper(r recognizer, input *recognizerInput, outputChan chan<- recognizerOutput, order int) {
+	outputChan <- r.Match(input, order)
 }
 
 type recognizerOutputs []recognizerOutput
 
-func (r recognizerOutputs) Len() int           { return len(r) }
-func (r recognizerOutputs) Less(i, j int) bool { return r[i].Confidence > r[j].Confidence }
-func (r recognizerOutputs) Swap(i, j int)      { r[i], r[j] = r[j], r[i] }
+func (r recognizerOutputs) Len() int { return len(r) }
+func (r recognizerOutputs) Less(i, j int) bool {
+	return r[i].Confidence > r[j].Confidence || (r[i].Confidence == r[j].Confidence && r[i].order < r[j].order)
+}
+func (r recognizerOutputs) Swap(i, j int) { r[i], r[j] = r[j], r[i] }
diff --git a/detector_test.go b/detector_test.go
index d085ff1..157fa89 100644
--- a/detector_test.go
+++ b/detector_test.go
@@ -2,11 +2,12 @@ package chardet_test
 
 import (
 	"bytes"
-	"github.com/gogs/chardet"
 	"io"
 	"os"
 	"path/filepath"
 	"testing"
+
+	"github.com/gogs/chardet"
 )
 
 func TestDetector(t *testing.T) {
@@ -58,6 +59,28 @@ func TestDetector(t *testing.T) {
 			t.Errorf("Expected language %s, actual %s", d.Language, result.Language)
 		}
 	}
+
+	// "ノエル" Shift JIS encoded
+	test := []byte("\x83m\x83G\x83\x8b")
+
+	result, err := textDetector.DetectAll(test)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if len(result) != 3 {
+		t.Errorf("Expected 3 results, actual %d", len(result))
+	}
+	if result[0].Charset != "Shift_JIS" || result[1].Charset != "GB18030" || result[2].Charset != "Big5" {
+		t.Errorf("DetectAll order is wrong: %v", result)
+	}
+
+	singleResult, err := textDetector.DetectBest(test)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if singleResult.Charset != "Shift_JIS" {
+		t.Errorf("DetectBest result is wrong: %v", singleResult)
+	}
 }
 
 func BenchmarkDetectBest(b *testing.B) {
diff --git a/go.mod b/go.mod
index d8653b8..95b5896 100644
--- a/go.mod
+++ b/go.mod
@@ -1 +1,3 @@
 module github.com/gogs/chardet
+
+go 1.19
diff --git a/multi_byte.go b/multi_byte.go
index b5cdf3d..6b7e7e2 100644
--- a/multi_byte.go
+++ b/multi_byte.go
@@ -16,11 +16,12 @@ type charDecoder interface {
 	DecodeOneChar([]byte) (c uint16, remain []byte, err error)
 }
 
-func (r *recognizerMultiByte) Match(input *recognizerInput) (output recognizerOutput) {
+func (r *recognizerMultiByte) Match(input *recognizerInput, order int) (output recognizerOutput) {
 	return recognizerOutput{
 		Charset:    r.charset,
 		Language:   r.language,
 		Confidence: r.matchConfidence(input),
+		order:      order,
 	}
 }
 
diff --git a/recognizer.go b/recognizer.go
index 1bf8461..70ebcf3 100644
--- a/recognizer.go
+++ b/recognizer.go
@@ -1,7 +1,7 @@
 package chardet
 
 type recognizer interface {
-	Match(*recognizerInput) recognizerOutput
+	Match(*recognizerInput, int) recognizerOutput
 }
 
 type recognizerOutput Result
diff --git a/single_byte.go b/single_byte.go
index a7ce39b..3aa323e 100644
--- a/single_byte.go
+++ b/single_byte.go
@@ -9,7 +9,7 @@ type recognizerSingleByte struct {
 	ngram            *[64]uint32
 }
 
-func (r *recognizerSingleByte) Match(input *recognizerInput) recognizerOutput {
+func (r *recognizerSingleByte) Match(input *recognizerInput, order int) recognizerOutput {
 	var charset string = r.charset
 	if input.hasC1Bytes && len(r.hasC1ByteCharset) > 0 {
 		charset = r.hasC1ByteCharset
@@ -18,6 +18,7 @@ func (r *recognizerSingleByte) Match(input *recognizerInput) recognizerOutput {
 		Charset:    charset,
 		Language:   r.language,
 		Confidence: r.parseNgram(input.input),
+		order:      order,
 	}
 }
 
diff --git a/unicode.go b/unicode.go
index 6f9fa9e..478d7cf 100644
--- a/unicode.go
+++ b/unicode.go
@@ -18,9 +18,10 @@ func newRecognizer_utf16be() *recognizerUtf16be {
 	return &recognizerUtf16be{}
 }
 
-func (*recognizerUtf16be) Match(input *recognizerInput) (output recognizerOutput) {
+func (*recognizerUtf16be) Match(input *recognizerInput, order int) (output recognizerOutput) {
 	output = recognizerOutput{
 		Charset: "UTF-16BE",
+		order:   order,
 	}
 	if bytes.HasPrefix(input.raw, utf16beBom) {
 		output.Confidence = 100
@@ -35,9 +36,10 @@ func newRecognizer_utf16le() *recognizerUtf16le {
 	return &recognizerUtf16le{}
 }
 
-func (*recognizerUtf16le) Match(input *recognizerInput) (output recognizerOutput) {
+func (*recognizerUtf16le) Match(input *recognizerInput, order int) (output recognizerOutput) {
 	output = recognizerOutput{
 		Charset: "UTF-16LE",
+		order:   order,
 	}
 	if bytes.HasPrefix(input.raw, utf16leBom) && !bytes.HasPrefix(input.raw, utf32leBom) {
 		output.Confidence = 100
@@ -75,9 +77,10 @@ func newRecognizer_utf32le() *recognizerUtf32 {
 	}
 }
 
-func (r *recognizerUtf32) Match(input *recognizerInput) (output recognizerOutput) {
+func (r *recognizerUtf32) Match(input *recognizerInput, order int) (output recognizerOutput) {
 	output = recognizerOutput{
 		Charset: r.name,
+		order:   order,
 	}
 	hasBom := bytes.HasPrefix(input.raw, r.bom)
 	var numValid, numInvalid uint32
diff --git a/utf8.go b/utf8.go
index ae036ad..db83f4f 100644
--- a/utf8.go
+++ b/utf8.go
@@ -13,9 +13,10 @@ func newRecognizer_utf8() *recognizerUtf8 {
 	return &recognizerUtf8{}
 }
 
-func (*recognizerUtf8) Match(input *recognizerInput) (output recognizerOutput) {
+func (*recognizerUtf8) Match(input *recognizerInput, order int) (output recognizerOutput) {
 	output = recognizerOutput{
 		Charset: "UTF-8",
+		order:   order,
 	}
 	hasBom := bytes.HasPrefix(input.raw, utf8Bom)
 	inputLen := len(input.raw)