diff --git a/2022.go b/2022.go index e667225..e1186f2 100644 --- a/2022.go +++ b/2022.go @@ -9,10 +9,11 @@ type recognizer2022 struct { escapes [][]byte } -func (r *recognizer2022) Match(input *recognizerInput) (output recognizerOutput) { +func (r *recognizer2022) Match(input *recognizerInput, order int) (output recognizerOutput) { return recognizerOutput{ Charset: r.charset, Confidence: r.matchConfidence(input.input), + order: order, } } diff --git a/detector.go b/detector.go index 027e1c3..e0a9e19 100644 --- a/detector.go +++ b/detector.go @@ -14,6 +14,9 @@ type Result struct { Language string // Confidence of the Result. Scale from 1 to 100. The bigger, the more confident. Confidence int + + // used for sorting internally + order int } // Detector implements charset detection. @@ -87,13 +90,13 @@ var ( func (d *Detector) DetectBest(b []byte) (r *Result, err error) { input := newRecognizerInput(b, d.stripTag) outputChan := make(chan recognizerOutput) - for _, r := range d.recognizers { - go matchHelper(r, input, outputChan) + for i, r := range d.recognizers { + go matchHelper(r, input, outputChan, i) } var output Result for i := 0; i < len(d.recognizers); i++ { o := <-outputChan - if output.Confidence < o.Confidence { + if output.Confidence < o.Confidence || (output.Confidence == o.Confidence && o.order < output.order) { output = Result(o) } } @@ -107,8 +110,8 @@ func (d *Detector) DetectBest(b []byte) (r *Result, err error) { func (d *Detector) DetectAll(b []byte) ([]Result, error) { input := newRecognizerInput(b, d.stripTag) outputChan := make(chan recognizerOutput) - for _, r := range d.recognizers { - go matchHelper(r, input, outputChan) + for i, r := range d.recognizers { + go matchHelper(r, input, outputChan, i) } outputs := make(recognizerOutputs, 0, len(d.recognizers)) for i := 0; i < len(d.recognizers); i++ { @@ -136,12 +139,14 @@ func (d *Detector) DetectAll(b []byte) ([]Result, error) { return dedupOutputs, nil } -func matchHelper(r recognizer, input *recognizerInput, outputChan chan<- recognizerOutput) { - outputChan <- r.Match(input) +func matchHelper(r recognizer, input *recognizerInput, outputChan chan<- recognizerOutput, order int) { + outputChan <- r.Match(input, order) } type recognizerOutputs []recognizerOutput -func (r recognizerOutputs) Len() int { return len(r) } -func (r recognizerOutputs) Less(i, j int) bool { return r[i].Confidence > r[j].Confidence } -func (r recognizerOutputs) Swap(i, j int) { r[i], r[j] = r[j], r[i] } +func (r recognizerOutputs) Len() int { return len(r) } +func (r recognizerOutputs) Less(i, j int) bool { + return r[i].Confidence > r[j].Confidence || (r[i].Confidence == r[j].Confidence && r[i].order < r[j].order) +} +func (r recognizerOutputs) Swap(i, j int) { r[i], r[j] = r[j], r[i] } diff --git a/detector_test.go b/detector_test.go index d085ff1..157fa89 100644 --- a/detector_test.go +++ b/detector_test.go @@ -2,11 +2,12 @@ package chardet_test import ( "bytes" - "github.com/gogs/chardet" "io" "os" "path/filepath" "testing" + + "github.com/gogs/chardet" ) func TestDetector(t *testing.T) { @@ -58,6 +59,28 @@ func TestDetector(t *testing.T) { t.Errorf("Expected language %s, actual %s", d.Language, result.Language) } } + + // "ノエル" Shift JIS encoded + test := []byte("\x83m\x83G\x83\x8b") + + result, err := textDetector.DetectAll(test) + if err != nil { + t.Fatal(err) + } + if len(result) != 3 { + t.Errorf("Expected 3 results, actual %d", len(result)) + } + if result[0].Charset != "Shift_JIS" || result[1].Charset != "GB18030" || result[2].Charset != "Big5" { + t.Errorf("DetectAll order is wrong: %v", result) + } + + singleResult, err := textDetector.DetectBest(test) + if err != nil { + t.Fatal(err) + } + if singleResult.Charset != "Shift_JIS" { + t.Errorf("DetectBest result is wrong: %v", singleResult) + } } func BenchmarkDetectBest(b *testing.B) { diff --git a/go.mod b/go.mod index d8653b8..95b5896 100644 --- a/go.mod +++ b/go.mod @@ -1 +1,3 @@ module github.com/gogs/chardet + +go 1.19 diff --git a/multi_byte.go b/multi_byte.go index b5cdf3d..6b7e7e2 100644 --- a/multi_byte.go +++ b/multi_byte.go @@ -16,11 +16,12 @@ type charDecoder interface { DecodeOneChar([]byte) (c uint16, remain []byte, err error) } -func (r *recognizerMultiByte) Match(input *recognizerInput) (output recognizerOutput) { +func (r *recognizerMultiByte) Match(input *recognizerInput, order int) (output recognizerOutput) { return recognizerOutput{ Charset: r.charset, Language: r.language, Confidence: r.matchConfidence(input), + order: order, } } diff --git a/recognizer.go b/recognizer.go index 1bf8461..70ebcf3 100644 --- a/recognizer.go +++ b/recognizer.go @@ -1,7 +1,7 @@ package chardet type recognizer interface { - Match(*recognizerInput) recognizerOutput + Match(*recognizerInput, int) recognizerOutput } type recognizerOutput Result diff --git a/single_byte.go b/single_byte.go index a7ce39b..3aa323e 100644 --- a/single_byte.go +++ b/single_byte.go @@ -9,7 +9,7 @@ type recognizerSingleByte struct { ngram *[64]uint32 } -func (r *recognizerSingleByte) Match(input *recognizerInput) recognizerOutput { +func (r *recognizerSingleByte) Match(input *recognizerInput, order int) recognizerOutput { var charset string = r.charset if input.hasC1Bytes && len(r.hasC1ByteCharset) > 0 { charset = r.hasC1ByteCharset @@ -18,6 +18,7 @@ func (r *recognizerSingleByte) Match(input *recognizerInput) recognizerOutput { Charset: charset, Language: r.language, Confidence: r.parseNgram(input.input), + order: order, } } diff --git a/unicode.go b/unicode.go index 6f9fa9e..478d7cf 100644 --- a/unicode.go +++ b/unicode.go @@ -18,9 +18,10 @@ func newRecognizer_utf16be() *recognizerUtf16be { return &recognizerUtf16be{} } -func (*recognizerUtf16be) Match(input *recognizerInput) (output recognizerOutput) { +func (*recognizerUtf16be) Match(input *recognizerInput, order int) (output recognizerOutput) { output = recognizerOutput{ Charset: "UTF-16BE", + order: order, } if bytes.HasPrefix(input.raw, utf16beBom) { output.Confidence = 100 @@ -35,9 +36,10 @@ func newRecognizer_utf16le() *recognizerUtf16le { return &recognizerUtf16le{} } -func (*recognizerUtf16le) Match(input *recognizerInput) (output recognizerOutput) { +func (*recognizerUtf16le) Match(input *recognizerInput, order int) (output recognizerOutput) { output = recognizerOutput{ Charset: "UTF-16LE", + order: order, } if bytes.HasPrefix(input.raw, utf16leBom) && !bytes.HasPrefix(input.raw, utf32leBom) { output.Confidence = 100 @@ -75,9 +77,10 @@ func newRecognizer_utf32le() *recognizerUtf32 { } } -func (r *recognizerUtf32) Match(input *recognizerInput) (output recognizerOutput) { +func (r *recognizerUtf32) Match(input *recognizerInput, order int) (output recognizerOutput) { output = recognizerOutput{ Charset: r.name, + order: order, } hasBom := bytes.HasPrefix(input.raw, r.bom) var numValid, numInvalid uint32 diff --git a/utf8.go b/utf8.go index ae036ad..db83f4f 100644 --- a/utf8.go +++ b/utf8.go @@ -13,9 +13,10 @@ func newRecognizer_utf8() *recognizerUtf8 { return &recognizerUtf8{} } -func (*recognizerUtf8) Match(input *recognizerInput) (output recognizerOutput) { +func (*recognizerUtf8) Match(input *recognizerInput, order int) (output recognizerOutput) { output = recognizerOutput{ Charset: "UTF-8", + order: order, } hasBom := bytes.HasPrefix(input.raw, utf8Bom) inputLen := len(input.raw)