Skip to content

Enforce order for results with same confidence #10

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion 2022.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,11 @@ type recognizer2022 struct {
escapes [][]byte
}

func (r *recognizer2022) Match(input *recognizerInput) (output recognizerOutput) {
func (r *recognizer2022) Match(input *recognizerInput, order int) (output recognizerOutput) {
return recognizerOutput{
Charset: r.charset,
Confidence: r.matchConfidence(input.input),
order: order,
}
}

Expand Down
25 changes: 15 additions & 10 deletions detector.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@ type Result struct {
Language string
// Confidence of the Result. Scale from 1 to 100. The bigger, the more confident.
Confidence int

// used for sorting internally
order int
}

// Detector implements charset detection.
Expand Down Expand Up @@ -87,13 +90,13 @@ var (
func (d *Detector) DetectBest(b []byte) (r *Result, err error) {
input := newRecognizerInput(b, d.stripTag)
outputChan := make(chan recognizerOutput)
for _, r := range d.recognizers {
go matchHelper(r, input, outputChan)
for i, r := range d.recognizers {
go matchHelper(r, input, outputChan, i)
}
var output Result
for i := 0; i < len(d.recognizers); i++ {
o := <-outputChan
if output.Confidence < o.Confidence {
if output.Confidence < o.Confidence || (output.Confidence == o.Confidence && o.order < output.order) {
output = Result(o)
}
}
Expand All @@ -107,8 +110,8 @@ func (d *Detector) DetectBest(b []byte) (r *Result, err error) {
func (d *Detector) DetectAll(b []byte) ([]Result, error) {
input := newRecognizerInput(b, d.stripTag)
outputChan := make(chan recognizerOutput)
for _, r := range d.recognizers {
go matchHelper(r, input, outputChan)
for i, r := range d.recognizers {
go matchHelper(r, input, outputChan, i)
}
outputs := make(recognizerOutputs, 0, len(d.recognizers))
for i := 0; i < len(d.recognizers); i++ {
Expand Down Expand Up @@ -136,12 +139,14 @@ func (d *Detector) DetectAll(b []byte) ([]Result, error) {
return dedupOutputs, nil
}

func matchHelper(r recognizer, input *recognizerInput, outputChan chan<- recognizerOutput) {
outputChan <- r.Match(input)
func matchHelper(r recognizer, input *recognizerInput, outputChan chan<- recognizerOutput, order int) {
outputChan <- r.Match(input, order)
}

type recognizerOutputs []recognizerOutput

func (r recognizerOutputs) Len() int { return len(r) }
func (r recognizerOutputs) Less(i, j int) bool { return r[i].Confidence > r[j].Confidence }
func (r recognizerOutputs) Swap(i, j int) { r[i], r[j] = r[j], r[i] }
func (r recognizerOutputs) Len() int { return len(r) }
func (r recognizerOutputs) Less(i, j int) bool {
return r[i].Confidence > r[j].Confidence || (r[i].Confidence == r[j].Confidence && r[i].order < r[j].order)
}
func (r recognizerOutputs) Swap(i, j int) { r[i], r[j] = r[j], r[i] }
25 changes: 24 additions & 1 deletion detector_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,12 @@ package chardet_test

import (
"bytes"
"github.com/gogs/chardet"
"io"
"os"
"path/filepath"
"testing"

"github.com/gogs/chardet"
)

func TestDetector(t *testing.T) {
Expand Down Expand Up @@ -58,6 +59,28 @@ func TestDetector(t *testing.T) {
t.Errorf("Expected language %s, actual %s", d.Language, result.Language)
}
}

// "ノエル" Shift JIS encoded
test := []byte("\x83m\x83G\x83\x8b")

result, err := textDetector.DetectAll(test)
if err != nil {
t.Fatal(err)
}
if len(result) != 3 {
t.Errorf("Expected 3 results, actual %d", len(result))
}
if result[0].Charset != "Shift_JIS" || result[1].Charset != "GB18030" || result[2].Charset != "Big5" {
t.Errorf("DetectAll order is wrong: %v", result)
}

singleResult, err := textDetector.DetectBest(test)
if err != nil {
t.Fatal(err)
}
if singleResult.Charset != "Shift_JIS" {
t.Errorf("DetectBest result is wrong: %v", singleResult)
}
}

func BenchmarkDetectBest(b *testing.B) {
Expand Down
2 changes: 2 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
module github.com/gogs/chardet

go 1.19
3 changes: 2 additions & 1 deletion multi_byte.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,12 @@ type charDecoder interface {
DecodeOneChar([]byte) (c uint16, remain []byte, err error)
}

func (r *recognizerMultiByte) Match(input *recognizerInput) (output recognizerOutput) {
func (r *recognizerMultiByte) Match(input *recognizerInput, order int) (output recognizerOutput) {
return recognizerOutput{
Charset: r.charset,
Language: r.language,
Confidence: r.matchConfidence(input),
order: order,
}
}

Expand Down
2 changes: 1 addition & 1 deletion recognizer.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
package chardet

type recognizer interface {
Match(*recognizerInput) recognizerOutput
Match(*recognizerInput, int) recognizerOutput
}

type recognizerOutput Result
Expand Down
3 changes: 2 additions & 1 deletion single_byte.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ type recognizerSingleByte struct {
ngram *[64]uint32
}

func (r *recognizerSingleByte) Match(input *recognizerInput) recognizerOutput {
func (r *recognizerSingleByte) Match(input *recognizerInput, order int) recognizerOutput {
var charset string = r.charset
if input.hasC1Bytes && len(r.hasC1ByteCharset) > 0 {
charset = r.hasC1ByteCharset
Expand All @@ -18,6 +18,7 @@ func (r *recognizerSingleByte) Match(input *recognizerInput) recognizerOutput {
Charset: charset,
Language: r.language,
Confidence: r.parseNgram(input.input),
order: order,
}
}

Expand Down
9 changes: 6 additions & 3 deletions unicode.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,10 @@ func newRecognizer_utf16be() *recognizerUtf16be {
return &recognizerUtf16be{}
}

func (*recognizerUtf16be) Match(input *recognizerInput) (output recognizerOutput) {
func (*recognizerUtf16be) Match(input *recognizerInput, order int) (output recognizerOutput) {
output = recognizerOutput{
Charset: "UTF-16BE",
order: order,
}
if bytes.HasPrefix(input.raw, utf16beBom) {
output.Confidence = 100
Expand All @@ -35,9 +36,10 @@ func newRecognizer_utf16le() *recognizerUtf16le {
return &recognizerUtf16le{}
}

func (*recognizerUtf16le) Match(input *recognizerInput) (output recognizerOutput) {
func (*recognizerUtf16le) Match(input *recognizerInput, order int) (output recognizerOutput) {
output = recognizerOutput{
Charset: "UTF-16LE",
order: order,
}
if bytes.HasPrefix(input.raw, utf16leBom) && !bytes.HasPrefix(input.raw, utf32leBom) {
output.Confidence = 100
Expand Down Expand Up @@ -75,9 +77,10 @@ func newRecognizer_utf32le() *recognizerUtf32 {
}
}

func (r *recognizerUtf32) Match(input *recognizerInput) (output recognizerOutput) {
func (r *recognizerUtf32) Match(input *recognizerInput, order int) (output recognizerOutput) {
output = recognizerOutput{
Charset: r.name,
order: order,
}
hasBom := bytes.HasPrefix(input.raw, r.bom)
var numValid, numInvalid uint32
Expand Down
3 changes: 2 additions & 1 deletion utf8.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,10 @@ func newRecognizer_utf8() *recognizerUtf8 {
return &recognizerUtf8{}
}

func (*recognizerUtf8) Match(input *recognizerInput) (output recognizerOutput) {
func (*recognizerUtf8) Match(input *recognizerInput, order int) (output recognizerOutput) {
output = recognizerOutput{
Charset: "UTF-8",
order: order,
}
hasBom := bytes.HasPrefix(input.raw, utf8Bom)
inputLen := len(input.raw)
Expand Down