Skip to content

Commit

Permalink
Add SetLimit function
Browse files Browse the repository at this point in the history
It allows setting the maximum size of the buffer used when detecting
MIME type.
  • Loading branch information
gabriel-vasile committed Feb 26, 2021
1 parent 96a50b9 commit b1ae8bf
Show file tree
Hide file tree
Showing 5 changed files with 42 additions and 28 deletions.
4 changes: 0 additions & 4 deletions internal/matchers/matchers.go
Original file line number Diff line number Diff line change
@@ -1,10 +1,6 @@
// Package matchers holds the matching functions used to find MIME types.
package matchers

// ReadLimit is the maximum number of bytes read from the input when detecting
// from a reader or from a file.
const ReadLimit = 3072

// trimLWS trims whitespace from beginning of the input.
func trimLWS(in []byte) []byte {
firstNonWS := 0
Expand Down
11 changes: 4 additions & 7 deletions internal/matchers/text.go
Original file line number Diff line number Diff line change
Expand Up @@ -194,11 +194,7 @@ func Php(in []byte) bool {

// Json matches a JavaScript Object Notation file.
func Json(in []byte) bool {
parsed, err := json.Scan(in)
if len(in) < ReadLimit {
return err == nil
}

parsed, _ := json.Scan(in)
return parsed == len(in)
}

Expand Down Expand Up @@ -270,6 +266,7 @@ func NdJson(in []byte) bool {

// Total bytes scanned.
parsed := 0
lenin := len(in)

// Split by `srn`.
for rni, insrn := range bytes.Split(in, srn) {
Expand All @@ -291,14 +288,14 @@ func NdJson(in []byte) bool {
}
p, err := json.Scan(insn)
parsed += p
if parsed < ReadLimit && err != nil {
if parsed < lenin && err != nil {
return false
}
}
}

// Empty inputs should not pass as valid NDJSON with 0 lines.
return parsed > 0 && parsed == len(in)
return parsed > 0 && parsed == lenin
}

// Js matches a Javascript file.
Expand Down
2 changes: 1 addition & 1 deletion internal/matchers/text_csv.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ func Tsv(in []byte) bool {
}

func sv(in []byte, comma rune) bool {
r := csv.NewReader(butLastLineReader(in, ReadLimit))
r := csv.NewReader(butLastLineReader(in, len(in)))
r.Comma = comma
r.TrimLeadingSpace = true
r.LazyQuotes = true
Expand Down
3 changes: 2 additions & 1 deletion mime.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ func (m *MIME) Extension() string {
// For example, the application/json and text/html MIME types have text/plain as
// their parent because they are text files who happen to contain JSON or HTML.
// Another example is the ZIP format, which is used as container
// for Microsoft Office files, EPUB files, JAR files and others.
// for Microsoft Office files, EPUB files, JAR files, and others.
func (m *MIME) Parent() *MIME {
return m.parent
}
Expand Down Expand Up @@ -92,6 +92,7 @@ func (m *MIME) match(in []byte) *MIME {
return m
}

// flatten transforms an hierarchy of MIMEs into a slice of MIMEs.
func (m *MIME) flatten() []*MIME {
out := []*MIME{m}
for _, c := range m.children {
Expand Down
50 changes: 35 additions & 15 deletions mimetype.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,19 +9,23 @@ package mimetype

import (
"io"
"io/ioutil"
"mime"
"os"

"github.com/gabriel-vasile/mimetype/internal/matchers"
"sync/atomic"
)

// readLimit is the maximum number of bytes from the input used when detecting.
var readLimit uint64 = 3072

// Detect returns the MIME type found from the provided byte slice.
//
// The result is always a valid MIME type, with application/octet-stream
// returned when identification failed.
func Detect(in []byte) *MIME {
if len(in) > matchers.ReadLimit {
in = in[:matchers.ReadLimit]
l := atomic.LoadUint64(&readLimit)
if l > 0 && len(in) > int(l) {
in = in[:l]
}
return root.match(in)
}
Expand All @@ -35,16 +39,27 @@ func Detect(in []byte) *MIME {
// DetectReader assumes the reader offset is at the start. If the input
// is a ReadSeeker you read from before, it should be rewinded before detection:
// reader.Seek(0, io.SeekStart)
//
// To prevent loading entire files into memory, DetectReader reads at most
// matchers.ReadLimit bytes from the reader.
func DetectReader(r io.Reader) (*MIME, error) {
in := make([]byte, matchers.ReadLimit)
n, err := io.ReadFull(r, in)
if err != nil && err != io.EOF && err != io.ErrUnexpectedEOF {
return root, err
l := atomic.LoadUint64(&readLimit)
var in []byte
var err error

if l == 0 {
in, err = ioutil.ReadAll(r)
if err != nil {
return root, err
}
} else {
// io.UnexpectedEOF means len(r) < len(in). It is not an error in this case,
// it just means the input file is smaller than the allocated bytes slice.
n := 0
in = make([]byte, l)
n, err = io.ReadFull(r, in)
if err != nil && err != io.ErrUnexpectedEOF {
return root, err
}
in = in[:n]
}
in = in[:n]

return Detect(in), nil
}
Expand All @@ -54,9 +69,6 @@ func DetectReader(r io.Reader) (*MIME, error) {
// The result is always a valid MIME type, with application/octet-stream
// returned when identification failed with or without an error.
// Any error returned is related to the opening and reading from the input file.
//
// To prevent loading entire files into memory, DetectFile reads at most
// matchers.ReadLimit bytes from the input file.
func DetectFile(file string) (*MIME, error) {
f, err := os.Open(file)
if err != nil {
Expand All @@ -82,3 +94,11 @@ func EqualsAny(s string, mimes ...string) bool {

return false
}

// SetLimit sets the maximum number of bytes read from input when detecting the MIME type.
// Increasing the limit provides better detection for file formats which store
// their magical numbers towards the end of the file.
// A limit of 0 means the whole input file will be used.
func SetLimit(limit uint64) {
atomic.StoreUint64(&readLimit, limit)
}

0 comments on commit b1ae8bf

Please sign in to comment.