Add SetLimit function

It allows setting the maximum size of the buffer used when detecting MIME type.
gabriel-vasile · Feb 26, 2021 · b1ae8bf · b1ae8bf
1 parent 96a50b9
commit b1ae8bf
Show file tree

Hide file tree

Showing 5 changed files with 42 additions and 28 deletions.
diff --git a/internal/matchers/matchers.go b/internal/matchers/matchers.go
@@ -1,10 +1,6 @@
 // Package matchers holds the matching functions used to find MIME types.
 package matchers
 
-// ReadLimit is the maximum number of bytes read from the input when detecting
-// from a reader or from a file.
-const ReadLimit = 3072
-
 // trimLWS trims whitespace from beginning of the input.
 func trimLWS(in []byte) []byte {
 	firstNonWS := 0

diff --git a/internal/matchers/text.go b/internal/matchers/text.go
@@ -194,11 +194,7 @@ func Php(in []byte) bool {
 
 // Json matches a JavaScript Object Notation file.
 func Json(in []byte) bool {
-	parsed, err := json.Scan(in)
-	if len(in) < ReadLimit {
-		return err == nil
-	}
-
+	parsed, _ := json.Scan(in)
 	return parsed == len(in)
 }
 
@@ -270,6 +266,7 @@ func NdJson(in []byte) bool {
 
 	// Total bytes scanned.
 	parsed := 0
+	lenin := len(in)
 
 	// Split by `srn`.
 	for rni, insrn := range bytes.Split(in, srn) {
@@ -291,14 +288,14 @@ func NdJson(in []byte) bool {
 			}
 			p, err := json.Scan(insn)
 			parsed += p
-			if parsed < ReadLimit && err != nil {
+			if parsed < lenin && err != nil {
 				return false
 			}
 		}
 	}
 
 	// Empty inputs should not pass as valid NDJSON with 0 lines.
-	return parsed > 0 && parsed == len(in)
+	return parsed > 0 && parsed == lenin
 }
 
 // Js matches a Javascript file.

diff --git a/internal/matchers/text_csv.go b/internal/matchers/text_csv.go
@@ -17,7 +17,7 @@ func Tsv(in []byte) bool {
 }
 
 func sv(in []byte, comma rune) bool {
-	r := csv.NewReader(butLastLineReader(in, ReadLimit))
+	r := csv.NewReader(butLastLineReader(in, len(in)))
 	r.Comma = comma
 	r.TrimLeadingSpace = true
 	r.LazyQuotes = true

diff --git a/mime.go b/mime.go
@@ -33,7 +33,7 @@ func (m *MIME) Extension() string {
 // For example, the application/json and text/html MIME types have text/plain as
 // their parent because they are text files who happen to contain JSON or HTML.
 // Another example is the ZIP format, which is used as container
-// for Microsoft Office files, EPUB files, JAR files and others.
+// for Microsoft Office files, EPUB files, JAR files, and others.
 func (m *MIME) Parent() *MIME {
 	return m.parent
 }
@@ -92,6 +92,7 @@ func (m *MIME) match(in []byte) *MIME {
 	return m
 }
 
+// flatten transforms an hierarchy of MIMEs into a slice of MIMEs.
 func (m *MIME) flatten() []*MIME {
 	out := []*MIME{m}
 	for _, c := range m.children {

diff --git a/mimetype.go b/mimetype.go
@@ -9,19 +9,23 @@ package mimetype
 
 import (
 	"io"
+	"io/ioutil"
 	"mime"
 	"os"
-
-	"github.com/gabriel-vasile/mimetype/internal/matchers"
+	"sync/atomic"
 )
 
+// readLimit is the maximum number of bytes from the input used when detecting.
+var readLimit uint64 = 3072
+
 // Detect returns the MIME type found from the provided byte slice.
 //
 // The result is always a valid MIME type, with application/octet-stream
 // returned when identification failed.
 func Detect(in []byte) *MIME {
-	if len(in) > matchers.ReadLimit {
-		in = in[:matchers.ReadLimit]
+	l := atomic.LoadUint64(&readLimit)
+	if l > 0 && len(in) > int(l) {
+		in = in[:l]
 	}
 	return root.match(in)
 }
@@ -35,16 +39,27 @@ func Detect(in []byte) *MIME {
 // DetectReader assumes the reader offset is at the start. If the input
 // is a ReadSeeker you read from before, it should be rewinded before detection:
 //  reader.Seek(0, io.SeekStart)
-//
-// To prevent loading entire files into memory, DetectReader reads at most
-// matchers.ReadLimit bytes from the reader.
 func DetectReader(r io.Reader) (*MIME, error) {
-	in := make([]byte, matchers.ReadLimit)
-	n, err := io.ReadFull(r, in)
-	if err != nil && err != io.EOF && err != io.ErrUnexpectedEOF {
-		return root, err
+	l := atomic.LoadUint64(&readLimit)
+	var in []byte
+	var err error
+
+	if l == 0 {
+		in, err = ioutil.ReadAll(r)
+		if err != nil {
+			return root, err
+		}
+	} else {
+		// io.UnexpectedEOF means len(r) < len(in). It is not an error in this case,
+		// it just means the input file is smaller than the allocated bytes slice.
+		n := 0
+		in = make([]byte, l)
+		n, err = io.ReadFull(r, in)
+		if err != nil && err != io.ErrUnexpectedEOF {
+			return root, err
+		}
+		in = in[:n]
 	}
-	in = in[:n]
 
 	return Detect(in), nil
 }
@@ -54,9 +69,6 @@ func DetectReader(r io.Reader) (*MIME, error) {
 // The result is always a valid MIME type, with application/octet-stream
 // returned when identification failed with or without an error.
 // Any error returned is related to the opening and reading from the input file.
-//
-// To prevent loading entire files into memory, DetectFile reads at most
-// matchers.ReadLimit bytes from the input file.
 func DetectFile(file string) (*MIME, error) {
 	f, err := os.Open(file)
 	if err != nil {
@@ -82,3 +94,11 @@ func EqualsAny(s string, mimes ...string) bool {
 
 	return false
 }
+
+// SetLimit sets the maximum number of bytes read from input when detecting the MIME type.
+// Increasing the limit provides better detection for file formats which store
+// their magical numbers towards the end of the file.
+// A limit of 0 means the whole input file will be used.
+func SetLimit(limit uint64) {
+	atomic.StoreUint64(&readLimit, limit)
+}