From 745df843e104e782ef1da99270370c438cd238b9 Mon Sep 17 00:00:00 2001 From: Tim Scheuermann Date: Fri, 7 Oct 2022 19:11:19 +0200 Subject: [PATCH 1/7] Implemented resumable downloads --- services/slack/download.go | 138 +++++++++++++++++++++++++++++++++ services/slack/intermediate.go | 35 +-------- 2 files changed, 142 insertions(+), 31 deletions(-) create mode 100644 services/slack/download.go diff --git a/services/slack/download.go b/services/slack/download.go new file mode 100644 index 00000000..78f95c2c --- /dev/null +++ b/services/slack/download.go @@ -0,0 +1,138 @@ +package slack + +import ( + "bytes" + "errors" + "fmt" + "io" + "log" + "net/http" + "os" +) + +const defaultOverlap int64 = 512 + +var ErrOverlapNotEqual = errors.New("the downloaded file doesn't match the one on disk") + +func downloadInto(name, url string, size int64) error { + file, err := os.OpenFile(name, os.O_RDWR|os.O_CREATE, 0660) + if err != nil { + return err + } + defer file.Close() + + return resumeDownload(file, size, url) +} + +func resumeDownload(existing *os.File, size int64, downloadURL string) error { + existingSize, overlap, err := calculateSize(existing, size) + if err != nil || existingSize == size { + return err + } + + start := existingSize - overlap + req, err := createRequest(downloadURL, start) + if err != nil { + return err + } + + if start != 0 { + log.Printf("Resuming download from %s\n", humanSize(start)) + } + + resp, err := http.DefaultClient.Do(req) + if err != nil { + return err + } + defer resp.Body.Close() + + switch resp.StatusCode { + case http.StatusPartialContent: + // do nothing, everything is fine + case http.StatusOK: + // server doesn't support Range + overlap = 0 + existing.Truncate(0) + default: + return fmt.Errorf("download failed with status %q", resp.Status) + } + + if overlap != 0 { + err = checkOverlap(existing, resp.Body, overlap) + if err != nil { + return err + } + } + + _, err = existing.Seek(0, io.SeekEnd) + if err != nil { + return err + } + + _, err = io.Copy(existing, resp.Body) + return err +} + +func checkOverlap(existing io.ReadSeeker, download io.Reader, overlap int64) error { + bufW := make([]byte, overlap) + bufL := make([]byte, overlap) + + _, err := io.ReadFull(download, bufW) + if err != nil { + return err + } + + _, err = existing.Seek(-overlap, io.SeekEnd) + if err != nil { + return err + } + + _, err = io.ReadFull(existing, bufL) + if err != nil { + return err + } + + if !bytes.Equal(bufW, bufL) { + return ErrOverlapNotEqual + } + + return nil +} + +func calculateSize(existing *os.File, size int64) (existingSize, overlap int64, err error) { + info, err := existing.Stat() + if err != nil { + return 0, 0, err + } + + existingSize = info.Size() + if existingSize == size { + return existingSize, 0, nil + } + if existingSize > size { + err = existing.Truncate(0) + if err != nil { + return 0, 0, err + } + existingSize = 0 + } + + overlap = defaultOverlap + if overlap > existingSize { + overlap = existingSize + } + + return existingSize, overlap, nil +} + +func createRequest(url string, start int64) (*http.Request, error) { + req, err := http.NewRequest(http.MethodGet, url, nil) + if err != nil { + return nil, err + } + + req.Header.Set("User-Agent", "mmetl/1.0") + req.Header.Set("Range", fmt.Sprintf("bytes=%d-", start)) + + return req, nil +} diff --git a/services/slack/intermediate.go b/services/slack/intermediate.go index d426b454..3876e2c9 100644 --- a/services/slack/intermediate.go +++ b/services/slack/intermediate.go @@ -5,7 +5,6 @@ import ( "encoding/json" "fmt" "io" - "net/http" "os" "path" "sort" @@ -342,39 +341,13 @@ func addFileToPost(file *SlackFile, uploads map[string]*zip.File, post *Intermed func addDownloadToPost(file *SlackFile, post *IntermediatePost, attachmentsDir string) error { destFilePath := getNormalisedFilePath(file, attachmentsInternal) - log.Printf("Downloading %q to %q...\n", file.DownloadURL, destFilePath) - fullFilePath := path.Join(attachmentsDir, destFilePath) - // TODO(noxer): Use the file info to resume incomplete downloads - if info, err := os.Stat(fullFilePath); err == nil { - if info.Size() == file.Size { - log.Println("File has already been downloaded, skipping...") - return nil - } - } - - destFile, err := os.Create(fullFilePath) - if err != nil { - return errors.Wrapf(err, "failed to create file %s in the attachments directory", file.Id) - } - defer destFile.Close() + log.Printf("Downloading %q into %q...\n", file.DownloadURL, destFilePath) - resp, err := http.Get(file.DownloadURL) + err := downloadInto(fullFilePath, file.DownloadURL, file.Size) if err != nil { - return errors.Wrap(err, "unable to request file") - } - defer resp.Body.Close() - - log.Printf("Download size is %s\n", humanSize(resp.ContentLength)) - - if resp.StatusCode != http.StatusOK { - return errors.Errorf("error requesting %q: %q", file.DownloadURL, resp.Status) - } - - _, err = io.Copy(destFile, resp.Body) - if err != nil { - return errors.Wrap(err, "error downloading file") + return err } log.Println("Download successful!") @@ -493,7 +466,7 @@ func (t *Transformer) AddAttachmentsToPost(post *SlackPost, newPost *Intermediat return props, propsByteArray } -//func (t *Transformer) TransformPosts(slackExport *SlackExport, attachmentsDir string, skipAttachments, discardInvalidProps bool) error { +// func (t *Transformer) TransformPosts(slackExport *SlackExport, attachmentsDir string, skipAttachments, discardInvalidProps bool) error { func (t *Transformer) TransformPosts(slackExport *SlackExport, attachmentsDir string, skipAttachments, discardInvalidProps, allowDownload bool) error { t.Logger.Info("Transforming posts") From 828c995598dd4e8121e0653c7e1689e7309bd8f5 Mon Sep 17 00:00:00 2001 From: Tim Scheuermann Date: Fri, 7 Oct 2022 19:25:03 +0200 Subject: [PATCH 2/7] Fix unchecked error --- services/slack/download.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/services/slack/download.go b/services/slack/download.go index 78f95c2c..ca63b221 100644 --- a/services/slack/download.go +++ b/services/slack/download.go @@ -52,7 +52,9 @@ func resumeDownload(existing *os.File, size int64, downloadURL string) error { case http.StatusOK: // server doesn't support Range overlap = 0 - existing.Truncate(0) + if err = existing.Truncate(0); err != nil { + return err + } default: return fmt.Errorf("download failed with status %q", resp.Status) } From 8c0ff4b587c8637627a512352648e51b6629e9ef Mon Sep 17 00:00:00 2001 From: Tim Scheuermann Date: Tue, 8 Nov 2022 13:46:51 +0100 Subject: [PATCH 3/7] Addressed comments --- services/slack/download.go | 23 +++++++++++++++++++---- services/slack/intermediate.go | 1 - 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/services/slack/download.go b/services/slack/download.go index ca63b221..93ac99bf 100644 --- a/services/slack/download.go +++ b/services/slack/download.go @@ -14,8 +14,19 @@ const defaultOverlap int64 = 512 var ErrOverlapNotEqual = errors.New("the downloaded file doesn't match the one on disk") -func downloadInto(name, url string, size int64) error { - file, err := os.OpenFile(name, os.O_RDWR|os.O_CREATE, 0660) +// downloadInto downloads the contents of a URL into a file. If the file already exists it +// will resume the download. To prevent corrupting the files it downloads a tiny bit of +// overlapping data (512 byte) and compares it to the existing file: +// +// [-----existing local file-----] +// [-------resumed download-------] +// [overlap] +// +// When the check fails, the function returns an error and doesn't silently re-download +// the whole file. If the server doesn't support resumable downloads, the existing file will +// be truncated and re-downloaded. +func downloadInto(filename, url string, size int64) error { + file, err := os.OpenFile(filename, os.O_RDWR|os.O_CREATE, 0660) if err != nil { return err } @@ -26,11 +37,15 @@ func downloadInto(name, url string, size int64) error { func resumeDownload(existing *os.File, size int64, downloadURL string) error { existingSize, overlap, err := calculateSize(existing, size) - if err != nil || existingSize == size { + if err != nil { return err } + if existingSize == size { + // the file has already been downloaded + return nil + } - start := existingSize - overlap + start := existingSize - overlap // calculateSize makes sure this can't be negative req, err := createRequest(downloadURL, start) if err != nil { return err diff --git a/services/slack/intermediate.go b/services/slack/intermediate.go index 3876e2c9..4e3f3a54 100644 --- a/services/slack/intermediate.go +++ b/services/slack/intermediate.go @@ -466,7 +466,6 @@ func (t *Transformer) AddAttachmentsToPost(post *SlackPost, newPost *Intermediat return props, propsByteArray } -// func (t *Transformer) TransformPosts(slackExport *SlackExport, attachmentsDir string, skipAttachments, discardInvalidProps bool) error { func (t *Transformer) TransformPosts(slackExport *SlackExport, attachmentsDir string, skipAttachments, discardInvalidProps, allowDownload bool) error { t.Logger.Info("Transforming posts") From ea6e8ed6b8f56ce5dfefb2c6f9cb906d071f87a7 Mon Sep 17 00:00:00 2001 From: Tim Scheuermann Date: Tue, 8 Nov 2022 14:33:25 +0100 Subject: [PATCH 4/7] Added individual error messages for the different errors to simplify tracking problems --- services/slack/download.go | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/services/slack/download.go b/services/slack/download.go index 93ac99bf..a4f3db4c 100644 --- a/services/slack/download.go +++ b/services/slack/download.go @@ -12,7 +12,7 @@ import ( const defaultOverlap int64 = 512 -var ErrOverlapNotEqual = errors.New("the downloaded file doesn't match the one on disk") +var ErrOverlapNotEqual = errors.New("download: the downloaded file doesn't match the one on disk") // downloadInto downloads the contents of a URL into a file. If the file already exists it // will resume the download. To prevent corrupting the files it downloads a tiny bit of @@ -28,7 +28,7 @@ var ErrOverlapNotEqual = errors.New("the downloaded file doesn't match the one o func downloadInto(filename, url string, size int64) error { file, err := os.OpenFile(filename, os.O_RDWR|os.O_CREATE, 0660) if err != nil { - return err + return fmt.Errorf("download: error opening the destination file: %w", err) } defer file.Close() @@ -57,7 +57,7 @@ func resumeDownload(existing *os.File, size int64, downloadURL string) error { resp, err := http.DefaultClient.Do(req) if err != nil { - return err + return fmt.Errorf("download: error during HTTP request: %w", err) } defer resp.Body.Close() @@ -68,10 +68,10 @@ func resumeDownload(existing *os.File, size int64, downloadURL string) error { // server doesn't support Range overlap = 0 if err = existing.Truncate(0); err != nil { - return err + return fmt.Errorf("download: error emptying file for re-download: %w", err) } default: - return fmt.Errorf("download failed with status %q", resp.Status) + return fmt.Errorf("download: HTTP request failed with status %q", resp.Status) } if overlap != 0 { @@ -83,11 +83,11 @@ func resumeDownload(existing *os.File, size int64, downloadURL string) error { _, err = existing.Seek(0, io.SeekEnd) if err != nil { - return err + return fmt.Errorf("download: error seeking to the end of the existing file: %w", err) } _, err = io.Copy(existing, resp.Body) - return err + return fmt.Errorf("download: error during download: %w", err) } func checkOverlap(existing io.ReadSeeker, download io.Reader, overlap int64) error { @@ -96,17 +96,17 @@ func checkOverlap(existing io.ReadSeeker, download io.Reader, overlap int64) err _, err := io.ReadFull(download, bufW) if err != nil { - return err + return fmt.Errorf("download: error downloading the overlapping data: %w", err) } _, err = existing.Seek(-overlap, io.SeekEnd) if err != nil { - return err + return fmt.Errorf("download: error seeking to the start of the existing overlap: %w", err) } _, err = io.ReadFull(existing, bufL) if err != nil { - return err + return fmt.Errorf("download: error reading the local overlapping data: %w", err) } if !bytes.Equal(bufW, bufL) { @@ -119,7 +119,7 @@ func checkOverlap(existing io.ReadSeeker, download io.Reader, overlap int64) err func calculateSize(existing *os.File, size int64) (existingSize, overlap int64, err error) { info, err := existing.Stat() if err != nil { - return 0, 0, err + return 0, 0, fmt.Errorf("download: error reading file info: %w", err) } existingSize = info.Size() @@ -129,7 +129,7 @@ func calculateSize(existing *os.File, size int64) (existingSize, overlap int64, if existingSize > size { err = existing.Truncate(0) if err != nil { - return 0, 0, err + return 0, 0, fmt.Errorf("download: error emptying file: %w", err) } existingSize = 0 } @@ -145,7 +145,7 @@ func calculateSize(existing *os.File, size int64) (existingSize, overlap int64, func createRequest(url string, start int64) (*http.Request, error) { req, err := http.NewRequest(http.MethodGet, url, nil) if err != nil { - return nil, err + return nil, fmt.Errorf("download: error creating HTTP request: %w", err) } req.Header.Set("User-Agent", "mmetl/1.0") From ac543d6cfcaa3758784408f5d2d2b0629c22b684 Mon Sep 17 00:00:00 2001 From: Tim Scheuermann Date: Tue, 8 Nov 2022 15:39:07 +0100 Subject: [PATCH 5/7] Small bugfix --- services/slack/download.go | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/services/slack/download.go b/services/slack/download.go index a4f3db4c..e732bcd2 100644 --- a/services/slack/download.go +++ b/services/slack/download.go @@ -87,7 +87,11 @@ func resumeDownload(existing *os.File, size int64, downloadURL string) error { } _, err = io.Copy(existing, resp.Body) - return fmt.Errorf("download: error during download: %w", err) + if err != nil { + return fmt.Errorf("download: error during download: %w", err) + } + + return nil } func checkOverlap(existing io.ReadSeeker, download io.Reader, overlap int64) error { From cc074d33ff54269b51c216bd759332aff1fa997f Mon Sep 17 00:00:00 2001 From: Tim Scheuermann Date: Tue, 8 Nov 2022 15:39:20 +0100 Subject: [PATCH 6/7] Added tests for resumable downloads --- services/slack/download_test.go | 198 ++++++++++++++++++++++++++++++++ 1 file changed, 198 insertions(+) create mode 100644 services/slack/download_test.go diff --git a/services/slack/download_test.go b/services/slack/download_test.go new file mode 100644 index 00000000..effac6a4 --- /dev/null +++ b/services/slack/download_test.go @@ -0,0 +1,198 @@ +package slack + +import ( + "math/rand" + "net/http" + "net/http/httptest" + "os" + "path/filepath" + "strconv" + "strings" + "testing" + + "github.com/stretchr/testify/require" +) + +var mockData []byte + +func TestDownload(t *testing.T) { + // set up the test + initializeMockData() + srv, old := mockDefaultHTTPClient() + defer func() { + srv.Close() + http.DefaultClient = old + }() + + // run the idividual tests + t.Run("successful download", func(t *testing.T) { + fileName := filepath.Join(os.TempDir(), "download-test") + defer os.Remove(fileName) + + require.NoError(t, downloadInto(fileName, srv.URL+"/no_resume", int64(len(mockData)))) + tempFile, _ := os.ReadFile(fileName) + require.Equal(t, mockData, tempFile) + }) + + t.Run("successful resume, empty file", func(t *testing.T) { + fileName := filepath.Join(os.TempDir(), "download-test") + defer os.Remove(fileName) + require.NoError(t, os.WriteFile(fileName, []byte{}, 0660)) + + require.NoError(t, downloadInto(fileName, srv.URL+"/resume", int64(len(mockData)))) + tempFile, _ := os.ReadFile(fileName) + require.Equal(t, mockData, tempFile) + }) + + t.Run("successful resume, tiny file", func(t *testing.T) { + fileName := filepath.Join(os.TempDir(), "download-test") + defer os.Remove(fileName) + require.NoError(t, os.WriteFile(fileName, mockData[:8], 0660)) + + require.NoError(t, downloadInto(fileName, srv.URL+"/resume", int64(len(mockData)))) + tempFile, _ := os.ReadFile(fileName) + require.Equal(t, mockData, tempFile) + }) + + t.Run("successful resume, half file", func(t *testing.T) { + fileName := filepath.Join(os.TempDir(), "download-test") + defer os.Remove(fileName) + require.NoError(t, os.WriteFile(fileName, mockData[:1024*512], 0660)) + + require.NoError(t, downloadInto(fileName, srv.URL+"/resume", int64(len(mockData)))) + tempFile, _ := os.ReadFile(fileName) + require.Equal(t, mockData, tempFile) + }) + + t.Run("successful resume, full file", func(t *testing.T) { + fileName := filepath.Join(os.TempDir(), "download-test") + defer os.Remove(fileName) + require.NoError(t, os.WriteFile(fileName, mockData, 0660)) + + require.NoError(t, downloadInto(fileName, srv.URL+"/resume", int64(len(mockData)))) + tempFile, _ := os.ReadFile(fileName) + require.Equal(t, mockData, tempFile) + }) + + t.Run("successful re-download, empty file", func(t *testing.T) { + fileName := filepath.Join(os.TempDir(), "download-test") + defer os.Remove(fileName) + require.NoError(t, os.WriteFile(fileName, []byte{}, 0660)) + + require.NoError(t, downloadInto(fileName, srv.URL+"/no_resume", int64(len(mockData)))) + tempFile, _ := os.ReadFile(fileName) + require.Equal(t, mockData, tempFile) + }) + + t.Run("successful re-download, tiny file", func(t *testing.T) { + fileName := filepath.Join(os.TempDir(), "download-test") + defer os.Remove(fileName) + require.NoError(t, os.WriteFile(fileName, mockData[:8], 0660)) + + require.NoError(t, downloadInto(fileName, srv.URL+"/no_resume", int64(len(mockData)))) + tempFile, _ := os.ReadFile(fileName) + require.Equal(t, mockData, tempFile) + }) + + t.Run("successful re-download, half file", func(t *testing.T) { + fileName := filepath.Join(os.TempDir(), "download-test") + defer os.Remove(fileName) + require.NoError(t, os.WriteFile(fileName, mockData[:1024*512], 0660)) + + require.NoError(t, downloadInto(fileName, srv.URL+"/no_resume", int64(len(mockData)))) + tempFile, _ := os.ReadFile(fileName) + require.Equal(t, mockData, tempFile) + }) + + t.Run("successful re-download, full file", func(t *testing.T) { + fileName := filepath.Join(os.TempDir(), "download-test") + defer os.Remove(fileName) + require.NoError(t, os.WriteFile(fileName, mockData, 0660)) + + require.NoError(t, downloadInto(fileName, srv.URL+"/no_resume", int64(len(mockData)))) + tempFile, _ := os.ReadFile(fileName) + require.Equal(t, mockData, tempFile) + }) + + t.Run("unsuccessful resume, tiny file", func(t *testing.T) { + fileName := filepath.Join(os.TempDir(), "download-test") + defer os.Remove(fileName) + require.NoError(t, os.WriteFile(fileName, mockData[:8], 0660)) + + require.Error(t, downloadInto(fileName, srv.URL+"/wrong_resume", int64(len(mockData)))) + }) + + t.Run("unsuccessful resume, half file", func(t *testing.T) { + fileName := filepath.Join(os.TempDir(), "download-test") + defer os.Remove(fileName) + require.NoError(t, os.WriteFile(fileName, mockData[:1024*512], 0660)) + + require.Error(t, downloadInto(fileName, srv.URL+"/wrong_resume", int64(len(mockData)))) + }) + + t.Run("successful resume from wrong file with an already downloaded file", func(t *testing.T) { + fileName := filepath.Join(os.TempDir(), "download-test") + defer os.Remove(fileName) + require.NoError(t, os.WriteFile(fileName, mockData, 0660)) + + require.NoError(t, downloadInto(fileName, srv.URL+"/wrong_resume", int64(len(mockData)))) + tempFile, _ := os.ReadFile(fileName) + require.Equal(t, mockData, tempFile) + }) + + t.Run("unknown file", func(t *testing.T) { + fileName := filepath.Join(os.TempDir(), "download-test") + defer os.Remove(fileName) + require.NoError(t, os.WriteFile(fileName, mockData[:1024*512], 0660)) + + require.Error(t, downloadInto(fileName, srv.URL+"/wrong_path", int64(len(mockData)))) + }) +} + +func mockDefaultHTTPClient() (newServer *httptest.Server, oldClient *http.Client) { + mux := http.NewServeMux() + + mux.HandleFunc("/no_resume", func(w http.ResponseWriter, r *http.Request) { + w.Write(mockData) + }) + + mux.HandleFunc("/resume", func(w http.ResponseWriter, r *http.Request) { + rangeHeader := r.Header.Get("Range") + if rangeHeader == "" { + w.Write(mockData) + return + } + + from, _ := strconv.ParseInt(strings.TrimPrefix(strings.TrimRight(rangeHeader, "-"), "bytes="), 10, 64) + + w.WriteHeader(http.StatusPartialContent) + w.Write(mockData[from:]) + }) + + mux.HandleFunc("/wrong_resume", func(w http.ResponseWriter, r *http.Request) { + wrongData := make([]byte, 1024*1024) + rand.Read(wrongData) // read different "random" data + + rangeHeader := r.Header.Get("Range") + if rangeHeader == "" { + w.Write(wrongData) + return + } + + from, _ := strconv.ParseInt(strings.TrimPrefix(strings.TrimRight(rangeHeader, "-"), "bytes="), 10, 64) + + w.WriteHeader(http.StatusPartialContent) + w.Write(wrongData[from:]) + }) + + newServer = httptest.NewServer(mux) + oldClient = http.DefaultClient + http.DefaultClient = newServer.Client() + + return +} + +func initializeMockData() { + mockData = make([]byte, 1024*1024) // 1 MiB of "random" data + rand.Read(mockData) +} From 29edf277b44951614d562b6628ecba710fbc4323 Mon Sep 17 00:00:00 2001 From: Tim Scheuermann Date: Tue, 8 Nov 2022 15:43:32 +0100 Subject: [PATCH 7/7] Make linter happy --- services/slack/download_test.go | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/services/slack/download_test.go b/services/slack/download_test.go index effac6a4..6cff27c2 100644 --- a/services/slack/download_test.go +++ b/services/slack/download_test.go @@ -153,20 +153,20 @@ func mockDefaultHTTPClient() (newServer *httptest.Server, oldClient *http.Client mux := http.NewServeMux() mux.HandleFunc("/no_resume", func(w http.ResponseWriter, r *http.Request) { - w.Write(mockData) + _, _ = w.Write(mockData) }) mux.HandleFunc("/resume", func(w http.ResponseWriter, r *http.Request) { rangeHeader := r.Header.Get("Range") if rangeHeader == "" { - w.Write(mockData) + _, _ = w.Write(mockData) return } from, _ := strconv.ParseInt(strings.TrimPrefix(strings.TrimRight(rangeHeader, "-"), "bytes="), 10, 64) w.WriteHeader(http.StatusPartialContent) - w.Write(mockData[from:]) + _, _ = w.Write(mockData[from:]) }) mux.HandleFunc("/wrong_resume", func(w http.ResponseWriter, r *http.Request) { @@ -175,14 +175,14 @@ func mockDefaultHTTPClient() (newServer *httptest.Server, oldClient *http.Client rangeHeader := r.Header.Get("Range") if rangeHeader == "" { - w.Write(wrongData) + _, _ = w.Write(wrongData) return } from, _ := strconv.ParseInt(strings.TrimPrefix(strings.TrimRight(rangeHeader, "-"), "bytes="), 10, 64) w.WriteHeader(http.StatusPartialContent) - w.Write(wrongData[from:]) + _, _ = w.Write(wrongData[from:]) }) newServer = httptest.NewServer(mux)