pauldreik · pauldreik · Feb 1, 2025 · Jan 13, 2025 · Jan 22, 2025 · Jan 23, 2025
diff --git a/Fileinfo.cc b/Fileinfo.cc
@@ -24,7 +24,8 @@
 
 int
 Fileinfo::fillwithbytes(enum readtobuffermode filltype,
-                        enum readtobuffermode lasttype)
+                        enum readtobuffermode lasttype,
+                        std::vector<char>& buffer)
 {
 
   // Decide if we are going to read from file or not.
@@ -80,11 +81,10 @@ Fileinfo::fillwithbytes(enum readtobuffermode filltype,
   if (checksumtype != Checksum::checksumtypes::NOTSET) {
     Checksum chk(checksumtype);
 
-    char buffer[4096];
     while (f1) {
-      f1.read(buffer, sizeof(buffer));
+      f1.read(buffer.data(), static_cast<std::streamsize>(buffer.size()));
       // gcount is never negative, the cast is safe.
-      chk.update(static_cast<std::size_t>(f1.gcount()), buffer);
+      chk.update(static_cast<std::size_t>(f1.gcount()), buffer.data());
     }
 
     // store the result of the checksum calculation in somebytes

diff --git a/Fileinfo.hh b/Fileinfo.hh
@@ -10,6 +10,7 @@
 #include <array>
 #include <cstdint>
 #include <string>
+#include <vector>
 
 // os specific headers
 #include <sys/types.h> //for off_t and others.
@@ -135,10 +136,13 @@ public:
    * is shorter than the length of the bytes field.
    * @param filltype
    * @param lasttype
+   * @param buffer will be used as a scratch buffer - provided from the outside
+   * to avoid having to reallocate it for each file
    * @return zero on success
    */
   int fillwithbytes(enum readtobuffermode filltype,
-                    enum readtobuffermode lasttype);
+                    enum readtobuffermode lasttype,
+                    std::vector<char>& buffer);
 
   /// get a pointer to the bytes read from the file
   const char* getbyteptr() const { return m_somebytes.data(); }

diff --git a/Makefile.am b/Makefile.am
@@ -18,7 +18,8 @@ TESTS=testcases/largefilesupport.sh \
       testcases/verify_deterministic_operation.sh \
       testcases/checksum_options.sh \
       testcases/md5collisions.sh \
-      testcases/sha1collisions.sh
+      testcases/sha1collisions.sh \
+      testcases/checksum_buffersize.sh
 
 AUXFILES=testcases/common_funcs.sh \
          testcases/md5collisions/letter_of_rec.ps \

diff --git a/Rdutil.cc b/Rdutil.cc
@@ -542,15 +542,18 @@ Rdutil::saveablespace(std::ostream& out) const
 int
 Rdutil::fillwithbytes(enum Fileinfo::readtobuffermode type,
                       enum Fileinfo::readtobuffermode lasttype,
-                      const long nsecsleep)
+                      const long nsecsleep,
+                      const std::size_t buffersize)
 {
   // first sort on inode (to read efficiently from the hard drive)
   sortOnDeviceAndInode();
 
   const auto duration = std::chrono::nanoseconds{ nsecsleep };
 
+  std::vector<char> buffer(buffersize, '\0');
+
   for (auto& elem : m_list) {
-    elem.fillwithbytes(type, lasttype);
+    elem.fillwithbytes(type, lasttype, buffer);
     if (nsecsleep > 0) {
       std::this_thread::sleep_for(duration);
     }

diff --git a/Rdutil.hh b/Rdutil.hh
@@ -88,9 +88,9 @@ public:
   // if there is trouble with too much disk reading, sleeping for nsecsleep
   // nanoseconds can be made between each file.
   int fillwithbytes(enum Fileinfo::readtobuffermode type,
-                    enum Fileinfo::readtobuffermode lasttype =
-                      Fileinfo::readtobuffermode::NOT_DEFINED,
-                    long nsecsleep = 0);
+                    enum Fileinfo::readtobuffermode lasttype,
+                    long nsecsleep,
+                    std::size_t buffersize);
 
   /// make symlinks of duplicates.
   std::size_t makesymlinks(bool dryrun) const;

diff --git a/rdfind.1 b/rdfind.1
@@ -80,6 +80,12 @@ is true.
 What type of checksum to be used: md5, sha1, sha256 or sha512. The default is
 sha1 since version 1.4.0.
 .TP
+.BR \-buffersize " " \fIN\fR
+Chunksize in bytes when calculating the checksum
+for files, smaller or bigger can improve performance
+dependent on filesystem and checksum algorithm.
+The default is 1 MiB, the maximum allowed is 128MiB (inclusive).
+.TP
 .BR \-deterministic " " \fItrue\fR|\fIfalse\fR
 If set (the default), sort files of equal rank in an unspecified but
 deterministic order. This makes the behaviour independent of in which

diff --git a/rdfind.cc b/rdfind.cc
@@ -39,8 +39,9 @@ int current_cmdline_index = 0;
 static void
 usage()
 {
+  const auto indent = "                                  ";
   std::cout
-    << "Usage: " << "rdfind [options] FILE ...\n"
+    << "Usage: rdfind [options] FILE ...\n"
     << '\n'
     << "Finds duplicate files recursively in the given FILEs (directories),\n"
     << "and takes appropriate action (by default, nothing).\n"
@@ -64,6 +65,9 @@ usage()
        "device and inode\n"
     << " -checksum           md5 |(sha1)| sha256 | sha512\n"
     << "                                  checksum type\n"
+    << " -buffersize N\n"
+    << indent << "chunksize in bytes when calculating the checksum.\n"
+    << indent << "The default is 1 MiB, can be up to 128 MiB.\n"
     << " -deterministic    (true)| false  makes results independent of order\n"
     << "                                  from listing the filesystem\n"
     << " -makesymlinks      true |(false) replace duplicate files with "
@@ -74,7 +78,7 @@ usage()
     << " -outputname  name  sets the results file name to \"name\" "
        "(default results.txt)\n"
     << " -deleteduplicates  true |(false) delete duplicate files\n"
-    << " -sleep              Xms          sleep for X milliseconds between "
+    << " -sleep             Xms          sleep for X milliseconds between "
        "file reads.\n"
     << "                                  Default is 0. Only a few values\n"
     << "                                  are supported; 0,1-5,10,25,50,100\n"
@@ -109,6 +113,7 @@ struct Options
   bool usesha256 = false;    // use sha256 checksum to check for similarity
   bool usesha512 = false;    // use sha512 checksum to check for similarity
   bool deterministic = true; // be independent of filesystem order
+  std::size_t buffersize = 1 << 20; // chunksize to use when reading files
   long nsecsleep = 0; // number of nanoseconds to sleep between each file read.
   std::string resultsfile = "results.txt"; // results file name.
 };
@@ -183,6 +188,19 @@ parseOptions(Parser& parser)
                   << parser.get_parsed_string() << "\"\n";
         std::exit(EXIT_FAILURE);
       }
+    } else if (parser.try_parse_string("-buffersize")) {
+      const long buffersize = std::stoll(parser.get_parsed_string());
+      constexpr long max_buffersize = 128 << 20;
+      if (buffersize <= 0) {
+        std::cerr << "a negative or zero buffersize is not allowed\n";
+        std::exit(EXIT_FAILURE);
+      } else if (buffersize > max_buffersize) {
+        std::cerr << "a maximum of " << (max_buffersize >> 20)
+                  << " MiB buffersize is allowed, got " << (buffersize >> 20)
+                  << " MiB\n";
+        std::exit(EXIT_FAILURE);
+      }
+      o.buffersize = static_cast<std::size_t>(buffersize);
     } else if (parser.try_parse_string("-sleep")) {
       const auto nextarg = std::string(parser.get_parsed_string());
       if (nextarg == "1ms") {
@@ -382,7 +400,7 @@ main(int narg, const char* argv[])
               << it->second << ": " << std::flush;
 
     // read bytes (destroys the sorting, for disk reading efficiency)
-    gswd.fillwithbytes(it[0].first, it[-1].first, o.nsecsleep);
+    gswd.fillwithbytes(it[0].first, it[-1].first, o.nsecsleep, o.buffersize);
 
     // remove non-duplicates
     std::cout << "removed " << gswd.removeUniqSizeAndBuffer()

diff --git a/testcases/checksum_buffersize.sh b/testcases/checksum_buffersize.sh
@@ -0,0 +1,45 @@
+#!/bin/sh
+# Test that selection of buffersizes works as expected.
+
+set -e
+. "$(dirname "$0")/common_funcs.sh"
+
+reset_teststate
+
+TEST_DIR=buffersizes_test
+mkdir -p "$TEST_DIR"
+
+make_test_files() {
+    dbgecho "creating test files in $TEST_DIR"
+    head -c 1000000 /dev/zero >"$TEST_DIR/a"
+    cp "$TEST_DIR/a" "$TEST_DIR/b"
+    cp "$TEST_DIR/a" "$TEST_DIR/c"
+    cp "$TEST_DIR/a" "$TEST_DIR/d"
+    cp "$TEST_DIR/a" "$TEST_DIR/e"
+}
+
+dbgecho "check so all buffersizes behave the same"
+
+# disables only run once shellscheck
+# shellcheck disable=SC2043
+for checksumtype in sha256; do
+    i=1
+    while :; do
+        if [ $i -gt 128 ]; then
+            break
+        fi
+        i="$((i*2))"
+        make_test_files
+        dbgecho "testing buffersize $((i*1024))"
+        dbgecho "testing $checksumtype"
+        # Fix this properly by making rdfind to array and use "${rdfind[@]}"
+        # this requires bash not sh
+        # shellcheck disable=SC2086
+        $rdfind -buffersize $((i*1024)) -checksum "$checksumtype" -deleteduplicates true "$TEST_DIR" >/dev/null
+        [ -e "$TEST_DIR/a" ]
+        [ ! -e "$TEST_DIR/b" ]
+        [ ! -e "$TEST_DIR/c" ]
+        [ ! -e "$TEST_DIR/d" ]
+        [ ! -e "$TEST_DIR/e" ]
+    done
+done
diff --git a/testcases/checksum_buffersize_speedtest.sh b/testcases/checksum_buffersize_speedtest.sh
@@ -0,0 +1,48 @@
+#!/bin/sh
+# Performance test for checksumming with different buffersizes. Not meant
+# to be run for regular testing.
+
+set -e
+. "$(dirname "$0")/common_funcs.sh"
+
+reset_teststate
+
+TEST_DIR=buffersizes_speedtest
+mkdir -p "$TEST_DIR"
+
+make_test_files() {
+    dbgecho "creating test files in $TEST_DIR/bigfiles"
+    mkdir -p "$TEST_DIR/bigfiles"
+    head -c $((1024*1024*500)) /dev/zero >"$TEST_DIR/bigfiles/a"
+    for f in b c d e; do
+      cp "$TEST_DIR/bigfiles/a" "$TEST_DIR/bigfiles/$f"
+    done
+    dbgecho "creating test files in $TEST_DIR/smallfiles"
+    mkdir -p "$TEST_DIR/smallfiles"
+    (cd "$TEST_DIR/smallfiles"; head -c100000000 /dev/zero |split --bytes 1000)
+}
+
+dbgecho "run speed test for all shecksums and buffersizes"
+
+make_test_files
+
+cat /dev/null >"$TEST_DIR/results.tsv"
+for filesize in big small; do
+for checksumtype in md5 sha1; do
+    i=1
+    while :; do
+        if [ $i -gt 4096 ]; then
+            break
+        fi
+        # Fix this properly by making rdfind to array and use "${rdfind[@]}"
+        # this requires bash not sh
+        # shellcheck disable=SC2086
+        dbgecho "testing $checksumtype $i kB buffersize"
+        # shellcheck disable=SC2086
+        /usr/bin/time --append --output=$TEST_DIR/results.tsv -f "$filesize\t$i\t$checksumtype\t%e\t%M\t%C" $rdfind -buffersize $((i*1024)) -checksum "$checksumtype" -dryrun true -deleteduplicates true "$TEST_DIR/${filesize}files" >/dev/null 2>&1
+        i="$((i*2))"
+done
+done
+done
+cat "$TEST_DIR/results.tsv"
+