Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add buffersize argument to rdfind (slightly modified version of trollkarlen's MR) #180

Merged
merged 3 commits into from
Feb 1, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions Fileinfo.cc
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@

int
Fileinfo::fillwithbytes(enum readtobuffermode filltype,
enum readtobuffermode lasttype)
enum readtobuffermode lasttype,
std::vector<char>& buffer)
{

// Decide if we are going to read from file or not.
Expand Down Expand Up @@ -80,11 +81,10 @@ Fileinfo::fillwithbytes(enum readtobuffermode filltype,
if (checksumtype != Checksum::checksumtypes::NOTSET) {
Checksum chk(checksumtype);

char buffer[4096];
while (f1) {
f1.read(buffer, sizeof(buffer));
f1.read(buffer.data(), static_cast<std::streamsize>(buffer.size()));
// gcount is never negative, the cast is safe.
chk.update(static_cast<std::size_t>(f1.gcount()), buffer);
chk.update(static_cast<std::size_t>(f1.gcount()), buffer.data());
}

// store the result of the checksum calculation in somebytes
Expand Down
6 changes: 5 additions & 1 deletion Fileinfo.hh
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include <array>
#include <cstdint>
#include <string>
#include <vector>

// os specific headers
#include <sys/types.h> //for off_t and others.
Expand Down Expand Up @@ -135,10 +136,13 @@ public:
* is shorter than the length of the bytes field.
* @param filltype
* @param lasttype
* @param buffer will be used as a scratch buffer - provided from the outside
* to avoid having to reallocate it for each file
* @return zero on success
*/
int fillwithbytes(enum readtobuffermode filltype,
enum readtobuffermode lasttype);
enum readtobuffermode lasttype,
std::vector<char>& buffer);

/// get a pointer to the bytes read from the file
const char* getbyteptr() const { return m_somebytes.data(); }
Expand Down
3 changes: 2 additions & 1 deletion Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ TESTS=testcases/largefilesupport.sh \
testcases/verify_deterministic_operation.sh \
testcases/checksum_options.sh \
testcases/md5collisions.sh \
testcases/sha1collisions.sh
testcases/sha1collisions.sh \
testcases/checksum_buffersize.sh

AUXFILES=testcases/common_funcs.sh \
testcases/md5collisions/letter_of_rec.ps \
Expand Down
7 changes: 5 additions & 2 deletions Rdutil.cc
Original file line number Diff line number Diff line change
Expand Up @@ -542,15 +542,18 @@ Rdutil::saveablespace(std::ostream& out) const
int
Rdutil::fillwithbytes(enum Fileinfo::readtobuffermode type,
enum Fileinfo::readtobuffermode lasttype,
const long nsecsleep)
const long nsecsleep,
const std::size_t buffersize)
{
// first sort on inode (to read efficiently from the hard drive)
sortOnDeviceAndInode();

const auto duration = std::chrono::nanoseconds{ nsecsleep };

std::vector<char> buffer(buffersize, '\0');

for (auto& elem : m_list) {
elem.fillwithbytes(type, lasttype);
elem.fillwithbytes(type, lasttype, buffer);
if (nsecsleep > 0) {
std::this_thread::sleep_for(duration);
}
Expand Down
6 changes: 3 additions & 3 deletions Rdutil.hh
Original file line number Diff line number Diff line change
Expand Up @@ -88,9 +88,9 @@ public:
// if there is trouble with too much disk reading, sleeping for nsecsleep
// nanoseconds can be made between each file.
int fillwithbytes(enum Fileinfo::readtobuffermode type,
enum Fileinfo::readtobuffermode lasttype =
Fileinfo::readtobuffermode::NOT_DEFINED,
long nsecsleep = 0);
enum Fileinfo::readtobuffermode lasttype,
long nsecsleep,
std::size_t buffersize);

/// make symlinks of duplicates.
std::size_t makesymlinks(bool dryrun) const;
Expand Down
6 changes: 6 additions & 0 deletions rdfind.1
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,12 @@ is true.
What type of checksum to be used: md5, sha1, sha256 or sha512. The default is
sha1 since version 1.4.0.
.TP
.BR \-buffersize " " \fIN\fR
Chunksize in bytes when calculating the checksum
for files, smaller or bigger can improve performance
dependent on filesystem and checksum algorithm.
The default is 1 MiB, the maximum allowed is 128MiB (inclusive).
.TP
.BR \-deterministic " " \fItrue\fR|\fIfalse\fR
If set (the default), sort files of equal rank in an unspecified but
deterministic order. This makes the behaviour independent of in which
Expand Down
24 changes: 21 additions & 3 deletions rdfind.cc
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,9 @@ int current_cmdline_index = 0;
static void
usage()
{
const auto indent = " ";
std::cout
<< "Usage: " << "rdfind [options] FILE ...\n"
<< "Usage: rdfind [options] FILE ...\n"
<< '\n'
<< "Finds duplicate files recursively in the given FILEs (directories),\n"
<< "and takes appropriate action (by default, nothing).\n"
Expand All @@ -64,6 +65,9 @@ usage()
"device and inode\n"
<< " -checksum md5 |(sha1)| sha256 | sha512\n"
<< " checksum type\n"
<< " -buffersize N\n"
<< indent << "chunksize in bytes when calculating the checksum.\n"
<< indent << "The default is 1 MiB, can be up to 128 MiB.\n"
<< " -deterministic (true)| false makes results independent of order\n"
<< " from listing the filesystem\n"
<< " -makesymlinks true |(false) replace duplicate files with "
Expand All @@ -74,7 +78,7 @@ usage()
<< " -outputname name sets the results file name to \"name\" "
"(default results.txt)\n"
<< " -deleteduplicates true |(false) delete duplicate files\n"
<< " -sleep Xms sleep for X milliseconds between "
<< " -sleep Xms sleep for X milliseconds between "
"file reads.\n"
<< " Default is 0. Only a few values\n"
<< " are supported; 0,1-5,10,25,50,100\n"
Expand Down Expand Up @@ -109,6 +113,7 @@ struct Options
bool usesha256 = false; // use sha256 checksum to check for similarity
bool usesha512 = false; // use sha512 checksum to check for similarity
bool deterministic = true; // be independent of filesystem order
std::size_t buffersize = 1 << 20; // chunksize to use when reading files
long nsecsleep = 0; // number of nanoseconds to sleep between each file read.
std::string resultsfile = "results.txt"; // results file name.
};
Expand Down Expand Up @@ -183,6 +188,19 @@ parseOptions(Parser& parser)
<< parser.get_parsed_string() << "\"\n";
std::exit(EXIT_FAILURE);
}
} else if (parser.try_parse_string("-buffersize")) {
const long buffersize = std::stoll(parser.get_parsed_string());
constexpr long max_buffersize = 128 << 20;
if (buffersize <= 0) {
std::cerr << "a negative or zero buffersize is not allowed\n";
std::exit(EXIT_FAILURE);
} else if (buffersize > max_buffersize) {
std::cerr << "a maximum of " << (max_buffersize >> 20)
<< " MiB buffersize is allowed, got " << (buffersize >> 20)
<< " MiB\n";
std::exit(EXIT_FAILURE);
}
o.buffersize = static_cast<std::size_t>(buffersize);
} else if (parser.try_parse_string("-sleep")) {
const auto nextarg = std::string(parser.get_parsed_string());
if (nextarg == "1ms") {
Expand Down Expand Up @@ -382,7 +400,7 @@ main(int narg, const char* argv[])
<< it->second << ": " << std::flush;

// read bytes (destroys the sorting, for disk reading efficiency)
gswd.fillwithbytes(it[0].first, it[-1].first, o.nsecsleep);
gswd.fillwithbytes(it[0].first, it[-1].first, o.nsecsleep, o.buffersize);

// remove non-duplicates
std::cout << "removed " << gswd.removeUniqSizeAndBuffer()
Expand Down
45 changes: 45 additions & 0 deletions testcases/checksum_buffersize.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#!/bin/sh
# Test that selection of buffersizes works as expected.

set -e
. "$(dirname "$0")/common_funcs.sh"

reset_teststate

TEST_DIR=buffersizes_test
mkdir -p "$TEST_DIR"

make_test_files() {
dbgecho "creating test files in $TEST_DIR"
head -c 1000000 /dev/zero >"$TEST_DIR/a"
cp "$TEST_DIR/a" "$TEST_DIR/b"
cp "$TEST_DIR/a" "$TEST_DIR/c"
cp "$TEST_DIR/a" "$TEST_DIR/d"
cp "$TEST_DIR/a" "$TEST_DIR/e"
}

dbgecho "check so all buffersizes behave the same"

# disables only run once shellscheck
# shellcheck disable=SC2043
for checksumtype in sha256; do
i=1
while :; do
if [ $i -gt 128 ]; then
break
fi
i="$((i*2))"
make_test_files
dbgecho "testing buffersize $((i*1024))"
dbgecho "testing $checksumtype"
# Fix this properly by making rdfind to array and use "${rdfind[@]}"
# this requires bash not sh
# shellcheck disable=SC2086
$rdfind -buffersize $((i*1024)) -checksum "$checksumtype" -deleteduplicates true "$TEST_DIR" >/dev/null
pauldreik marked this conversation as resolved.
Show resolved Hide resolved
[ -e "$TEST_DIR/a" ]
[ ! -e "$TEST_DIR/b" ]
[ ! -e "$TEST_DIR/c" ]
[ ! -e "$TEST_DIR/d" ]
[ ! -e "$TEST_DIR/e" ]
done
done
48 changes: 48 additions & 0 deletions testcases/checksum_buffersize_speedtest.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
#!/bin/sh
# Performance test for checksumming with different buffersizes. Not meant
# to be run for regular testing.

set -e
. "$(dirname "$0")/common_funcs.sh"

reset_teststate

TEST_DIR=buffersizes_speedtest
mkdir -p "$TEST_DIR"

make_test_files() {
dbgecho "creating test files in $TEST_DIR/bigfiles"
mkdir -p "$TEST_DIR/bigfiles"
head -c $((1024*1024*500)) /dev/zero >"$TEST_DIR/bigfiles/a"
for f in b c d e; do
cp "$TEST_DIR/bigfiles/a" "$TEST_DIR/bigfiles/$f"
done
dbgecho "creating test files in $TEST_DIR/smallfiles"
mkdir -p "$TEST_DIR/smallfiles"
(cd "$TEST_DIR/smallfiles"; head -c100000000 /dev/zero |split --bytes 1000)
}

dbgecho "run speed test for all shecksums and buffersizes"

make_test_files

cat /dev/null >"$TEST_DIR/results.tsv"
for filesize in big small; do
for checksumtype in md5 sha1; do
i=1
while :; do
if [ $i -gt 4096 ]; then
break
fi
# Fix this properly by making rdfind to array and use "${rdfind[@]}"
# this requires bash not sh
# shellcheck disable=SC2086
dbgecho "testing $checksumtype $i kB buffersize"
# shellcheck disable=SC2086
/usr/bin/time --append --output=$TEST_DIR/results.tsv -f "$filesize\t$i\t$checksumtype\t%e\t%M\t%C" $rdfind -buffersize $((i*1024)) -checksum "$checksumtype" -dryrun true -deleteduplicates true "$TEST_DIR/${filesize}files" >/dev/null 2>&1
i="$((i*2))"
done
done
done
cat "$TEST_DIR/results.tsv"

Loading