Skip to content

Commit

Permalink
pdr.utils.datamgmt.checksum_of(): provide buf size as opt. arg
Browse files Browse the repository at this point in the history
  • Loading branch information
RayPlante committed Nov 29, 2023
1 parent 9976501 commit e135e6b
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 3 deletions.
14 changes: 11 additions & 3 deletions python/nistoar/pdr/utils/datamgmt.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,15 +79,23 @@ def build_mime_type_map(filelist):
update_mimetypes_from_file(out, file)
return out

def checksum_of(filepath):
def checksum_of(filepath, bufsize: int=10240000):
"""
return the checksum for the given file
:param str|Path filepath: the path of the file to calculate the checksum for
:param int bufsize: the memory buffer size to use when reading the file.
The default is 10 MB; multithreaded applications should
consider a smaller value.
"""
bfsz = 10240000 # 10 MB buffer
if not isinstance(bufsize, int):
raise TypeError("checksum_of(): bufsize arg must be an integer")
if bufsize < 1:
raise ValueError("checksum_of(): bufsize arg must be a positive integer")
sum = hashlib.sha256()
with open(filepath, mode='rb') as fd:
while True:
buf = fd.read(bfsz)
buf = fd.read(bufsize)
if not buf: break
sum.update(buf)
return sum.hexdigest()
Expand Down
17 changes: 17 additions & 0 deletions python/tests/nistoar/pdr/utils/test_datamgmt.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,23 @@ def test_checksum_of(self):
self.assertEqual(utils.checksum_of(dfile), self.syssum(dfile))
dfile = os.path.join(testdatadir2,"trial3/trial3a.json")
self.assertEqual(utils.checksum_of(dfile), self.syssum(dfile))
dfile = os.path.join(testdatadir3,"3A1EE2F169DD3B8CE0531A570681DB5D1491.json")
self.assertEqual(utils.checksum_of(dfile), self.syssum(dfile))

def test_bufsize(self):
dfile = os.path.join(testdatadir3,"3A1EE2F169DD3B8CE0531A570681DB5D1491.json")
syssum = self.syssum(dfile)
self.assertEqual(utils.checksum_of(dfile, 10240), syssum)
self.assertEqual(utils.checksum_of(dfile, 1024), syssum)
self.assertEqual(utils.checksum_of(dfile, 10), syssum)
self.assertEqual(utils.checksum_of(dfile, 1), syssum)

with self.assertRaises(ValueError):
utils.checksum_of(dfile, 0)
with self.assertRaises(ValueError):
utils.checksum_of(dfile, -20)
with self.assertRaises(TypeError):
utils.checksum_of(dfile, "1024")

def syssum(self, filepath):
cmd = ["sha256sum", filepath]
Expand Down

0 comments on commit e135e6b

Please sign in to comment.