From 8b5d89390faa12cf91ea9c687ba6aa3896a3b6b5 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Sun, 23 Jun 2024 18:35:55 +0900 Subject: [PATCH 001/143] gfptar: new option: --memory --- gftool/gfptar/gfptar | 47 +++++++++++++++++++++++++++----------------- 1 file changed, 29 insertions(+), 18 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 183c39cb8..dd1e7bccb 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -40,6 +40,7 @@ import traceback import unittest import hashlib import signal +import resource from docopt import docopt from schema import Schema, Use, Or @@ -317,9 +318,8 @@ class GfURLEntry(): TYPE_SYMLINK = 'SYM' TYPE_OTHER = 'OTHER' - def __init__(self, start_url, path, mode, file_type, uname, gname, + def __init__(self, path, mode, file_type, uname, gname, size, mtime, linkname): - self.start_url = start_url # GfURL self.path = path self.mode = mode self.file_type = file_type @@ -588,7 +588,7 @@ class GfURL(metaclass=abc.ABCMeta): def tar_add(self, tar, subpath, entry): tarinfo = entry.toTarinfo(subpath) - if tarinfo is None: + if tarinfo is None: # warning, skip return # hard link is not supported if entry.is_file(): @@ -966,8 +966,7 @@ class GfURLGfarm(GfURL): else: path = dirname + '/' + name # path is Gfarm URL (gfarm:/...) - start = self - yield GfURLEntry(start, path, mode, file_type, uname, gname, + yield GfURLEntry(path, mode, file_type, uname, gname, size, mtime, linkname) else: # ex. gfarm:/home/user1/dir: -> gfarm:/home/user1/dir @@ -1259,9 +1258,8 @@ class GfURLLocal(GfURL): return file_type def _toGfURLEntry(self, entry, path_only, hardlink_warn): - start = self if path_only: - return GfURLEntry(start, entry.path, 0, None, + return GfURLEntry(entry.path, 0, None, 'root', 'root', 0, 0, '') st = entry.stat(follow_symlinks=False) @@ -1272,7 +1270,7 @@ class GfURLLocal(GfURL): 'nlink=%d, inode=%d (Local): %s', st.st_nlink, st.st_ino, entry.path) linkname = self._readlink(entry.path, entry.is_symlink()) - return GfURLEntry(start, entry.path, st.st_mode, file_type, + return GfURLEntry(entry.path, st.st_mode, file_type, self.uid2name(st.st_uid), self.gid2name(st.st_gid), st.st_size, st.st_mtime, linkname) @@ -1280,8 +1278,7 @@ class GfURLLocal(GfURL): def _scandir(self, path, path_only, recursive, first, hardlink_warn): if first: st = os.stat(path, follow_symlinks=True) - start = self - yield GfURLEntry(start, path, st.st_mode, self._toFileType(st), + yield GfURLEntry(path, st.st_mode, self._toFileType(st), self.uid2name(st.st_uid), self.gid2name(st.st_gid), st.st_size, st.st_mtime, '') @@ -1510,8 +1507,8 @@ class GfTarFile(tarfile.TarFile): if post_func: close_obj.post() - def add_entry(self, subpath, entry): - entry.start_url.tar_add(self, subpath, entry) + def add_entry(self, basedir_url, subpath, entry): + basedir_url.tar_add(self, subpath, entry) class TestGfptar(unittest.TestCase): @@ -1567,7 +1564,6 @@ class GfptarCommand(Command): self.canceled = threading.Event() self.lock_init(False) self.futures = None - self.sig_init() self.hardlink_warn = True if self.quiet: GfURL.shutup_stderr() @@ -1579,6 +1575,13 @@ class GfptarCommand(Command): self.progress_enabled = self._progress_enabled() self.use_fsync = not self.opt['--disable-fsync'] + self.memory_limit = self.opt['--memory'] + if self.memory_limit is not None: + self.set_memory_limit(self.memory_limit) + + def set_memory_limit(self, max_memory): + resource.setrlimit(resource.RLIMIT_AS, (max_memory, max_memory)) + def sig_init(self): def sig_handler(signum, frame): print('') # new line @@ -2198,6 +2201,9 @@ class GfptarCommand(Command): self.outdir_url.create_new_dir() has_error = None + + self.sig_init() + try: if self.MT_enabled(): self.create_tars_MT() @@ -2403,7 +2409,7 @@ class GfptarCommand(Command): break subpath = entry.subpath(self.basedir_url) try: - tar.add_entry(subpath, entry) + tar.add_entry(self.basedir_url, subpath, entry) except Exception as e: with self.lock(): self.cannot_be_archived += 1 @@ -2483,6 +2489,8 @@ class GfptarCommand(Command): self.start_time = time.time() self.next_time = self.start_time + 1 + self.sig_init() + for ent in indir_url.listdir(recursive=False): if ent.path.endswith(self.LIST_SUFFIX): # ignored continue @@ -2937,6 +2945,8 @@ Options: (https://docs.python.org/3/library/codecs.html#standard-encodings) [default: utf-8] --bufsize=BYTES buffer size to copy [default: 1M] + --memory=NUM upper limit of memory size (bytes) + (default: no limit) --test test mode (-q option is recommended) --test-workdir-local=DIR local directory for test [default: /tmp] --test-workdir-gfarm=DIR Gfarm directory for test [default: gfarm:/tmp] @@ -2963,13 +2973,13 @@ Example of --create (Gfarm to Gfarm): ... F dir/test0999.data -Example of --extract (Gfarm to Gfarm): +Example of --extract (Gfarm to Local): Command line: - gfptar -x gfarm:/home/user1/out2 gfarm:/home/user1/out + gfptar -x /home/user1/out2 gfarm:/home/user1/out Output files: - gfarm:/home/user1/out2/dir/test0000.data + /home/user1/out2/dir/test0000.data ... - gfarm:/home/user1/out2/dir/test9999.data + /home/user1/out2/dir/test9999.data Limitations: - Hard links are not preserved. @@ -2995,6 +3005,7 @@ _schema = Schema({ '--gfsched-interval': Use(int), '--same-owner': bool, '--use-tqdm': bool, + '--memory': Or(Use(unhumanize_number), None), '--test': bool, '--test-workdir-local': Or(str, None), '--test-workdir-gfarm': Or(str, None), From 5e7501c13d99b43a04942ddaea055e904e880369 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Wed, 26 Jun 2024 16:50:49 +0900 Subject: [PATCH 002/143] gfptar: new option: --dummy-input --- gftool/gfptar/gfptar | 275 +++++++++++++++++++++++++++++++------------ 1 file changed, 199 insertions(+), 76 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index dd1e7bccb..fc0e4a311 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -41,6 +41,8 @@ import unittest import hashlib import signal import resource +import string +import random from docopt import docopt from schema import Schema, Use, Or @@ -477,7 +479,7 @@ class GfURL(metaclass=abc.ABCMeta): base = self.url_str if not fullpath.startswith(base): logger.error('subpath: %s, %s', base, fullpath) - raise AssertionError + raise AssertionError(f'base={base}, fullpath={fullpath}') logger.debug('subpath: %s, %s', base, fullpath) return fullpath[len(base):].lstrip('/') # relative path @@ -586,19 +588,6 @@ class GfURL(metaclass=abc.ABCMeta): user=None, group=None, use_fsync=True, hostname=None): raise NotImplementedError - def tar_add(self, tar, subpath, entry): - tarinfo = entry.toTarinfo(subpath) - if tarinfo is None: # warning, skip - return - # hard link is not supported - if entry.is_file(): - url = GfURL.init(entry.path) - url.check_readable() - with url.readopen() as f: - tar.addfile(tarinfo, fileobj=f) - else: - tar.addfile(tarinfo) - def copy_from(self, inf, bufsize, mode=None, mtime=0o600, user=None, group=None, use_fsync=True, hostname=None): readlen = 0 @@ -1312,18 +1301,6 @@ class GfURLLocal(GfURL): yield from self._scandir(self.url_str, path_only, recursive, first, hardlink_warn) - # NOTE: This is not expected behavior. - # - This can copy a hard link, - # but a hard link cannot be extracted from gfexport (stream open) - # - When the specified for --create is a symlink, - # the entry will be archived as symlink. - # def tar_add(self, tar, subpath, entry): - # if entry.path: - # path = os.path.join(self.url_str, entry.path) - # else: - # path = self.url_str - # tar.add(path, arcname=subpath, recursive=False) - @contextmanager def readopen(self, textmode=False): if textmode: @@ -1365,7 +1342,8 @@ class GfURLLocal(GfURL): class GfTarFile(tarfile.TarFile): COMPRESS_TYPE_NO = 'no' ATTR_PROC_LIST = '_gfptar_proc_list' # [(proc, fileobj, fileobj), ...] - USE_FSYNC = 'use_fsync' + ATTR_USE_FSYNC = 'use_fsync' + METHOD_add_entry = 'add_entry' @classmethod def extract_open(cls, gfurl, copybufsize, compress_prog=None): @@ -1416,7 +1394,7 @@ class GfTarFile(tarfile.TarFile): @classmethod def create_open(cls, gfurl, compress_type, copybufsize, compress_prog=None, - use_fsync=True, target_host=None): + use_fsync=True, target_host=None, dummy_input=False): # use Stream (not seekable) openmode = 'w|' if compress_prog is None \ @@ -1461,7 +1439,11 @@ class GfTarFile(tarfile.TarFile): gfurl.chmod(0o600) logger.debug('GfTarFile.create_open: %s', gfurl.url_str) setattr(tar, cls.ATTR_PROC_LIST, proc_list) - setattr(tar, cls.USE_FSYNC, use_fsync) + setattr(tar, cls.ATTR_USE_FSYNC, use_fsync) + if dummy_input: + setattr(tar, cls.METHOD_add_entry, tar._add_entry_dummy) + else: + setattr(tar, cls.METHOD_add_entry, tar._add_entry) return tar @classmethod @@ -1485,7 +1467,7 @@ class GfTarFile(tarfile.TarFile): # override def close(self): super().close() - use_fsync = getattr(self, self.USE_FSYNC, True) + use_fsync = getattr(self, self.ATTR_USE_FSYNC, True) proc_list = getattr(self, self.ATTR_PROC_LIST, None) if proc_list: for proc_tuple in proc_list: @@ -1507,8 +1489,66 @@ class GfTarFile(tarfile.TarFile): if post_func: close_obj.post() - def add_entry(self, basedir_url, subpath, entry): - basedir_url.tar_add(self, subpath, entry) + def _add_entry_dummy(self, subpath, entry): + tarinfo = entry.toTarinfo(subpath) + if tarinfo is None: # warning, skip + return + # hard link is not supported + if entry.is_file(): + with RandomStream(entry.size) as f: + self.addfile(tarinfo, fileobj=f) + else: + self.addfile(tarinfo) + + def _add_entry(self, subpath, entry): + tarinfo = entry.toTarinfo(subpath) + if tarinfo is None: # warning, skip + return + # hard link is not supported + if entry.is_file(): + url = GfURL.init(entry.path) + url.check_readable() + with url.readopen() as f: + self.addfile(tarinfo, fileobj=f) + else: + self.addfile(tarinfo) + + # NOTE: This is not expected behavior for local file. + # - This can copy a hard link, + # but a hard link cannot be extracted from gfexport (stream open) + # - When the specified for --create is a symlink, + # the entry will be archived as symlink. + # def tar_add(self, subpath, entry): + # if entry.path: + # path = os.path.join(self.url_str, entry.path) + # else: + # path = self.url_str + # self.add(path, arcname=subpath, recursive=False) + + +class RandomStream(): + chunk_size = 1024 * 16 + + def __init__(self, size): + self.size = size + self.written = 0 + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + pass + + def read(self, size=-1): + if size == -1 or size > self.size - self.written: + size = self.size - self.written + if self.written >= self.size: + return b'' + remaining_size = self.size - self.written + read_size = min(self.chunk_size, remaining_size, size) + random_data = os.urandom(read_size) + self.written += read_size + return random_data class TestGfptar(unittest.TestCase): @@ -1574,6 +1614,7 @@ class GfptarCommand(Command): self.bufsize = self.opt['--bufsize'] self.progress_enabled = self._progress_enabled() self.use_fsync = not self.opt['--disable-fsync'] + self.dummy_input_num = self.opt['--dummy-input'] self.memory_limit = self.opt['--memory'] if self.memory_limit is not None: @@ -2099,6 +2140,64 @@ class GfptarCommand(Command): if self.verbose: print(fmt.format(*args)) + def list_dummy_files(self, base_dir, num, files_per_dir=1000): + choices = string.ascii_letters + string.digits + '漢あア()[]-' + + def generate_random_dirname(min_depth=5, max_depth=5, + min_length=30, max_length=30): + depth = random.randint(min_depth, max_depth) + directories = [''.join( + random.choices(choices, + k=random.randint(min_length, max_length))) + for _ in range(depth)] + return os.path.join(*directories) + + def generate_random_filename(min_length=30, max_length=30): + return ''.join( + random.choices(choices, k=random.randint( + min_length, max_length))) \ + + '.' + ''.join(random.choices(string.ascii_lowercase, + k=random.randint(2, 4))) + + def rand_dir(): + dir_path = generate_random_dirname() + path = os.path.join(base_dir, dir_path) + mode = 0o700 + file_type = GfURLEntry.TYPE_DIR + uname = 'user1' + gname = 'group1' + size = 0 + mtime = 0 + linkname = '' + return GfURLEntry(path, mode, file_type, uname, gname, + size, mtime, linkname) + + def rand_file(dir_path): + f = generate_random_filename() + path = os.path.join(dir_path, f) + mode = 0o600 + file_type = GfURLEntry.TYPE_FILE + uname = 'user1' + gname = 'group1' + size = random.randint(0, 1024) + mtime = 0 + linkname = '' + return GfURLEntry(path, mode, file_type, uname, gname, + size, mtime, linkname) + + dir_num = int(num / files_per_dir) + remainder = num % files_per_dir + for i in range(dir_num): + dir_ent = rand_dir() + yield dir_ent + for j in range(files_per_dir - 1): + yield rand_file(dir_ent.path) + if remainder > 0: + dir_ent = rand_dir() + yield dir_ent + for j in range(remainder - 1): + yield rand_file(dir_ent.path) + def create(self, outdir, basedir, infiles): self.options_init() self.outdir = outdir @@ -2128,6 +2227,14 @@ class GfptarCommand(Command): self.start_time = time.time() self.next_time = self.start_time + 1 + def listdir_switch(gfurl): + if self.dummy_input_num is not None: + return self.list_dummy_files(gfurl.url_str, + self.dummy_input_num) + else: + return gfurl.listdir(recursive=True, first=True, + hardlink_warn=self.hardlink_warn) + self.cannot_be_archived = 0 cannot_read_dir = 0 filelist = [] @@ -2150,8 +2257,7 @@ class GfptarCommand(Command): gfurl = GfURL.init(url_str) logger.debug('listdir: %s', gfurl.url_str) try: - for entry in gfurl.listdir(recursive=True, first=True, - hardlink_warn=self.hardlink_warn): + for entry in listdir_switch(gfurl): logger.debug('listdir: entry.path=%s', entry.path) filelist.append(entry) self.total_size += entry.size @@ -2389,19 +2495,31 @@ class GfptarCommand(Command): outname = '%s..%s%s' % (firstpath, lastpath, self.suffix) serial_str = '%04d_' % serial - outname_max = GfURL.MAXNAMLEN - len(serial_str) - len(self.LIST_SUFFIX) - if len(outname) > outname_max: + outname_max = self.outdir_url.MAXNAMLEN \ + - len(serial_str) - len(self.LIST_SUFFIX) + outname_len = len(outname.encode()) + offset = 0 + while outname_len > outname_max: # use last half of name - outname = outname[-outname_max:] + outname = outname[-(outname_max-offset):] + outname_len = len(outname.encode()) + logger.debug(f'modified outname_len={outname_len}') + # loop for multibyte charactors + offset += 1 # ex.: home/user1/dir -> home_user1_dir outname = serial_str + outname.replace('/', '_') outurl = GfURL.init(self.outdir_url.url_join(outname), use_gfarm_command=self.use_gfarm_command) target_host = self.select_a_target_host(outurl, serial) + if self.dummy_input_num > 0: + dummy_input = True + else: + dummy_input = False tar = GfTarFile.create_open(outurl, self.compress_type, self.bufsize, compress_prog=self.compress_prog, use_fsync=self.use_fsync, - target_host=target_host) + target_host=target_host, + dummy_input=dummy_input) filelist_ok = [] for entry in filelist: if self.is_canceled(): @@ -2409,7 +2527,7 @@ class GfptarCommand(Command): break subpath = entry.subpath(self.basedir_url) try: - tar.add_entry(self.basedir_url, subpath, entry) + tar.add_entry(subpath, entry) except Exception as e: with self.lock(): self.cannot_be_archived += 1 @@ -2908,13 +3026,38 @@ progname = os.path.basename(__file__) __doc__ = """ -Usage: - {f} [options] -c [-C ] [--] ... - {f} [options] -x [--] [...] - {f} [options] -t - {f} [options] --test - {f} [options] --test -C ... - {f} -h | --help +gfptar - archive files in parallel + +Example of --create (Gfarm to Gfarm): + Command line: + gfptar -c gfarm:/home/user1/out -C gfarm:/home/user1 ./dir + Input files: + gfarm:/home/user1/dir/test0000.data + ... + gfarm:/home/user1/dir/test9999.data + Output files: + gfarm:/home/user1/out/0001_dir_test0000.data..dir_test0999.data.tar.gz + gfarm:/home/user1/out/0001_dir_test0000.data..dir_test0999.data.tar.gz.lst + ... + gfarm:/home/user1/out/0010_dir_test9000.data..dir_test9999.data.tar.gz + gfarm:/home/user1/out/0010_dir_test9000.data..di1_test9999.data.tar.gz.lst + Contents of list file (*.lst): + F dir/test0000.data + ... + F dir/test0999.data + +Example of --extract (Gfarm to Local): + Command line: + gfptar -x /home/user1/out2 gfarm:/home/user1/out + Output files: + /home/user1/out2/dir/test0000.data + ... + /home/user1/out2/dir/test9999.data + +Limitations: + - Hard links are not preserved. + - File names cannot include newline characters. + - Subsecond (less than a second) for mtime is not preserved. Options: -t, --list=DIR list mode, @@ -2950,41 +3093,20 @@ Options: --test test mode (-q option is recommended) --test-workdir-local=DIR local directory for test [default: /tmp] --test-workdir-gfarm=DIR Gfarm directory for test [default: gfarm:/tmp] + --dummy-input=NUM the number of dummy files for input + (for -c) (ignore ) (1000 files per dir) -q, --quiet quiet messages -v, --verbose verbose output -d, --debug debug mode -?, -h, --help show this help and exit -Example of --create (Gfarm to Gfarm): - Command line: - gfptar -c gfarm:/home/user1/out -C gfarm:/home/user1 ./dir - Input files: - gfarm:/home/user1/dir/test0000.data - ... - gfarm:/home/user1/dir/test9999.data - Output files: - gfarm:/home/user1/out/0001_dir_test0000.data..dir_test0999.data.tar.gz - gfarm:/home/user1/out/0001_dir_test0000.data..dir_test0999.data.tar.gz.lst - ... - gfarm:/home/user1/out/0010_dir_test9000.data..dir_test9999.data.tar.gz - gfarm:/home/user1/out/0010_dir_test9000.data..di1_test9999.data.tar.gz.lst - Contents of list file (*.lst): - F dir/test0000.data - ... - F dir/test0999.data - -Example of --extract (Gfarm to Local): - Command line: - gfptar -x /home/user1/out2 gfarm:/home/user1/out - Output files: - /home/user1/out2/dir/test0000.data - ... - /home/user1/out2/dir/test9999.data - -Limitations: - - Hard links are not preserved. - - File names cannot include newline characters. - - Subsecond (less than a second) for mtime is not preserved. +Usage: + {f} [options] -c [-C ] [--] ... + {f} [options] -x [--] [...] + {f} [options] -t + {f} [options] --test + {f} [options] --test -C ... + {f} -h | --help """.format(f=progname) @@ -3009,6 +3131,7 @@ _schema = Schema({ '--test': bool, '--test-workdir-local': Or(str, None), '--test-workdir-gfarm': Or(str, None), + '--dummy-input': Or(Use(unhumanize_number), None), '--quiet': bool, '--verbose': bool, '--debug': bool, From 93539bc9c5e3ad6a605c82fdff66ed46deb333e0 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Wed, 26 Jun 2024 17:08:51 +0900 Subject: [PATCH 003/143] gfptar: fix "TypeError: '>' not supported between instances of 'NoneType' and 'int'" --- gftool/gfptar/gfptar | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index fc0e4a311..67e47b172 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -2511,7 +2511,7 @@ class GfptarCommand(Command): outurl = GfURL.init(self.outdir_url.url_join(outname), use_gfarm_command=self.use_gfarm_command) target_host = self.select_a_target_host(outurl, serial) - if self.dummy_input_num > 0: + if self.dummy_input_num is not None and self.dummy_input_num > 0: dummy_input = True else: dummy_input = False From 7804e2a35adf8309d4edc4b11fd67a90b60fccb3 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Wed, 26 Jun 2024 17:47:08 +0900 Subject: [PATCH 004/143] gfptar: include file name length in --size --- gftool/gfptar/gfptar | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 67e47b172..022528be3 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -2383,12 +2383,15 @@ class GfptarCommand(Command): newlist = [] total = 0 for entry in filelist: - total += entry.size + this_size = entry.size + len(entry.path.encode()) + if entry.is_symlink(): + this_size += len(entry.linkname.encode()) + total += this_size if total >= self.split_size: if len(newlist) > 0: filelistlist.append(newlist) newlist = [] - total = entry.size + total = this_size newlist.append(entry) if len(newlist) > 0: filelistlist.append(newlist) From 6627e652b8faef8e6da300bd705b27deb2389267 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Mon, 1 Jul 2024 23:13:38 +0900 Subject: [PATCH 005/143] gfptar (reduce memory usage): do not sort file names when using --create gfptar: new options: --dummy-size-min, --dummy-size-max, --dummy-sleep gfptar: change option: --dummy-input -> --dummy-num gfptar: remove --tqdm option --- gftool/gfptar/gfptar | 421 +++++++++++++++++++++++-------------------- 1 file changed, 226 insertions(+), 195 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 022528be3..31dce4add 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -3,16 +3,14 @@ # Requirements: # --- for Debian,Ubuntu series --- # apt-get install python3 python3-docopt python3-schema -# (optional) python3-tqdm # # --- for RHEL,CentOS series --- # yum install epel-release # yum install python3 python3-docopt python3-schema -# (optional) python3-tqdm # # --- Or, install to ~/.local just for user's environment --- # (required: python3-pip) -# pip3 install --user docopt schema tqdm +# pip3 install --user docopt schema # Coding style check: # flake8 ./gfptar @@ -47,12 +45,6 @@ import random from docopt import docopt from schema import Schema, Use, Or -try: - from tqdm import tqdm - have_tqdm = True -except Exception: - have_tqdm = False - # library def unhumanize_number(numstr): @@ -343,6 +335,13 @@ class GfURLEntry(): def subpath(self, baseurl): return baseurl.subpath(self.path) + def size_all(self): + # include length of path + this_size = self.size + len(self.path.encode()) + if self.is_symlink(): + this_size += len(self.linkname.encode()) + return this_size + def is_file(self): return self.file_type == self.TYPE_FILE @@ -1343,6 +1342,7 @@ class GfTarFile(tarfile.TarFile): COMPRESS_TYPE_NO = 'no' ATTR_PROC_LIST = '_gfptar_proc_list' # [(proc, fileobj, fileobj), ...] ATTR_USE_FSYNC = 'use_fsync' + ATTR_DUMMY_SLEEP = 'dummy_sleep' METHOD_add_entry = 'add_entry' @classmethod @@ -1394,7 +1394,8 @@ class GfTarFile(tarfile.TarFile): @classmethod def create_open(cls, gfurl, compress_type, copybufsize, compress_prog=None, - use_fsync=True, target_host=None, dummy_input=False): + use_fsync=True, target_host=None, + dummy_input=False, dummy_sleep=0): # use Stream (not seekable) openmode = 'w|' if compress_prog is None \ @@ -1442,6 +1443,7 @@ class GfTarFile(tarfile.TarFile): setattr(tar, cls.ATTR_USE_FSYNC, use_fsync) if dummy_input: setattr(tar, cls.METHOD_add_entry, tar._add_entry_dummy) + setattr(tar, cls.ATTR_DUMMY_SLEEP, dummy_sleep) else: setattr(tar, cls.METHOD_add_entry, tar._add_entry) return tar @@ -1495,6 +1497,7 @@ class GfTarFile(tarfile.TarFile): return # hard link is not supported if entry.is_file(): + time.sleep(getattr(self, self.ATTR_DUMMY_SLEEP)) with RandomStream(entry.size) as f: self.addfile(tarinfo, fileobj=f) else: @@ -1527,11 +1530,11 @@ class GfTarFile(tarfile.TarFile): class RandomStream(): - chunk_size = 1024 * 16 - def __init__(self, size): self.size = size self.written = 0 + self.rand_block_size = 1024 + self.rand_data = os.urandom(self.rand_block_size) def __enter__(self): return self @@ -1540,15 +1543,18 @@ class RandomStream(): pass def read(self, size=-1): - if size == -1 or size > self.size - self.written: - size = self.size - self.written if self.written >= self.size: return b'' + if size == -1: + size = self.size remaining_size = self.size - self.written - read_size = min(self.chunk_size, remaining_size, size) - random_data = os.urandom(read_size) + read_size = min(remaining_size, size) + repeat_count = read_size / self.rand_block_size + repeat_remainder = read_size % self.rand_block_size + data = self.rand_data * int(repeat_count) \ + + self.rand_data[:repeat_remainder] self.written += read_size - return random_data + return data class TestGfptar(unittest.TestCase): @@ -1614,7 +1620,6 @@ class GfptarCommand(Command): self.bufsize = self.opt['--bufsize'] self.progress_enabled = self._progress_enabled() self.use_fsync = not self.opt['--disable-fsync'] - self.dummy_input_num = self.opt['--dummy-input'] self.memory_limit = self.opt['--memory'] if self.memory_limit is not None: @@ -1672,7 +1677,8 @@ class GfptarCommand(Command): if self.debug: raise else: - if issubclass(type(e), GfptarError): # already reported + if issubclass(type(e), CannotBeArchivedError): + # already reported pass else: logger.error(convert_message(e)) @@ -2133,54 +2139,66 @@ class GfptarCommand(Command): exclude_list=exclude_list) def MT_enabled(self): - return self.jobs >= 2 + return self.jobs >= 1 # lock required def info(self, fmt, *args): if self.verbose: print(fmt.format(*args)) - def list_dummy_files(self, base_dir, num, files_per_dir=1000): + def list_dummy_files(self, base_dir, num, size_min, size_max, dummy_sleep): + # defaults + files_per_dir = 1000 + dummy_sleep_per_entry = dummy_sleep / 512 # for each readdir() + dir_min_depth = 5 + dir_max_depth = 5 + dir_min_length = 30 + dir_max_length = 30 + file_min_length = 30 + file_max_length = 30 + choices = string.ascii_letters + string.digits + '漢あア()[]-' + now = time.time() - def generate_random_dirname(min_depth=5, max_depth=5, - min_length=30, max_length=30): - depth = random.randint(min_depth, max_depth) + def generate_random_dirname(): + depth = random.randint(dir_min_depth, dir_max_depth) directories = [''.join( random.choices(choices, - k=random.randint(min_length, max_length))) + k=random.randint( + dir_min_length, dir_max_length))) for _ in range(depth)] return os.path.join(*directories) - def generate_random_filename(min_length=30, max_length=30): + def generate_random_filename(): + suffix = ''.join(random.choices(string.ascii_lowercase, + k=random.randint(2, 4))) return ''.join( random.choices(choices, k=random.randint( - min_length, max_length))) \ - + '.' + ''.join(random.choices(string.ascii_lowercase, - k=random.randint(2, 4))) + file_min_length, file_max_length))) + '.' + suffix def rand_dir(): dir_path = generate_random_dirname() path = os.path.join(base_dir, dir_path) mode = 0o700 file_type = GfURLEntry.TYPE_DIR - uname = 'user1' - gname = 'group1' + uname = 'testuser1' + gname = 'testgroup1' size = 0 - mtime = 0 + mtime = now linkname = '' return GfURLEntry(path, mode, file_type, uname, gname, size, mtime, linkname) def rand_file(dir_path): + time.sleep(dummy_sleep_per_entry) f = generate_random_filename() path = os.path.join(dir_path, f) mode = 0o600 file_type = GfURLEntry.TYPE_FILE - uname = 'user1' - gname = 'group1' - size = random.randint(0, 1024) - mtime = 0 + uname = 'testuser1' + gname = 'testgroup1' + size = random.randint(size_min, size_max) + mtime = now linkname = '' return GfURLEntry(path, mode, file_type, uname, gname, size, mtime, linkname) @@ -2204,13 +2222,22 @@ class GfptarCommand(Command): self.outdir_url = GfURL.init(outdir) self.basedir_url = GfURL.init(basedir) self.size = self.opt['--size'] + if self.size <= 0: + self.size = 1 self.ratio = self.opt['--ratio'] self.compress_type = self.opt['--type'] self.compress_prog = self.opt['--use-compress-program'] self.disable_gfarm_command = self.opt['--disable-gfarm-command'] self.use_gfarm_command = not self.disable_gfarm_command self.gfsched_interval = self.opt['--gfsched-interval'] - self.use_tqdm = self.opt['--use-tqdm'] + self.dummy_num = self.opt['--dummy-num'] + if self.dummy_num is not None and self.dummy_num > 0: + self.dummy_input = True + else: + self.dummy_input = False + self.dummy_size_min = self.opt['--dummy-size-min'] + self.dummy_size_max = self.opt['--dummy-size-max'] + self.dummy_sleep = self.opt['--dummy-sleep'] if self.compress_type == GfTarFile.COMPRESS_TYPE_NO: self.split_size = self.size self.suffix = '.tar' @@ -2218,32 +2245,43 @@ class GfptarCommand(Command): self.split_size = self.size * 100 / self.ratio self.suffix = '.tar.' + self.compress_type - def entry_key(entry): - return entry.path - - self.filelistlist = [] self.total_size = 0 self.total_num = 0 + self.start_time = time.time() self.next_time = self.start_time + 1 - def listdir_switch(gfurl): - if self.dummy_input_num is not None: - return self.list_dummy_files(gfurl.url_str, - self.dummy_input_num) - else: - return gfurl.listdir(recursive=True, first=True, - hardlink_warn=self.hardlink_warn) + serial = 0 + self.archived_size = 0 + self.stored_size = 0 + self.stored_num = 0 + + self.gfsched_lock = None + self.gfsched_next = 0 + self.gfsched_list = None + + self.outdir_url.create_new_dir() + has_error = None self.cannot_be_archived = 0 cannot_read_dir = 0 - filelist = [] + targroup_list = [] + targroup_list_len = 0 + targroup_size = 0 + serial = 0 + self.listing = True + + self.create_job_init() + self.sig_init() + + infiles_checked = [] for infile in infiles: infile_url = GfURL.init(infile) if not infile_url.is_local(): raise GfException('specifying a relative path is required ' 'instead of a URL: ' + infile) infile = infile_url.path # normalize and ignore scheme + # normalized: ex. .///abc -> ./abc infile = infile.lstrip('/') # relative path only if infile.startswith('./'): infile = infile[2:] @@ -2253,78 +2291,84 @@ class GfptarCommand(Command): if infile == '..' or infile.startswith('../'): raise GfException('specifying parent directory (..) ' + 'as members is not allowed: ' + infile) + infiles_checked.append(infile) + + def listdir_switch(gfurl): + if self.dummy_num is not None: + return self.list_dummy_files(gfurl.url_str, + self.dummy_num, + self.dummy_size_min, + self.dummy_size_max, + self.dummy_sleep) + else: + return gfurl.listdir(recursive=True, first=True, + hardlink_warn=self.hardlink_warn) + + for infile in infiles_checked: + if self.is_canceled(): + logger.debug('Canceled (listdir 1): serial=%04d', serial) + break + url_str = os.path.join(self.basedir_url.url_str, infile) gfurl = GfURL.init(url_str) logger.debug('listdir: %s', gfurl.url_str) try: for entry in listdir_switch(gfurl): + if self.is_canceled(): + logger.debug('Canceled (listdir 2): serial=%04d', + serial) + break logger.debug('listdir: entry.path=%s', entry.path) - filelist.append(entry) - self.total_size += entry.size - self.total_num += 1 - if self.progress_enabled: + # include length of path + this_size = entry.size_all() + with self.lock(): # for progress + self.total_size += this_size + self.total_num += 1 + + targroup_size += this_size + if targroup_size > self.split_size \ + and targroup_list_len > 0: + serial += 1 + try: + self.create_job_execute(serial, targroup_list) + except Exception as e1: + if has_error is None: + has_error = e1 + targroup_list = [] + targroup_list_len = 0 + targroup_size = this_size + if has_error is not None: + break # from listdir_switch() + targroup_list.append(entry) + targroup_list_len += 1 + + # progress for listing before starting threads + if serial == 0 and self.progress_enabled: now = time.time() if now >= self.next_time: self.next_time = now + 1 - self.progress_for_list1(now) - except Exception as e: + self.progress_for_create(now) + except Exception as e2: cannot_read_dir += 1 logger.info('%s: error while reading directory (%s)', - gfurl.url_str, convert_message(e)) - # continue + gfurl.url_str, convert_message(e2)) + # continue : next infile - filelist.sort(key=entry_key, reverse=False) - self.filelistlist = self.schedule(filelist) - if self.progress_enabled: - self.progress_for_list1(time.time()) - sys.stdout.write('\n') - - self.tqdm = None - self.myprogress = False - if self.progress_enabled: - if self.use_tqdm and have_tqdm: - term_size = shutil.get_terminal_size() - # bar_format = '{l_bar}{r_bar}' - bar_format = '{percentage:3.0f}% {n_fmt}/{total_fmt}' \ - + ' [{elapsed}<{remaining}, {rate_fmt}]' - self.tqdm = tqdm(total=self.total_size, unit_scale=True, - unit='B', dynamic_ncols=False, - bar_format=bar_format, - ncols=int(term_size.columns*3/4)) - else: - self.myprogress = True - - self.serial = 0 - self.archive_size = 0 - self.stored_size = 0 - self.stored_num = 0 - self.start_time = time.time() - self.next_time = self.start_time + 1 - - self.gfsched_lock = None - self.gfsched_next = 0 - self.gfsched_list = None - - self.outdir_url.create_new_dir() - has_error = None - - self.sig_init() + if targroup_list_len > 0: + serial += 1 + try: + self.create_job_execute(serial, targroup_list) + except Exception as e: + if has_error is None: + has_error = e - try: - if self.MT_enabled(): - self.create_tars_MT() - else: - self.create_tars() - except Exception as e: - if has_error is None: # save first error - has_error = e - logger.debug(str(e)) + with self.lock(): # for progress + self.listing = False + self.create_job_finalize() - if self.myprogress: + if self.progress_enabled: self.progress_for_create(time.time()) sys.stdout.write('\n') - if self.tqdm: - self.tqdm.close() if self.cannot_be_archived > 0 or cannot_read_dir > 0: e = CannotBeArchivedError(self.cannot_be_archived, cannot_read_dir) logger.warning(str(e)) @@ -2332,70 +2376,59 @@ class GfptarCommand(Command): has_error = e if not self.quiet and self.stored_size > 0: print('compression ratio: %.2f %% (%d/%d)' % - (100 * self.archive_size / self.stored_size, - self.archive_size, self.stored_size)) - del self.filelistlist + (100 * self.archived_size / self.stored_size, + self.archived_size, self.stored_size)) if self.is_canceled(): raise self.error_canceled() if has_error is not None: raise has_error - def create_tars(self): - self.lock_init(False) + def create_job_init(self): + if self.MT_enabled(): + self.lock_init(True) + self.create_job_execute = self._create_job_execute_MT + self.executor = concurrent.futures.ThreadPoolExecutor( + max_workers=self.jobs) + self.futures = {} # key: serial number + else: + self.lock_init(False) + self.create_job_execute = self._create_job_execute + + def _create_job_execute(self, serial, filelist): + self.create_a_tar(serial, filelist) + + def _create_job_execute_MT(self, serial, filelist): + t = self.executor.submit(self.create_a_tar, serial, filelist) + self.futures[t] = serial has_error = None - for filelist in self.filelistlist: - self.serial += 1 - try: - self.create_a_tar(self.serial, filelist) - except Exception as e: - if has_error is None: # save first error - has_error = e - logger.debug(str(e)) + try: + has_error = self._create_job_check_MT() + except Exception as e: + logger.debug(f'_create_job_execute_MT(): {str(e)}') if has_error is not None: raise has_error - def create_tars_MT(self): - self.lock_init(True) - with concurrent.futures.ThreadPoolExecutor( - max_workers=self.jobs) as executor: - futures = {} # serial number - for filelist in self.filelistlist: - self.serial += 1 - t = executor.submit(self.create_a_tar, self.serial, filelist) - futures[t] = self.serial - self.futures = futures - has_error = None - for t in concurrent.futures.as_completed(futures, timeout=None): - exc = t.exception() - if exc: - logger.debug('serial=%04d: %s', futures[t], - convert_message(exc)) - if self.verbose or self.debug: - tb = traceback.TracebackException.from_exception(exc) - logger.debug(''.join(tb.format())) - if has_error is None: # save first error - has_error = exc - if has_error is not None: - raise has_error - - def schedule(self, filelist): - filelistlist = [] - newlist = [] - total = 0 - for entry in filelist: - this_size = entry.size + len(entry.path.encode()) - if entry.is_symlink(): - this_size += len(entry.linkname.encode()) - total += this_size - if total >= self.split_size: - if len(newlist) > 0: - filelistlist.append(newlist) - newlist = [] - total = this_size - newlist.append(entry) - if len(newlist) > 0: - filelistlist.append(newlist) - return filelistlist + def create_job_finalize(self): + if not self.MT_enabled(): + return + has_error = self._create_job_check_MT(timeout=None) + if has_error is not None: + raise has_error + + def _create_job_check_MT(self, timeout=0.1): + has_error = None + for t in concurrent.futures.as_completed(self.futures, + timeout=timeout): + exc = t.exception() + if exc: + logger.debug('serial=%04d: %s', self.futures[t], + convert_message(exc)) + if self.verbose or self.debug: + tb = traceback.TracebackException.from_exception(exc) + logger.debug(''.join(tb.format())) + if has_error is None: # save first error + has_error = exc + return has_error def lock_init(self, enable, timeout=None): if enable: @@ -2475,7 +2508,7 @@ class GfptarCommand(Command): def create_a_tar0(self, serial, filelist): logger.debug('create_a_tar: start: %04d', serial) if self.is_canceled(): - logger.debug('canceled (1): serial=%04d', serial) + logger.debug('Canceled (create 1): serial=%04d', serial) return first = None last = None @@ -2507,26 +2540,23 @@ class GfptarCommand(Command): outname = outname[-(outname_max-offset):] outname_len = len(outname.encode()) logger.debug(f'modified outname_len={outname_len}') - # loop for multibyte charactors + # loop for multibyte characters offset += 1 # ex.: home/user1/dir -> home_user1_dir outname = serial_str + outname.replace('/', '_') outurl = GfURL.init(self.outdir_url.url_join(outname), use_gfarm_command=self.use_gfarm_command) target_host = self.select_a_target_host(outurl, serial) - if self.dummy_input_num is not None and self.dummy_input_num > 0: - dummy_input = True - else: - dummy_input = False tar = GfTarFile.create_open(outurl, self.compress_type, self.bufsize, compress_prog=self.compress_prog, use_fsync=self.use_fsync, target_host=target_host, - dummy_input=dummy_input) + dummy_input=self.dummy_input, + dummy_sleep=self.dummy_sleep) filelist_ok = [] for entry in filelist: if self.is_canceled(): - logger.debug('canceled (2): serial=%04d', serial) + logger.debug('Canceled (create 2): serial=%04d', serial) break subpath = entry.subpath(self.basedir_url) try: @@ -2540,23 +2570,20 @@ class GfptarCommand(Command): filelist_ok.append(entry) with self.lock(): self.info('stored: {}', subpath) - self.stored_size += entry.size - if self.myprogress: + self.stored_size += entry.size_all() + if self.progress_enabled: self.stored_num += 1 now = time.time() if now >= self.next_time: self.next_time = now + 1 self.progress_for_create(now) - elif self.tqdm: - if entry.is_file(): - self.tqdm.update(entry.size) tar.close() tar_size = outurl.get_size() self.create_a_members_list(outurl, filelist_ok, target_host) with self.lock(): self.info('created(.tar): {}', outurl.url_str) - self.archive_size += tar_size + self.archived_size += tar_size def create_a_members_list(self, url, filelist, target_host): outurl = GfURL.init(url.url_str + self.LIST_SUFFIX) @@ -2644,9 +2671,9 @@ class GfptarCommand(Command): now = time.time() if now >= self.next_time: self.next_time = now + 1 - self.progress_for_list2(now) + self.progress_for_listing(now) if self.progress_enabled: - self.progress_for_list2(time.time()) + self.progress_for_listing(time.time()) sys.stdout.write('\n') if search_target: @@ -2816,7 +2843,7 @@ class GfptarCommand(Command): for t2 in futures: if t == t2: continue - logger.error('%s: canceled', futures[t2]) + logger.error('%s: Canceled', futures[t2]) t2.cancel() raise exc @@ -2832,7 +2859,7 @@ class GfptarCommand(Command): def extract_from_a_tar0(self, serial, target, member_set): logger.debug('extract_from_a_tar: start: %04d', serial) if self.is_canceled(): - logger.debug('canceled (1): name=%s', target) + logger.debug('Canceled (extract 1): name=%s', target) return arch_url = GfURL.init(target, use_gfarm_command=self.use_gfarm_command) @@ -2842,7 +2869,7 @@ class GfptarCommand(Command): index = serial while True: if self.is_canceled(): - logger.debug('canceled (2): name=%s', target) + logger.debug('Canceled (extract 2): name=%s', target) break try: tarinfo = tar.next() @@ -2925,14 +2952,7 @@ class GfptarCommand(Command): with self.lock(): self.info('extracted(done): {}', arch_url.url_str) - def progress_for_list1(self, now): - sec = now - self.start_time - sys.stdout.write(f'\rlisting: ' - f'num={self.total_num}, ' - f'size={self.total_size}, ' - f'sec={sec:.0f}') - - def progress_for_list2(self, now): + def progress_for_listing(self, now): sec = now - self.start_time sys.stdout.write(f'\rlisting: ' f'num={self.total_num}, ' @@ -2941,15 +2961,21 @@ class GfptarCommand(Command): # lock required def progress_for_create(self, now): sec = now - self.start_time - if self.total_size > 0: - percent = self.stored_size * 100 / self.total_size + if self.listing: + percent_str = '?' else: - percent = 0 + percent1 = self.stored_num * 100 / self.total_num + if self.total_size > 0: + percent2 = self.stored_size * 100 / self.total_size + percent = (percent1 + percent2) / 2 + else: + percent = percent1 + percent_str = f'{percent:.0f}' if sec > 0: bytes_per_sec = self.stored_size / sec else: bytes_per_sec = 0 - sys.stdout.write(f'\rcreated: {percent:.0f}%, ' + sys.stdout.write(f'\rcreated: {percent_str}%, ' f'num={self.stored_num}/{self.total_num}, ' f'size={self.stored_size}, ' f'sec={sec:.0f}, ' @@ -3086,7 +3112,6 @@ Options: --disable-fsync disable calling fsync() before close() --gfsched-interval=SEC interval of updating candidate hosts to write (for Gfarm URL only) [default: 120] - --use-tqdm use tqdm to show progress (for --create) --encoding=CODEC codec for filename encoding (https://docs.python.org/3/library/codecs.html#standard-encodings) [default: utf-8] @@ -3096,8 +3121,12 @@ Options: --test test mode (-q option is recommended) --test-workdir-local=DIR local directory for test [default: /tmp] --test-workdir-gfarm=DIR Gfarm directory for test [default: gfarm:/tmp] - --dummy-input=NUM the number of dummy files for input + --dummy-num=NUM the number of dummy (random) files for input (for -c) (ignore ) (1000 files per dir) + (default: disabled) + --dummy-size-min=BYTES minimum size of dummy files [default: 0] + --dummy-size-max=BYTES maximum size of dummy files [default: 1M] + --dummy-sleep=SEC sleep time per dummy file [default: 0.0] -q, --quiet quiet messages -v, --verbose verbose output -d, --debug debug mode @@ -3129,12 +3158,14 @@ _schema = Schema({ '--disable-fsync': bool, '--gfsched-interval': Use(int), '--same-owner': bool, - '--use-tqdm': bool, '--memory': Or(Use(unhumanize_number), None), '--test': bool, '--test-workdir-local': Or(str, None), '--test-workdir-gfarm': Or(str, None), - '--dummy-input': Or(Use(unhumanize_number), None), + '--dummy-num': Or(Use(unhumanize_number), None), + '--dummy-size-min': Use(unhumanize_number), + '--dummy-size-max': Use(unhumanize_number), + '--dummy-sleep': Use(float), '--quiet': bool, '--verbose': bool, '--debug': bool, From e40208375bf39dfd3d71f81d11d9ce13fedb1f4e Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Tue, 2 Jul 2024 20:19:56 +0900 Subject: [PATCH 006/143] gfptar (reduce memory usage): do not continue on MemoryError gfptar: change the format of progress_for_create() gfptar: use SI prefix by default. To use binary prefix, specifiy 'Mi' for example. --- gftool/gfptar/gfptar | 200 ++++++++++++++++++++++++++++++++++--------- 1 file changed, 158 insertions(+), 42 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 31dce4add..17e7ff5b0 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -41,29 +41,75 @@ import signal import resource import string import random +from decimal import Decimal, ROUND_DOWN from docopt import docopt from schema import Schema, Use, Or # library -def unhumanize_number(numstr): - strlen = len(numstr) - if strlen == 1: +def format_seconds(seconds): + if seconds < 3600: + minutes = seconds / 60 + return f"{seconds:.0f}s({minutes:.1f}m)" + else: + hours = seconds / 3600 + return f"{seconds:.0f}s({hours:.1f}h)" + + +def humanize_number(num, binary_prefix=False): + if binary_prefix: + units = ['', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi', 'Yi'] + base = Decimal(1024) + else: + units = ['', 'K', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y'] + base = Decimal(1000) + if num < base: + return str(num) + n = Decimal(num) + ulen = len(units) - 1 + scale = 0 + while n >= base and scale < ulen: + n /= base + scale += 1 + if n < 100: + d = n.quantize(Decimal('0.0'), rounding=ROUND_DOWN) + return f'{d}{units[scale]}' + else: + d = n.quantize(Decimal('0'), rounding=ROUND_DOWN) + return f'{d}{units[scale]}' + + +def unhumanize_number(numstr, binary_prefix=False): + if binary_prefix: + base = 1024 + else: + base = 1000 + numstrlen = len(numstr) + if numstrlen == 1: return int(numstr) - n = int(numstr[:(strlen-1)]) - si_prefix = numstr[(strlen-1):] - prefixes = {'K': 1, - 'M': 2, - 'G': 3, - 'T': 4, - 'P': 5, - 'E': 6, - } - power = prefixes.get(si_prefix.upper()) + lastchar = numstr[-1] + if lastchar == 'i' and numstrlen > 2: + n = int(numstr[:(numstrlen-2)]) + prefix = numstr[-2] + base = 1024 # even if binary_prefix=False + else: + n = int(numstr[:(numstrlen-1)]) + prefix = lastchar + + units = {'K': 1, + 'M': 2, + 'G': 3, + 'T': 4, + 'P': 5, + 'E': 6, + 'Z': 7, + 'Y': 8, + } + power = units.get(prefix.upper()) if power is None: return int(numstr) - return n * (1024 ** power) + return n * (base ** power) class GfException(Exception): @@ -195,6 +241,8 @@ def execcmd(args, stdin=subprocess.DEVNULL, stderr=subprocess.PIPE, if ret != 0: try: err = err.decode().rstrip() + except MemoryError: + raise except Exception: logger.debug('cannot decode: err={}', err) pass @@ -638,6 +686,8 @@ class GfURL(metaclass=abc.ABCMeta): def listdir_ignore_error(gfurl): try: yield from gfurl.listdir(recursive=True) + except MemoryError: + raise except Exception as e: logger.warning('listdir(%s): error ignored: %s', gfurl.url_str, str(e)) @@ -659,6 +709,8 @@ class GfURL(metaclass=abc.ABCMeta): if not search_compare_pop(subpath1, d1ent, d2dict): # not found, compare later d1dict[subpath1] = d1ent + except MemoryError: + raise except Exception as e: logger.error(str(e)) return False @@ -676,6 +728,8 @@ class GfURL(metaclass=abc.ABCMeta): if not search_compare_pop(subpath2, d2ent, d1dict): # not found, compare later d2dict[subpath2] = d2ent + except MemoryError: + raise except Exception as e: logger.error(str(e)) return False @@ -695,6 +749,8 @@ class GfURL(metaclass=abc.ABCMeta): try: if not search_compare_pop(subpath1, d1ent, d2dict): d1notfound[subpath1] = d1ent + except MemoryError: + raise except Exception as e: logger.error(str(e)) return False @@ -799,6 +855,8 @@ class GfURLGfarm(GfURL): raise AssertionError try: execcmd(['gfrm', '-rf', path]) + except MemoryError: + raise except Exception: if remove_readonly: execcmd(['gfchmod', '-R', '700', path]) @@ -1561,17 +1619,31 @@ class TestGfptar(unittest.TestCase): @staticmethod def suite(): suite = unittest.TestSuite() + suite.addTest(TestGfptar('test_humanize')) suite.addTest(TestGfptar('test_unhumanize')) suite.addTest(TestGfptar('test_GfURL_use_gfarm_command_for_local')) return suite + def test_humanize(self): + self.assertEqual(humanize_number(1023, binary_prefix=True), '1023') + self.assertEqual(humanize_number(1024, binary_prefix=True), '1.0Ki') + self.assertEqual(humanize_number(999), '999') + self.assertEqual(humanize_number(1000), '1.0K') + self.assertEqual(humanize_number(99999), '99.9K') + self.assertEqual(humanize_number(100000), '100K') + self.assertEqual(humanize_number(1900000), '1.9M') + self.assertEqual(humanize_number(2000000), '2.0M') + def test_unhumanize(self): - self.assertEqual(unhumanize_number('1K'), 1024) - self.assertEqual(unhumanize_number('2M'), 2097152) - self.assertEqual(unhumanize_number('3G'), 3221225472) - self.assertEqual(unhumanize_number('4T'), 4398046511104) - self.assertEqual(unhumanize_number('5P'), 5629499534213120) - self.assertEqual(unhumanize_number('6E'), 6917529027641081856) + self.assertEqual(unhumanize_number('999'), 999) + self.assertEqual(unhumanize_number('1K'), 1000) + self.assertEqual(unhumanize_number('1K', binary_prefix=True), 1024) + self.assertEqual(unhumanize_number('1Ki'), 1024) + self.assertEqual(unhumanize_number('2Mi'), 2097152) + self.assertEqual(unhumanize_number('3Gi'), 3221225472) + self.assertEqual(unhumanize_number('4Ti'), 4398046511104) + self.assertEqual(unhumanize_number('5Pi'), 5629499534213120) + self.assertEqual(unhumanize_number('6Ei'), 6917529027641081856) def test_GfURL_use_gfarm_command_for_local(self): url = GfURL.init('/tmp', use_gfarm_command=True) @@ -1674,6 +1746,10 @@ class GfptarCommand(Command): self.test_main() return except Exception as e: + if self.futures is not None: + for t in self.futures: + logger.info('%s: Canceled', self.futures[t]) + t.cancel() if self.debug: raise else: @@ -1725,7 +1801,7 @@ class GfptarCommand(Command): # create tar per one entry self.opt['--size'] = 0 - pattern_jobs = [1, 10] + pattern_jobs = [0, 1, 10] for jobs in pattern_jobs: self.opt['--jobs'] = jobs self.test_simple('jobs_' + str(jobs), use_all_files=True) @@ -2261,7 +2337,6 @@ class GfptarCommand(Command): self.gfsched_list = None self.outdir_url.create_new_dir() - has_error = None self.cannot_be_archived = 0 cannot_read_dir = 0 @@ -2273,6 +2348,7 @@ class GfptarCommand(Command): self.create_job_init() self.sig_init() + has_error = None infiles_checked = [] for infile in infiles: @@ -2332,6 +2408,7 @@ class GfptarCommand(Command): try: self.create_job_execute(serial, targroup_list) except Exception as e1: + self.cancel() if has_error is None: has_error = e1 targroup_list = [] @@ -2348,22 +2425,33 @@ class GfptarCommand(Command): if now >= self.next_time: self.next_time = now + 1 self.progress_for_create(now) + except MemoryError as e2: + self.cancel() + has_error = e2 + targroup_list = [] + targroup_list_len = 0 + break # from loop of infiles_checked except Exception as e2: cannot_read_dir += 1 logger.info('%s: error while reading directory (%s)', gfurl.url_str, convert_message(e2)) - # continue : next infile + # continue (next infile) + + if has_error is not None: + self.cancel() if targroup_list_len > 0: serial += 1 try: self.create_job_execute(serial, targroup_list) except Exception as e: + self.cancel() if has_error is None: has_error = e with self.lock(): # for progress self.listing = False + self.create_job_finalize() if self.progress_enabled: @@ -2378,10 +2466,10 @@ class GfptarCommand(Command): print('compression ratio: %.2f %% (%d/%d)' % (100 * self.archived_size / self.stored_size, self.archived_size, self.stored_size)) - if self.is_canceled(): - raise self.error_canceled() if has_error is not None: raise has_error + if self.is_canceled(): + raise self.error_canceled() def create_job_init(self): if self.MT_enabled(): @@ -2403,15 +2491,19 @@ class GfptarCommand(Command): has_error = None try: has_error = self._create_job_check_MT() + except MemoryError: + raise except Exception as e: + # NOTE: cannot catch TimeoutError + # ignore timeout logger.debug(f'_create_job_execute_MT(): {str(e)}') if has_error is not None: raise has_error - def create_job_finalize(self): + def create_job_finalize(self, timeout=None): if not self.MT_enabled(): return - has_error = self._create_job_check_MT(timeout=None) + has_error = self._create_job_check_MT(timeout=timeout) if has_error is not None: raise has_error @@ -2426,6 +2518,13 @@ class GfptarCommand(Command): if self.verbose or self.debug: tb = traceback.TracebackException.from_exception(exc) logger.debug(''.join(tb.format())) + if not self.is_canceled(): + self.cancel() + for t2 in self.futures: + if t == t2: + continue + logger.error('%s: Canceled', self.futures[t2]) + t2.cancel() if has_error is None: # save first error has_error = exc return has_error @@ -2561,6 +2660,9 @@ class GfptarCommand(Command): subpath = entry.subpath(self.basedir_url) try: tar.add_entry(subpath, entry) + except MemoryError: + tar.close() + raise except Exception as e: with self.lock(): self.cannot_be_archived += 1 @@ -2568,9 +2670,10 @@ class GfptarCommand(Command): continue filelist_ok.append(entry) + size_all = entry.size_all() with self.lock(): self.info('stored: {}', subpath) - self.stored_size += entry.size_all() + self.stored_size += size_all if self.progress_enabled: self.stored_num += 1 now = time.time() @@ -2756,6 +2859,8 @@ class GfptarCommand(Command): dir_url = GfURL.init(url_str) try: dir_url.mkdir() + except MemoryError: + raise except Exception: dir_url.makedirs() self.info('prepare_dir: {}', dir_url.url_str) @@ -2823,27 +2928,29 @@ class GfptarCommand(Command): self.lock_init(True) with concurrent.futures.ThreadPoolExecutor( max_workers=self.jobs) as executor: - futures = {} # tar filenames + self.futures = {} # tar filenames serial = 0 for target in target_list: logger.debug('target_set: %s', target) serial += 1 t = executor.submit(self.extract_from_a_tar, serial, target, member_set) - futures[t] = target + self.futures[t] = target - for t in concurrent.futures.as_completed(futures, timeout=None): + for t in concurrent.futures.as_completed(self.futures, + timeout=None): exc = t.exception() if exc: - logger.debug('%s: %s', futures[t], convert_message(exc)) + logger.debug('%s: %s', self.futures[t], + convert_message(exc)) tb = traceback.TracebackException.from_exception(exc) logger.debug(''.join(tb.format())) if not self.is_canceled(): self.cancel() - for t2 in futures: + for t2 in self.futures: if t == t2: continue - logger.error('%s: Canceled', futures[t2]) + logger.error('%s: Canceled', self.futures[t2]) t2.cancel() raise exc @@ -2873,6 +2980,8 @@ class GfptarCommand(Command): break try: tarinfo = tar.next() + except MemoryError: + raise except Exception as e: logger.warning(f'{target}: SKIPPED: invalid or empty tar: ' f' {str(e)}') @@ -2961,6 +3070,7 @@ class GfptarCommand(Command): # lock required def progress_for_create(self, now): sec = now - self.start_time + sec_str = format_seconds(sec) if self.listing: percent_str = '?' else: @@ -2975,11 +3085,15 @@ class GfptarCommand(Command): bytes_per_sec = self.stored_size / sec else: bytes_per_sec = 0 - sys.stdout.write(f'\rcreated: {percent_str}%, ' - f'num={self.stored_num}/{self.total_num}, ' - f'size={self.stored_size}, ' - f'sec={sec:.0f}, ' - f'B/s={bytes_per_sec:.0f} ') + stored_num_str = humanize_number(self.stored_num) + total_num_str = humanize_number(self.total_num) + stored_size_str = humanize_number(self.stored_size) + bytes_per_sec_str = humanize_number(bytes_per_sec) + sys.stdout.write(f'\rcreated: {percent_str}% ' + f'num={stored_num_str}/{total_num_str} ' + f'{stored_size_str}B ' + f'{sec_str} ' + f'{bytes_per_sec_str}B/s ') # lock required def progress_for_extract(self, now): @@ -3034,6 +3148,8 @@ class GfptarCommand(Command): while True: try: t = tar.next() + except MemoryError: + raise except Exception as e: logger.warning(f'{path}: SKIPPED: invalid or empty tar:' f' {str(e)}') @@ -3099,7 +3215,7 @@ Options: -C, --basedir=DIR base directory for s [default: .] -j, --jobs=NUM the number of jobs to copy per tar file in parallel [default: 4] - -s, --size=BYTES assumed bytes per output file [default: 200M] + -s, --size=BYTES assumed bytes per output file [default: 200Mi] -T, --type=TYPE compress type (gz,bz2,xz,no) [default: gz] -r, --ratio=RATIO assumed compression ratio (%) [default: 50] -I, --use-compress-program=COMMAND @@ -3115,7 +3231,7 @@ Options: --encoding=CODEC codec for filename encoding (https://docs.python.org/3/library/codecs.html#standard-encodings) [default: utf-8] - --bufsize=BYTES buffer size to copy [default: 1M] + --bufsize=BYTES buffer size to copy [default: 1Mi] --memory=NUM upper limit of memory size (bytes) (default: no limit) --test test mode (-q option is recommended) @@ -3125,7 +3241,7 @@ Options: (for -c) (ignore ) (1000 files per dir) (default: disabled) --dummy-size-min=BYTES minimum size of dummy files [default: 0] - --dummy-size-max=BYTES maximum size of dummy files [default: 1M] + --dummy-size-max=BYTES maximum size of dummy files [default: 1Mi] --dummy-sleep=SEC sleep time per dummy file [default: 0.0] -q, --quiet quiet messages -v, --verbose verbose output From d8be2f8a31ce251a2d34ca587da3513e0559a4d5 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Sat, 6 Jul 2024 00:45:22 +0900 Subject: [PATCH 007/143] gfptar: use sqlite3 to reduce memory usage gfptar: new option: --workdir gfptar: support ";" and "?" in filename --- gftool/gfptar/gfptar | 550 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 473 insertions(+), 77 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 17e7ff5b0..bf4536d14 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -31,7 +31,6 @@ import subprocess import concurrent.futures import threading from typing import NoReturn -from urllib.parse import urlparse import shutil from contextlib import contextmanager import traceback @@ -42,14 +41,19 @@ import resource import string import random from decimal import Decimal, ROUND_DOWN +import sqlite3 +import json +import tempfile from docopt import docopt from schema import Schema, Use, Or # library -def format_seconds(seconds): - if seconds < 3600: +def format_seconds(seconds, minhour=False): + if minhour is False: + return f"{seconds:.0f}s" + elif seconds < 3600: minutes = seconds / 60 return f"{seconds:.0f}s({minutes:.1f}m)" else: @@ -112,6 +116,239 @@ def unhumanize_number(numstr, binary_prefix=False): return n * (base ** power) +# Do not use the same DB object from multi-threads. +class DB: + def __init__(self, filename): + self.filename = filename + self.con = sqlite3.connect(filename) + self.con.execute('PRAGMA synchronous = OFF') + # self.con.execute('PRAGMA cache_size = 1000') + # self.con.execute('PRAGMA mmap_size = 1000') + # self.con.execute('PRAGMA journal_mode=WAL') + # self.con.execute("PRAGMA busy_timeout=60000") + + def commit(self): + self.con.commit() + + def close(self): + self.con.close() + + def remove(self): + os.remove(self.filename) + + +class DBObj: + def dumps(self): + raise NotImplementedError + + @classmethod + def loads(cls, key, txt): + raise NotImplementedError + + +# Abstract +class DBCollection: + def __init__(self, db, obj_cls, table_name, clear=False): + self.db = db + self.con = db.con + self.obj_cls = obj_cls + self.table_name = table_name + if clear: + self.clear() + self.create_table() + + def create_table(self): + raise NotImplementedError + + def clear(self): + self.con.execute(f'DROP TABLE IF EXISTS {self.table_name}') + self._count = 0 + + def commit(self): + self.con.commit() + + def close(self): + self.con.close() + + def filename(self): + return self.db.filename + + def remove(self): + self.db.remove() + + def __len__(self): + res = self.con.execute(f'SELECT COUNT(*) FROM {self.table_name}') + return res.fetchone()[0] + + +class DBDict(DBCollection): + def create_table(self): + # Do not set "key TEXT PRIMARY KEY" here for speed reason. + self.con.execute(f''' + CREATE TABLE IF NOT EXISTS {self.table_name} + (key TEXT, value TEXT) + ''') + + def create_index(self): + self.con.execute(f'CREATE INDEX idx_key ON {self.table_name} (key)') + + def obj2json(self, obj): + return obj.dumps(for_list=False) + + def json2obj(self, key, j): + if j is None: + return None + return self.obj_cls.loads(key, j, for_list=False) + + def __setitem__(self, key, value): + self.con.execute(f''' + INSERT OR REPLACE INTO {self.table_name} (key, value) VALUES (?, ?) + ''', (key, self.obj2json(value))) + self.con.commit() + + def __getitem__(self, key): + res = self.con.execute( + f'SELECT value FROM {self.table_name} WHERE key = ?', (key,)) + row = res.fetchone() + if row: + return self.json2obj(row[0]) + else: + raise KeyError(key) + + def __delitem__(self, key): + if key in self: # __contains__() + self.con.execute( + f'DELETE FROM {self.table_name} WHERE key = ?', (key,)) + else: + raise KeyError(key) + + def __contains__(self, key): + res = self.con.execute( + f'SELECT 1 FROM {self.table_name} WHERE key = ?', (key,)) + row = res.fetchone() + return row is not None + + def keys(self): + res = self.con.execute(f'SELECT key FROM {self.table_name}') + for row in res: + yield row[0] + + def values(self): + res = self.con.execute(f'SELECT value FROM {self.table_name}') + for row in res: + yield self.json2obj(row[0]) + + def items(self): + res = self.con.execute(f'SELECT key,value FROM {self.table_name}') + for row in res: + yield row[0], self.json2obj(row[0], row[1]) + + def __iter__(self): + return self.keys() + + # sort: None, 'ASC', 'DESC' + def iterator(self, sort=None, offset=0, limit=-1): + sql = f'SELECT key,value FROM {self.table_name}' + if sort is not None: + if sort.upper() == 'ASC': + sql += ' ORDER BY key ASC' + elif sort.upper() == 'DESC': + sql += ' ORDER BY key DESC' + sql += f' LIMIT {limit} OFFSET {offset}' + res = self.con.execute(sql) + for row in res: + yield row[0], self.json2obj(row[0], row[1]) + + +class DBList(DBCollection): + ERRMSG_INDEX = 'list index out of range' + + def create_table(self): + self.con.execute(f''' + CREATE TABLE IF NOT EXISTS {self.table_name} + (id INTEGER PRIMARY KEY AUTOINCREMENT, value TEXT) + ''') + + def __repr__(self): + return repr(list(self)) + + def __str__(self): + return str(list(self)) + + def obj2json(self, obj): + return obj.dumps(for_list=True) + + def json2obj(self, key, j): + if j is None: + return None + return self.obj_cls.loads(key, j, for_list=True) + + def append(self, obj): + self.con.execute(f''' + INSERT INTO {self.table_name} (value) VALUES (?) + ''', (self.obj2json(obj),)) + + def __getitem__(self, index): + res = self.con.execute(f''' + SELECT id,value FROM {self.table_name} ORDER BY id LIMIT 1 OFFSET ? + ''', (index,)) + row = res.fetchone() + if row is None: + raise IndexError(self.ERRMSG_INDEX) + return self.json2obj(row[0], row[1]) + + def __setitem__(self, index, value): + with self.con: + res = self.con.execute(f''' + SELECT id FROM {self.table_name} ORDER BY id LIMIT 1 OFFSET ? + ''', (index,)) + row = res.fetchone() + if row is None: + raise IndexError(self.ERRMSG_INDEX) + self.con.execute(f''' + UPDATE {self.table_name} SET value = ? WHERE id = ? + ''', (self.obj2json(value), row[0])) + + def __delitem__(self, index): + with self.con: + res = self.con.execute(f''' + SELECT id FROM {self.table_name} ORDER BY id LIMIT 1 OFFSET ? + ''', (index,)) + row = res.fetchone() + if row is None: + raise IndexError(self.ERRMSG_INDEX) + self.con.execute(f''' + DELETE FROM {self.table_name} WHERE id = ? + ''', (row[0],)) + + def __iter__(self): + res = self.con.execute(f''' + SELECT id,value FROM {self.table_name} ORDER BY id ASC + ''') + for row in res: + yield self.json2obj(row[0], row[1]) + + def __reversed__(self): + res = self.con.execute(f''' + SELECT id,value FROM {self.table_name} ORDER BY id DESC + ''') + for row in res: + yield self.json2obj(row[0], row[1]) + + # sort: None, 'ASC', 'DESC' + def iterator(self, sort=None, offset=0, limit=-1): + sql = f'SELECT id,value FROM {self.table_name}' + if sort is not None: + if sort.upper() == 'ASC': + sql += ' ORDER BY id ASC' + elif sort.upper() == 'DESC': + sql += ' ORDER BY id DESC' + sql += f' LIMIT {limit} OFFSET {offset}' + res = self.con.execute(sql) + for row in res: + yield self.json2obj(row[0], row[1]) + + class GfException(Exception): pass @@ -354,12 +591,20 @@ class Command(metaclass=abc.ABCMeta): raise NotImplementedError -class GfURLEntry(): +class GfURLEntry(DBObj): TYPE_FILE = 'FILE' TYPE_DIR = 'DIR' TYPE_SYMLINK = 'SYM' TYPE_OTHER = 'OTHER' + type_map = { + TYPE_FILE: 1, + TYPE_DIR: 2, + TYPE_SYMLINK: 3, + TYPE_OTHER: 4, + } + type_map_reverse = {v: k for k, v in type_map.items()} + def __init__(self, path, mode, file_type, uname, gname, size, mtime, linkname): self.path = path @@ -367,18 +612,42 @@ class GfURLEntry(): self.file_type = file_type self.uname = uname self.gname = gname - self.size = size self.mtime = mtime self.linkname = linkname - if not self.is_file(): + if self.is_file(): + self.size = size + else: self.size = 0 def __str__(self): return f'{self.path}, {self.mode:o}, {self.file_type}' def __repr__(self): - return str(self) + return (f'Entry(path={self.path},mode={oct(self.mode)},' + f'user={self.uname},group={self.gname})') + + # only path must be specified for key when using DBDict + def dumps(self, for_list=False): + t = self.type_map[self.file_type] + # serialize using list() to reduce size + # [0]...[6] + array = [self.mode, t, self.uname, self.gname, + self.size, self.mtime, self.linkname] + # save path to key when using dict, so don't save path to value + if for_list: + array.append(self.path) # [7] + return json.dumps(array, separators=(',', ':')) + + @classmethod + def loads(cls, key, txt, for_list=False): + o = json.loads(txt) + if for_list: + path = o[7] + else: + path = key + t = cls.type_map_reverse[o[1]] + return cls(path, o[0], t, o[2], o[3], o[4], o[5], o[6]) def subpath(self, baseurl): return baseurl.subpath(self.path) @@ -483,8 +752,41 @@ class GfURL(metaclass=abc.ABCMeta): return gfurl1 def __init__(self, url): - # allow_fragments=False : allow "#filename" - self._url = urlparse(url, allow_fragments=False) + self._url_str = url + scheme, host, path = self.parse(url) + self._scheme = scheme + self._host = host + self._path = path + + @classmethod + def parse(cls, url): + supported_classes = [GfURLGfarm] + + for c in supported_classes: + if url.startswith(f'{c.SCHEME}:'): + # gfarm://host/path -> //host/path + hostpath = url[(c.SCHEME_LEN+1):] + scheme = c.SCHEME + if hostpath.startswith('//'): # gfarm://... + hostpath = hostpath[2:] + hp = hostpath.split('/', 1) + host = hp[0] + if len(hp) >= 2: + # ex. gfarm://host/path -> path='/path' + # ex. gfarm://host/ -> path='/' + path = '/' + hp[1] + else: + # ex. gfarm://host + path = '/' + return scheme, host, path + else: # gfarm:/... or gfarm:... + host = None + path = hostpath + return scheme, host, path + + # case of GfURLLocal: + # scheme, host, path + return None, None, url @property def basename(self): @@ -511,16 +813,22 @@ class GfURL(metaclass=abc.ABCMeta): @property def url_str(self): - return self._url.geturl() + return self._url_str @property def path(self): - return os.path.normpath(self._url.path) + return os.path.normpath(self._path) @property def root_url_str(self): - # ex. http://example.com/a/b/c -> http://example.com - return self.url_str[:-len(self._url.path)] + # ex. gfarm://example.com/a/b/c -> gfarm://example.com + # ex. gfarm:/abc.def/gh -> gfarm: + if self._scheme: + if self._host: + return f'{self._scheme}:{self._host}' + else: + return f'{self._scheme}:' + return '' def subpath(self, fullpath): base = self.url_str @@ -772,6 +1080,9 @@ USE_GFREG_PLUS = str2bool(os.getenv('GFREG_PLUS', 'True')) class GfURLGfarm(GfURL): + SCHEME = 'gfarm' + SCHEME_LEN = 5 + SCHEME_COLON = 'gfarm:' # Ex. 12345 -rw-rw-r-- 1 user1 gfarmadm 29 Jan 1 00:00:00 2022 fname PAT_ENTRY = re.compile(r'^\s*(\d+)\s+([-dl]\S+)\s+(\d+)\s+(\S+)\s+(\S+)\s+' r'(\d+)\s+(\S+\s+\d+\s+\d+:\d+:\d+\s+\d+)\s+(.+)$') @@ -782,7 +1093,7 @@ class GfURLGfarm(GfURL): @classmethod def is_my_URL(cls, url): - return url.startswith('gfarm:') + return url.startswith(cls.SCHEME_COLON) @staticmethod def to_oct_str(mode): @@ -1149,6 +1460,9 @@ def get_gid(group): class GfURLLocal(GfURL): + SCHEME = '' + SCHEME_LEN = 0 + def __init__(self, url): super().__init__(url) @@ -1553,7 +1867,6 @@ class GfTarFile(tarfile.TarFile): tarinfo = entry.toTarinfo(subpath) if tarinfo is None: # warning, skip return - # hard link is not supported if entry.is_file(): time.sleep(getattr(self, self.ATTR_DUMMY_SLEEP)) with RandomStream(entry.size) as f: @@ -1561,6 +1874,11 @@ class GfTarFile(tarfile.TarFile): else: self.addfile(tarinfo) + # NOTE: add() is not expected behavior. Use addfile() instead. + # - add() can copy a hard link, + # but a hard link cannot be extracted from gfexport (stream open) + # - When the specified for --create is a symlink, + # the entry will be archived as symlink. def _add_entry(self, subpath, entry): tarinfo = entry.toTarinfo(subpath) if tarinfo is None: # warning, skip @@ -1574,20 +1892,8 @@ class GfTarFile(tarfile.TarFile): else: self.addfile(tarinfo) - # NOTE: This is not expected behavior for local file. - # - This can copy a hard link, - # but a hard link cannot be extracted from gfexport (stream open) - # - When the specified for --create is a symlink, - # the entry will be archived as symlink. - # def tar_add(self, subpath, entry): - # if entry.path: - # path = os.path.join(self.url_str, entry.path) - # else: - # path = self.url_str - # self.add(path, arcname=subpath, recursive=False) - -class RandomStream(): +class RandomStream: def __init__(self, size): self.size = size self.written = 0 @@ -1622,6 +1928,7 @@ class TestGfptar(unittest.TestCase): suite.addTest(TestGfptar('test_humanize')) suite.addTest(TestGfptar('test_unhumanize')) suite.addTest(TestGfptar('test_GfURL_use_gfarm_command_for_local')) + suite.addTest(TestGfptar('test_GfURL_parse')) return suite def test_humanize(self): @@ -1650,6 +1957,30 @@ class TestGfptar(unittest.TestCase): self.assertEqual(url.is_local(), True) self.assertEqual(url.path, '/tmp') + def test_GfURL_parse(self): + self.assertEqual(GfURL.parse('gfarm://host'), + ('gfarm', 'host', '/')) + self.assertEqual(GfURL.parse('gfarm://host/'), + ('gfarm', 'host', '/')) + self.assertEqual(GfURL.parse('gfarm://host/path'), + ('gfarm', 'host', '/path')) + self.assertEqual(GfURL.parse('gfarm://host/path://'), + ('gfarm', 'host', '/path://')) + self.assertEqual(GfURL.parse('gfarm://host/path;abc'), + ('gfarm', 'host', '/path;abc')) + self.assertEqual(GfURL.parse('gfarm://host/path#abc'), + ('gfarm', 'host', '/path#abc')) + self.assertEqual(GfURL.parse('gfarm://host/path?abc'), + ('gfarm', 'host', '/path?abc')) + self.assertEqual(GfURL.parse('gfarm:/path/abc'), + ('gfarm', None, '/path/abc')) + self.assertEqual(GfURL.parse('gfarm:path/abc'), + ('gfarm', None, 'path/abc')) + self.assertEqual(GfURL.parse('/abc/#def:/'), + (None, None, '/abc/#def:/')) + self.assertEqual(GfURL.parse('abc:/def://'), + (None, None, 'abc:/def://')) + class GfptarError(GfException): pass @@ -1676,6 +2007,7 @@ class CannotBeArchivedError(GfptarError): class GfptarCommand(Command): LIST_SUFFIX = '.lst' + SERIAL_FORMAT = '%04d' def __init__(self, name): self.init(name) @@ -1692,6 +2024,7 @@ class GfptarCommand(Command): self.bufsize = self.opt['--bufsize'] self.progress_enabled = self._progress_enabled() self.use_fsync = not self.opt['--disable-fsync'] + self.workdir = self.opt['--workdir'] self.memory_limit = self.opt['--memory'] if self.memory_limit is not None: @@ -1801,7 +2134,7 @@ class GfptarCommand(Command): # create tar per one entry self.opt['--size'] = 0 - pattern_jobs = [0, 1, 10] + pattern_jobs = [0, 10] for jobs in pattern_jobs: self.opt['--jobs'] = jobs self.test_simple('jobs_' + str(jobs), use_all_files=True) @@ -2222,6 +2555,11 @@ class GfptarCommand(Command): if self.verbose: print(fmt.format(*args)) + def print_trace(self, exc): + if self.verbose or self.debug: + tb = traceback.TracebackException.from_exception(exc) + logger.info(''.join(tb.format())) + def list_dummy_files(self, base_dir, num, size_min, size_max, dummy_sleep): # defaults files_per_dir = 1000 @@ -2234,6 +2572,8 @@ class GfptarCommand(Command): file_max_length = 30 choices = string.ascii_letters + string.digits + '漢あア()[]-' + other_symbols = '!"#$%&=^~|`@{}+*;:,.<>?_' + r"\'" + choices += other_symbols now = time.time() def generate_random_dirname(): @@ -2265,9 +2605,10 @@ class GfptarCommand(Command): return GfURLEntry(path, mode, file_type, uname, gname, size, mtime, linkname) - def rand_file(dir_path): + def rand_file(dir_path, idx): time.sleep(dummy_sleep_per_entry) - f = generate_random_filename() + # f = generate_random_filename() + f = f'{idx}.txt' path = os.path.join(dir_path, f) mode = 0o600 file_type = GfURLEntry.TYPE_FILE @@ -2285,12 +2626,12 @@ class GfptarCommand(Command): dir_ent = rand_dir() yield dir_ent for j in range(files_per_dir - 1): - yield rand_file(dir_ent.path) + yield rand_file(dir_ent.path, j) if remainder > 0: dir_ent = rand_dir() yield dir_ent for j in range(remainder - 1): - yield rand_file(dir_ent.path) + yield rand_file(dir_ent.path, j) def create(self, outdir, basedir, infiles): self.options_init() @@ -2301,11 +2642,13 @@ class GfptarCommand(Command): if self.size <= 0: self.size = 1 self.ratio = self.opt['--ratio'] + self.compress_type = self.opt['--type'] self.compress_prog = self.opt['--use-compress-program'] self.disable_gfarm_command = self.opt['--disable-gfarm-command'] self.use_gfarm_command = not self.disable_gfarm_command self.gfsched_interval = self.opt['--gfsched-interval'] + self.dummy_num = self.opt['--dummy-num'] if self.dummy_num is not None and self.dummy_num > 0: self.dummy_input = True @@ -2321,16 +2664,14 @@ class GfptarCommand(Command): self.split_size = self.size * 100 / self.ratio self.suffix = '.tar.' + self.compress_type - self.total_size = 0 - self.total_num = 0 - self.start_time = time.time() self.next_time = self.start_time + 1 - serial = 0 self.archived_size = 0 self.stored_size = 0 self.stored_num = 0 + self.total_size = 0 + self.total_num = 0 self.gfsched_lock = None self.gfsched_next = 0 @@ -2340,9 +2681,6 @@ class GfptarCommand(Command): self.cannot_be_archived = 0 cannot_read_dir = 0 - targroup_list = [] - targroup_list_len = 0 - targroup_size = 0 serial = 0 self.listing = True @@ -2380,6 +2718,22 @@ class GfptarCommand(Command): return gfurl.listdir(recursive=True, first=True, hardlink_warn=self.hardlink_warn) + # Temporary files are removed when the process exits. + # dir=None: system default + tmpdir = tempfile.TemporaryDirectory(prefix='gfptar-', + dir=self.workdir) + tardb_prefix = os.path.join(tmpdir.name, 'list_for_create') + tardb_fmt = f'_{self.SERIAL_FORMAT}.db' + serial = 1 + tarlist_db = DB(tardb_prefix + tardb_fmt % serial) + tarlist = DBList(tarlist_db, GfURLEntry, 'tarlist') + tarlist_num = 0 + tarlist_size = 0 + + # TODO for debug (unnecessary) + # testdb = DB(os.path.join(tmpdir.name, 'test_dict.db')) + # testdict = DBDict(testdb, GfURLEntry, 'test_dict') + for infile in infiles_checked: if self.is_canceled(): logger.debug('Canceled (listdir 1): serial=%04d', serial) @@ -2401,26 +2755,33 @@ class GfptarCommand(Command): self.total_size += this_size self.total_num += 1 - targroup_size += this_size - if targroup_size > self.split_size \ - and targroup_list_len > 0: - serial += 1 + # testdict[entry.path] = entry # TODO + + tarlist_size += this_size + if tarlist_size > self.split_size and tarlist_num > 0: try: - self.create_job_execute(serial, targroup_list) + tarlist.commit() + tarlist.close() + # DO NOT share a DBList with children, + # share the filename instead. + self.create_job_execute(serial, + tarlist.filename()) except Exception as e1: self.cancel() if has_error is None: has_error = e1 - targroup_list = [] - targroup_list_len = 0 - targroup_size = this_size + serial += 1 + tarlist_db = DB(tardb_prefix + tardb_fmt % serial) + tarlist = DBList(tarlist_db, GfURLEntry, 'tarlist') + tarlist_num = 0 + tarlist_size = this_size if has_error is not None: break # from listdir_switch() - targroup_list.append(entry) - targroup_list_len += 1 + tarlist.append(entry) + tarlist_num += 1 # progress for listing before starting threads - if serial == 0 and self.progress_enabled: + if serial == 1 and self.progress_enabled: now = time.time() if now >= self.next_time: self.next_time = now + 1 @@ -2428,22 +2789,41 @@ class GfptarCommand(Command): except MemoryError as e2: self.cancel() has_error = e2 - targroup_list = [] - targroup_list_len = 0 + tarlist = [] + tarlist_num = 0 + tarlist_size = 0 break # from loop of infiles_checked except Exception as e2: cannot_read_dir += 1 logger.info('%s: error while reading directory (%s)', gfurl.url_str, convert_message(e2)) + self.print_trace(e2) # continue (next infile) if has_error is not None: self.cancel() - if targroup_list_len > 0: - serial += 1 + # TODO + # testdict.commit() + # count = 0 + # for key, entry in testdict.items(): + # #print(f'TODO DEBUG: key={key}: {str(entry)}') # TODO + # count += 1 + # if count >= 2: + # break + # count = 0 + # for key in testdict: + # #print(f'TODO DEBUG: key={key}') # TODO + # count += 1 + # if count >= 2: + # break + # testdict.close() + + tarlist.commit() + tarlist.close() + if tarlist_num > 0: try: - self.create_job_execute(serial, targroup_list) + self.create_job_execute(serial, tarlist.filename()) except Exception as e: self.cancel() if has_error is None: @@ -2463,7 +2843,7 @@ class GfptarCommand(Command): if has_error is None: has_error = e if not self.quiet and self.stored_size > 0: - print('compression ratio: %.2f %% (%d/%d)' % + print('compression ratio: %.2f%% (%d/%d)' % (100 * self.archived_size / self.stored_size, self.archived_size, self.stored_size)) if has_error is not None: @@ -2482,11 +2862,11 @@ class GfptarCommand(Command): self.lock_init(False) self.create_job_execute = self._create_job_execute - def _create_job_execute(self, serial, filelist): - self.create_a_tar(serial, filelist) + def _create_job_execute(self, serial, arg): + self.create_a_tar(serial, arg) - def _create_job_execute_MT(self, serial, filelist): - t = self.executor.submit(self.create_a_tar, serial, filelist) + def _create_job_execute_MT(self, serial, arg): + t = self.executor.submit(self.create_a_tar, serial, arg) self.futures[t] = serial has_error = None try: @@ -2515,9 +2895,7 @@ class GfptarCommand(Command): if exc: logger.debug('serial=%04d: %s', self.futures[t], convert_message(exc)) - if self.verbose or self.debug: - tb = traceback.TracebackException.from_exception(exc) - logger.debug(''.join(tb.format())) + self.print_trace(exc) if not self.is_canceled(): self.cancel() for t2 in self.futures: @@ -2595,20 +2973,24 @@ class GfptarCommand(Command): logger.debug("selected target_host: %s", target_host) return target_host - def create_a_tar(self, serial, filelist): + def create_a_tar(self, serial, dbfile): try: - self.create_a_tar0(serial, filelist) + self.create_a_tar0(serial, dbfile) except Exception: if self.is_canceled(): raise self.error_canceled() else: raise - def create_a_tar0(self, serial, filelist): + def create_a_tar0(self, serial, dbfile): logger.debug('create_a_tar: start: %04d', serial) if self.is_canceled(): logger.debug('Canceled (create 1): serial=%04d', serial) return + + tardb1 = DB(dbfile) + filelist = DBList(tardb1, GfURLEntry, 'tarlist') + first = None last = None for entry in filelist: @@ -2629,7 +3011,7 @@ class GfptarCommand(Command): lastpath = last.subpath(self.basedir_url) outname = '%s..%s%s' % (firstpath, lastpath, self.suffix) - serial_str = '%04d_' % serial + serial_str = f'{self.SERIAL_FORMAT}_' % serial outname_max = self.outdir_url.MAXNAMLEN \ - len(serial_str) - len(self.LIST_SUFFIX) outname_len = len(outname.encode()) @@ -2652,7 +3034,9 @@ class GfptarCommand(Command): target_host=target_host, dummy_input=self.dummy_input, dummy_sleep=self.dummy_sleep) - filelist_ok = [] + + tardb_ok = DB(tardb1.filename + '_ok.db') + filelist_ok = DBList(tardb_ok, GfURLEntry, 'filelist_ok') for entry in filelist: if self.is_canceled(): logger.debug('Canceled (create 2): serial=%04d', serial) @@ -2682,8 +3066,14 @@ class GfptarCommand(Command): self.progress_for_create(now) tar.close() - tar_size = outurl.get_size() + tardb1.close() + tardb1.remove() + self.create_a_members_list(outurl, filelist_ok, target_host) + tardb_ok.close() + tardb_ok.remove() + + tar_size = outurl.get_size() with self.lock(): self.info('created(.tar): {}', outurl.url_str) self.archived_size += tar_size @@ -2720,8 +3110,9 @@ class GfptarCommand(Command): self.outdir_url = GfURL.init(outdir) self.indir = indir member_set = set(members) - self.compress_prog = self.opt['--use-compress-program'] self.same_owner = self.opt['--same-owner'] + + self.compress_prog = self.opt['--use-compress-program'] self.disable_gfarm_command = self.opt['--disable-gfarm-command'] self.use_gfarm_command = not self.disable_gfarm_command self.gfsched_interval = self.opt['--gfsched-interval'] @@ -2943,8 +3334,7 @@ class GfptarCommand(Command): if exc: logger.debug('%s: %s', self.futures[t], convert_message(exc)) - tb = traceback.TracebackException.from_exception(exc) - logger.debug(''.join(tb.format())) + self.print_trace(exc) if not self.is_canceled(): self.cancel() for t2 in self.futures: @@ -3070,11 +3460,14 @@ class GfptarCommand(Command): # lock required def progress_for_create(self, now): sec = now - self.start_time - sec_str = format_seconds(sec) + sec_str = format_seconds(sec, minhour=True) if self.listing: percent_str = '?' else: - percent1 = self.stored_num * 100 / self.total_num + if self.total_num > 0: + percent1 = self.stored_num * 100 / self.total_num + else: + percent1 = 0 if self.total_size > 0: percent2 = self.stored_size * 100 / self.total_size percent = (percent1 + percent2) / 2 @@ -3088,10 +3481,11 @@ class GfptarCommand(Command): stored_num_str = humanize_number(self.stored_num) total_num_str = humanize_number(self.total_num) stored_size_str = humanize_number(self.stored_size) + total_size_str = humanize_number(self.total_size) bytes_per_sec_str = humanize_number(bytes_per_sec) sys.stdout.write(f'\rcreated: {percent_str}% ' - f'num={stored_num_str}/{total_num_str} ' - f'{stored_size_str}B ' + f'{stored_size_str}B/{total_size_str}B ' + f'{stored_num_str}/{total_num_str} ' f'{sec_str} ' f'{bytes_per_sec_str}B/s ') @@ -3232,6 +3626,7 @@ Options: (https://docs.python.org/3/library/codecs.html#standard-encodings) [default: utf-8] --bufsize=BYTES buffer size to copy [default: 1Mi] + --workdir=DIR local directory for temporaly files --memory=NUM upper limit of memory size (bytes) (default: no limit) --test test mode (-q option is recommended) @@ -3274,6 +3669,7 @@ _schema = Schema({ '--disable-fsync': bool, '--gfsched-interval': Use(int), '--same-owner': bool, + '--workdir': Or(str, None), '--memory': Or(Use(unhumanize_number), None), '--test': bool, '--test-workdir-local': Or(str, None), From d37d2fd9c2541ded75010ef869669eeb3afc7d59 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Sat, 6 Jul 2024 11:23:31 +0900 Subject: [PATCH 008/143] gfptar: new option: --max-entries-per-tar (default 100K) --- gftool/gfptar/gfptar | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index bf4536d14..9ce7236fe 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -2638,9 +2638,12 @@ class GfptarCommand(Command): self.outdir = outdir self.outdir_url = GfURL.init(outdir) self.basedir_url = GfURL.init(basedir) - self.size = self.opt['--size'] - if self.size <= 0: - self.size = 1 + self.assumed_size = self.opt['--size'] + if self.assumed_size <= 0: + self.assumed_size = 1 + self.max_entries_per_tar = self.opt['--max-entries-per-tar'] + if self.max_entries_per_tar <= 0: + self.max_entries_per_tar = 1 self.ratio = self.opt['--ratio'] self.compress_type = self.opt['--type'] @@ -2658,10 +2661,10 @@ class GfptarCommand(Command): self.dummy_size_max = self.opt['--dummy-size-max'] self.dummy_sleep = self.opt['--dummy-sleep'] if self.compress_type == GfTarFile.COMPRESS_TYPE_NO: - self.split_size = self.size + self.split_size = self.assumed_size self.suffix = '.tar' else: - self.split_size = self.size * 100 / self.ratio + self.split_size = self.assumed_size * 100 / self.ratio self.suffix = '.tar.' + self.compress_type self.start_time = time.time() @@ -2725,6 +2728,7 @@ class GfptarCommand(Command): tardb_prefix = os.path.join(tmpdir.name, 'list_for_create') tardb_fmt = f'_{self.SERIAL_FORMAT}.db' serial = 1 + # to reduce memory usage tarlist_db = DB(tardb_prefix + tardb_fmt % serial) tarlist = DBList(tarlist_db, GfURLEntry, 'tarlist') tarlist_num = 0 @@ -2757,8 +2761,8 @@ class GfptarCommand(Command): # testdict[entry.path] = entry # TODO - tarlist_size += this_size - if tarlist_size > self.split_size and tarlist_num > 0: + if tarlist_size + this_size > self.split_size \ + or tarlist_num + 1 > self.max_entries_per_tar: try: tarlist.commit() tarlist.close() @@ -2774,11 +2778,10 @@ class GfptarCommand(Command): tarlist_db = DB(tardb_prefix + tardb_fmt % serial) tarlist = DBList(tarlist_db, GfURLEntry, 'tarlist') tarlist_num = 0 - tarlist_size = this_size - if has_error is not None: - break # from listdir_switch() + tarlist_size = 0 tarlist.append(entry) tarlist_num += 1 + tarlist_size += this_size # progress for listing before starting threads if serial == 1 and self.progress_enabled: @@ -2786,6 +2789,9 @@ class GfptarCommand(Command): if now >= self.next_time: self.next_time = now + 1 self.progress_for_create(now) + + if has_error is not None: + break # from listdir_switch() except MemoryError as e2: self.cancel() has_error = e2 @@ -2819,10 +2825,10 @@ class GfptarCommand(Command): # break # testdict.close() - tarlist.commit() - tarlist.close() if tarlist_num > 0: try: + tarlist.commit() + tarlist.close() self.create_job_execute(serial, tarlist.filename()) except Exception as e: self.cancel() @@ -3035,6 +3041,7 @@ class GfptarCommand(Command): dummy_input=self.dummy_input, dummy_sleep=self.dummy_sleep) + # to reduce memory usage tardb_ok = DB(tardb1.filename + '_ok.db') filelist_ok = DBList(tardb_ok, GfURLEntry, 'filelist_ok') for entry in filelist: @@ -3627,6 +3634,8 @@ Options: [default: utf-8] --bufsize=BYTES buffer size to copy [default: 1Mi] --workdir=DIR local directory for temporaly files + --max-entries-per-tar=NUM the number of entries per tar file + to limit memory usage [default: 100K] --memory=NUM upper limit of memory size (bytes) (default: no limit) --test test mode (-q option is recommended) @@ -3670,6 +3679,7 @@ _schema = Schema({ '--gfsched-interval': Use(int), '--same-owner': bool, '--workdir': Or(str, None), + '--max-entries-per-tar': Use(unhumanize_number), '--memory': Or(Use(unhumanize_number), None), '--test': bool, '--test-workdir-local': Or(str, None), From 0ef55782784a844304b0021fe5bc9334cb32a945 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Sat, 6 Jul 2024 15:47:56 +0900 Subject: [PATCH 009/143] gfptar: use multiprocessing.Process to use CPU cores efficiently --- gftool/gfptar/gfptar | 112 +++++++++++++++++++++++++++++++------------ 1 file changed, 82 insertions(+), 30 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 9ce7236fe..d0974d54e 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -44,6 +44,8 @@ from decimal import Decimal, ROUND_DOWN import sqlite3 import json import tempfile +import multiprocessing +import queue from docopt import docopt from schema import Schema, Use, Or @@ -2761,8 +2763,9 @@ class GfptarCommand(Command): # testdict[entry.path] = entry # TODO - if tarlist_size + this_size > self.split_size \ - or tarlist_num + 1 > self.max_entries_per_tar: + if tarlist_num > 0 \ + and (tarlist_size + this_size > self.split_size + or tarlist_num + 1 > self.max_entries_per_tar): try: tarlist.commit() tarlist.close() @@ -2801,8 +2804,8 @@ class GfptarCommand(Command): break # from loop of infiles_checked except Exception as e2: cannot_read_dir += 1 - logger.info('%s: error while reading directory (%s)', - gfurl.url_str, convert_message(e2)) + logger.warning('%s: error while reading directory (%s)', + gfurl.url_str, convert_message(e2)) self.print_trace(e2) # continue (next infile) @@ -2840,6 +2843,8 @@ class GfptarCommand(Command): self.create_job_finalize() + tmpdir.cleanup() + if self.progress_enabled: self.progress_for_create(time.time()) sys.stdout.write('\n') @@ -2994,9 +2999,62 @@ class GfptarCommand(Command): logger.debug('Canceled (create 1): serial=%04d', serial) return + input_queue = multiprocessing.Queue() + output_queue = multiprocessing.Queue() + process = multiprocessing.Process(target=self.create_a_tar_worker, + args=(input_queue, output_queue, + serial, dbfile)) + process.start() + try: + while True: + if self.is_canceled(): + logger.debug('Canceled (create 2): serial=%04d', serial) + input_queue.put('CANCEL') + try: + result = output_queue.get(timeout=1) + except queue.Empty: + if not process.is_alive(): + logger.error('Unexpected child process termination') + break + continue + if result[0] == 'ADD': + status, subpath, size_all = result + with self.lock(): + self.info('stored: {}', subpath) + self.stored_size += size_all + if self.progress_enabled: + self.stored_num += 1 + now = time.time() + if now >= self.next_time: + self.next_time = now + 1 + self.progress_for_create(now) + elif result[0] == 'DONE': + (status, tar_size, cannot_be_archived, + outurl, listurl) = result + with self.lock(): + self.archived_size += tar_size + self.info(f'created(.tar): {outurl}') + self.info(f'created(.lst): {listurl}') + break + elif result[0] == 'ERR': + status, exc = result + raise exc + else: + logger.warning(f'unknown result: {str(result)}') + finally: + process.join() + input_queue.close() + output_queue.close() + + def create_a_tar_worker(self, input_queue, output_queue, serial, dbfile): tardb1 = DB(dbfile) filelist = DBList(tardb1, GfURLEntry, 'tarlist') + if len(filelist) == 0: + tardb1.close() + tardb1.remove() + logger.error(f'empty filelist: {dbfile}') + return first = None last = None for entry in filelist: @@ -3044,46 +3102,41 @@ class GfptarCommand(Command): # to reduce memory usage tardb_ok = DB(tardb1.filename + '_ok.db') filelist_ok = DBList(tardb_ok, GfURLEntry, 'filelist_ok') + cannot_be_archived = 0 for entry in filelist: - if self.is_canceled(): - logger.debug('Canceled (create 2): serial=%04d', serial) - break + if not input_queue.empty(): + qdata = input_queue.get() + if qdata == 'CANCEL': + break subpath = entry.subpath(self.basedir_url) try: tar.add_entry(subpath, entry) - except MemoryError: + filelist_ok.append(entry) + size_all = entry.size_all() + output_queue.put(('ADD', subpath, size_all)) + except MemoryError as e: tar.close() - raise + output_queue.put(('ERR', e)) except Exception as e: - with self.lock(): - self.cannot_be_archived += 1 + cannot_be_archived += 1 logger.warning(convert_message(e)) continue - filelist_ok.append(entry) - size_all = entry.size_all() - with self.lock(): - self.info('stored: {}', subpath) - self.stored_size += size_all - if self.progress_enabled: - self.stored_num += 1 - now = time.time() - if now >= self.next_time: - self.next_time = now + 1 - self.progress_for_create(now) - tar.close() tardb1.close() tardb1.remove() - self.create_a_members_list(outurl, filelist_ok, target_host) + # for DEBUG + # raise Exception('unexpected raise') + # output_queue.put(('ERR', Exception('expected error'))) + + listurl = self.create_a_members_list(outurl, filelist_ok, target_host) tardb_ok.close() tardb_ok.remove() tar_size = outurl.get_size() - with self.lock(): - self.info('created(.tar): {}', outurl.url_str) - self.archived_size += tar_size + output_queue.put(('DONE', tar_size, cannot_be_archived, + outurl.url_str, listurl.url_str)) def create_a_members_list(self, url, filelist, target_host): outurl = GfURL.init(url.url_str + self.LIST_SUFFIX) @@ -3102,8 +3155,7 @@ class GfptarCommand(Command): subpath = entry.subpath(self.basedir_url) f.write(subpath) f.write('\n') - with self.lock(): - self.info('created(.lst): {}', outurl.url_str) + return outurl def error_canceled(self): return GfptarError('Canceled') @@ -3633,7 +3685,7 @@ Options: (https://docs.python.org/3/library/codecs.html#standard-encodings) [default: utf-8] --bufsize=BYTES buffer size to copy [default: 1Mi] - --workdir=DIR local directory for temporaly files + --workdir=DIR local directory for temporary files --max-entries-per-tar=NUM the number of entries per tar file to limit memory usage [default: 100K] --memory=NUM upper limit of memory size (bytes) From 0a23395a67b6043e6226ff28a35cbde701cd1ae6 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Mon, 8 Jul 2024 19:36:00 +0900 Subject: [PATCH 010/143] gfptar: prevent input_queue from filling up --- gftool/gfptar/gfptar | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index d0974d54e..31ef4d25d 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -2081,9 +2081,10 @@ class GfptarCommand(Command): self.test_main() return except Exception as e: + self.cancel() if self.futures is not None: for t in self.futures: - logger.info('%s: Canceled', self.futures[t]) + logger.debug('Canceled: %s', self.futures[t]) t.cancel() if self.debug: raise @@ -2999,17 +3000,19 @@ class GfptarCommand(Command): logger.debug('Canceled (create 1): serial=%04d', serial) return - input_queue = multiprocessing.Queue() - output_queue = multiprocessing.Queue() - process = multiprocessing.Process(target=self.create_a_tar_worker, - args=(input_queue, output_queue, - serial, dbfile)) - process.start() try: + input_queue = multiprocessing.Queue() + output_queue = multiprocessing.Queue() + process = multiprocessing.Process(target=self.create_a_tar_worker, + args=(input_queue, output_queue, + serial, dbfile)) + process.start() + cancel = False while True: - if self.is_canceled(): + if self.is_canceled() and cancel is False: logger.debug('Canceled (create 2): serial=%04d', serial) input_queue.put('CANCEL') + cancel = True # prevent input_queue from filling up try: result = output_queue.get(timeout=1) except queue.Empty: @@ -3042,9 +3045,13 @@ class GfptarCommand(Command): else: logger.warning(f'unknown result: {str(result)}') finally: - process.join() + timeout = 10 + process.join(timeout) + process.kill() + process.close() input_queue.close() output_queue.close() + logger.debug(f'sub-process done: serial={serial}') def create_a_tar_worker(self, input_queue, output_queue, serial, dbfile): tardb1 = DB(dbfile) @@ -3104,9 +3111,13 @@ class GfptarCommand(Command): filelist_ok = DBList(tardb_ok, GfURLEntry, 'filelist_ok') cannot_be_archived = 0 for entry in filelist: - if not input_queue.empty(): + while not input_queue.empty(): qdata = input_queue.get() if qdata == 'CANCEL': + logger.debug('receive CANCEL from parent') + break + else: + logger.error('unexpected message from parent') break subpath = entry.subpath(self.basedir_url) try: From 4a599137d35d97e07a7673835569d139d1b3a6dd Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Tue, 9 Jul 2024 13:50:09 +0900 Subject: [PATCH 011/143] gfptar: change the format of progress_for_extract() gfptar: add cancelable points gfptar: retry process.close() to avoid ValueError --- gftool/gfptar/gfptar | 176 +++++++++++++++++++++++++++++++++---------- 1 file changed, 138 insertions(+), 38 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 31ef4d25d..226ce81e1 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -71,7 +71,7 @@ def humanize_number(num, binary_prefix=False): units = ['', 'K', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y'] base = Decimal(1000) if num < base: - return str(num) + return str(int(num)) n = Decimal(num) ulen = len(units) - 1 scale = 0 @@ -485,7 +485,7 @@ def execcmd(args, stdin=subprocess.DEVNULL, stderr=subprocess.PIPE, except Exception: logger.debug('cannot decode: err={}', err) pass - msg = '{}: {}'.format(str(args), err) + msg = f'{str(args)} (exit={ret}): {err}' if err and use_stderr: logger.debug(msg) raise GfException(msg) @@ -1934,6 +1934,8 @@ class TestGfptar(unittest.TestCase): return suite def test_humanize(self): + self.assertEqual(humanize_number(0.1), '0') + self.assertEqual(humanize_number(1.9), '1') self.assertEqual(humanize_number(1023, binary_prefix=True), '1023') self.assertEqual(humanize_number(1024, binary_prefix=True), '1.0Ki') self.assertEqual(humanize_number(999), '999') @@ -2082,10 +2084,12 @@ class GfptarCommand(Command): return except Exception as e: self.cancel() - if self.futures is not None: - for t in self.futures: - logger.debug('Canceled: %s', self.futures[t]) - t.cancel() + # NOTE: If threads are canceled, child processes may not stop. + # In that case, gfptar may freeze. + # if self.futures is not None: + # for t in self.futures: + # logger.debug('Canceled: %s', self.futures[t]) + # t.cancel() if self.debug: raise else: @@ -2107,6 +2111,7 @@ class GfptarCommand(Command): self.pid = os.getpid() out = gfwhoami() self.gfarm_user = out.strip() + self.test_unit() self.test_invalid('url', 'gfarm:/tmp', 'dst', True) self.test_invalid('dot1', '.', 'dst', True) @@ -2910,11 +2915,12 @@ class GfptarCommand(Command): self.print_trace(exc) if not self.is_canceled(): self.cancel() - for t2 in self.futures: - if t == t2: - continue - logger.error('%s: Canceled', self.futures[t2]) - t2.cancel() + # DO NOT cancel threads beacause it will freeze. + # for t2 in self.futures: + # if t == t2: + # continue + # logger.error('Canceled: %s', self.futures[t2]) + # t2.cancel() if has_error is None: # save first error has_error = exc return has_error @@ -2949,6 +2955,10 @@ class GfptarCommand(Command): if locked: self._lock.release() + def clear_canceled(self): + with self.lock(): + self.canceled.clear() + def cancel(self): with self.lock(): self.canceled.set() @@ -3007,6 +3017,19 @@ class GfptarCommand(Command): args=(input_queue, output_queue, serial, dbfile)) process.start() + except Exception: + try: + input_queue.close() + except Exception: + pass + try: + output_queue.close() + except Exception: + pass + raise + + save_exc = None + try: cancel = False while True: if self.is_canceled() and cancel is False: @@ -3017,7 +3040,7 @@ class GfptarCommand(Command): result = output_queue.get(timeout=1) except queue.Empty: if not process.is_alive(): - logger.error('Unexpected child process termination') + logger.warning('Unexpected child process termination') break continue if result[0] == 'ADD': @@ -3040,27 +3063,77 @@ class GfptarCommand(Command): self.info(f'created(.lst): {listurl}') break elif result[0] == 'ERR': - status, exc = result - raise exc + (status, exc_type_name, exc_value_str, + exc_traceback_str) = result + input_queue.put('ERR_COMPLETE') + raise Exception( + f'{exc_type_name}: {exc_value_str}\n' + f'{"".join(exc_traceback_str)}') else: logger.warning(f'unknown result: {str(result)}') + except Exception as e: + save_exc = e + raise finally: timeout = 10 - process.join(timeout) - process.kill() - process.close() - input_queue.close() - output_queue.close() - logger.debug(f'sub-process done: serial={serial}') + try: + process.join(timeout) + process.terminate() + process.kill() + # process.close() may raise ValueError. + # (I don't know the reason): + # ValueError: Cannot close a process while it is still + # running. You should first call join() or terminate(). + # + # Retry process.close() if ValueError is caught. + ok = False + for i in range(50): # retry, max 5s + try: + process.close() + ok = True + break + except Exception: + logger.debug(f'retry process.close(): {i}') + time.sleep(0.1) + if not ok: + process.close() + except Exception as e: + self.print_trace(e) + if save_exc is None: + raise + finally: + input_queue.close() + output_queue.close() + logger.debug(f'sub-process exits: serial={serial}') def create_a_tar_worker(self, input_queue, output_queue, serial, dbfile): + try: + self.create_a_tar_worker0(input_queue, output_queue, + serial, dbfile) + except Exception: + exc_type, exc_value, exc_traceback = sys.exc_info() + exc_type_name = exc_type.__name__ + exc_value_str = str(exc_value) + exc_traceback_str = traceback.format_exception( + exc_type, exc_value, exc_traceback) + output_queue.put(('ERR', exc_type_name, exc_value_str, + exc_traceback_str)) + try: + input_queue.get(timeout=10) + # ERR_COMPLETE + except queue.Empty: + pass + + def create_a_tar_worker0(self, input_queue, output_queue, serial, dbfile): + signal.signal(signal.SIGINT, signal.SIG_DFL) + signal.signal(signal.SIGTERM, signal.SIG_DFL) tardb1 = DB(dbfile) filelist = DBList(tardb1, GfURLEntry, 'tarlist') if len(filelist) == 0: tardb1.close() tardb1.remove() - logger.error(f'empty filelist: {dbfile}') + logger.warning(f'empty filelist: {dbfile}') return first = None last = None @@ -3125,9 +3198,9 @@ class GfptarCommand(Command): filelist_ok.append(entry) size_all = entry.size_all() output_queue.put(('ADD', subpath, size_all)) - except MemoryError as e: + except MemoryError: tar.close() - output_queue.put(('ERR', e)) + raise except Exception as e: cannot_be_archived += 1 logger.warning(convert_message(e)) @@ -3139,7 +3212,6 @@ class GfptarCommand(Command): # for DEBUG # raise Exception('unexpected raise') - # output_queue.put(('ERR', Exception('expected error'))) listurl = self.create_a_members_list(outurl, filelist_ok, target_host) tardb_ok.close() @@ -3251,6 +3323,8 @@ class GfptarCommand(Command): member_set = new_member_set for member in member_set: + if self.is_canceled(): + raise self.error_canceled() type_arch = archive_dict.get(member, None) if type_arch is None: raise GfException('Not found in archive: ' + member) @@ -3262,6 +3336,8 @@ class GfptarCommand(Command): # add parent directories to update attributes (mode,mtime) new_member_set = set() for member in member_set: + if self.is_canceled(): + raise self.error_canceled() new_member_set.add(member) url = GfURL.init(member, local=True) # relative path for parent_url in url.parent_list: @@ -3316,6 +3392,8 @@ class GfptarCommand(Command): # NOTE: slow on Gfarm directory_list.sort() for d in directory_list: + if self.is_canceled(): + raise self.error_canceled() url_str = self.outdir_url.url_join(d) dir_url = GfURL.init(url_str) try: @@ -3332,6 +3410,8 @@ class GfptarCommand(Command): directory_list.sort(reverse=True) created = self.created_directory_set for d in directory_list: + if self.is_canceled(): + raise self.error_canceled() url_str = self.outdir_url.url_join(d) dir_url = GfURL.init(url_str) # url.path is normalized @@ -3360,7 +3440,11 @@ class GfptarCommand(Command): # process from leaves directory_list.sort(reverse=True) members_num = len(member_set) + self.clear_canceled() for d in directory_list: + if self.is_canceled(): + logger.debug('Canceled (extract 3)') + break if members_num > 0 and d not in member_set: continue tarinfo = self.dirstat_dict.get(d) @@ -3407,11 +3491,12 @@ class GfptarCommand(Command): self.print_trace(exc) if not self.is_canceled(): self.cancel() - for t2 in self.futures: - if t == t2: - continue - logger.error('%s: Canceled', self.futures[t2]) - t2.cancel() + # DO NOT cancel threads beacause it will freeze. + # for t2 in self.futures: + # if t == t2: + # continue + # logger.error('Canceled: %s', self.futures[t2]) + # t2.cancel() raise exc def extract_from_a_tar(self, serial, target, member_set): @@ -3523,9 +3608,11 @@ class GfptarCommand(Command): def progress_for_listing(self, now): sec = now - self.start_time + sec_str = format_seconds(sec, minhour=True) + total_num_str = humanize_number(self.total_num) sys.stdout.write(f'\rlisting: ' - f'num={self.total_num}, ' - f'sec={sec:.0f}') + f'{total_num_str}Ent, ' + f'{sec_str} ') # lock required def progress_for_create(self, now): @@ -3546,35 +3633,48 @@ class GfptarCommand(Command): percent_str = f'{percent:.0f}' if sec > 0: bytes_per_sec = self.stored_size / sec + ent_per_sec = self.stored_num / sec else: bytes_per_sec = 0 + ent_per_sec = 0 stored_num_str = humanize_number(self.stored_num) total_num_str = humanize_number(self.total_num) stored_size_str = humanize_number(self.stored_size) total_size_str = humanize_number(self.total_size) bytes_per_sec_str = humanize_number(bytes_per_sec) + ent_per_sec_str = humanize_number(ent_per_sec) sys.stdout.write(f'\rcreated: {percent_str}% ' f'{stored_size_str}B/{total_size_str}B ' - f'{stored_num_str}/{total_num_str} ' + f'{stored_num_str}/{total_num_str}Ent ' f'{sec_str} ' - f'{bytes_per_sec_str}B/s ') + f'{bytes_per_sec_str}B/s ' + f'{ent_per_sec_str}Ent/s ') # lock required def progress_for_extract(self, now): sec = now - self.start_time + sec_str = format_seconds(sec, minhour=True) if self.total_num > 0: percent = self.extracted_num * 100 / self.total_num else: percent = 0 if sec > 0: bytes_per_sec = self.extracted_size / sec + ent_per_sec = self.extracted_num / sec else: bytes_per_sec = 0 - sys.stdout.write(f'\rextracted: {percent:.0f}%, ' - f'num={self.extracted_num}/{self.total_num}, ' - f'size={self.extracted_size}, ' - f'sec={sec:.0f}, ' - f'B/s={bytes_per_sec:.0f} ') + ent_per_sec = 0 + extracted_num_str = humanize_number(self.extracted_num) + total_num_str = humanize_number(self.total_num) + extracted_size_str = humanize_number(self.extracted_size) + bytes_per_sec_str = humanize_number(bytes_per_sec) + ent_per_sec_str = humanize_number(ent_per_sec) + sys.stdout.write(f'\rextracted: {percent:.0f}% ' + f'{extracted_size_str}B ' + f'{extracted_num_str}/{total_num_str}Ent ' + f'{sec_str} ' + f'{bytes_per_sec_str}B/s ' + f'{ent_per_sec_str}Ent/s ') def list_simple(self, indir, quiet=False): self.options_init() From 4289b09f80a4ac346b77ddef9fd42c3203c2a880 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Tue, 9 Jul 2024 18:01:51 +0900 Subject: [PATCH 012/143] gfptar: implement DBSet() --- gftool/gfptar/gfptar | 233 +++++++++++++++++++++++++++++++++++-------- 1 file changed, 193 insertions(+), 40 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 226ce81e1..bddce989e 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -135,19 +135,33 @@ class DB: def close(self): self.con.close() - def remove(self): + def unlink(self): os.remove(self.filename) +# Interface class DBObj: - def dumps(self): + def dumps(self, for_list=False): raise NotImplementedError @classmethod - def loads(cls, key, txt): + def loads(cls, key, txt, for_list=False): raise NotImplementedError +# Example of DBObj +class IntObj(DBObj): + def __init__(self, i): + self.i = i + + def dumps(self, for_list=False): + return str(self.i) + + @classmethod + def loads(cls, key, txt, for_list=False): + return int(txt) + + # Abstract class DBCollection: def __init__(self, db, obj_cls, table_name, clear=False): @@ -157,26 +171,30 @@ class DBCollection: self.table_name = table_name if clear: self.clear() - self.create_table() + else: + self.create_table() def create_table(self): raise NotImplementedError def clear(self): self.con.execute(f'DROP TABLE IF EXISTS {self.table_name}') - self._count = 0 + self.create_table() def commit(self): self.con.commit() def close(self): self.con.close() + self.con = None def filename(self): return self.db.filename - def remove(self): - self.db.remove() + def unlink(self): + if self.con is not None: + self.close() + self.db.unlink() def __len__(self): res = self.con.execute(f'SELECT COUNT(*) FROM {self.table_name}') @@ -185,35 +203,30 @@ class DBCollection: class DBDict(DBCollection): def create_table(self): - # Do not set "key TEXT PRIMARY KEY" here for speed reason. self.con.execute(f''' CREATE TABLE IF NOT EXISTS {self.table_name} - (key TEXT, value TEXT) + (key TEXT PRIMARY KEY, value TEXT) ''') - def create_index(self): - self.con.execute(f'CREATE INDEX idx_key ON {self.table_name} (key)') - - def obj2json(self, obj): + def obj2txt(self, obj): return obj.dumps(for_list=False) - def json2obj(self, key, j): - if j is None: + def txt2obj(self, key, txt): + if txt is None: return None - return self.obj_cls.loads(key, j, for_list=False) + return self.obj_cls.loads(key, txt, for_list=False) def __setitem__(self, key, value): self.con.execute(f''' INSERT OR REPLACE INTO {self.table_name} (key, value) VALUES (?, ?) - ''', (key, self.obj2json(value))) - self.con.commit() + ''', (key, self.obj2txt(value))) def __getitem__(self, key): res = self.con.execute( f'SELECT value FROM {self.table_name} WHERE key = ?', (key,)) row = res.fetchone() if row: - return self.json2obj(row[0]) + return self.txt2obj(key, row[0]) else: raise KeyError(key) @@ -236,14 +249,21 @@ class DBDict(DBCollection): yield row[0] def values(self): - res = self.con.execute(f'SELECT value FROM {self.table_name}') + res = self.con.execute(f'SELECT key,value FROM {self.table_name}') for row in res: - yield self.json2obj(row[0]) + yield self.txt2obj(row[0], row[1]) def items(self): res = self.con.execute(f'SELECT key,value FROM {self.table_name}') for row in res: - yield row[0], self.json2obj(row[0], row[1]) + yield row[0], self.txt2obj(row[0], row[1]) + + def update(self, other=None, **kwargs): + if other: + for key, value in other.items(): + self[key] = value + for key, value in kwargs.items(): + self[key] = value def __iter__(self): return self.keys() @@ -259,7 +279,58 @@ class DBDict(DBCollection): sql += f' LIMIT {limit} OFFSET {offset}' res = self.con.execute(sql) for row in res: - yield row[0], self.json2obj(row[0], row[1]) + yield row[0], self.txt2obj(row[0], row[1]) + + +class DBSet(DBCollection): + def create_table(self): + self.con.execute(f''' + CREATE TABLE IF NOT EXISTS {self.table_name} + (key TEXT PRIMARY KEY) + ''') + + def obj2txt(self, obj): + return obj.dumps(for_list=True) + + def txt2obj(self, key, txt): + if txt is None: + return None + return self.obj_cls.loads(key, txt, for_list=True) + + def update(self, others): + for key in others: + self.add(key) + + def add(self, key): + self.con.execute(f''' + INSERT OR IGNORE INTO {self.table_name} (key) VALUES (?) + ''', (self.obj2txt(key),)) + + def remove(self, key): + if key in self: + self.con.execute(f'DELETE FROM {self.table_name} WHERE key = ?', + (self.obj2txt(key),)) + else: + raise KeyError(key) + + def discard(self, key): + self.con.execute(f'DELETE FROM {self.table_name} WHERE key = ?', + (self.obj2txt(key),)) + + def __delitem__(self, key): + self.remove(key) + + def __contains__(self, key): + res = self.con.execute(f''' + SELECT 1 FROM {self.table_name} WHERE key = ? + ''', (self.obj2txt(key),)) + row = res.fetchone() + return row is not None + + def __iter__(self): + res = self.con.execute(f'SELECT key FROM {self.table_name}') + for row in res: + yield self.txt2obj(row[0], row[0]) class DBList(DBCollection): @@ -277,18 +348,22 @@ class DBList(DBCollection): def __str__(self): return str(list(self)) - def obj2json(self, obj): + def obj2txt(self, obj): return obj.dumps(for_list=True) - def json2obj(self, key, j): - if j is None: + def txt2obj(self, key, txt): + if txt is None: return None - return self.obj_cls.loads(key, j, for_list=True) + return self.obj_cls.loads(key, txt, for_list=True) def append(self, obj): self.con.execute(f''' INSERT INTO {self.table_name} (value) VALUES (?) - ''', (self.obj2json(obj),)) + ''', (self.obj2txt(obj),)) + + def extend(self, lst): + for obj in lst: + self.append(obj) def __getitem__(self, index): res = self.con.execute(f''' @@ -297,7 +372,7 @@ class DBList(DBCollection): row = res.fetchone() if row is None: raise IndexError(self.ERRMSG_INDEX) - return self.json2obj(row[0], row[1]) + return self.txt2obj(row[0], row[1]) def __setitem__(self, index, value): with self.con: @@ -309,7 +384,7 @@ class DBList(DBCollection): raise IndexError(self.ERRMSG_INDEX) self.con.execute(f''' UPDATE {self.table_name} SET value = ? WHERE id = ? - ''', (self.obj2json(value), row[0])) + ''', (self.obj2txt(value), row[0])) def __delitem__(self, index): with self.con: @@ -328,14 +403,14 @@ class DBList(DBCollection): SELECT id,value FROM {self.table_name} ORDER BY id ASC ''') for row in res: - yield self.json2obj(row[0], row[1]) + yield self.txt2obj(row[0], row[1]) def __reversed__(self): res = self.con.execute(f''' SELECT id,value FROM {self.table_name} ORDER BY id DESC ''') for row in res: - yield self.json2obj(row[0], row[1]) + yield self.txt2obj(row[0], row[1]) # sort: None, 'ASC', 'DESC' def iterator(self, sort=None, offset=0, limit=-1): @@ -348,7 +423,7 @@ class DBList(DBCollection): sql += f' LIMIT {limit} OFFSET {offset}' res = self.con.execute(sql) for row in res: - yield self.json2obj(row[0], row[1]) + yield self.txt2obj(row[0], row[1]) class GfException(Exception): @@ -1927,10 +2002,8 @@ class TestGfptar(unittest.TestCase): @staticmethod def suite(): suite = unittest.TestSuite() - suite.addTest(TestGfptar('test_humanize')) - suite.addTest(TestGfptar('test_unhumanize')) - suite.addTest(TestGfptar('test_GfURL_use_gfarm_command_for_local')) - suite.addTest(TestGfptar('test_GfURL_parse')) + suite.addTest( + unittest.defaultTestLoader.loadTestsFromTestCase(TestGfptar)) return suite def test_humanize(self): @@ -1985,6 +2058,86 @@ class TestGfptar(unittest.TestCase): self.assertEqual(GfURL.parse('abc:/def://'), (None, None, 'abc:/def://')) + def test_DBDict(self): + tmpdir = tempfile.TemporaryDirectory(prefix='gfptar-test-', + dir=None) + testdb = DB(os.path.join(tmpdir.name, 'test.db')) + d = DBDict(testdb, IntObj, 'test_dict') + + d.update({'a': IntObj(1), 'b': IntObj(2), 'c': IntObj(3)}) + self.assertEqual(d['a'], 1) + self.assertEqual(d['b'], 2) + self.assertEqual(d['c'], 3) + + d.clear() + d['a'] = IntObj(4) + self.assertEqual(d['a'], 4) + + d.clear() + d.update({'a': IntObj(1)}) + d['a'] = IntObj(2) + self.assertEqual(d['a'], 2) + + d.clear() + d.update({'a': IntObj(1)}) + del d['a'] + self.assertNotIn('a', d) + + d.unlink() + + def test_DBSet(self): + tmpdir = tempfile.TemporaryDirectory(prefix='gfptar-test-', + dir=None) + testdb = DB(os.path.join(tmpdir.name, 'test.db')) + s = DBSet(testdb, IntObj, 'test_set') + + s.update({IntObj(1), IntObj(2), IntObj(3)}) + self.assertEqual(set(s), {1, 2, 3}) + + s.clear() + s.add(IntObj(1)) + self.assertIn(IntObj(1), s) + + s.clear() + s.update({IntObj(1), IntObj(2), IntObj(3)}) + s.add(IntObj(4)) + self.assertIn(IntObj(4), s) + + s.clear() + s.update({IntObj(1), IntObj(2), IntObj(3)}) + s.remove(IntObj(2)) + self.assertNotIn(IntObj(2), s) + + with self.assertRaises(KeyError): + s.remove(IntObj(4)) + s.discard(IntObj(4)) + + s.unlink() + + def test_DBList(self): + tmpdir = tempfile.TemporaryDirectory(prefix='gfptar-test-', + dir=None) + testdb = DB(os.path.join(tmpdir.name, 'test.db')) + lst = DBList(testdb, IntObj, 'test_list') + lst.extend([IntObj(1), IntObj(2), IntObj(3)]) + self.assertEqual(list(lst), [1, 2, 3]) + + lst.clear() + lst.append(IntObj(1)) + self.assertEqual(lst[0], 1) + + lst.clear() + lst.extend([IntObj(1), IntObj(2), IntObj(3)]) + lst[1] = IntObj(4) + self.assertEqual(list(lst), [1, 4, 3]) + + lst.clear() + lst.extend([IntObj(1), IntObj(2), IntObj(3)]) + del lst[1] + self.assertEqual(list(lst), [1, 3]) + + lst.unlink() + class GfptarError(GfException): pass @@ -3132,7 +3285,7 @@ class GfptarCommand(Command): if len(filelist) == 0: tardb1.close() - tardb1.remove() + tardb1.unlink() logger.warning(f'empty filelist: {dbfile}') return first = None @@ -3208,14 +3361,14 @@ class GfptarCommand(Command): tar.close() tardb1.close() - tardb1.remove() + tardb1.unlink() # for DEBUG # raise Exception('unexpected raise') listurl = self.create_a_members_list(outurl, filelist_ok, target_host) tardb_ok.close() - tardb_ok.remove() + tardb_ok.unlink() tar_size = outurl.get_size() output_queue.put(('DONE', tar_size, cannot_be_archived, From 2f7981f5f8d74f0e8ddbdd9034d00dbac664c259 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Tue, 9 Jul 2024 18:21:24 +0900 Subject: [PATCH 013/143] gfptar: simplify --- gftool/gfptar/gfptar | 55 +++++++++++++++++--------------------------- 1 file changed, 21 insertions(+), 34 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index bddce989e..39d3072f5 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -141,11 +141,11 @@ class DB: # Interface class DBObj: - def dumps(self, for_list=False): + def dumps(self, for_dict): raise NotImplementedError @classmethod - def loads(cls, key, txt, for_list=False): + def loads(cls, key, txt, for_dict): raise NotImplementedError @@ -154,11 +154,11 @@ class IntObj(DBObj): def __init__(self, i): self.i = i - def dumps(self, for_list=False): + def dumps(self, for_dict): return str(self.i) @classmethod - def loads(cls, key, txt, for_list=False): + def loads(cls, key, txt, for_dict): return int(txt) @@ -174,6 +174,14 @@ class DBCollection: else: self.create_table() + def obj2txt(self, obj): + return obj.dumps(self.for_dict) + + def txt2obj(self, key, txt): + if txt is None: + return None + return self.obj_cls.loads(key, txt, self.for_dict) + def create_table(self): raise NotImplementedError @@ -207,14 +215,7 @@ class DBDict(DBCollection): CREATE TABLE IF NOT EXISTS {self.table_name} (key TEXT PRIMARY KEY, value TEXT) ''') - - def obj2txt(self, obj): - return obj.dumps(for_list=False) - - def txt2obj(self, key, txt): - if txt is None: - return None - return self.obj_cls.loads(key, txt, for_list=False) + self.for_dict = True def __setitem__(self, key, value): self.con.execute(f''' @@ -288,14 +289,7 @@ class DBSet(DBCollection): CREATE TABLE IF NOT EXISTS {self.table_name} (key TEXT PRIMARY KEY) ''') - - def obj2txt(self, obj): - return obj.dumps(for_list=True) - - def txt2obj(self, key, txt): - if txt is None: - return None - return self.obj_cls.loads(key, txt, for_list=True) + self.for_dict = False def update(self, others): for key in others: @@ -341,6 +335,7 @@ class DBList(DBCollection): CREATE TABLE IF NOT EXISTS {self.table_name} (id INTEGER PRIMARY KEY AUTOINCREMENT, value TEXT) ''') + self.for_dict = False def __repr__(self): return repr(list(self)) @@ -348,14 +343,6 @@ class DBList(DBCollection): def __str__(self): return str(list(self)) - def obj2txt(self, obj): - return obj.dumps(for_list=True) - - def txt2obj(self, key, txt): - if txt is None: - return None - return self.obj_cls.loads(key, txt, for_list=True) - def append(self, obj): self.con.execute(f''' INSERT INTO {self.table_name} (value) VALUES (?) @@ -705,24 +692,24 @@ class GfURLEntry(DBObj): f'user={self.uname},group={self.gname})') # only path must be specified for key when using DBDict - def dumps(self, for_list=False): + def dumps(self, for_dict): t = self.type_map[self.file_type] # serialize using list() to reduce size # [0]...[6] array = [self.mode, t, self.uname, self.gname, self.size, self.mtime, self.linkname] # save path to key when using dict, so don't save path to value - if for_list: + if not for_dict: array.append(self.path) # [7] return json.dumps(array, separators=(',', ':')) @classmethod - def loads(cls, key, txt, for_list=False): + def loads(cls, key, txt, for_dict): o = json.loads(txt) - if for_list: - path = o[7] - else: + if for_dict: path = key + else: + path = o[7] t = cls.type_map_reverse[o[1]] return cls(path, o[0], t, o[2], o[3], o[4], o[5], o[6]) From 7f5c01474778ca53bc1558023de26795ae070098 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Tue, 9 Jul 2024 22:54:22 +0900 Subject: [PATCH 014/143] gfptar: use sqlite3 for --extract to reduce memory usage --- gftool/gfptar/gfptar | 247 +++++++++++++++++++++++++++---------------- 1 file changed, 154 insertions(+), 93 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 39d3072f5..11fff92a1 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -120,9 +120,10 @@ def unhumanize_number(numstr, binary_prefix=False): # Do not use the same DB object from multi-threads. class DB: - def __init__(self, filename): + def __init__(self, filename, check_same_thread=True): self.filename = filename - self.con = sqlite3.connect(filename) + self.con = sqlite3.connect(filename, + check_same_thread=check_same_thread) self.con.execute('PRAGMA synchronous = OFF') # self.con.execute('PRAGMA cache_size = 1000') # self.con.execute('PRAGMA mmap_size = 1000') @@ -141,7 +142,8 @@ class DB: # Interface class DBObj: - def dumps(self, for_dict): + @classmethod + def dumps(cls, obj, for_dict): raise NotImplementedError @classmethod @@ -154,14 +156,66 @@ class IntObj(DBObj): def __init__(self, i): self.i = i - def dumps(self, for_dict): - return str(self.i) + @classmethod + def dumps(cls, obj, for_dict): + if isinstance(obj, int): + return str(obj) + return str(obj.i) @classmethod def loads(cls, key, txt, for_dict): return int(txt) +class StrObj(DBObj): + def __init__(self, s): + self.s = s + + @classmethod + def dumps(cls, obj, for_dict): + if isinstance(obj, str): + return obj + return obj.s + + @classmethod + def loads(cls, key, txt, for_dict): + return txt + + +class TypeName(DBObj): + def __init__(self, file_type, filename): + self.file_type = file_type + self.filename = filename + + @classmethod + def dumps(cls, obj, for_dict): + array = [obj.file_type, obj.filename] + return json.dumps(array, separators=(',', ':')) + + @classmethod + def loads(cls, key, txt, for_dict): + array = json.loads(txt) + return cls(array[0], array[1]) + + +class FileAttr(DBObj): + def __init__(self, mode, mtime, user, group): + self.mode = mode + self.mtime = mtime + self.user = user + self.group = group + + @classmethod + def dumps(cls, obj, for_dict): + array = [obj.mode, obj.mtime, obj.user, obj.group] + return json.dumps(array, separators=(',', ':')) + + @classmethod + def loads(cls, key, txt, for_dict): + array = json.loads(txt) + return cls(array[0], array[1], array[2], array[3]) + + # Abstract class DBCollection: def __init__(self, db, obj_cls, table_name, clear=False): @@ -175,7 +229,7 @@ class DBCollection: self.create_table() def obj2txt(self, obj): - return obj.dumps(self.for_dict) + return self.obj_cls.dumps(obj, self.for_dict) def txt2obj(self, key, txt): if txt is None: @@ -244,6 +298,11 @@ class DBDict(DBCollection): row = res.fetchone() return row is not None + def get(self, key, default=None): + if key in self: # __contains__() + return self[key] # __getitem__() + return default + def keys(self): res = self.con.execute(f'SELECT key FROM {self.table_name}') for row in res: @@ -326,6 +385,19 @@ class DBSet(DBCollection): for row in res: yield self.txt2obj(row[0], row[0]) + # sort: None, 'ASC', 'DESC' + def iterator(self, sort=None, offset=0, limit=-1): + sql = f'SELECT key FROM {self.table_name}' + if sort is not None: + if sort.upper() == 'ASC': + sql += ' ORDER BY key ASC' + elif sort.upper() == 'DESC': + sql += ' ORDER BY key DESC' + sql += f' LIMIT {limit} OFFSET {offset}' + res = self.con.execute(sql) + for row in res: + yield self.txt2obj(row[0], row[0]) + class DBList(DBCollection): ERRMSG_INDEX = 'list index out of range' @@ -692,15 +764,16 @@ class GfURLEntry(DBObj): f'user={self.uname},group={self.gname})') # only path must be specified for key when using DBDict - def dumps(self, for_dict): - t = self.type_map[self.file_type] + @classmethod + def dumps(cls, obj, for_dict): + t = obj.type_map[obj.file_type] # serialize using list() to reduce size # [0]...[6] - array = [self.mode, t, self.uname, self.gname, - self.size, self.mtime, self.linkname] + array = [obj.mode, t, obj.uname, obj.gname, + obj.size, obj.mtime, obj.linkname] # save path to key when using dict, so don't save path to value if not for_dict: - array.append(self.path) # [7] + array.append(obj.path) # [7] return json.dumps(array, separators=(',', ':')) @classmethod @@ -2055,9 +2128,13 @@ class TestGfptar(unittest.TestCase): self.assertEqual(d['a'], 1) self.assertEqual(d['b'], 2) self.assertEqual(d['c'], 3) + self.assertEqual(d.get('c'), 3) + self.assertEqual(d.get('d'), None) + self.assertNotIn('d', d) + self.assertIn('a', d) d.clear() - d['a'] = IntObj(4) + d['a'] = 4 self.assertEqual(d['a'], 4) d.clear() @@ -2084,6 +2161,7 @@ class TestGfptar(unittest.TestCase): s.clear() s.add(IntObj(1)) self.assertIn(IntObj(1), s) + self.assertIn(1, s) s.clear() s.update({IntObj(1), IntObj(2), IntObj(3)}) @@ -2882,10 +2960,6 @@ class GfptarCommand(Command): tarlist_num = 0 tarlist_size = 0 - # TODO for debug (unnecessary) - # testdb = DB(os.path.join(tmpdir.name, 'test_dict.db')) - # testdict = DBDict(testdb, GfURLEntry, 'test_dict') - for infile in infiles_checked: if self.is_canceled(): logger.debug('Canceled (listdir 1): serial=%04d', serial) @@ -2907,8 +2981,6 @@ class GfptarCommand(Command): self.total_size += this_size self.total_num += 1 - # testdict[entry.path] = entry # TODO - if tarlist_num > 0 \ and (tarlist_size + this_size > self.split_size or tarlist_num + 1 > self.max_entries_per_tar): @@ -2958,22 +3030,6 @@ class GfptarCommand(Command): if has_error is not None: self.cancel() - # TODO - # testdict.commit() - # count = 0 - # for key, entry in testdict.items(): - # #print(f'TODO DEBUG: key={key}: {str(entry)}') # TODO - # count += 1 - # if count >= 2: - # break - # count = 0 - # for key in testdict: - # #print(f'TODO DEBUG: key={key}') # TODO - # count += 1 - # if count >= 2: - # break - # testdict.close() - if tarlist_num > 0: try: tarlist.commit() @@ -3391,7 +3447,8 @@ class GfptarCommand(Command): self.outdir = outdir self.outdir_url = GfURL.init(outdir) self.indir = indir - member_set = set(members) + orig_member_set = set(members) + del members self.same_owner = self.opt['--same-owner'] self.compress_prog = self.opt['--use-compress-program'] @@ -3405,10 +3462,22 @@ class GfptarCommand(Command): if not indir_url.is_directory(): raise self.error_not_a_gfptar_directory(indir_url.url_str) - search_target = len(member_set) > 0 - archive_dict = {} # member -> (file_type, tar filename) - target_set_all = set() # all tar files - directory_list = [] + search_target = len(orig_member_set) > 0 + + # Temporary files are removed when the process exits. + # dir=None: system default + tmpdir = tempfile.TemporaryDirectory(prefix='gfptar-', + dir=self.workdir) + db_file = os.path.join(tmpdir.name, 'extract.db') + # to reduce memory usage + db = DB(db_file, check_same_thread=False) + + # member name -> (file_type, tar filename) + archive_dict = DBDict(db, TypeName, 'archive_dict') + # all tar files + target_set_all = DBSet(db, StrObj, 'target_set_all') + directory_set_all = DBSet(db, StrObj, 'directory_set_all') + self.total_num = 0 self.start_time = time.time() self.next_time = self.start_time + 1 @@ -3439,10 +3508,10 @@ class GfptarCommand(Command): file_type = line[:1] path = line[2:].lstrip('/') if search_target: - archive_dict[path] = (file_type, arch_url_str) + archive_dict[path] = TypeName(file_type, arch_url_str) else: if file_type == 'D': - directory_list.append(path) + directory_set_all.add(path) if self.progress_enabled: now = time.time() if now >= self.next_time: @@ -3454,84 +3523,76 @@ class GfptarCommand(Command): if search_target: # selected tar files - target_set = set() - directory_set = set() - new_member_set = set() - for member in member_set: + target_set = DBSet(db, StrObj, 'target_set') + directory_set = DBSet(db, StrObj, 'directory_set') + member_set1 = DBSet(db, StrObj, 'member_set1') + for member in orig_member_set: m = GfURL.init(member, local=True) - new_member_set.add(m.path) # normalized - member_set = new_member_set + member_set1.add(m.path) # normalized - for member in member_set: + for member in member_set1: if self.is_canceled(): raise self.error_canceled() type_arch = archive_dict.get(member, None) if type_arch is None: raise GfException('Not found in archive: ' + member) - file_type, target_arch = type_arch - target_set.add(target_arch) + target_set.add(type_arch.filename) if file_type == 'D': directory_set.add(member) # add parent directories to update attributes (mode,mtime) - new_member_set = set() - for member in member_set: + member_set2 = DBSet(db, StrObj, 'member_set2') + for member in member_set1: if self.is_canceled(): raise self.error_canceled() - new_member_set.add(member) + member_set2.add(member) url = GfURL.init(member, local=True) # relative path for parent_url in url.parent_list: path = parent_url.path if path != '.' and path != '/': type_arch = archive_dict.get(path, None) if type_arch is not None: # found - file_type, target_arch = type_arch - target_set.add(target_arch) + target_set.add(type_arch.filename) directory_set.add(path) - new_member_set.add(path) - del member_set - member_set = new_member_set # replace + member_set2.add(path) + archive_dict.clear() + member_set1.clear() + member_set = member_set2 # replace self.total_num = len(member_set) # re-set - del archive_dict - directory_list = list(directory_set) - del directory_set else: target_set = target_set_all - - target_list = list(target_set) - del target_set - target_list.sort() + directory_set = directory_set_all + member_set = DBSet(db, StrObj, 'member_set') self.outdir_url.create_new_dir() - self.created_directory_set = set() - # self.extract_directories(directory_list) - self.extract_directories_fast(directory_list) + self.created_directory_set = DBSet(db, StrObj, 'created_directory_set') + # self.extract_directories(directory_set) + self.extract_directories_fast(directory_set) self.extracted_num = 0 self.extracted_size = 0 self.start_time = time.time() self.next_time = self.start_time + 1 - self.dirstat_dict = {} + self.dirstat_dict = DBDict(db, FileAttr, 'dirstat_dict') self.gfsched_lock = None self.gfsched_next = 0 self.gfsched_list = None if self.MT_enabled(): - self.extract_from_archives_MT(target_list, member_set) + self.extract_from_archives_MT(target_set, member_set) else: - self.extract_from_archives(target_list, member_set) + self.extract_from_archives(target_set, member_set) if self.progress_enabled: self.progress_for_extract(time.time()) sys.stdout.write('\n') - self.update_stat_for_directories(directory_list, member_set) + self.update_stat_for_directories(directory_set, member_set) - def extract_directories(self, directory_list): + def extract_directories(self, directory_set): # NOTE: slow on Gfarm - directory_list.sort() - for d in directory_list: + for d in directory_set.iterator(sort='ASC'): if self.is_canceled(): raise self.error_canceled() url_str = self.outdir_url.url_join(d) @@ -3545,43 +3606,41 @@ class GfptarCommand(Command): self.info('prepare_dir: {}', dir_url.url_str) self.created_directory_set.add(dir_url.path) - def extract_directories_fast(self, directory_list): + def extract_directories_fast(self, directory_set): # faster implementation for gfmkdir - directory_list.sort(reverse=True) - created = self.created_directory_set - for d in directory_list: + created_set = self.created_directory_set + for d in directory_set.iterator(sort='DESC'): if self.is_canceled(): raise self.error_canceled() url_str = self.outdir_url.url_join(d) dir_url = GfURL.init(url_str) # url.path is normalized - if dir_url.path in created: + if dir_url.path in created_set: logger.debug('skip (already created): %s', url_str) continue parent_url = dir_url.parent - if parent_url.path in created: + if parent_url.path in created_set: dir_url.mkdir() - created.add(dir_url.path) + created_set.add(dir_url.path) self.info('prepare_dir: {}', dir_url.path) else: # no parent dir_url.makedirs() - created.add(dir_url.path) + created_set.add(dir_url.path) self.info('prepare_dir: {}', dir_url.path) - created.add(parent_url.path) + created_set.add(parent_url.path) self.info('prepare_dir: {}', parent_url.path) for p in parent_url.parent_list: path = p.path if path == '.' or path == '/': continue - created.add(path) + created_set.add(path) self.info('prepare_dir: {}', path) - def update_stat_for_directories(self, directory_list, member_set): + def update_stat_for_directories(self, directory_set, member_set): # process from leaves - directory_list.sort(reverse=True) members_num = len(member_set) self.clear_canceled() - for d in directory_list: + for d in directory_set.iterator(sort='DESC'): if self.is_canceled(): logger.debug('Canceled (extract 3)') break @@ -3601,21 +3660,21 @@ class GfptarCommand(Command): logger.debug("update_stat: %s, %s", d, oct(tarinfo.mode)) self.info('update_stat: {}', d) - def extract_from_archives(self, target_list, member_set): + def extract_from_archives(self, target_set, member_set): self.lock_init(False) serial = 0 - for target in target_list: + for target in target_set.iterator(sort='ASC'): logger.debug('target_set: %s', target) serial += 1 self.extract_from_a_tar(serial, target, member_set) - def extract_from_archives_MT(self, target_list, member_set): + def extract_from_archives_MT(self, target_set, member_set): self.lock_init(True) with concurrent.futures.ThreadPoolExecutor( max_workers=self.jobs) as executor: self.futures = {} # tar filenames serial = 0 - for target in target_list: + for target in target_set.iterator(sort='ASC'): logger.debug('target_set: %s', target) serial += 1 t = executor.submit(self.extract_from_a_tar, @@ -3715,7 +3774,9 @@ class GfptarCommand(Command): # NOTE: already created logger.debug('extract,dir: %s', outfile) with self.lock(): - self.dirstat_dict[outfile] = tarinfo + self.dirstat_dict[outfile] = FileAttr( + tarinfo.mode, tarinfo.mtime, + tarinfo.uname, tarinfo.gname) elif tarinfo.issym(): logger.debug('extract,link: %s, %s', outfile, tarinfo.linkname) From 331fcfb417379d2cec00fe16254e92e477cfac8f Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Thu, 11 Jul 2024 11:32:52 +0900 Subject: [PATCH 015/143] gfptar: create processes before creating threads --- gftool/gfptar/gfptar | 307 +++++++++++++++++++++++++++++-------------- 1 file changed, 205 insertions(+), 102 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 11fff92a1..e9974155c 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -539,16 +539,7 @@ class GfLogger(logging.getLoggerClass()): logger = None -def logger_init(name, loglevel=logging.WARNING, debug=False, verbose=False): - global logger - - if logger is not None: - return logger - - logger = logging.getLogger() # RootLogger - logger.__class__ = GfLogger - logger.myinit() - logger.setLevel(loglevel) +def logger_config(logger, name, loglevel, debug, verbose): strm = logging.StreamHandler() # stderr if debug: fmt = '%(filename)s:%(levelname)s:L%(lineno)d:' + \ @@ -562,6 +553,27 @@ def logger_init(name, loglevel=logging.WARNING, debug=False, verbose=False): strm.setFormatter(formatter_strm) strm.setLevel(loglevel) logger.addHandler(strm) + + +def logger_init(name, loglevel=logging.WARNING, debug=False, verbose=False): + global logger + + if logger is not None: + return logger + + logger = logging.getLogger() # RootLogger + logger.__class__ = GfLogger + logger.myinit() + logger.setLevel(loglevel) + logger_config(logger, name, loglevel, debug, verbose) + return logger + + +# unused +def logger_init_without_lock(name, loglevel=logging.WARNING, + debug=False, verbose=False): + logger = logging.RootLogger(loglevel) + logger_config(logger, name, loglevel, debug, verbose) return logger @@ -690,6 +702,7 @@ def convert_message(error): class Command(metaclass=abc.ABCMeta): def init(self, name) -> NoReturn: + self.name = name self._docopt = docopt(self.getDoc()) self.opt = self.getSchema().validate(self._docopt) self.debug = self.opt['--debug'] @@ -705,10 +718,9 @@ class Command(metaclass=abc.ABCMeta): loglevel = logging.ERROR self.loglevel = loglevel - # use stderr - mylogger = logger_init(name, loglevel=loglevel, debug=self.debug, - verbose=self.verbose) - self.logger = mylogger + # use stderr with lock + logger_init(name, loglevel=loglevel, debug=self.debug, + verbose=self.verbose) logger.debug('USE_GFMKDIR_PLUS: %s', USE_GFMKDIR_PLUS) logger.debug('USE_GFCHMOD_PLUS: %s', USE_GFCHMOD_PLUS) @@ -2273,7 +2285,7 @@ class GfptarCommand(Command): return not self.debug and not self.verbose and not self.quiet def run(self): - self.logger.debug(pformat(self.opt)) + logger.debug(pformat(self.opt)) try: outdir = self.opt['--create'] if outdir: @@ -2317,6 +2329,9 @@ class GfptarCommand(Command): else: logger.error(convert_message(e)) sys.exit(1) + finally: + self.worker_terminate() + logger.debug('exit') def test_main(self): self.am_I_gfarmroot = am_I_gfarmroot() @@ -2423,6 +2438,7 @@ class GfptarCommand(Command): try: self.create(test1_dir, workdir, [testsrc_name]) except Exception as e: + self.print_trace(e) if str(e).startswith('specifying '): ok = True else: @@ -2912,10 +2928,10 @@ class GfptarCommand(Command): cannot_read_dir = 0 serial = 0 self.listing = True + has_error = None self.create_job_init() self.sig_init() - has_error = None infiles_checked = [] for infile in infiles: @@ -2962,7 +2978,7 @@ class GfptarCommand(Command): for infile in infiles_checked: if self.is_canceled(): - logger.debug('Canceled (listdir 1): serial=%04d', serial) + logger.debug(f'Canceled (listdir 1): serial={serial}') break url_str = os.path.join(self.basedir_url.url_str, infile) @@ -2971,8 +2987,7 @@ class GfptarCommand(Command): try: for entry in listdir_switch(gfurl): if self.is_canceled(): - logger.debug('Canceled (listdir 2): serial=%04d', - serial) + logger.debug(f'Canceled (listdir 2): serial={serial}') break logger.debug('listdir: entry.path=%s', entry.path) # include length of path @@ -3043,7 +3058,7 @@ class GfptarCommand(Command): with self.lock(): # for progress self.listing = False - self.create_job_finalize() + self.create_job_final() tmpdir.cleanup() @@ -3068,18 +3083,42 @@ class GfptarCommand(Command): if self.MT_enabled(): self.lock_init(True) self.create_job_execute = self._create_job_execute_MT - self.executor = concurrent.futures.ThreadPoolExecutor( + self.thread_pool = concurrent.futures.ThreadPoolExecutor( max_workers=self.jobs) self.futures = {} # key: serial number + else: self.lock_init(False) self.create_job_execute = self._create_job_execute + if self.jobs <= 0: + worker_num = 1 + else: + worker_num = self.jobs + + # kill old processes for regress + self.worker_terminate() + + class Started(): + pass + + self.worker_list = [] # (started, process, inq, outq) + self.worker_ident_dict = {} + for i in range(worker_num): + inq = multiprocessing.Queue() + outq = multiprocessing.Queue() + # ProcessPoolExecutor cannot be utilized here, + # because, for example, Queue cannot be not specified as arguments. + process = multiprocessing.Process(target=self.create_a_tar_process, + args=(inq, outq)) + started = Started() + started.status = False + self.worker_list.append((started, process, inq, outq)) def _create_job_execute(self, serial, arg): - self.create_a_tar(serial, arg) + self.create_a_tar_thread(serial, arg) def _create_job_execute_MT(self, serial, arg): - t = self.executor.submit(self.create_a_tar, serial, arg) + t = self.thread_pool.submit(self.create_a_tar_thread, serial, arg) self.futures[t] = serial has_error = None try: @@ -3089,16 +3128,67 @@ class GfptarCommand(Command): except Exception as e: # NOTE: cannot catch TimeoutError # ignore timeout - logger.debug(f'_create_job_execute_MT(): {str(e)}') + logger.debug(f'_create_job_execute_MT(): {type(e)} {str(e)}') if has_error is not None: raise has_error - def create_job_finalize(self, timeout=None): - if not self.MT_enabled(): + def process_close(self, process, index=None): + if not process.is_alive(): return - has_error = self._create_job_check_MT(timeout=timeout) - if has_error is not None: - raise has_error + logger.debug(f'process_close() start (index={index}):' + f' pid={process.pid}') + # timeout = 10 + # process.join(timeout) + process.terminate() + process.kill() + # process.close() may raise ValueError. + # (I don't know the reason): + # ValueError: Cannot close a process while it is still + # running. You should first call join() or terminate(). + # + # Retry process.close() if ValueError is caught. + ok = False + for i in range(50): # retry, max 5s + try: + process.close() + ok = True + break + except ValueError as e: + logger.debug(f'retry[{i}] process.close() (index={index}):' + f' {str(e)}') + time.sleep(0.1) + if not ok: + process.close() + logger.debug(f'process_close() finished (index={index})') + + def create_job_final(self, timeout=None): + if self.MT_enabled(): + has_error = self._create_job_check_MT(timeout=timeout) + self.thread_pool.shutdown(cancel_futures=True) + if has_error is not None: + raise has_error + self.worker_terminate() + + def worker_terminate(self): + if getattr(self, 'worker_list', None) is None: + return + err_list = [] + for index, worker in enumerate(self.worker_list): + started, process, inq, outq = worker + try: + inq.close() + except Exception: + pass + try: + outq.close() + except Exception: + pass + try: + if started.status: + self.process_close(process, index=index) + except Exception: + err_list.append(worker) + self.worker_list = err_list def _create_job_check_MT(self, timeout=0.1): has_error = None @@ -3106,7 +3196,7 @@ class GfptarCommand(Command): timeout=timeout): exc = t.exception() if exc: - logger.debug('serial=%04d: %s', self.futures[t], + logger.debug('serial=%d: %s', self.futures[t], convert_message(exc)) self.print_trace(exc) if not self.is_canceled(): @@ -3191,55 +3281,65 @@ class GfptarCommand(Command): logger.debug("selected target_host: %s", target_host) return target_host - def create_a_tar(self, serial, dbfile): + def create_a_tar_thread(self, serial, dbfile): try: - self.create_a_tar0(serial, dbfile) + self.create_a_tar_thread0(serial, dbfile) except Exception: if self.is_canceled(): raise self.error_canceled() else: raise - def create_a_tar0(self, serial, dbfile): - logger.debug('create_a_tar: start: %04d', serial) + def create_a_tar_thread0(self, serial, dbfile): + logger.debug(f'create_a_tar: start (serial={serial})') if self.is_canceled(): - logger.debug('Canceled (create 1): serial=%04d', serial) + logger.debug(f'Canceled (create 1): serial={serial}') return - try: - input_queue = multiprocessing.Queue() - output_queue = multiprocessing.Queue() - process = multiprocessing.Process(target=self.create_a_tar_worker, - args=(input_queue, output_queue, - serial, dbfile)) - process.start() - except Exception: - try: - input_queue.close() - except Exception: - pass - try: - output_queue.close() - except Exception: - pass - raise + with self.lock(): + # Thread ID -> index of worker_list + ident = threading.get_ident() + idx = self.worker_ident_dict.get(ident) + if idx is None: + idx = len(self.worker_ident_dict) + self.worker_ident_dict[ident] = idx + logger.debug('worker_ident_dict: ' + + str(self.worker_ident_dict)) + + started, process, inq, outq = self.worker_list[idx] + if not started.status: + process.start() # high cost to start + started.status = True + + inq.put(('START', serial, dbfile)) + response = outq.get() + if response != 'READY': + logger.error(f'Unexpected child response: {response}') + return - save_exc = None try: cancel = False while True: if self.is_canceled() and cancel is False: - logger.debug('Canceled (create 2): serial=%04d', serial) - input_queue.put('CANCEL') + logger.debug(f'Canceled (create 2): serial={serial}') + inq.put('CANCEL') cancel = True # prevent input_queue from filling up try: - result = output_queue.get(timeout=1) + result = outq.get(timeout=1) except queue.Empty: + result = None if not process.is_alive(): - logger.warning('Unexpected child process termination') + logger.warning('Unexpected child process termination' + f' (serial={serial})') break + if result is None: + logger.debug('waiting for message from child:' + f' serial={serial}') continue - if result[0] == 'ADD': + if len(result) == 0: + logger.warning('unknown result (None)') + break + elif result[0] == 'ADD': status, subpath, size_all = result with self.lock(): self.info('stored: {}', subpath) @@ -3261,52 +3361,53 @@ class GfptarCommand(Command): elif result[0] == 'ERR': (status, exc_type_name, exc_value_str, exc_traceback_str) = result - input_queue.put('ERR_COMPLETE') + inq.put('ERR_COMPLETE') raise Exception( f'{exc_type_name}: {exc_value_str}\n' f'{"".join(exc_traceback_str)}') else: logger.warning(f'unknown result: {str(result)}') - except Exception as e: - save_exc = e - raise + break finally: - timeout = 10 + logger.debug(f'(parent) subprocess finished: serial={serial}') + + def create_a_tar_process(self, input_queue, output_queue): + global logger + logger = logger_init_without_lock( + self.name, loglevel=self.loglevel, + debug=self.debug, verbose=self.verbose) + + signal.signal(signal.SIGINT, signal.SIG_DFL) + signal.signal(signal.SIGTERM, signal.SIG_DFL) + while True: + pid = os.getpid() + logger.debug(f'create_a_tar_process: start, pid={pid}') try: - process.join(timeout) - process.terminate() - process.kill() - # process.close() may raise ValueError. - # (I don't know the reason): - # ValueError: Cannot close a process while it is still - # running. You should first call join() or terminate(). - # - # Retry process.close() if ValueError is caught. - ok = False - for i in range(50): # retry, max 5s - try: - process.close() - ok = True - break - except Exception: - logger.debug(f'retry process.close(): {i}') - time.sleep(0.1) - if not ok: - process.close() - except Exception as e: - self.print_trace(e) - if save_exc is None: - raise - finally: - input_queue.close() - output_queue.close() - logger.debug(f'sub-process exits: serial={serial}') + self.create_a_tar_process0(input_queue, output_queue) + except KeyboardInterrupt: + pass - def create_a_tar_worker(self, input_queue, output_queue, serial, dbfile): + def create_a_tar_process0(self, input_queue, output_queue): + request = input_queue.get() + if len(request) > 0 and request[0] == 'START': + output_queue.put('READY') + else: + logger.error(f'Unexpected request for child: {request}') + return # exit + op, serial, dbfile = request + logger.debug(f'create_a_tar_process0: start (serial={serial})') try: - self.create_a_tar_worker0(input_queue, output_queue, - serial, dbfile) - except Exception: + result = self.create_a_tar_process1(input_queue, output_queue, + serial, dbfile) + tar_size, cannot_be_archived, outurl, listurl = result + output_queue.put(('DONE', tar_size, cannot_be_archived, + outurl, listurl)) + logger.debug(f'subprocess exits: serial={serial}') + except KeyboardInterrupt: + pass + except Exception as e: + logger.debug(f'create_a_tar_process: error (serial={serial})') + self.print_trace(e) exc_type, exc_value, exc_traceback = sys.exc_info() exc_type_name = exc_type.__name__ exc_value_str = str(exc_value) @@ -3320,9 +3421,8 @@ class GfptarCommand(Command): except queue.Empty: pass - def create_a_tar_worker0(self, input_queue, output_queue, serial, dbfile): - signal.signal(signal.SIGINT, signal.SIG_DFL) - signal.signal(signal.SIGTERM, signal.SIG_DFL) + def create_a_tar_process1(self, input_queue, output_queue, serial, dbfile): + logger.debug(f'create_a_tar_process1: start (serial={serial})') tardb1 = DB(dbfile) filelist = DBList(tardb1, GfURLEntry, 'tarlist') @@ -3330,7 +3430,7 @@ class GfptarCommand(Command): tardb1.close() tardb1.unlink() logger.warning(f'empty filelist: {dbfile}') - return + return 0, 0, '', '' first = None last = None for entry in filelist: @@ -3380,8 +3480,12 @@ class GfptarCommand(Command): filelist_ok = DBList(tardb_ok, GfURLEntry, 'filelist_ok') cannot_be_archived = 0 for entry in filelist: + logger.debug(f'subprocess(serial={serial}): {entry.path}') while not input_queue.empty(): - qdata = input_queue.get() + try: + qdata = input_queue.get(timeout=1) + except queue.Empty: + qdata = None if qdata == 'CANCEL': logger.debug('receive CANCEL from parent') break @@ -3414,8 +3518,7 @@ class GfptarCommand(Command): tardb_ok.unlink() tar_size = outurl.get_size() - output_queue.put(('DONE', tar_size, cannot_be_archived, - outurl.url_str, listurl.url_str)) + return tar_size, cannot_be_archived, outurl.url_str, listurl.url_str def create_a_members_list(self, url, filelist, target_host): outurl = GfURL.init(url.url_str + self.LIST_SUFFIX) From 35e34f4494b0380ed8412f67cc468aeeaff6d526 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Fri, 12 Jul 2024 13:36:09 +0900 Subject: [PATCH 016/143] gfptar: lock for serializing access to sqlite3 --- gftool/gfptar/gfptar | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index e9974155c..1265e9080 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -125,10 +125,7 @@ class DB: self.con = sqlite3.connect(filename, check_same_thread=check_same_thread) self.con.execute('PRAGMA synchronous = OFF') - # self.con.execute('PRAGMA cache_size = 1000') - # self.con.execute('PRAGMA mmap_size = 1000') - # self.con.execute('PRAGMA journal_mode=WAL') - # self.con.execute("PRAGMA busy_timeout=60000") + self.con.execute('PRAGMA journal_mode = OFF') # disable rollback def commit(self): self.con.commit() @@ -3213,13 +3210,12 @@ class GfptarCommand(Command): def lock_init(self, enable, timeout=None): if enable: + self._lock = threading.Lock() + self.lock = self.lock_timeout if timeout is not None: - self._lock = threading.Lock() self._lock_timeout = timeout - self.lock = self.lock_timeout else: - self._lock = None - self.lock = threading.Lock + self._lock_timeout = -1 else: self._lock = None self.lock = self.lock_noop @@ -3228,7 +3224,6 @@ class GfptarCommand(Command): def lock_noop(self): yield - # NOTE: not used @contextmanager def lock_timeout(self): locked = True @@ -3572,13 +3567,16 @@ class GfptarCommand(Command): tmpdir = tempfile.TemporaryDirectory(prefix='gfptar-', dir=self.workdir) db_file = os.path.join(tmpdir.name, 'extract.db') + db_file_target = os.path.join(tmpdir.name, 'target.db') + # to reduce memory usage db = DB(db_file, check_same_thread=False) + db_target = DB(db_file_target, check_same_thread=False) # member name -> (file_type, tar filename) archive_dict = DBDict(db, TypeName, 'archive_dict') # all tar files - target_set_all = DBSet(db, StrObj, 'target_set_all') + target_set_all = DBSet(db_target, StrObj, 'target_set_all') directory_set_all = DBSet(db, StrObj, 'directory_set_all') self.total_num = 0 @@ -3626,7 +3624,7 @@ class GfptarCommand(Command): if search_target: # selected tar files - target_set = DBSet(db, StrObj, 'target_set') + target_set = DBSet(db_target, StrObj, 'target_set') directory_set = DBSet(db, StrObj, 'directory_set') member_set1 = DBSet(db, StrObj, 'member_set1') for member in orig_member_set: @@ -3819,7 +3817,8 @@ class GfptarCommand(Command): use_gfarm_command=self.use_gfarm_command) tar = GfTarFile.extract_open(arch_url, self.bufsize, compress_prog=self.compress_prog) - members_num = len(member_set) + with self.lock(): + members_num = len(member_set) index = serial while True: if self.is_canceled(): @@ -3836,8 +3835,9 @@ class GfptarCommand(Command): if tarinfo is None: break if members_num > 0: - if tarinfo.name not in member_set: - continue # not a target + with self.lock(): + if tarinfo.name not in member_set: + continue # not a target # members_num == 0 -> extract all # ex. /a/b/c/ -> a/b/c @@ -3876,10 +3876,10 @@ class GfptarCommand(Command): elif tarinfo.isdir(): # NOTE: already created logger.debug('extract,dir: %s', outfile) + fattr = FileAttr(tarinfo.mode, tarinfo.mtime, + tarinfo.uname, tarinfo.gname) with self.lock(): - self.dirstat_dict[outfile] = FileAttr( - tarinfo.mode, tarinfo.mtime, - tarinfo.uname, tarinfo.gname) + self.dirstat_dict[outfile] = fattr elif tarinfo.issym(): logger.debug('extract,link: %s, %s', outfile, tarinfo.linkname) From 6d440ccf070dcdea591726ba8fdbad2e5fc5bd9f Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Fri, 12 Jul 2024 14:34:46 +0900 Subject: [PATCH 017/143] gfptar: change the format of progress_for_listing() --- gftool/gfptar/gfptar | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 1265e9080..f5e60330b 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -3914,9 +3914,15 @@ class GfptarCommand(Command): sec = now - self.start_time sec_str = format_seconds(sec, minhour=True) total_num_str = humanize_number(self.total_num) + if sec > 0: + ent_per_sec = self.total_num / sec + else: + ent_per_sec = 0 + ent_per_sec_str = humanize_number(ent_per_sec) sys.stdout.write(f'\rlisting: ' - f'{total_num_str}Ent, ' - f'{sec_str} ') + f'{total_num_str}Ent ' + f'{sec_str} ' + f'{ent_per_sec_str}Ent/s ') # lock required def progress_for_create(self, now): From 250bee883bffc135e57ea8d34162deeef989873f Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Fri, 12 Jul 2024 14:40:00 +0900 Subject: [PATCH 018/143] (temporal change) "gfptar --debug --test" to investigate error on GitHub Actions --- regress/gftool/gfptar/gfptar.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regress/gftool/gfptar/gfptar.sh b/regress/gftool/gfptar/gfptar.sh index 429dd9791..641978f6a 100755 --- a/regress/gftool/gfptar/gfptar.sh +++ b/regress/gftool/gfptar/gfptar.sh @@ -12,7 +12,7 @@ mkdir $localtmp LANG=en_US.UTF-8 export LANG -if gfptar -q --test \ +if gfptar -d --test \ --test-workdir-local=$localtmp \ --test-workdir-gfarm=gfarm:$gftmp; then exit_code=$exit_pass From b909518719d01a6d86cf1b98f3648bb960940ad2 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Fri, 12 Jul 2024 17:11:28 +0900 Subject: [PATCH 019/143] gfptar: print correct lineno in debug log on Python 3.12 or later --- gftool/gfptar/gfptar | 69 +++++++++++++++++++------------------------- 1 file changed, 29 insertions(+), 40 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index f5e60330b..5befe5050 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -46,6 +46,7 @@ import json import tempfile import multiprocessing import queue +import inspect from docopt import docopt from schema import Schema, Use, Or @@ -487,50 +488,41 @@ class GfException(Exception): class GfLogger(logging.getLoggerClass()): - def __init__(self): - super().__init__() - self.myinit() + def __init__(self, name): + self._mylock = threading.Lock() + super().__init__(name) - def myinit(self): - if getattr(self, 'lock', None) is None: - self.lock = threading.Lock() + def _mylog(self, level, msg, *args): + frame = inspect.currentframe().f_back.f_back + lineno = frame.f_lineno + record = self.makeRecord(self.name, level, frame.f_code.co_filename, + lineno, msg, args, None) + with self._mylock: + self.handle(record) - # REFERENCE: logging/__init__.py: class Logger() - def debug(self, msg, *args, **kwargs): - if self.isEnabledFor(logging.DEBUG): - with self.lock: - self._log(logging.DEBUG, msg, args, **kwargs) + def debug(self, msg, *args): + self._mylog(logging.DEBUG, msg, *args) - def info(self, msg, *args, **kwargs): - if self.isEnabledFor(logging.INFO): - with self.lock: - self._log(logging.INFO, msg, args, **kwargs) + def info(self, msg, *args): + self._mylog(logging.INFO, msg, *args) def warning(self, msg, *args, **kwargs): - if self.isEnabledFor(logging.WARNING): - with self.lock: - self._log(logging.WARNING, msg, args, **kwargs) + self._mylog(logging.WARNING, msg, *args) def error(self, msg, *args, **kwargs): - if self.isEnabledFor(logging.ERROR): - with self.lock: - self._log(logging.ERROR, msg, args, **kwargs) - - def error_exit(self, exit_code, msg, *args, **kwargs): - if self.isEnabledFor(logging.ERROR): - with self.lock: - self._log(logging.ERROR, msg, args, **kwargs) + self._mylog(logging.ERROR, msg, *args) + + def error_exit(self, exit_code, msg, *args): + self._mylog(logging.ERROR, msg, *args) sys.exit(exit_code) - def fatal(self, msg, *args, **kwargs): - if self.isEnabledFor(logging.ERROR): - with self.lock: - self._log(logging.ERROR, msg, args, **kwargs) - if 'exit_code' in kwargs: - raise GfException('exit_code={}'.format( - kwargs['exit_code'])) - else: - raise GfException('fatal exit') + # def fatal(self, msg, *args, **kwargs): + # self._mylog(logging.ERROR, msg, args) + # if 'exit_code' in kwargs: + # raise GfException('exit_code={}'.format( + # kwargs['exit_code'])) + # else: + # raise GfException('fatal exit') logger = None @@ -558,15 +550,12 @@ def logger_init(name, loglevel=logging.WARNING, debug=False, verbose=False): if logger is not None: return logger - logger = logging.getLogger() # RootLogger - logger.__class__ = GfLogger - logger.myinit() + logger = GfLogger('gfptar') logger.setLevel(loglevel) logger_config(logger, name, loglevel, debug, verbose) return logger -# unused def logger_init_without_lock(name, loglevel=logging.WARNING, debug=False, verbose=False): logger = logging.RootLogger(loglevel) @@ -3161,7 +3150,7 @@ class GfptarCommand(Command): def create_job_final(self, timeout=None): if self.MT_enabled(): has_error = self._create_job_check_MT(timeout=timeout) - self.thread_pool.shutdown(cancel_futures=True) + self.thread_pool.shutdown(wait=False) if has_error is not None: raise has_error self.worker_terminate() From 7ceb404e55c23837668c670a4674b9789d70080f Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Sat, 13 Jul 2024 00:04:22 +0900 Subject: [PATCH 020/143] gfptar: add debug logs --- gftool/gfptar/gfptar | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 5befe5050..f62ae9054 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -2780,7 +2780,7 @@ class GfptarCommand(Command): # lock required def info(self, fmt, *args): - if self.verbose: + if self.verbose or self.debug: print(fmt.format(*args)) def print_trace(self, exc): @@ -3681,6 +3681,7 @@ class GfptarCommand(Command): self.update_stat_for_directories(directory_set, member_set) def extract_directories(self, directory_set): + logger.debug('extract_directories') # NOTE: slow on Gfarm for d in directory_set.iterator(sort='ASC'): if self.is_canceled(): @@ -3697,6 +3698,7 @@ class GfptarCommand(Command): self.created_directory_set.add(dir_url.path) def extract_directories_fast(self, directory_set): + logger.debug('extract_directories_fast: start') # faster implementation for gfmkdir created_set = self.created_directory_set for d in directory_set.iterator(sort='DESC'): @@ -3704,6 +3706,8 @@ class GfptarCommand(Command): raise self.error_canceled() url_str = self.outdir_url.url_join(d) dir_url = GfURL.init(url_str) + logger.debug('extract_directories_fast: ' + f'out_url={self.outdir_url.url_str}, d={d}') # url.path is normalized if dir_url.path in created_set: logger.debug('skip (already created): %s', url_str) @@ -3727,6 +3731,7 @@ class GfptarCommand(Command): self.info('prepare_dir: {}', path) def update_stat_for_directories(self, directory_set, member_set): + logger.debug('update_stat_for_directories') # process from leaves members_num = len(member_set) self.clear_canceled() @@ -3831,8 +3836,8 @@ class GfptarCommand(Command): # ex. /a/b/c/ -> a/b/c outfile = tarinfo.name.strip('/') # relative path only - url_str = self.outdir_url.url_join(outfile) - outurl = GfURL.init(url_str) + outurl_str = self.outdir_url.url_join(outfile) + outurl = GfURL.init(outurl_str) parent = outurl.parent with self.lock(): @@ -3852,6 +3857,8 @@ class GfptarCommand(Command): user = tarinfo.uname group = tarinfo.gname inf = tar.extractfile(tarinfo) # io.BufferedReader + logger.debug('extract,file(before): %s, %s', + outurl_str, outfile) size = outurl.copy_from(inf, self.bufsize, mode=tarinfo.mode, mtime=tarinfo.mtime, From 816ed7b65e76219f525f05bd955c50b81298f45e Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Sat, 13 Jul 2024 13:21:57 +0900 Subject: [PATCH 021/143] gfptar: add debug logs (2) --- gftool/gfptar/gfptar | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index f62ae9054..6eb7331f7 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -2780,8 +2780,10 @@ class GfptarCommand(Command): # lock required def info(self, fmt, *args): - if self.verbose or self.debug: + if self.verbose: print(fmt.format(*args)) + if self.debug: + logger.debug(fmt.format(*args)) def print_trace(self, exc): if self.verbose or self.debug: @@ -3598,9 +3600,11 @@ class GfptarCommand(Command): file_type = line[:1] path = line[2:].lstrip('/') if search_target: + logger.debug(f'archive_dict[{path}]: {file_type}') archive_dict[path] = TypeName(file_type, arch_url_str) else: if file_type == 'D': + # logger.debug(f'directory_set_all.add 1: {path}') directory_set_all.add(path) if self.progress_enabled: now = time.time() @@ -3628,6 +3632,7 @@ class GfptarCommand(Command): raise GfException('Not found in archive: ' + member) target_set.add(type_arch.filename) if file_type == 'D': + logger.debug(f'directory_set_all.add 2: {path}') directory_set.add(member) # add parent directories to update attributes (mode,mtime) @@ -3643,6 +3648,7 @@ class GfptarCommand(Command): type_arch = archive_dict.get(path, None) if type_arch is not None: # found target_set.add(type_arch.filename) + logger.debug(f'directory_set_all.add 3: {path}') directory_set.add(path) member_set2.add(path) archive_dict.clear() From a343f161c5edb7dffe122ca7ef0e99d27214c7d5 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Sat, 13 Jul 2024 15:44:22 +0900 Subject: [PATCH 022/143] gfptar: close sqlite3 and unlink --- gftool/gfptar/gfptar | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 6eb7331f7..afd9719bf 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -2261,6 +2261,10 @@ class GfptarCommand(Command): signal.signal(signal.SIGINT, sig_handler) signal.signal(signal.SIGTERM, sig_handler) + def sig_default(self): + signal.signal(signal.SIGINT, signal.SIG_DFL) + signal.signal(signal.SIGTERM, signal.SIG_DFL) + def getDoc(self) -> str: return __doc__ @@ -2317,6 +2321,7 @@ class GfptarCommand(Command): sys.exit(1) finally: self.worker_terminate() + self.sig_default() logger.debug('exit') def test_main(self): @@ -3363,8 +3368,7 @@ class GfptarCommand(Command): self.name, loglevel=self.loglevel, debug=self.debug, verbose=self.verbose) - signal.signal(signal.SIGINT, signal.SIG_DFL) - signal.signal(signal.SIGTERM, signal.SIG_DFL) + self.sig_default() while True: pid = os.getpid() logger.debug(f'create_a_tar_process: start, pid={pid}') @@ -3559,6 +3563,8 @@ class GfptarCommand(Command): dir=self.workdir) db_file = os.path.join(tmpdir.name, 'extract.db') db_file_target = os.path.join(tmpdir.name, 'target.db') + logger.debug(f'db_file={db_file}') + logger.debug(f'db_file_target={db_file_target}') # to reduce memory usage db = DB(db_file, check_same_thread=False) @@ -3604,7 +3610,7 @@ class GfptarCommand(Command): archive_dict[path] = TypeName(file_type, arch_url_str) else: if file_type == 'D': - # logger.debug(f'directory_set_all.add 1: {path}') + logger.debug(f'directory_set_all.add: {path}') directory_set_all.add(path) if self.progress_enabled: now = time.time() @@ -3624,6 +3630,8 @@ class GfptarCommand(Command): m = GfURL.init(member, local=True) member_set1.add(m.path) # normalized + logger.debug('directory_set 0: ' + + str(list(directory_set))) # TODO for member in member_set1: if self.is_canceled(): raise self.error_canceled() @@ -3632,8 +3640,10 @@ class GfptarCommand(Command): raise GfException('Not found in archive: ' + member) target_set.add(type_arch.filename) if file_type == 'D': - logger.debug(f'directory_set_all.add 2: {path}') + logger.debug(f'directory_set.add 2: {path}') directory_set.add(member) + logger.debug('directory_set 2: ' + + str(list(directory_set))) # TODO # add parent directories to update attributes (mode,mtime) member_set2 = DBSet(db, StrObj, 'member_set2') @@ -3648,8 +3658,10 @@ class GfptarCommand(Command): type_arch = archive_dict.get(path, None) if type_arch is not None: # found target_set.add(type_arch.filename) - logger.debug(f'directory_set_all.add 3: {path}') + logger.debug(f'directory_set.add 3: {path}') directory_set.add(path) + logger.debug('directory_set 3:' + + str(list(directory_set))) # TODO member_set2.add(path) archive_dict.clear() member_set1.clear() @@ -3685,6 +3697,10 @@ class GfptarCommand(Command): sys.stdout.write('\n') self.update_stat_for_directories(directory_set, member_set) + db.close() + db.unlink() + db_target.close() + db_target.unlink() def extract_directories(self, directory_set): logger.debug('extract_directories') From 43d6d7bd192757fef3eb160bc53f73e586728abd Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Sat, 13 Jul 2024 16:32:06 +0900 Subject: [PATCH 023/143] gfptar: (sequel to 7f5c014) use sqlite3 for --extract to reduce memory usage --- gftool/gfptar/gfptar | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index afd9719bf..0166e8567 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -3630,8 +3630,6 @@ class GfptarCommand(Command): m = GfURL.init(member, local=True) member_set1.add(m.path) # normalized - logger.debug('directory_set 0: ' - + str(list(directory_set))) # TODO for member in member_set1: if self.is_canceled(): raise self.error_canceled() @@ -3639,11 +3637,9 @@ class GfptarCommand(Command): if type_arch is None: raise GfException('Not found in archive: ' + member) target_set.add(type_arch.filename) - if file_type == 'D': + if type_arch.file_type == 'D': logger.debug(f'directory_set.add 2: {path}') directory_set.add(member) - logger.debug('directory_set 2: ' - + str(list(directory_set))) # TODO # add parent directories to update attributes (mode,mtime) member_set2 = DBSet(db, StrObj, 'member_set2') @@ -3660,8 +3656,6 @@ class GfptarCommand(Command): target_set.add(type_arch.filename) logger.debug(f'directory_set.add 3: {path}') directory_set.add(path) - logger.debug('directory_set 3:' - + str(list(directory_set))) # TODO member_set2.add(path) archive_dict.clear() member_set1.clear() From e298ab5cdc3066d50cf27669f0ca578fa69c4a67 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Sat, 13 Jul 2024 16:53:00 +0900 Subject: [PATCH 024/143] gfptar: revert 250bee8 (gfptar --debug --test) --- regress/gftool/gfptar/gfptar.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regress/gftool/gfptar/gfptar.sh b/regress/gftool/gfptar/gfptar.sh index 641978f6a..429dd9791 100755 --- a/regress/gftool/gfptar/gfptar.sh +++ b/regress/gftool/gfptar/gfptar.sh @@ -12,7 +12,7 @@ mkdir $localtmp LANG=en_US.UTF-8 export LANG -if gfptar -d --test \ +if gfptar -q --test \ --test-workdir-local=$localtmp \ --test-workdir-gfarm=gfarm:$gftmp; then exit_code=$exit_pass From 2d9626862d249e377153679ec4deaecfebf241ad Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Sat, 13 Jul 2024 17:55:46 +0900 Subject: [PATCH 025/143] gfptar: refactoring (no functional change) --- gftool/gfptar/gfptar | 75 ++++++++++++++++++++++++-------------------- 1 file changed, 41 insertions(+), 34 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 0166e8567..7da9fe5dc 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -3540,7 +3540,7 @@ class GfptarCommand(Command): self.outdir = outdir self.outdir_url = GfURL.init(outdir) self.indir = indir - orig_member_set = set(members) + self.orig_member_set = set(members) del members self.same_owner = self.opt['--same-owner'] @@ -3549,13 +3549,13 @@ class GfptarCommand(Command): self.use_gfarm_command = not self.disable_gfarm_command self.gfsched_interval = self.opt['--gfsched-interval'] - indir_url = GfURL.init(self.indir) + self.indir_url = indir_url = GfURL.init(self.indir) if not indir_url.exists(): raise FileNotFoundError(indir_url.url_str) if not indir_url.is_directory(): raise self.error_not_a_gfptar_directory(indir_url.url_str) - search_target = len(orig_member_set) > 0 + self.search_target = len(self.orig_member_set) > 0 # Temporary files are removed when the process exits. # dir=None: system default @@ -3567,32 +3567,35 @@ class GfptarCommand(Command): logger.debug(f'db_file_target={db_file_target}') # to reduce memory usage - db = DB(db_file, check_same_thread=False) - db_target = DB(db_file_target, check_same_thread=False) + self.db = DB(db_file, check_same_thread=False) + self.db_target = DB(db_file_target, check_same_thread=False) # member name -> (file_type, tar filename) - archive_dict = DBDict(db, TypeName, 'archive_dict') + self.archive_dict = DBDict(self.db, TypeName, 'archive_dict') # all tar files - target_set_all = DBSet(db_target, StrObj, 'target_set_all') - directory_set_all = DBSet(db, StrObj, 'directory_set_all') + self.target_set_all = DBSet(self.db_target, StrObj, 'target_set_all') + self.directory_set_all = DBSet(self.db, StrObj, 'directory_set_all') self.total_num = 0 self.start_time = time.time() self.next_time = self.start_time + 1 self.sig_init() + target_set, directory_set, member_set = self.extract_schedule_v1() + self.extract_main(target_set, directory_set, member_set) - for ent in indir_url.listdir(recursive=False): + def extract_schedule_v1(self): + for ent in self.indir_url.listdir(recursive=False): if ent.path.endswith(self.LIST_SUFFIX): # ignored continue if self.is_canceled(): raise self.error_canceled() - subpath = ent.subpath(indir_url) - arch_url_str = indir_url.url_join(subpath) - target_set_all.add(arch_url_str) + subpath = ent.subpath(self.indir_url) + arch_url_str = self.indir_url.url_join(subpath) + self.target_set_all.add(arch_url_str) list_url = GfURL.init(arch_url_str + self.LIST_SUFFIX) if not list_url.exists(): - raise self.error_not_a_gfptar_directory(indir_url.url_str) + raise self.error_not_a_gfptar_directory(self.indir_url.url_str) with list_url.readopen(textmode=True) as f: while True: line = f.readline() @@ -3605,13 +3608,14 @@ class GfptarCommand(Command): # ex. "D /path/to/dir" file_type = line[:1] path = line[2:].lstrip('/') - if search_target: + if self.search_target: logger.debug(f'archive_dict[{path}]: {file_type}') - archive_dict[path] = TypeName(file_type, arch_url_str) + self.archive_dict[path] = TypeName( + file_type, arch_url_str) else: if file_type == 'D': logger.debug(f'directory_set_all.add: {path}') - directory_set_all.add(path) + self.directory_set_all.add(path) if self.progress_enabled: now = time.time() if now >= self.next_time: @@ -3621,19 +3625,19 @@ class GfptarCommand(Command): self.progress_for_listing(time.time()) sys.stdout.write('\n') - if search_target: + if self.search_target: # selected tar files - target_set = DBSet(db_target, StrObj, 'target_set') - directory_set = DBSet(db, StrObj, 'directory_set') - member_set1 = DBSet(db, StrObj, 'member_set1') - for member in orig_member_set: + target_set = DBSet(self.db_target, StrObj, 'target_set') + directory_set = DBSet(self.db, StrObj, 'directory_set') + member_set1 = DBSet(self.db, StrObj, 'member_set1') + for member in self.orig_member_set: m = GfURL.init(member, local=True) member_set1.add(m.path) # normalized for member in member_set1: if self.is_canceled(): raise self.error_canceled() - type_arch = archive_dict.get(member, None) + type_arch = self.archive_dict.get(member, None) if type_arch is None: raise GfException('Not found in archive: ' + member) target_set.add(type_arch.filename) @@ -3642,7 +3646,7 @@ class GfptarCommand(Command): directory_set.add(member) # add parent directories to update attributes (mode,mtime) - member_set2 = DBSet(db, StrObj, 'member_set2') + member_set2 = DBSet(self.db, StrObj, 'member_set2') for member in member_set1: if self.is_canceled(): raise self.error_canceled() @@ -3651,24 +3655,27 @@ class GfptarCommand(Command): for parent_url in url.parent_list: path = parent_url.path if path != '.' and path != '/': - type_arch = archive_dict.get(path, None) + type_arch = self.archive_dict.get(path, None) if type_arch is not None: # found target_set.add(type_arch.filename) logger.debug(f'directory_set.add 3: {path}') directory_set.add(path) member_set2.add(path) - archive_dict.clear() + self.archive_dict.clear() member_set1.clear() member_set = member_set2 # replace self.total_num = len(member_set) # re-set else: - target_set = target_set_all - directory_set = directory_set_all - member_set = DBSet(db, StrObj, 'member_set') + target_set = self.target_set_all + directory_set = self.directory_set_all + member_set = DBSet(self.db, StrObj, 'member_set') + return target_set, directory_set, member_set + def extract_main(self, target_set, directory_set, member_set): self.outdir_url.create_new_dir() - self.created_directory_set = DBSet(db, StrObj, 'created_directory_set') + self.created_directory_set = DBSet(self.db, StrObj, + 'created_directory_set') # self.extract_directories(directory_set) self.extract_directories_fast(directory_set) @@ -3676,7 +3683,7 @@ class GfptarCommand(Command): self.extracted_size = 0 self.start_time = time.time() self.next_time = self.start_time + 1 - self.dirstat_dict = DBDict(db, FileAttr, 'dirstat_dict') + self.dirstat_dict = DBDict(self.db, FileAttr, 'dirstat_dict') self.gfsched_lock = None self.gfsched_next = 0 @@ -3691,10 +3698,10 @@ class GfptarCommand(Command): sys.stdout.write('\n') self.update_stat_for_directories(directory_set, member_set) - db.close() - db.unlink() - db_target.close() - db_target.unlink() + self.db.close() + self.db.unlink() + self.db_target.close() + self.db_target.unlink() def extract_directories(self, directory_set): logger.debug('extract_directories') From 713f6eaf28fcd33a0c31e5177cfd611a5df88674 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Sat, 13 Jul 2024 19:09:55 +0900 Subject: [PATCH 026/143] gfptar: reduce temporary data --- gftool/gfptar/gfptar | 68 +++++++++++++++++++++++--------------------- 1 file changed, 35 insertions(+), 33 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 7da9fe5dc..118c9dae8 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -149,6 +149,16 @@ class DBObj: raise NotImplementedError +class JsonObj(DBObj): + @classmethod + def dumps(cls, obj, for_dict): + return json.dumps(obj, separators=(',', ':')) + + @classmethod + def loads(cls, key, txt, for_dict): + return json.loads(txt) + + # Example of DBObj class IntObj(DBObj): def __init__(self, i): @@ -180,22 +190,6 @@ class StrObj(DBObj): return txt -class TypeName(DBObj): - def __init__(self, file_type, filename): - self.file_type = file_type - self.filename = filename - - @classmethod - def dumps(cls, obj, for_dict): - array = [obj.file_type, obj.filename] - return json.dumps(array, separators=(',', ':')) - - @classmethod - def loads(cls, key, txt, for_dict): - array = json.loads(txt) - return cls(array[0], array[1]) - - class FileAttr(DBObj): def __init__(self, mode, mtime, user, group): self.mode = mode @@ -3570,10 +3564,6 @@ class GfptarCommand(Command): self.db = DB(db_file, check_same_thread=False) self.db_target = DB(db_file_target, check_same_thread=False) - # member name -> (file_type, tar filename) - self.archive_dict = DBDict(self.db, TypeName, 'archive_dict') - # all tar files - self.target_set_all = DBSet(self.db_target, StrObj, 'target_set_all') self.directory_set_all = DBSet(self.db, StrObj, 'directory_set_all') self.total_num = 0 @@ -3581,10 +3571,17 @@ class GfptarCommand(Command): self.next_time = self.start_time + 1 self.sig_init() - target_set, directory_set, member_set = self.extract_schedule_v1() + target_set, directory_set, member_set = self.extract_schedule_v2() self.extract_main(target_set, directory_set, member_set) - def extract_schedule_v1(self): + def extract_schedule_v2(self): + # tar archive filename + target_list_all = DBList(self.db, StrObj, 'target_list_all') + target_list_idx = 0 + + # member name -> set(file_type, tar filename) + archive_dict = DBDict(self.db, JsonObj, 'archive_dict') + for ent in self.indir_url.listdir(recursive=False): if ent.path.endswith(self.LIST_SUFFIX): # ignored continue @@ -3592,7 +3589,6 @@ class GfptarCommand(Command): raise self.error_canceled() subpath = ent.subpath(self.indir_url) arch_url_str = self.indir_url.url_join(subpath) - self.target_set_all.add(arch_url_str) list_url = GfURL.init(arch_url_str + self.LIST_SUFFIX) if not list_url.exists(): raise self.error_not_a_gfptar_directory(self.indir_url.url_str) @@ -3610,12 +3606,13 @@ class GfptarCommand(Command): path = line[2:].lstrip('/') if self.search_target: logger.debug(f'archive_dict[{path}]: {file_type}') - self.archive_dict[path] = TypeName( - file_type, arch_url_str) + archive_dict[path] = (file_type, target_list_idx) else: if file_type == 'D': logger.debug(f'directory_set_all.add: {path}') self.directory_set_all.add(path) + target_list_all.append(arch_url_str) + target_list_idx += 1 if self.progress_enabled: now = time.time() if now >= self.next_time: @@ -3625,9 +3622,9 @@ class GfptarCommand(Command): self.progress_for_listing(time.time()) sys.stdout.write('\n') + target_set = DBSet(self.db_target, StrObj, 'target_set') if self.search_target: # selected tar files - target_set = DBSet(self.db_target, StrObj, 'target_set') directory_set = DBSet(self.db, StrObj, 'directory_set') member_set1 = DBSet(self.db, StrObj, 'member_set1') for member in self.orig_member_set: @@ -3637,11 +3634,13 @@ class GfptarCommand(Command): for member in member_set1: if self.is_canceled(): raise self.error_canceled() - type_arch = self.archive_dict.get(member, None) + type_arch = archive_dict.get(member, None) if type_arch is None: raise GfException('Not found in archive: ' + member) - target_set.add(type_arch.filename) - if type_arch.file_type == 'D': + arch_idx = type_arch[1] + target_filename = target_list_all[arch_idx] + target_set.add(target_filename) + if type_arch[0] == 'D': logger.debug(f'directory_set.add 2: {path}') directory_set.add(member) @@ -3655,18 +3654,21 @@ class GfptarCommand(Command): for parent_url in url.parent_list: path = parent_url.path if path != '.' and path != '/': - type_arch = self.archive_dict.get(path, None) + type_arch = archive_dict.get(path, None) if type_arch is not None: # found - target_set.add(type_arch.filename) + arch_idx = type_arch[1] + target_filename = target_list_all[arch_idx] + target_set.add(target_filename) logger.debug(f'directory_set.add 3: {path}') directory_set.add(path) member_set2.add(path) - self.archive_dict.clear() + archive_dict.clear() member_set1.clear() member_set = member_set2 # replace self.total_num = len(member_set) # re-set else: - target_set = self.target_set_all + for target in target_list_all: + target_set.add(target) directory_set = self.directory_set_all member_set = DBSet(self.db, StrObj, 'member_set') return target_set, directory_set, member_set From 71ac8a7b62d99915ec51894823285062d9aac547 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Tue, 16 Jul 2024 22:58:13 +0900 Subject: [PATCH 027/143] gfptar: extract sub entries --- gftool/gfptar/gfptar | 204 +++++++++++++++++++++++++------------------ 1 file changed, 118 insertions(+), 86 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 118c9dae8..627574345 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -321,18 +321,33 @@ class DBDict(DBCollection): return self.keys() # sort: None, 'ASC', 'DESC' - def iterator(self, sort=None, offset=0, limit=-1): + def iterator(self, like=None, sort=None, offset=0, limit=-1): sql = f'SELECT key,value FROM {self.table_name}' + if like is not None: + sql += " WHERE key LIKE ? ESCAPE '\\'" if sort is not None: if sort.upper() == 'ASC': sql += ' ORDER BY key ASC' elif sort.upper() == 'DESC': sql += ' ORDER BY key DESC' + else: + raise AssertionError(f'unknown sort type: {str(sort)}') sql += f' LIMIT {limit} OFFSET {offset}' - res = self.con.execute(sql) + if like is not None: + res = self.con.execute(sql, (like,)) + else: + res = self.con.execute(sql) for row in res: yield row[0], self.txt2obj(row[0], row[1]) + @classmethod + def _escape(cls, txt): + return txt.replace('%', '\\%').replace('_', '\\_') + + def find_by_prefix(self, prefix, sort=None): + escaped = self._escape(prefix) + return self.iterator(like=escaped+'%', sort=sort) + class DBSet(DBCollection): def create_table(self): @@ -2116,14 +2131,24 @@ class TestGfptar(unittest.TestCase): testdb = DB(os.path.join(tmpdir.name, 'test.db')) d = DBDict(testdb, IntObj, 'test_dict') - d.update({'a': IntObj(1), 'b': IntObj(2), 'c': IntObj(3)}) - self.assertEqual(d['a'], 1) - self.assertEqual(d['b'], 2) - self.assertEqual(d['c'], 3) - self.assertEqual(d.get('c'), 3) + d.update({'a%%bc': IntObj(1), 'a__bc': IntObj(2), 'bc': 3}) + self.assertEqual(d['a%%bc'], 1) + self.assertEqual(d['a__bc'], 2) + self.assertEqual(d['bc'], 3) + self.assertEqual(d.get('bc'), 3) self.assertEqual(d.get('d'), None) self.assertNotIn('d', d) - self.assertIn('a', d) + self.assertIn('a%%bc', d) + + self.assertEqual(set(k for k, v in d.find_by_prefix('a%%b')), + {'a%%bc'}) + self.assertEqual(set(k for k, v in d.find_by_prefix('a__b')), + {'a__bc'}) + # sort order: % _ + self.assertEqual([k for k, v in d.find_by_prefix('a', sort='ASC')], + ['a%%bc', 'a__bc']) + self.assertEqual([k for k, v in d.find_by_prefix('a', sort='DESC')], + ['a__bc', 'a%%bc']) d.clear() d['a'] = 4 @@ -2592,12 +2617,15 @@ class GfptarCommand(Command): test3_dir_local = workdir_local_url.url_join(test3_name) test4_name = 'test-4-extract' test4_dir_local = workdir_local_url.url_join(test4_name) + test5_name = 'test-5-extract' + test5_dir_local = workdir_local_url.url_join(test5_name) # pick files as members (SEE ALSO: test_prepare_srcdir) + members = ['file1', 'dir1'] files = ['file1', 'dir1/readonly/file#2'] # Gfarm -> Gfarm(tar) - self.create(test1_dir_gfarm, srcdir_gfarm, files) + self.create(test1_dir_gfarm, srcdir_gfarm, members) # Gfarm(tar) -> Gfarm self.extract(test2_dir_gfarm, test1_dir_gfarm, files) @@ -2605,13 +2633,18 @@ class GfptarCommand(Command): self.create(test3_dir_local, srcdir_local, files) # Local(tar) -> Local self.extract(test4_dir_local, test3_dir_local, files) + self.extract(test5_dir_local, test3_dir_local, members) for f in files: g_member = GfURL.init(os.path.join(test2_dir_gfarm, f)) l_member = GfURL.init(os.path.join(test4_dir_local, f)) + l_member2 = GfURL.init(os.path.join(test5_dir_local, f)) if not g_member.compare_data(l_member): logger.error_exit(1, testname - + '... FAIL (data mismatch)') + + '... FAIL (data mismatch 1)') + if not l_member.compare_data(l_member2): + logger.error_exit(1, testname + + '... FAIL (data mismatch 2)') print(testname + ' ... PASS') workdir_local_url.remove_tree(remove_readonly=True) workdir_gfarm_url.remove_tree(remove_readonly=True) @@ -2863,6 +2896,7 @@ class GfptarCommand(Command): yield rand_file(dir_ent.path, j) def create(self, outdir, basedir, infiles): + logger.debug(f'create start: outdir={outdir}, basedir={basedir}') self.options_init() self.outdir = outdir self.outdir_url = GfURL.init(outdir) @@ -3478,6 +3512,7 @@ class GfptarCommand(Command): break subpath = entry.subpath(self.basedir_url) try: + logger.debug(f'tar.add_entry: {subpath}') tar.add_entry(subpath, entry) filelist_ok.append(entry) size_all = entry.size_all() @@ -3529,13 +3564,13 @@ class GfptarCommand(Command): def error_not_a_gfptar_directory(self, url_str): return GfptarError('Not a gfptar-archived directory: ' + url_str) - def extract(self, outdir, indir, members): + def extract(self, outdir, indir, specified_members): + logger.debug(f'extract start: outdir={outdir}, indir={indir}') self.options_init() self.outdir = outdir self.outdir_url = GfURL.init(outdir) self.indir = indir - self.orig_member_set = set(members) - del members + self.specified_members = specified_members self.same_owner = self.opt['--same-owner'] self.compress_prog = self.opt['--use-compress-program'] @@ -3549,7 +3584,7 @@ class GfptarCommand(Command): if not indir_url.is_directory(): raise self.error_not_a_gfptar_directory(indir_url.url_str) - self.search_target = len(self.orig_member_set) > 0 + self.search_target = len(specified_members) > 0 # Temporary files are removed when the process exits. # dir=None: system default @@ -3564,29 +3599,35 @@ class GfptarCommand(Command): self.db = DB(db_file, check_same_thread=False) self.db_target = DB(db_file_target, check_same_thread=False) - self.directory_set_all = DBSet(self.db, StrObj, 'directory_set_all') - self.total_num = 0 self.start_time = time.time() self.next_time = self.start_time + 1 self.sig_init() - target_set, directory_set, member_set = self.extract_schedule_v2() + target_set, directory_set, member_set = self.extract_schedule_v3() self.extract_main(target_set, directory_set, member_set) - def extract_schedule_v2(self): - # tar archive filename - target_list_all = DBList(self.db, StrObj, 'target_list_all') - target_list_idx = 0 + def extract_schedule_v3(self): + target_set = DBSet(self.db, StrObj, 'target_set') + directory_set = DBSet(self.db, StrObj, 'directory_set') + member_set = DBSet(self.db, StrObj, 'member_set') - # member name -> set(file_type, tar filename) archive_dict = DBDict(self.db, JsonObj, 'archive_dict') + member_check_dict = DBDict(self.db, JsonObj, 'member_check_dict') + + if self.search_target: + for member in self.specified_members: + m = GfURL.init(member, local=True) # relative path + path = m.path # normalized path + if path.startswith('/'): + path = path.lstrip('/') + member_check_dict[path] = False # initialize for ent in self.indir_url.listdir(recursive=False): - if ent.path.endswith(self.LIST_SUFFIX): # ignored - continue if self.is_canceled(): raise self.error_canceled() + if ent.path.endswith(self.LIST_SUFFIX): # ignored + continue subpath = ent.subpath(self.indir_url) arch_url_str = self.indir_url.url_join(subpath) list_url = GfURL.init(arch_url_str + self.LIST_SUFFIX) @@ -3606,13 +3647,46 @@ class GfptarCommand(Command): path = line[2:].lstrip('/') if self.search_target: logger.debug(f'archive_dict[{path}]: {file_type}') - archive_dict[path] = (file_type, target_list_idx) + archive_dict[path] = file_type else: if file_type == 'D': - logger.debug(f'directory_set_all.add: {path}') - self.directory_set_all.add(path) - target_list_all.append(arch_url_str) - target_list_idx += 1 + logger.debug(f'directory_set.add: {path}') + directory_set.add(path) # all directories + if self.search_target: + for member in member_check_dict.keys(): + is_checked = member_check_dict.get(member) + if is_checked: + logger.debug(f'from member_check_dict.keys(checked):' + f' {member} ') + continue + found = False + file_type = archive_dict.get(member, None) + logger.debug(f'from member_check_dict.keys: {member} ' + f'[{file_type}]') + if file_type is not None: + logger.debug(f'target_set.add: {arch_url_str}') + target_set.add(arch_url_str) # select this tar file + logger.debug(f'member_set.add: {member}') + member_set.add(member) + found = True + if file_type == 'D': + logger.debug(f'directory_set.add: {member}') + directory_set.add(member) + # else ... specified member may be a parent directory + # ex. dir -> dir/ -> dir/path/to/file + for path, file_type in archive_dict.find_by_prefix( + member + '/'): + logger.debug(f'member_set.add: {path}') + member_set.add(path) + found = True + if file_type == 'D': + logger.debug(f'directory_set.add: {path}') + directory_set.add(path) + if found: + member_check_dict[member] = True + archive_dict.clear() # re-use for next tar file + else: + target_set.add(arch_url_str) # use all tar files if self.progress_enabled: now = time.time() if now >= self.next_time: @@ -3621,56 +3695,15 @@ class GfptarCommand(Command): if self.progress_enabled: self.progress_for_listing(time.time()) sys.stdout.write('\n') - - target_set = DBSet(self.db_target, StrObj, 'target_set') if self.search_target: - # selected tar files - directory_set = DBSet(self.db, StrObj, 'directory_set') - member_set1 = DBSet(self.db, StrObj, 'member_set1') - for member in self.orig_member_set: - m = GfURL.init(member, local=True) - member_set1.add(m.path) # normalized - - for member in member_set1: - if self.is_canceled(): - raise self.error_canceled() - type_arch = archive_dict.get(member, None) - if type_arch is None: - raise GfException('Not found in archive: ' + member) - arch_idx = type_arch[1] - target_filename = target_list_all[arch_idx] - target_set.add(target_filename) - if type_arch[0] == 'D': - logger.debug(f'directory_set.add 2: {path}') - directory_set.add(member) - - # add parent directories to update attributes (mode,mtime) - member_set2 = DBSet(self.db, StrObj, 'member_set2') - for member in member_set1: - if self.is_canceled(): - raise self.error_canceled() - member_set2.add(member) - url = GfURL.init(member, local=True) # relative path - for parent_url in url.parent_list: - path = parent_url.path - if path != '.' and path != '/': - type_arch = archive_dict.get(path, None) - if type_arch is not None: # found - arch_idx = type_arch[1] - target_filename = target_list_all[arch_idx] - target_set.add(target_filename) - logger.debug(f'directory_set.add 3: {path}') - directory_set.add(path) - member_set2.add(path) - archive_dict.clear() - member_set1.clear() - member_set = member_set2 # replace - self.total_num = len(member_set) # re-set - else: - for target in target_list_all: - target_set.add(target) - directory_set = self.directory_set_all - member_set = DBSet(self.db, StrObj, 'member_set') + self.total_num = len(member_set) # update + archive_dict.clear() + member_check_dict.clear() + + for member, found in member_check_dict.items(): + if not found: + raise GfException('Not found in archive: ' + member) + return target_set, directory_set, member_set def extract_main(self, target_set, directory_set, member_set): @@ -3699,7 +3732,7 @@ class GfptarCommand(Command): self.progress_for_extract(time.time()) sys.stdout.write('\n') - self.update_stat_for_directories(directory_set, member_set) + self.update_stat_for_directories(directory_set) self.db.close() self.db.unlink() self.db_target.close() @@ -3755,17 +3788,14 @@ class GfptarCommand(Command): created_set.add(path) self.info('prepare_dir: {}', path) - def update_stat_for_directories(self, directory_set, member_set): + def update_stat_for_directories(self, directory_set): logger.debug('update_stat_for_directories') # process from leaves - members_num = len(member_set) self.clear_canceled() for d in directory_set.iterator(sort='DESC'): if self.is_canceled(): logger.debug('Canceled (extract 3)') break - if members_num > 0 and d not in member_set: - continue tarinfo = self.dirstat_dict.get(d) if tarinfo is None: logger.warning('No information of the directory: %s', d) @@ -3828,7 +3858,7 @@ class GfptarCommand(Command): raise def extract_from_a_tar0(self, serial, target, member_set): - logger.debug('extract_from_a_tar: start: %04d', serial) + logger.debug('extract_from_a_tar start: serial=%d', serial) if self.is_canceled(): logger.debug('Canceled (extract 1): name=%s', target) return @@ -3856,7 +3886,8 @@ class GfptarCommand(Command): if members_num > 0: with self.lock(): if tarinfo.name not in member_set: - continue # not a target + continue + member_set.remove(tarinfo.name) # members_num == 0 -> extract all # ex. /a/b/c/ -> a/b/c @@ -3869,7 +3900,7 @@ class GfptarCommand(Command): exist_dir = parent.path in self.created_directory_set if not exist_dir: if not parent.exists(): - parent.makedirs() + parent.makedirs() # default 0700 with self.lock(): self.created_directory_set.add(parent.path) @@ -4058,6 +4089,7 @@ class GfptarCommand(Command): name = name + ' -> ' + t.linkname info = (f'{t.mode:04o} {t.uname:>10}/{t.gname:<10}' f' {t.size:9d} {t.mtime} {name}') + # logger.debug(info) if not quiet: print(info) From 309642dbc5642311b08844535a3bb6c0c05fd119 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Tue, 16 Jul 2024 23:21:10 +0900 Subject: [PATCH 028/143] gfptar: raise error when the specified file is not found in archive files --- gftool/gfptar/gfptar | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 627574345..b5f4be972 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -2619,6 +2619,8 @@ class GfptarCommand(Command): test4_dir_local = workdir_local_url.url_join(test4_name) test5_name = 'test-5-extract' test5_dir_local = workdir_local_url.url_join(test5_name) + test6_name = 'test-6-extract' + test6_dir_local = workdir_local_url.url_join(test6_name) # pick files as members (SEE ALSO: test_prepare_srcdir) members = ['file1', 'dir1'] @@ -2635,6 +2637,14 @@ class GfptarCommand(Command): self.extract(test4_dir_local, test3_dir_local, files) self.extract(test5_dir_local, test3_dir_local, members) + # error + try: + self.extract(test6_dir_local, test3_dir_local, ['abcde']) + logger.error_exit(1, testname + '(not found in archive files) ' + + '... FAIL (unexpected success)') + except GfptarError as e: + logger.info(f'Expected error, ignored: {str(e)}') + for f in files: g_member = GfURL.init(os.path.join(test2_dir_gfarm, f)) l_member = GfURL.init(os.path.join(test4_dir_local, f)) @@ -3222,7 +3232,7 @@ class GfptarCommand(Command): self.print_trace(exc) if not self.is_canceled(): self.cancel() - # DO NOT cancel threads beacause it will freeze. + # DO NOT cancel threads because it will freeze. # for t2 in self.futures: # if t == t2: # continue @@ -3698,11 +3708,13 @@ class GfptarCommand(Command): if self.search_target: self.total_num = len(member_set) # update archive_dict.clear() - member_check_dict.clear() for member, found in member_check_dict.items(): + logger.debug(f'check member_check_dict: {member}, {found}') if not found: - raise GfException('Not found in archive: ' + member) + raise GfptarError('The specified file is not found' + ' in archive files: ' + member) + member_check_dict.clear() return target_set, directory_set, member_set @@ -3840,7 +3852,7 @@ class GfptarCommand(Command): self.print_trace(exc) if not self.is_canceled(): self.cancel() - # DO NOT cancel threads beacause it will freeze. + # DO NOT cancel threads because it will freeze. # for t2 in self.futures: # if t == t2: # continue From 0648bc2c6c8cedf17d81b325051e8c629af10769 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Wed, 17 Jul 2024 02:46:18 +0900 Subject: [PATCH 029/143] gfptar --extract: specified members can match multiple targets --- gftool/gfptar/gfptar | 46 +++++++++++++++++++++++++------------------- 1 file changed, 26 insertions(+), 20 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index b5f4be972..e6a24841c 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -3618,7 +3618,7 @@ class GfptarCommand(Command): self.extract_main(target_set, directory_set, member_set) def extract_schedule_v3(self): - target_set = DBSet(self.db, StrObj, 'target_set') + target_set = DBSet(self.db_target, StrObj, 'target_set') directory_set = DBSet(self.db, StrObj, 'directory_set') member_set = DBSet(self.db, StrObj, 'member_set') @@ -3663,38 +3663,43 @@ class GfptarCommand(Command): logger.debug(f'directory_set.add: {path}') directory_set.add(path) # all directories if self.search_target: + is_target = False for member in member_check_dict.keys(): - is_checked = member_check_dict.get(member) - if is_checked: - logger.debug(f'from member_check_dict.keys(checked):' - f' {member} ') - continue - found = False file_type = archive_dict.get(member, None) logger.debug(f'from member_check_dict.keys: {member} ' f'[{file_type}]') + found = False if file_type is not None: - logger.debug(f'target_set.add: {arch_url_str}') - target_set.add(arch_url_str) # select this tar file logger.debug(f'member_set.add: {member}') member_set.add(member) found = True if file_type == 'D': logger.debug(f'directory_set.add: {member}') directory_set.add(member) - # else ... specified member may be a parent directory - # ex. dir -> dir/ -> dir/path/to/file - for path, file_type in archive_dict.find_by_prefix( - member + '/'): - logger.debug(f'member_set.add: {path}') - member_set.add(path) - found = True - if file_type == 'D': - logger.debug(f'directory_set.add: {path}') - directory_set.add(path) + is_dir = True + else: + is_dir = False + else: + # specified member may be a parent directory + # ex. dir -> dir/* -> dir/path/to/file + is_dir = True + if is_dir: + # find {member}/* files + for path, file_type in archive_dict.find_by_prefix( + member + '/'): + logger.debug(f'member_set.add: {path}') + member_set.add(path) + found = True + if file_type == 'D': + logger.debug(f'directory_set.add: {path}') + directory_set.add(path) if found: member_check_dict[member] = True + is_target = True archive_dict.clear() # re-use for next tar file + if is_target: + logger.debug(f'target_set.add: {arch_url_str}') + target_set.add(arch_url_str) # select this tar file else: target_set.add(arch_url_str) # use all tar files if self.progress_enabled: @@ -3707,7 +3712,6 @@ class GfptarCommand(Command): sys.stdout.write('\n') if self.search_target: self.total_num = len(member_set) # update - archive_dict.clear() for member, found in member_check_dict.items(): logger.debug(f'check member_check_dict: {member}, {found}') @@ -3715,6 +3719,7 @@ class GfptarCommand(Command): raise GfptarError('The specified file is not found' ' in archive files: ' + member) member_check_dict.clear() + archive_dict.clear() return target_set, directory_set, member_set @@ -3897,6 +3902,7 @@ class GfptarCommand(Command): break if members_num > 0: with self.lock(): + # perfect match if tarinfo.name not in member_set: continue member_set.remove(tarinfo.name) From 67cfc4280afc2f82752701fe60b35c8d4f9c4253 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Wed, 17 Jul 2024 19:12:50 +0900 Subject: [PATCH 030/143] gfptar: use os.mkdir() istead of os.makedirs() for Python version 3.7 or later --- gftool/gfptar/gfptar | 69 ++++++++++++++++++++++++++++++++++---------- 1 file changed, 53 insertions(+), 16 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index e6a24841c..7085f0795 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -612,6 +612,7 @@ def shutup_stderr(): global DEFAULT_STDERR use_stderr = False DEFAULT_STDERR = subprocess.DEVNULL + logger.debug('shutup_stderr') def execcmd(args, stdin=subprocess.DEVNULL, stderr=subprocess.PIPE, @@ -947,12 +948,12 @@ class GfURL(metaclass=abc.ABCMeta): return GfURL.init(self.root_url_str + parent_path) @property - def parent_list(self): + def parent_iter(self): def parents_func(u): - if u.path != '.' and u.path != '/': + p = u.parent.path + if p != '' and p != '.' and p != '/': yield u.parent yield from parents_func(u.parent) - return yield from parents_func(self) @property @@ -1068,7 +1069,11 @@ class GfURL(metaclass=abc.ABCMeta): raise NotImplementedError @abc.abstractmethod - def get_size(self): + def size(self): + raise NotImplementedError + + @abc.abstractmethod + def mode(self): raise NotImplementedError @abc.abstractmethod @@ -1366,11 +1371,16 @@ class GfURLGfarm(GfURL): def is_writable(self): return self.access(self.W_OK) - def get_size(self): - for entry in self.listdir(recursive=False): - logger.debug('get_size: %s, %d', entry.path, entry.size) + def size(self): + for entry in self.listdir(recursive=False, myself_only=True): + logger.debug('size: %s, %d', entry.path, entry.size) return entry.size + def mode(self): + for entry in self.listdir(recursive=False, myself_only=True): + logger.debug('mode: %s, %o', entry.path, entry.mode) + return entry.mode & 0o777 + @classmethod def from_rwx(cls, rwx, highchar): perm = 0 @@ -1417,11 +1427,13 @@ class GfURLGfarm(GfURL): return mode, file_type def listdir(self, path_only=False, recursive=False, first=False, - hardlink_warn=False): + hardlink_warn=False, myself_only=False): dirname = self.url_str gfls_opt = '-ailT' if recursive: gfls_opt += 'R' + if myself_only: + gfls_opt += 'd' for line in execcmd_readline(['gfls', gfls_opt, self.url_str]): logger.debug('listdir: raw line=%s', line) if line is None: @@ -1676,7 +1688,16 @@ class GfURLLocal(GfURL): def mkdir(self, mode=0o700, parents=False): if parents: - os.makedirs(self.url_str, mode=mode, exist_ok=True) + # Reference: https://docs.python.org/3/library/os.html#os.makedirs + # Changed in version 3.7: The mode argument no longer + # affects the file permission bits of newly created + # intermediate-level directories. + # os.makedirs(self.url_str, mode=mode, exist_ok=True) + for p in reversed(list(self.parent_iter)): + if p.exists(): + continue + os.mkdir(p.url_str, mode | 0o700) + os.mkdir(self.url_str, mode) else: os.mkdir(self.url_str, mode) @@ -1721,11 +1742,16 @@ class GfURLLocal(GfURL): def is_writable(self): return os.access(self.url_str, os.W_OK) - def get_size(self): + def size(self): st = os.stat(self.url_str, follow_symlinks=False) - logger.debug('get_size: %s, %d', self.url_str, st.st_size) + logger.debug('size: %s, %d', self.url_str, st.st_size) return st.st_size + def mode(self): + st = os.stat(self.url_str, follow_symlinks=False) + logger.debug('mode: %s, %o', self.url_str, st.st_mode) + return st.st_mode & 0o777 + @classmethod def _readlink(cls, path, is_symlink): if is_symlink: @@ -2645,6 +2671,18 @@ class GfptarCommand(Command): except GfptarError as e: logger.info(f'Expected error, ignored: {str(e)}') + # check mode=0o700 for parents without tarinfo of file#2 + gd1 = GfURL.init(os.path.join(test2_dir_gfarm, 'dir1/readonly')) + ld1 = GfURL.init(os.path.join(test4_dir_local, 'dir1/readonly')) + gd1_mode = gd1.mode() + if gd1_mode != 0o700: + logger.error_exit( + 1, f'{testname} ... FAIL: gd1.mode={gd1_mode:#o}') + ld1_mode = ld1.mode() + if ld1_mode != 0o700: + logger.error_exit( + 1, f'{testname} ... FAIL: ld1.mode={ld1_mode:#o}') + for f in files: g_member = GfURL.init(os.path.join(test2_dir_gfarm, f)) l_member = GfURL.init(os.path.join(test4_dir_local, f)) @@ -3546,7 +3584,7 @@ class GfptarCommand(Command): tardb_ok.close() tardb_ok.unlink() - tar_size = outurl.get_size() + tar_size = outurl.size() return tar_size, cannot_be_archived, outurl.url_str, listurl.url_str def create_a_members_list(self, url, filelist, target_host): @@ -3798,7 +3836,7 @@ class GfptarCommand(Command): self.info('prepare_dir: {}', dir_url.path) created_set.add(parent_url.path) self.info('prepare_dir: {}', parent_url.path) - for p in parent_url.parent_list: + for p in parent_url.parent_iter: path = p.path if path == '.' or path == '/': continue @@ -3807,8 +3845,8 @@ class GfptarCommand(Command): def update_stat_for_directories(self, directory_set): logger.debug('update_stat_for_directories') - # process from leaves self.clear_canceled() + # DESC: process from leaves for d in directory_set.iterator(sort='DESC'): if self.is_canceled(): logger.debug('Canceled (extract 3)') @@ -3824,8 +3862,7 @@ class GfptarCommand(Command): user=tarinfo.uname, group=tarinfo.gname) else: dir_url.chmod(tarinfo.mode, mtime=tarinfo.mtime) - logger.debug("update_stat: %s, %s", d, oct(tarinfo.mode)) - self.info('update_stat: {}', d) + self.info('update_stat: {}, mode={}', d, oct(tarinfo.mode)) def extract_from_archives(self, target_set, member_set): self.lock_init(False) From 28d97a9df63907130e0299dcffe7eae94a146cfa Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Thu, 18 Jul 2024 11:26:15 +0900 Subject: [PATCH 031/143] gfptar --dummy-num: store tarinfo for directories --- gftool/gfptar/gfptar | 52 +++++++++++++++++++++++++++++--------------- 1 file changed, 35 insertions(+), 17 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 7085f0795..f9d23aa15 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -2893,7 +2893,7 @@ class GfptarCommand(Command): k=random.randint( dir_min_length, dir_max_length))) for _ in range(depth)] - return os.path.join(*directories) + return directories def generate_random_filename(): suffix = ''.join(random.choices(string.ascii_lowercase, @@ -2902,18 +2902,29 @@ class GfptarCommand(Command): random.choices(choices, k=random.randint( file_min_length, file_max_length))) + '.' + suffix - def rand_dir(): - dir_path = generate_random_dirname() - path = os.path.join(base_dir, dir_path) - mode = 0o700 + def rand_dir(first_dir_index): + first_dir = f'dir{first_dir_index:04d}' + dir_list = generate_random_dirname() + dir_depth = len(dir_list) + + mode = 0o755 file_type = GfURLEntry.TYPE_DIR uname = 'testuser1' - gname = 'testgroup1' + gname = 'testgroup2' size = 0 mtime = now linkname = '' - return GfURLEntry(path, mode, file_type, uname, gname, - size, mtime, linkname) + # first_dir: to pick up members + # + # path = os.path.join(base_dir, first_dir) + # yield GfURLEntry(path, mode, file_type, uname, gname, + # size, mtime, linkname) + for i in range(dir_depth): + parent = dir_list[:(i+1)] + dir_path = os.path.join(*parent) + path = os.path.join(base_dir, first_dir, dir_path) + yield GfURLEntry(path, mode, file_type, uname, gname, + size, mtime, linkname) def rand_file(dir_path, idx): time.sleep(dummy_sleep_per_entry) @@ -2933,14 +2944,18 @@ class GfptarCommand(Command): dir_num = int(num / files_per_dir) remainder = num % files_per_dir for i in range(dir_num): - dir_ent = rand_dir() - yield dir_ent - for j in range(files_per_dir - 1): + dir_num = 0 + for dir_ent in rand_dir(i): + dir_num += 1 + yield dir_ent + for j in range(files_per_dir - dir_num): yield rand_file(dir_ent.path, j) if remainder > 0: - dir_ent = rand_dir() - yield dir_ent - for j in range(remainder - 1): + dir_num = 0 + for dir_ent in rand_dir(dir_num): + dir_num += 1 + yield dir_ent + for j in range(remainder - dir_num): yield rand_file(dir_ent.path, j) def create(self, outdir, basedir, infiles): @@ -4203,6 +4218,7 @@ Options: -I, --use-compress-program=COMMAND filter data through COMMAND, the command must accept -d option for decompression + (ex. pigz) --same-owner extract files with the same ownership (for euid=0 on local, or gfarmroot on Gfarm) --disable-gfarm-command disable the use of gfreg and gfexport @@ -4215,15 +4231,17 @@ Options: [default: utf-8] --bufsize=BYTES buffer size to copy [default: 1Mi] --workdir=DIR local directory for temporary files + (default: system temporary directory) --max-entries-per-tar=NUM the number of entries per tar file to limit memory usage [default: 100K] - --memory=NUM upper limit of memory size (bytes) - (default: no limit) + --memory=BYTES upper limit of memory size (bytes) + (default: no limit) (ex. 2Gi) --test test mode (-q option is recommended) --test-workdir-local=DIR local directory for test [default: /tmp] --test-workdir-gfarm=DIR Gfarm directory for test [default: gfarm:/tmp] --dummy-num=NUM the number of dummy (random) files for input - (for -c) (ignore ) (1000 files per dir) + (for -c option) (ignore arguments) + (create dummy 1000 entries per dir) (default: disabled) --dummy-size-min=BYTES minimum size of dummy files [default: 0] --dummy-size-max=BYTES maximum size of dummy files [default: 1Mi] From 595d7e89f42573e14ddde680d1d2519204c50957 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Thu, 18 Jul 2024 15:40:46 +0900 Subject: [PATCH 032/143] gfptar: new option: --progress-unit --- gftool/gfptar/gfptar | 88 +++++++++++++++++++++++++++++++------------- 1 file changed, 63 insertions(+), 25 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index f9d23aa15..063dd7ec6 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -68,8 +68,8 @@ def humanize_number(num, binary_prefix=False): if binary_prefix: units = ['', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi', 'Yi'] base = Decimal(1024) - else: - units = ['', 'K', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y'] + else: # SI prefix + units = ['', 'k', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y'] base = Decimal(1000) if num < base: return str(int(num)) @@ -97,10 +97,11 @@ def unhumanize_number(numstr, binary_prefix=False): return int(numstr) lastchar = numstr[-1] if lastchar == 'i' and numstrlen > 2: + # binary prefix n = int(numstr[:(numstrlen-2)]) prefix = numstr[-2] base = 1024 # even if binary_prefix=False - else: + else: # SI prefix n = int(numstr[:(numstrlen-1)]) prefix = lastchar @@ -2105,16 +2106,18 @@ class TestGfptar(unittest.TestCase): self.assertEqual(humanize_number(1023, binary_prefix=True), '1023') self.assertEqual(humanize_number(1024, binary_prefix=True), '1.0Ki') self.assertEqual(humanize_number(999), '999') - self.assertEqual(humanize_number(1000), '1.0K') - self.assertEqual(humanize_number(99999), '99.9K') - self.assertEqual(humanize_number(100000), '100K') + self.assertEqual(humanize_number(1000), '1.0k') + self.assertEqual(humanize_number(99999), '99.9k') + self.assertEqual(humanize_number(100000), '100k') self.assertEqual(humanize_number(1900000), '1.9M') self.assertEqual(humanize_number(2000000), '2.0M') def test_unhumanize(self): self.assertEqual(unhumanize_number('999'), 999) + self.assertEqual(unhumanize_number('1k'), 1000) self.assertEqual(unhumanize_number('1K'), 1000) - self.assertEqual(unhumanize_number('1K', binary_prefix=True), 1024) + self.assertEqual(unhumanize_number('1k', binary_prefix=True), 1024) + self.assertEqual(unhumanize_number('1ki'), 1024) self.assertEqual(unhumanize_number('1Ki'), 1024) self.assertEqual(unhumanize_number('2Mi'), 2097152) self.assertEqual(unhumanize_number('3Gi'), 3221225472) @@ -2291,6 +2294,16 @@ class GfptarCommand(Command): self.use_fsync = not self.opt['--disable-fsync'] self.workdir = self.opt['--workdir'] + progress_unit_type = self.opt['--progress-unit'] + if progress_unit_type == 'si': + self._humanize = self._humanize_si + elif progress_unit_type == 'bin': + self._humanize = self._humanize_bin + elif progress_unit_type == 'raw': + self._humanize = self._humanize_raw + else: + self._humanize = self._humanize_si + self.memory_limit = self.opt['--memory'] if self.memory_limit is not None: self.set_memory_limit(self.memory_limit) @@ -4032,19 +4045,28 @@ class GfptarCommand(Command): with self.lock(): self.info('extracted(done): {}', arch_url.url_str) + def _humanize_si(self, n): + return humanize_number(n) + + def _humanize_bin(self, n): + return humanize_number(n, binary_prefix=True) + + def _humanize_raw(self, n): + return int(n) + def progress_for_listing(self, now): sec = now - self.start_time sec_str = format_seconds(sec, minhour=True) - total_num_str = humanize_number(self.total_num) + total_num_str = self._humanize(self.total_num) if sec > 0: ent_per_sec = self.total_num / sec else: ent_per_sec = 0 - ent_per_sec_str = humanize_number(ent_per_sec) + ent_per_sec_str = self._humanize(ent_per_sec) sys.stdout.write(f'\rlisting: ' f'{total_num_str}Ent ' f'{sec_str} ' - f'{ent_per_sec_str}Ent/s ') + f'{ent_per_sec_str}Ent/s ') # lock required def progress_for_create(self, now): @@ -4069,18 +4091,18 @@ class GfptarCommand(Command): else: bytes_per_sec = 0 ent_per_sec = 0 - stored_num_str = humanize_number(self.stored_num) - total_num_str = humanize_number(self.total_num) - stored_size_str = humanize_number(self.stored_size) - total_size_str = humanize_number(self.total_size) - bytes_per_sec_str = humanize_number(bytes_per_sec) - ent_per_sec_str = humanize_number(ent_per_sec) + stored_num_str = self._humanize(self.stored_num) + total_num_str = self._humanize(self.total_num) + stored_size_str = self._humanize(self.stored_size) + total_size_str = self._humanize(self.total_size) + bytes_per_sec_str = self._humanize(bytes_per_sec) + ent_per_sec_str = self._humanize(ent_per_sec) sys.stdout.write(f'\rcreated: {percent_str}% ' - f'{stored_size_str}B/{total_size_str}B ' + f'{stored_size_str}/{total_size_str}B ' f'{stored_num_str}/{total_num_str}Ent ' f'{sec_str} ' f'{bytes_per_sec_str}B/s ' - f'{ent_per_sec_str}Ent/s ') + f'{ent_per_sec_str}Ent/s ') # lock required def progress_for_extract(self, now): @@ -4096,17 +4118,17 @@ class GfptarCommand(Command): else: bytes_per_sec = 0 ent_per_sec = 0 - extracted_num_str = humanize_number(self.extracted_num) - total_num_str = humanize_number(self.total_num) - extracted_size_str = humanize_number(self.extracted_size) - bytes_per_sec_str = humanize_number(bytes_per_sec) - ent_per_sec_str = humanize_number(ent_per_sec) + extracted_num_str = self._humanize(self.extracted_num) + total_num_str = self._humanize(self.total_num) + extracted_size_str = self._humanize(self.extracted_size) + bytes_per_sec_str = self._humanize(bytes_per_sec) + ent_per_sec_str = self._humanize(ent_per_sec) sys.stdout.write(f'\rextracted: {percent:.0f}% ' f'{extracted_size_str}B ' f'{extracted_num_str}/{total_num_str}Ent ' f'{sec_str} ' f'{bytes_per_sec_str}B/s ' - f'{ent_per_sec_str}Ent/s ') + f'{ent_per_sec_str}Ent/s ') def list_simple(self, indir, quiet=False): self.options_init() @@ -4196,6 +4218,16 @@ Example of --extract (Gfarm to Local): ... /home/user1/out2/dir/test9999.data +SI prefix or Binary prerix: + SI prefix: + - 1k = 10^3 = 1000^1 (kilo) (not K) + - 1M = 10^6 = 1000^2 (mega) + - ... + Binary prefix: + - 1Ki = 2^10 = 1024^1 (kibi) + - 1Mi = 2^20 = 1024^2 (mebi) + - ... + Limitations: - Hard links are not preserved. - File names cannot include newline characters. @@ -4233,7 +4265,12 @@ Options: --workdir=DIR local directory for temporary files (default: system temporary directory) --max-entries-per-tar=NUM the number of entries per tar file - to limit memory usage [default: 100K] + to limit memory usage [default: 100k] + --progress-unit=TYPE unit for progress + - si: SI prefix + - bin: Binary prefix + - raw: no conversion + [default: si] --memory=BYTES upper limit of memory size (bytes) (default: no limit) (ex. 2Gi) --test test mode (-q option is recommended) @@ -4279,6 +4316,7 @@ _schema = Schema({ '--same-owner': bool, '--workdir': Or(str, None), '--max-entries-per-tar': Use(unhumanize_number), + '--progress-unit': str, '--memory': Or(Use(unhumanize_number), None), '--test': bool, '--test-workdir-local': Or(str, None), From c07f1a2e50ea2940a71bd3296adf634f7b245e4f Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Wed, 24 Jul 2024 18:39:04 +0900 Subject: [PATCH 033/143] gfptar -v --list: create and use g*_info.db --- gftool/gfptar/gfptar | 396 ++++++++++++++++++++++++++++++++++++++----- 1 file changed, 350 insertions(+), 46 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 063dd7ec6..0e95c8c03 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -47,6 +47,7 @@ import tempfile import multiprocessing import queue import inspect +from collections import defaultdict from docopt import docopt from schema import Schema, Use, Or @@ -128,6 +129,7 @@ class DB: check_same_thread=check_same_thread) self.con.execute('PRAGMA synchronous = OFF') self.con.execute('PRAGMA journal_mode = OFF') # disable rollback + # self.con.execute('PRAGMA journal_mode = WAL') def commit(self): self.con.commit() @@ -209,6 +211,29 @@ class FileAttr(DBObj): return cls(array[0], array[1], array[2], array[3]) +class FileAttr2(DBObj): + def __init__(self, mode, mtime, user, group, size, linkname, ftype): + self.mode = mode + self.mtime = mtime + self.user = user + self.group = group + self.size = size + self.linkname = linkname + self.ftype = ftype + + @classmethod + def dumps(cls, obj, for_dict): + array = [obj.mode, obj.mtime, obj.user, obj.group, + obj.size, obj.linkname, obj.ftype] + return json.dumps(array, separators=(',', ':')) + + @classmethod + def loads(cls, key, txt, for_dict): + array = json.loads(txt) + return cls(array[0], array[1], array[2], array[3], + array[4], array[5], array[6]) + + # Abstract class DBCollection: def __init__(self, db, obj_cls, table_name, clear=False): @@ -795,6 +820,9 @@ class GfURLEntry(DBObj): t = cls.type_map_reverse[o[1]] return cls(path, o[0], t, o[2], o[3], o[4], o[5], o[6]) + def url(self): + return GfURL.init(self.path) + def subpath(self, baseurl): return baseurl.subpath(self.path) @@ -1027,6 +1055,14 @@ class GfURL(metaclass=abc.ABCMeta): def makedirs(self, mode=0o700): self.mkdir(mode, parents=True) + @abc.abstractmethod + def rename(self, dest): + raise NotImplementedError + + @abc.abstractmethod + def remove(self): + raise NotImplementedError + @abc.abstractmethod def remove_tree(self, remove_readonly=False): raise NotImplementedError @@ -1093,7 +1129,7 @@ class GfURL(metaclass=abc.ABCMeta): user=None, group=None, use_fsync=True, hostname=None): raise NotImplementedError - def copy_from(self, inf, bufsize, mode=None, mtime=0o600, + def copy_from(self, inf, bufsize, mode=0o600, mtime=None, user=None, group=None, use_fsync=True, hostname=None): readlen = 0 with self.writeopen(mode=mode, mtime=mtime, @@ -1310,6 +1346,12 @@ class GfURLGfarm(GfURL): # slow self.chmod(mode) + def rename(self, dest): + execcmd(['gfmv', self.url_str, dest]) + + def remove(self): + execcmd(['gfrm', self.url_str]) + def remove_tree(self, remove_readonly=False): path = self.url_str if path == '/' or path == '': @@ -1711,6 +1753,12 @@ class GfURLLocal(GfURL): file_path = os.path.join(root, f) os.chmod(file_path, file_mode) + def rename(self, dest): + os.rename(self.path, dest) + + def remove(self): + os.remove(self.path) + def remove_tree(self, remove_readonly=False): path = self.path if path == '/' or path == '': @@ -1873,12 +1921,7 @@ class GfURLLocal(GfURL): f.flush() os.fsync(f.fileno()) f.close() - if mode is not None: - self.chmod(mode, mtime=mtime, user=user, group=group) - else: - if mtime is not None: - self.utime(mtime, mtime) - self.chown(user, group) + self.chmod(mode, mtime=mtime, user=user, group=group) class GfTarFile(tarfile.TarFile): @@ -2308,20 +2351,27 @@ class GfptarCommand(Command): if self.memory_limit is not None: self.set_memory_limit(self.memory_limit) + self.compress_type = self.opt['--type'] + self.compress_prog = self.opt['--use-compress-program'] + def set_memory_limit(self, max_memory): resource.setrlimit(resource.RLIMIT_AS, (max_memory, max_memory)) def sig_init(self): def sig_handler(signum, frame): - print('') # new line + logger.warning(f'Interrupt (signal={signum}') self.cancel() + signal.signal(signal.SIGHUP, sig_handler) signal.signal(signal.SIGINT, sig_handler) signal.signal(signal.SIGTERM, sig_handler) + signal.signal(signal.SIGPIPE, signal.SIG_IGN) def sig_default(self): + signal.signal(signal.SIGHUP, signal.SIG_DFL) signal.signal(signal.SIGINT, signal.SIG_DFL) signal.signal(signal.SIGTERM, signal.SIG_DFL) + signal.signal(signal.SIGPIPE, signal.SIG_IGN) def getDoc(self) -> str: return __doc__ @@ -2895,7 +2945,7 @@ class GfptarCommand(Command): file_max_length = 30 choices = string.ascii_letters + string.digits + '漢あア()[]-' - other_symbols = '!"#$%&=^~|`@{}+*;:,.<>?_' + r"\'" + other_symbols = ' !"#$%&=^~|`@{}+*;:,.<>?_' + r"\'" choices += other_symbols now = time.time() @@ -2985,8 +3035,6 @@ class GfptarCommand(Command): self.max_entries_per_tar = 1 self.ratio = self.opt['--ratio'] - self.compress_type = self.opt['--type'] - self.compress_prog = self.opt['--use-compress-program'] self.disable_gfarm_command = self.opt['--disable-gfarm-command'] self.use_gfarm_command = not self.disable_gfarm_command self.gfsched_interval = self.opt['--gfsched-interval'] @@ -3649,7 +3697,6 @@ class GfptarCommand(Command): self.specified_members = specified_members self.same_owner = self.opt['--same-owner'] - self.compress_prog = self.opt['--use-compress-program'] self.disable_gfarm_command = self.opt['--disable-gfarm-command'] self.use_gfarm_command = not self.disable_gfarm_command self.gfsched_interval = self.opt['--gfsched-interval'] @@ -3702,13 +3749,16 @@ class GfptarCommand(Command): for ent in self.indir_url.listdir(recursive=False): if self.is_canceled(): raise self.error_canceled() - if ent.path.endswith(self.LIST_SUFFIX): # ignored + if ent.path.endswith(self.LIST_SUFFIX): # ignore *.lst, *.tar.lst + continue + base = os.path.basename(ent.path) + if not self.PATT_TAR.match(base): # ignore not *.tar.* continue - subpath = ent.subpath(self.indir_url) - arch_url_str = self.indir_url.url_join(subpath) - list_url = GfURL.init(arch_url_str + self.LIST_SUFFIX) + arch_url_str = ent.path + list_file = arch_url_str + self.LIST_SUFFIX + list_url = GfURL.init(list_file) if not list_url.exists(): - raise self.error_not_a_gfptar_directory(self.indir_url.url_str) + raise self.error_not_a_gfptar_directory(list_file) with list_url.readopen(textmode=True) as f: while True: line = f.readline() @@ -4149,41 +4199,295 @@ class GfptarCommand(Command): if not quiet: print(line.rstrip()) + def decompress_infodb(self, tmpdir, db_path): + db_url = GfURL.init(db_path) + base = os.path.basename(db_path) + tmpdb_path = os.path.join(tmpdir.name, base) + tmpdb_url = GfURL.init(tmpdb_path) + # TODO decompress + with db_url.readopen() as f: + tmpdb_url.copy_from(f, self.bufsize, use_fsync=self.use_fsync) + return tmpdb_url + + TABLE_ENTRY = 'entries' + TABLE_USER = 'users' + TABLE_GROUP = 'groups' + def list_verbose(self, indir, quiet=False): self.options_init() - self.compress_prog = self.opt['--use-compress-program'] indir_url = GfURL.init(indir) - archlist = [] + tmpdir = tempfile.TemporaryDirectory(prefix='gfptar-', + dir=self.workdir) + self.gen_infodb_all(indir_url, tmpdir, update=False) + + # TODO .db.gz + db_gen1_pattern = re.compile(r'^(\d+)_.+.db') + # divide by generations : gNN_ (NN >= 2) + db_gen_pattern = re.compile(r'^g(\d+)_(\d+)_.+.db') + + infodb_list = [] + for ent in indir_url.listdir(recursive=False): + path = ent.path # fullpath when ent is Gfarm + base = os.path.basename(path) + g1_match = db_gen1_pattern.match(base) + if g1_match: + id_num = g1_match.group(1) + infodb_list.append((int(id_num), path)) + else: + g_match = db_gen_pattern.match(base) + if g_match: + id_num = g_match.group(2) + infodb_list.append((int(id_num), path)) + + def id_key(id_path): + id_num, path = id_path + return id_num + + infodb_list.sort(key=id_key) + for id_num, db_path in infodb_list: + tmpdb_url = self.decompress_infodb(tmpdir, db_path) + db = DB(tmpdb_url.path) + try: + fattr_dict = DBDict(db, FileAttr2, self.TABLE_ENTRY) + user_dict = DBDict(db, StrObj, self.TABLE_USER) + group_dict = DBDict(db, StrObj, self.TABLE_GROUP) + user_dict_mem = {} + group_dict_mem = {} + # cache in memory + for k, v in user_dict.items(): + user_dict_mem[k] = v + for k, v in group_dict.items(): + group_dict_mem[k] = v + for path, t in fattr_dict.iterator(sort='ASC'): + # id -> name + user = user_dict_mem.get(t.user, '???') + group = user_dict_mem.get(t.group, '???') + if not quiet: + print(f'{t.ftype} {t.mode:04o}' + f' {user:>10}/{group:<10}' + f' {t.size:9d} {t.mtime} {path}') + finally: + db.close() + + # *.tar or *.tar.* + PATT_TAR = re.compile(r'.*\.tar(\.\w{1,5})?$') + + def gen_infodb_all(self, indir_url, tmpdir, update=False): + tar_pattern = self.PATT_TAR + gen1_pattern = re.compile(r'^(\d+)_.+') + # divide by generations : gNN_ (NN >= 2) + gen_pattern = re.compile(r'^g(\d+)_(\d+)_.+') + gen_dict_list = defaultdict(list) + for ent in indir_url.listdir(recursive=False): - if ent.path.endswith(self.LIST_SUFFIX): + path = ent.path + base = os.path.basename(path) + if ent.path.endswith(self.LIST_SUFFIX): # ignore *.lst, *.tar.lst continue - archlist.append(ent.path) - archlist.sort() - for path in archlist: - arch_url = GfURL.init(path) - tar = GfTarFile.extract_open(arch_url, self.bufsize, - compress_prog=self.compress_prog) - while True: - try: - t = tar.next() - except MemoryError: - raise - except Exception as e: - logger.warning(f'{path}: SKIPPED: invalid or empty tar:' - f' {str(e)}') - t = None - if t is None: + if not tar_pattern.match(base): # ignore not *.tar.* + continue + g1_match = gen1_pattern.match(base) + if g1_match: + # generation number = 1 + gen_num = '1' + id_num = g1_match.group(1) + else: + g_match = gen_pattern.match(base) + if g_match: + gen_num = g_match.group(1) + id_num = g_match.group(2) + else: + # skip irrelevant file + continue + # gen -> tarlist + gen_dict_list[gen_num].append((id_num, path)) + + if self.jobs >= 1: + max_workers = self.jobs + else: + max_workers = 1 + + save_e = None + cancel = False + # Concurrent execution for each generation + with multiprocessing.Manager() as manager: + lock = manager.Lock() + share_cancel = manager.Value('i', 0) + + def sig_handler(signum, frame): + logger.warning(f'Interrupt (signal={signum}') + share_cancel.value = 1 + + signal.signal(signal.SIGHUP, sig_handler) + signal.signal(signal.SIGINT, sig_handler) + signal.signal(signal.SIGTERM, sig_handler) + signal.signal(signal.SIGPIPE, signal.SIG_IGN) + + for gen_num, tarlist in gen_dict_list.items(): + with concurrent.futures.ProcessPoolExecutor( + max_workers=max_workers) as executor: + arglist = [] + for id_num, tar_path in tarlist: + arglist.append((lock, share_cancel, tmpdir, + update, gen_num, id_num, tar_path, + self.bufsize, self.compress_prog, + self.use_fsync)) + # executor.map(gen_infodb_one, arglist) + futures = [executor.submit(gen_infodb_one, arg) + for arg in arglist] + + for future in concurrent.futures.as_completed(futures): + try: + future.result() + except Exception as e: + # logger.error(f'{e}') + self.print_trace(e) + share_cancel.value = 1 + if save_e is None: + save_e = e + if share_cancel.value != 0: + cancel = True + self.sig_default() # no longer be able to access Manager + + if save_e: + raise save_e + if cancel: + raise self.error_canceled() + + +signal_initialized = False + + +def signal_init_for_gen_infodb(share_cancel): + global signal_initialized + + if not signal_initialized: + def sig_handler(signum, frame): + with share_cancel.get_lock(): + if share_cancel.value != 0: + logger.warning(f'Interrupt (signal={signum}') + share_cancel.value = 1 + + signal.signal(signal.SIGHUP, sig_handler) + signal.signal(signal.SIGINT, sig_handler) + signal.signal(signal.SIGTERM, sig_handler) + signal.signal(signal.SIGPIPE, signal.SIG_IGN) + signal_initialized = True + # logger.debug('signal_init_for_gen_infodb') + + +def gen_infodb_name(gen_num, id_num): + # ex. g2_0099_ + return f'g{gen_num}_{id_num:04}_info.db' + + +# ProcessPoolExecutor cannot serialize "self" object of GfptarCommand +def gen_infodb_one(args): + (lock, share_cancel, tmpdir, update, gen_num, id_num, + tar_path, bufsize, compress_prog, use_fsync) = args + if share_cancel.value != 0: + # logger.debug('Canceled (2)') + return + tar_url = GfURL.init(tar_path) + indir_url = tar_url.parent + db_name = gen_infodb_name(gen_num, id_num) + db_path = indir_url.url_join(db_name) + db_url = GfURL.init(db_path) + if not update and db_url.exists(): + logger.debug(f'not update: {db_path}') + return + signal_init_for_gen_infodb(share_cancel) + + # Local file + tmpdb_path = os.path.join(tmpdir.name, str(id_num) + '_info.db') + tmpdb = DB(tmpdb_path) + fattr_dict = DBDict(tmpdb, FileAttr2, GfptarCommand.TABLE_ENTRY) + + user_dict = {} # use memory for speed + group_dict = {} # use memory for speed + + tar = GfTarFile.extract_open(tar_url, bufsize, + compress_prog=compress_prog) + db_close = False + try: + interval = 1 # sec. | for interrupt + next_check = time.time() + interval + while True: + now = time.time() + if now >= next_check: + # access manager.Value(): very high cost + if share_cancel.value != 0: + logger.info('Canceled') break - name = t.name - if t.isdir(): - name = name + '/' - elif t.issym(): - name = name + ' -> ' + t.linkname - info = (f'{t.mode:04o} {t.uname:>10}/{t.gname:<10}' - f' {t.size:9d} {t.mtime} {name}') - # logger.debug(info) - if not quiet: - print(info) + next_check = now + interval + try: + t = tar.next() + except MemoryError: + raise + except Exception as e: + logger.warning(f'{tar_path}: SKIPPED: invalid or empty tar:' + f' {str(e)}') + t = None + if t is None: + break + # name = t.name + # if t.isdir(): + # name = name + '/' + # elif t.issym(): + # name = name + ' -> ' + t.linkname + # info = (f'{gen_num}:{id_num} {t.mode:04o}' + # f' {t.uname:>10}/{t.gname:<10}' + # f' {t.size:9d} {t.mtime} {name}') + # logger.debug(info) + # logger.debug(f'add to DB: {t.name}') + if t.isfile(): + ftype = 'F' + elif t.isdir(): + ftype = 'D' + elif t.issym(): + ftype = 'L' + else: + ftype = '?' + user_id = user_dict.get(t.uname, None) # not uid + if user_id is None: + user_id = str(len(user_dict)) + user_dict[t.uname] = user_id + group_id = group_dict.get(t.gname, None) # not gid + if group_id is None: + group_id = str(len(group_dict)) + group_dict[t.gname] = group_id + fattr = FileAttr2(t.mode, int(t.mtime), user_id, group_id, + t.size, t.linkname, ftype) + fattr_dict[t.name] = fattr + # success + + # id -> name + rev_user_dict = DBDict(tmpdb, StrObj, GfptarCommand.TABLE_USER) + rev_group_dict = DBDict(tmpdb, StrObj, GfptarCommand.TABLE_GROUP) + for k, v in user_dict.items(): + rev_user_dict[v] = k + for k, v in group_dict.items(): + rev_group_dict[v] = k + tmpdb.commit() + tmpdb.close() + db_close = True + + db_path0 = indir_url.url_join(db_name + '.tmp') + db_url0 = GfURL.init(db_path0) + tmpdb_url = GfURL.init(tmpdb_path) + if db_url0.exists(): + db_url0.remove() + # TODO convert .gz : gzip_writeopen(), copy_from(, gzip=True) + # copy infodb from tmpdir to indir + with tmpdb_url.readopen() as f: + db_url0.copy_from(f, bufsize, use_fsync=use_fsync) + # atomic operation to avoid leaving junk files + db_url0.rename(db_path) + finally: + tar.close() + if not db_close: + tmpdb.commit() + tmpdb.close() progname = os.path.basename(__file__) From d4f2b37ca9e6747bffd7b28d1cfd723da320a999 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Wed, 24 Jul 2024 18:41:43 +0900 Subject: [PATCH 034/143] sequel to "gfptar -v --list: create and use g*_info.db" --- gftool/gfptar/gfptar | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 0e95c8c03..422346f42 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -4261,7 +4261,7 @@ class GfptarCommand(Command): for path, t in fattr_dict.iterator(sort='ASC'): # id -> name user = user_dict_mem.get(t.user, '???') - group = user_dict_mem.get(t.group, '???') + group = group_dict_mem.get(t.group, '???') if not quiet: print(f'{t.ftype} {t.mode:04o}' f' {user:>10}/{group:<10}' From b72a337a4d95a3da17671e47a52838fdfc3c6f8a Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Thu, 25 Jul 2024 13:32:50 +0900 Subject: [PATCH 035/143] gfptar: create g*_info.db.gz gfptar: new option: --gzip-program,--bzip2-program,--xz-program gfptar: new option: --test-long --- gftool/gfptar/gfptar | 231 ++++++++++++++++++++++++++++++------------- 1 file changed, 161 insertions(+), 70 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 422346f42..326da6b87 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -1924,8 +1924,58 @@ class GfURLLocal(GfURL): self.chmod(mode, mtime=mtime, user=user, group=group) +class Compress: + TYPE_NO = 'no' + TYPE_GZIP = 'gz' + TYPE_BZIP2 = 'bz2' + TYPE_XZ = 'xz' + + @classmethod + def set_default(cls, compress_type, compress_prog, + gzip_prog, bzip2_prog, xz_prog): + if gzip_prog is not None: + cls.gzip_prog = shutil.which(gzip_prog) + else: + cls.gzip_prog = None + if bzip2_prog is not None: + cls.bzip2_prog = shutil.which(bzip2_prog) + else: + cls.bzip2_prog = None + if xz_prog is not None: + cls.xz_prog = shutil.which(xz_prog) + else: + cls.xz_prog = None + + cls.compress_type = compress_type + if compress_prog is None: + if compress_type == cls.TYPE_GZIP: + compress_prog = cls.gzip_prog + elif compress_type == cls.TYPE_BZIP2: + compress_prog = cls.bzip2_prog + elif compress_type == cls.TYPE_XZ: + compress_prog = cls.xz_prog + cls.compress_prog = compress_prog # may be None + + @classmethod + def compress(cls, compress_prog, outf): + args = [compress_prog] + # binary mode + return subprocess.Popen( + args, shell=False, close_fds=True, + stdin=subprocess.PIPE, stdout=outf, + stderr=DEFAULT_STDERR) + + @classmethod + def decompress(cls, compress_prog, inf): + args = [compress_prog, '-d'] + # binary mode + return subprocess.Popen( + args, shell=False, close_fds=True, + stdin=inf, stdout=subprocess.PIPE, + stderr=DEFAULT_STDERR) + + class GfTarFile(tarfile.TarFile): - COMPRESS_TYPE_NO = 'no' ATTR_PROC_LIST = '_gfptar_proc_list' # [(proc, fileobj, fileobj), ...] ATTR_USE_FSYNC = 'use_fsync' ATTR_DUMMY_SLEEP = 'dummy_sleep' @@ -1933,6 +1983,8 @@ class GfTarFile(tarfile.TarFile): @classmethod def extract_open(cls, gfurl, copybufsize, compress_prog=None): + if compress_prog is None: + compress_prog = Compress.compress_prog # use Stream (not seekable) if compress_prog is not None: openmode = 'r|' @@ -1946,8 +1998,8 @@ class GfTarFile(tarfile.TarFile): if gfurl.is_gfarm(): if compress_prog: gfexport_proc = gfurl.gfexport() - decompress_proc = cls.decompress(compress_prog, - gfexport_proc.stdout) + decompress_proc = Compress.decompress(compress_prog, + gfexport_proc.stdout) tar = cls.open(None, mode=openmode, fileobj=decompress_proc.stdout, copybufsize=copybufsize) @@ -1965,7 +2017,7 @@ class GfTarFile(tarfile.TarFile): else: if compress_prog: inf = open(gfurl.url_str, 'rb') - decompress_proc = cls.decompress(compress_prog, inf) + decompress_proc = Compress.decompress(compress_prog, inf) tar = cls.open(None, mode=openmode, fileobj=decompress_proc.stdout, copybufsize=copybufsize) @@ -1982,11 +2034,14 @@ class GfTarFile(tarfile.TarFile): def create_open(cls, gfurl, compress_type, copybufsize, compress_prog=None, use_fsync=True, target_host=None, dummy_input=False, dummy_sleep=0): + if compress_prog is None: + compress_prog = Compress.compress_prog # use Stream (not seekable) - openmode = 'w|' if compress_prog is None \ - and compress_type != cls.COMPRESS_TYPE_NO: + and compress_type != Compress.TYPE_NO: openmode = 'w|' + compress_type + else: + openmode = 'w|' if gfurl.exists(): raise FileExistsError(gfurl.url_str) # list of tuple(proc, closeable obj, synchronizable obj) @@ -1994,8 +2049,8 @@ class GfTarFile(tarfile.TarFile): if gfurl.is_gfarm(): if compress_prog: gfreg_obj = gfurl.gfreg(mode=0o600, hostname=target_host) - compress_proc = cls.compress(compress_prog, - gfreg_obj.stdin) + compress_proc = Compress.compress(compress_prog, + gfreg_obj.stdin) tar = cls.open(None, mode=openmode, fileobj=compress_proc.stdin, copybufsize=copybufsize) @@ -2011,7 +2066,7 @@ class GfTarFile(tarfile.TarFile): else: # Local if compress_prog: outf = open(gfurl.url_str, 'wb') - compress_proc = cls.compress(compress_prog, outf) + compress_proc = Compress.compress(compress_prog, outf) tar = cls.open(None, mode=openmode, fileobj=compress_proc.stdin, copybufsize=copybufsize) @@ -2034,24 +2089,6 @@ class GfTarFile(tarfile.TarFile): setattr(tar, cls.METHOD_add_entry, tar._add_entry) return tar - @classmethod - def compress(cls, compress_prog, outf): - args = [compress_prog] - # binary mode - return subprocess.Popen( - args, shell=False, close_fds=True, - stdin=subprocess.PIPE, stdout=outf, - stderr=DEFAULT_STDERR) - - @classmethod - def decompress(cls, compress_prog, inf): - args = [compress_prog, '-d'] - # binary mode - return subprocess.Popen( - args, shell=False, close_fds=True, - stdin=inf, stdout=subprocess.PIPE, - stderr=DEFAULT_STDERR) - # override def close(self): super().close() @@ -2353,6 +2390,13 @@ class GfptarCommand(Command): self.compress_type = self.opt['--type'] self.compress_prog = self.opt['--use-compress-program'] + self.gzip_prog = self.opt['--gzip-program'] + self.xz_prog = self.opt['--xz-program'] + self.bzip2_prog = self.opt['--bzip2-program'] + Compress.set_default(self.compress_type, self.compress_prog, + self.gzip_prog, self.bzip2_prog, self.xz_prog) + if Compress.gzip_prog is None: + raise GfException('gzip: command not found. Please install gzip.') def set_memory_limit(self, max_memory): resource.setrlimit(resource.RLIMIT_AS, (max_memory, max_memory)) @@ -2408,7 +2452,10 @@ class GfptarCommand(Command): return if self.opt['--test']: - self.test_main() + self.test_main_short() + return + if self.opt['--test-long']: + self.test_main_long() return except Exception as e: self.cancel() @@ -2432,7 +2479,7 @@ class GfptarCommand(Command): self.sig_default() logger.debug('exit') - def test_main(self): + def test_init(self): self.am_I_gfarmroot = am_I_gfarmroot() if self.am_I_gfarmroot: logger.error('warning: gfarmroot is enabled: ' @@ -2444,6 +2491,13 @@ class GfptarCommand(Command): out = gfwhoami() self.gfarm_user = out.strip() + def test_main_short(self): + self.test_init() + self.test_opt_pattern(full=False) + self.test_specified_dir() + + def test_main_long(self): + self.test_init() self.test_unit() self.test_invalid('url', 'gfarm:/tmp', 'dst', True) self.test_invalid('dot1', '.', 'dst', True) @@ -2455,8 +2509,7 @@ class GfptarCommand(Command): self.test_invalid('dotdot3', '../abc', 'dst', False) self.test_invalid('dotdot4', './..', 'dst', False) self.test_member() - self.test_opt_pattern() - self.test_specified_dir() + self.test_opt_pattern(full=True) def test_unit(self): verbosity = 2 @@ -2466,7 +2519,7 @@ class GfptarCommand(Command): logger.error_exit(1, 'unittest error') print('unittest ... PASS') - def test_opt_pattern(self): + def test_opt_pattern(self, full=False): save_opt_size = self.opt['--size'] save_opt_jobs = self.opt['--jobs'] save_opt_type = self.opt['--type'] @@ -2474,7 +2527,10 @@ class GfptarCommand(Command): # create tar per one entry self.opt['--size'] = 0 - pattern_jobs = [0, 10] + if full: + pattern_jobs = [0, 10] + else: + pattern_jobs = [16] for jobs in pattern_jobs: self.opt['--jobs'] = jobs self.test_simple('jobs_' + str(jobs), use_all_files=True) @@ -2482,24 +2538,35 @@ class GfptarCommand(Command): # create one tar self.opt['--size'] = unhumanize_number('100M') - pattern_type = [ - 'gz', - # 'bz2', - # 'xz', - 'no'] + if full: + pattern_type = [ + 'gz', + 'bz2', + 'xz', + 'no'] + else: + pattern_type = ['gz'] for t in pattern_type: self.opt['--type'] = t self.test_simple('type_' + t) self.opt['--type'] = save_opt_type - pattern_compress_prog = { - # 'gz': 'gzip', - # 'bz2': 'bzip2', - 'xz': 'xz', - # 'lzip': 'lzip', - # 'lzop': 'lzop', - } - for t, prog in pattern_compress_prog.items(): + if full: + pattern_compress_prog = { + 'gzip': 'gz', + 'pigz': 'gz', + 'bzip2': 'bz2', + 'xz': 'xz', + # 'pbzip2': 'bz2', + # 'lzip': 'lz', + # 'lzop': 'lzo', + # 'lz4': 'lz4', + } + else: + pattern_compress_prog = { + 'xz': 'xz', + } + for prog, t in pattern_compress_prog.items(): w = shutil.which(prog) if not w: logger.error('SKIPPED: No such command: %s', prog) @@ -3047,7 +3114,8 @@ class GfptarCommand(Command): self.dummy_size_min = self.opt['--dummy-size-min'] self.dummy_size_max = self.opt['--dummy-size-max'] self.dummy_sleep = self.opt['--dummy-sleep'] - if self.compress_type == GfTarFile.COMPRESS_TYPE_NO: + + if self.compress_type == Compress.TYPE_NO: self.split_size = self.assumed_size self.suffix = '.tar' else: @@ -3611,7 +3679,6 @@ class GfptarCommand(Command): use_gfarm_command=self.use_gfarm_command) target_host = self.select_a_target_host(outurl, serial) tar = GfTarFile.create_open(outurl, self.compress_type, self.bufsize, - compress_prog=self.compress_prog, use_fsync=self.use_fsync, target_host=target_host, dummy_input=self.dummy_input, @@ -3996,8 +4063,7 @@ class GfptarCommand(Command): return arch_url = GfURL.init(target, use_gfarm_command=self.use_gfarm_command) - tar = GfTarFile.extract_open(arch_url, self.bufsize, - compress_prog=self.compress_prog) + tar = GfTarFile.extract_open(arch_url, self.bufsize) with self.lock(): members_num = len(member_set) index = serial @@ -4199,15 +4265,21 @@ class GfptarCommand(Command): if not quiet: print(line.rstrip()) - def decompress_infodb(self, tmpdir, db_path): + def decompress_infodb(self, outdir, in_dbgz_path): + dbgz_url = GfURL.init(in_dbgz_path) + base = os.path.basename(in_dbgz_path) + db_path = os.path.join(outdir.name, base) db_url = GfURL.init(db_path) - base = os.path.basename(db_path) - tmpdb_path = os.path.join(tmpdir.name, base) - tmpdb_url = GfURL.init(tmpdb_path) - # TODO decompress - with db_url.readopen() as f: - tmpdb_url.copy_from(f, self.bufsize, use_fsync=self.use_fsync) - return tmpdb_url + with dbgz_url.readopen() as inf: + proc = Compress.decompress(Compress.gzip_prog, inf) + with db_url.writeopen(use_fsync=self.use_fsync) as outf: + shutil.copyfileobj(proc.stdout, outf, self.bufsize) + proc.stdout.close() + ret = proc.wait() + if ret != 0: + raise GfException('{}: returncode={}'.format( + ' '.join(proc.args), ret)) + return db_url TABLE_ENTRY = 'entries' TABLE_USER = 'users' @@ -4330,8 +4402,7 @@ class GfptarCommand(Command): for id_num, tar_path in tarlist: arglist.append((lock, share_cancel, tmpdir, update, gen_num, id_num, tar_path, - self.bufsize, self.compress_prog, - self.use_fsync)) + self.bufsize, self.use_fsync)) # executor.map(gen_infodb_one, arglist) futures = [executor.submit(gen_infodb_one, arg) for arg in arglist] @@ -4378,13 +4449,13 @@ def signal_init_for_gen_infodb(share_cancel): def gen_infodb_name(gen_num, id_num): # ex. g2_0099_ - return f'g{gen_num}_{id_num:04}_info.db' + return f'g{gen_num}_{id_num}_info.db.gz' # ProcessPoolExecutor cannot serialize "self" object of GfptarCommand def gen_infodb_one(args): (lock, share_cancel, tmpdir, update, gen_num, id_num, - tar_path, bufsize, compress_prog, use_fsync) = args + tar_path, bufsize, use_fsync) = args if share_cancel.value != 0: # logger.debug('Canceled (2)') return @@ -4406,8 +4477,7 @@ def gen_infodb_one(args): user_dict = {} # use memory for speed group_dict = {} # use memory for speed - tar = GfTarFile.extract_open(tar_url, bufsize, - compress_prog=compress_prog) + tar = GfTarFile.extract_open(tar_url, bufsize) db_close = False try: interval = 1 # sec. | for interrupt @@ -4477,10 +4547,17 @@ def gen_infodb_one(args): tmpdb_url = GfURL.init(tmpdb_path) if db_url0.exists(): db_url0.remove() - # TODO convert .gz : gzip_writeopen(), copy_from(, gzip=True) # copy infodb from tmpdir to indir - with tmpdb_url.readopen() as f: - db_url0.copy_from(f, bufsize, use_fsync=use_fsync) + with db_url0.writeopen(use_fsync=use_fsync) as outf: + proc = Compress.compress(Compress.gzip_prog, outf) + with tmpdb_url.readopen() as inf: + shutil.copyfileobj(inf, proc.stdin, bufsize) + proc.stdin.close() + ret = proc.wait() + if ret != 0: + raise GfException('{}: returncode={}'.format( + ' '.join(proc.args), ret)) + # atomic operation to avoid leaving junk files db_url0.rename(db_path) finally: @@ -4549,12 +4626,20 @@ Options: -j, --jobs=NUM the number of jobs to copy per tar file in parallel [default: 4] -s, --size=BYTES assumed bytes per output file [default: 200Mi] - -T, --type=TYPE compress type (gz,bz2,xz,no) [default: gz] + -T, --type=TYPE compression type (and tar archive suffix) + - gz : use gzip (*.tar.gz) + - bz2: use bzip2 (*.tar.bz2) + - xz : use xz (*.tar.xz) + - no : no compression (*.tar) + [default: gz] -r, --ratio=RATIO assumed compression ratio (%) [default: 50] -I, --use-compress-program=COMMAND filter data through COMMAND, the command must accept -d option for decompression - (ex. pigz) + (ex. lz4, lzip, lzop) + --gzip-program=COMMAND gzip command (ex. pigz) [default: gzip] + --bzip2-program=COMMAND bzip2 command (ex. pbzip2) [default: bzip2] + --xz-program=COMMAND xz command [default: xz] --same-owner extract files with the same ownership (for euid=0 on local, or gfarmroot on Gfarm) --disable-gfarm-command disable the use of gfreg and gfexport @@ -4577,7 +4662,8 @@ Options: [default: si] --memory=BYTES upper limit of memory size (bytes) (default: no limit) (ex. 2Gi) - --test test mode (-q option is recommended) + --test run short tests (-q option is recommended) + --test-long run long tests (-q option is recommended) --test-workdir-local=DIR local directory for test [default: /tmp] --test-workdir-gfarm=DIR Gfarm directory for test [default: gfarm:/tmp] --dummy-num=NUM the number of dummy (random) files for input @@ -4598,6 +4684,7 @@ Usage: {f} [options] -t {f} [options] --test {f} [options] --test -C ... + {f} [options] --test-long {f} -h | --help """.format(f=progname) @@ -4614,6 +4701,9 @@ _schema = Schema({ '--ratio': Use(int), '--jobs': Use(int), '--use-compress-program': Or(str, None), + '--gzip-program': Or(str, None), + '--bzip2-program': Or(str, None), + '--xz-program': Or(str, None), '--disable-gfarm-command': bool, '--disable-fsync': bool, '--gfsched-interval': Use(int), @@ -4623,6 +4713,7 @@ _schema = Schema({ '--progress-unit': str, '--memory': Or(Use(unhumanize_number), None), '--test': bool, + '--test-long': bool, '--test-workdir-local': Or(str, None), '--test-workdir-gfarm': Or(str, None), '--dummy-num': Or(Use(unhumanize_number), None), From 104747808998ede1e2de508f54af65fcef711362 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Thu, 25 Jul 2024 16:48:18 +0900 Subject: [PATCH 036/143] gfptar --create: create g*_info.db.gz --- gftool/gfptar/gfptar | 265 ++++++++++++++++++++++++------------------- 1 file changed, 150 insertions(+), 115 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 326da6b87..b355736d8 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -193,7 +193,7 @@ class StrObj(DBObj): return txt -class FileAttr(DBObj): +class FileAttr1(DBObj): def __init__(self, mode, mtime, user, group): self.mode = mode self.mtime = mtime @@ -797,6 +797,7 @@ class GfURLEntry(DBObj): return (f'Entry(path={self.path},mode={oct(self.mode)},' f'user={self.uname},group={self.gname})') + # TODO remove # only path must be specified for key when using DBDict @classmethod def dumps(cls, obj, for_dict): @@ -810,6 +811,7 @@ class GfURLEntry(DBObj): array.append(obj.path) # [7] return json.dumps(array, separators=(',', ':')) + # TODO remove @classmethod def loads(cls, key, txt, for_dict): o = json.loads(txt) @@ -2124,6 +2126,7 @@ class GfTarFile(tarfile.TarFile): self.addfile(tarinfo, fileobj=f) else: self.addfile(tarinfo) + return tarinfo # NOTE: add() is not expected behavior. Use addfile() instead. # - add() can copy a hard link, @@ -2142,6 +2145,7 @@ class GfTarFile(tarfile.TarFile): self.addfile(tarinfo, fileobj=f) else: self.addfile(tarinfo) + return tarinfo class RandomStream: @@ -3183,6 +3187,7 @@ class GfptarCommand(Command): tardb_prefix = os.path.join(tmpdir.name, 'list_for_create') tardb_fmt = f'_{self.SERIAL_FORMAT}.db' serial = 1 + gen = 1 # TODO # to reduce memory usage tarlist_db = DB(tardb_prefix + tardb_fmt % serial) tarlist = DBList(tarlist_db, GfURLEntry, 'tarlist') @@ -3217,7 +3222,7 @@ class GfptarCommand(Command): tarlist.close() # DO NOT share a DBList with children, # share the filename instead. - self.create_job_execute(serial, + self.create_job_execute(gen, serial, tarlist.filename()) except Exception as e1: self.cancel() @@ -3262,7 +3267,7 @@ class GfptarCommand(Command): try: tarlist.commit() tarlist.close() - self.create_job_execute(serial, tarlist.filename()) + self.create_job_execute(gen, serial, tarlist.filename()) except Exception as e: self.cancel() if has_error is None: @@ -3327,11 +3332,11 @@ class GfptarCommand(Command): started.status = False self.worker_list.append((started, process, inq, outq)) - def _create_job_execute(self, serial, arg): - self.create_a_tar_thread(serial, arg) + def _create_job_execute(self, gen, serial, arg): + self.create_a_tar_thread(gen, serial, arg) - def _create_job_execute_MT(self, serial, arg): - t = self.thread_pool.submit(self.create_a_tar_thread, serial, arg) + def _create_job_execute_MT(self, gen, serial, arg): + t = self.thread_pool.submit(self.create_a_tar_thread, gen, serial, arg) self.futures[t] = serial has_error = None try: @@ -3492,17 +3497,17 @@ class GfptarCommand(Command): logger.debug("selected target_host: %s", target_host) return target_host - def create_a_tar_thread(self, serial, dbfile): + def create_a_tar_thread(self, gen, serial, dbfile): try: - self.create_a_tar_thread0(serial, dbfile) + self.create_a_tar_thread0(gen, serial, dbfile) except Exception: if self.is_canceled(): raise self.error_canceled() else: raise - def create_a_tar_thread0(self, serial, dbfile): - logger.debug(f'create_a_tar: start (serial={serial})') + def create_a_tar_thread0(self, gen, serial, dbfile): + logger.debug(f'create_a_tar: start (gen={gen}, serial={serial})') if self.is_canceled(): logger.debug(f'Canceled (create 1): serial={serial}') return @@ -3522,7 +3527,7 @@ class GfptarCommand(Command): process.start() # high cost to start started.status = True - inq.put(('START', serial, dbfile)) + inq.put(('START', gen, serial, dbfile)) response = outq.get() if response != 'READY': logger.error(f'Unexpected child response: {response}') @@ -3563,11 +3568,11 @@ class GfptarCommand(Command): self.progress_for_create(now) elif result[0] == 'DONE': (status, tar_size, cannot_be_archived, - outurl, listurl) = result + outurl, infodb_url) = result with self.lock(): self.archived_size += tar_size self.info(f'created(.tar): {outurl}') - self.info(f'created(.lst): {listurl}') + self.info(f'created(.lst): {infodb_url}') break elif result[0] == 'ERR': (status, exc_type_name, exc_value_str, @@ -3604,14 +3609,14 @@ class GfptarCommand(Command): else: logger.error(f'Unexpected request for child: {request}') return # exit - op, serial, dbfile = request + op, gen, serial, dbfile = request logger.debug(f'create_a_tar_process0: start (serial={serial})') try: result = self.create_a_tar_process1(input_queue, output_queue, - serial, dbfile) - tar_size, cannot_be_archived, outurl, listurl = result + gen, serial, dbfile) + tar_size, cannot_be_archived, outurl, infodb_url = result output_queue.put(('DONE', tar_size, cannot_be_archived, - outurl, listurl)) + outurl, infodb_url)) logger.debug(f'subprocess exits: serial={serial}') except KeyboardInterrupt: pass @@ -3631,7 +3636,8 @@ class GfptarCommand(Command): except queue.Empty: pass - def create_a_tar_process1(self, input_queue, output_queue, serial, dbfile): + def create_a_tar_process1(self, input_queue, output_queue, + gen, serial, dbfile): logger.debug(f'create_a_tar_process1: start (serial={serial})') tardb1 = DB(dbfile) filelist = DBList(tardb1, GfURLEntry, 'tarlist') @@ -3662,8 +3668,12 @@ class GfptarCommand(Command): outname = '%s..%s%s' % (firstpath, lastpath, self.suffix) serial_str = f'{self.SERIAL_FORMAT}_' % serial + if gen >= 2: + prefix_str = f'g{gen}_{serial_str}' + else: + prefix_str = serial_str outname_max = self.outdir_url.MAXNAMLEN \ - - len(serial_str) - len(self.LIST_SUFFIX) + - len(prefix_str) - len(self.LIST_SUFFIX) outname_len = len(outname.encode()) offset = 0 while outname_len > outname_max: @@ -3674,7 +3684,7 @@ class GfptarCommand(Command): # loop for multibyte characters offset += 1 # ex.: home/user1/dir -> home_user1_dir - outname = serial_str + outname.replace('/', '_') + outname = prefix_str + outname.replace('/', '_') outurl = GfURL.init(self.outdir_url.url_join(outname), use_gfarm_command=self.use_gfarm_command) target_host = self.select_a_target_host(outurl, serial) @@ -3683,53 +3693,68 @@ class GfptarCommand(Command): target_host=target_host, dummy_input=self.dummy_input, dummy_sleep=self.dummy_sleep) - # to reduce memory usage + # SEE ALSO: gen_infodb_one() + tmpdb_path = tardb1.filename + '_info.db' + infodb = InfoDB(tmpdb_path) + + # TODO remove tardb_ok = DB(tardb1.filename + '_ok.db') filelist_ok = DBList(tardb_ok, GfURLEntry, 'filelist_ok') + cannot_be_archived = 0 - for entry in filelist: - logger.debug(f'subprocess(serial={serial}): {entry.path}') - while not input_queue.empty(): + try: + for entry in filelist: + logger.debug(f'subprocess(serial={serial}): {entry.path}') + while not input_queue.empty(): + try: + qdata = input_queue.get(timeout=1) + except queue.Empty: + qdata = None + if qdata == 'CANCEL': + logger.debug('receive CANCEL from parent') + break + else: + logger.error('unexpected message from parent') + break + subpath = entry.subpath(self.basedir_url) try: - qdata = input_queue.get(timeout=1) - except queue.Empty: - qdata = None - if qdata == 'CANCEL': - logger.debug('receive CANCEL from parent') - break - else: - logger.error('unexpected message from parent') - break - subpath = entry.subpath(self.basedir_url) - try: - logger.debug(f'tar.add_entry: {subpath}') - tar.add_entry(subpath, entry) - filelist_ok.append(entry) - size_all = entry.size_all() - output_queue.put(('ADD', subpath, size_all)) - except MemoryError: - tar.close() - raise - except Exception as e: - cannot_be_archived += 1 - logger.warning(convert_message(e)) - continue + logger.debug(f'tar.add_entry: {subpath}') + tarinfo = tar.add_entry(subpath, entry) + infodb.add(tarinfo) + filelist_ok.append(entry) # TODO remove + size_all = entry.size_all() + output_queue.put(('ADD', subpath, size_all)) + except MemoryError: + tar.close() + raise + except Exception as e: + cannot_be_archived += 1 + logger.warning(convert_message(e)) + continue + finally: + tar.close() + tardb1.close() + tardb1.unlink() + infodb.close() - tar.close() - tardb1.close() - tardb1.unlink() + # SEE ALSO: gen_infodb_name() + db_name = f'g{gen}_{serial_str}info.db.gz' + db_url_str = self.outdir_url.url_join(db_name) + infodb.compress_copy(db_url_str, self.bufsize, self.use_fsync) # for DEBUG # raise Exception('unexpected raise') - listurl = self.create_a_members_list(outurl, filelist_ok, target_host) + # TODO remove + self.create_a_members_list(outurl, filelist_ok, target_host) tardb_ok.close() tardb_ok.unlink() tar_size = outurl.size() - return tar_size, cannot_be_archived, outurl.url_str, listurl.url_str + return tar_size, cannot_be_archived, outurl.url_str, db_url_str + # TODO remove def create_a_members_list(self, url, filelist, target_host): outurl = GfURL.init(url.url_str + self.LIST_SUFFIX) with outurl.writeopen(textmode=True, use_fsync=self.use_fsync, @@ -3918,7 +3943,7 @@ class GfptarCommand(Command): self.extracted_size = 0 self.start_time = time.time() self.next_time = self.start_time + 1 - self.dirstat_dict = DBDict(self.db, FileAttr, 'dirstat_dict') + self.dirstat_dict = DBDict(self.db, FileAttr1, 'dirstat_dict') self.gfsched_lock = None self.gfsched_next = 0 @@ -4127,8 +4152,8 @@ class GfptarCommand(Command): elif tarinfo.isdir(): # NOTE: already created logger.debug('extract,dir: %s', outfile) - fattr = FileAttr(tarinfo.mode, tarinfo.mtime, - tarinfo.uname, tarinfo.gname) + fattr = FileAttr1(tarinfo.mode, tarinfo.mtime, + tarinfo.uname, tarinfo.gname) with self.lock(): self.dirstat_dict[outfile] = fattr elif tarinfo.issym(): @@ -4434,10 +4459,9 @@ def signal_init_for_gen_infodb(share_cancel): if not signal_initialized: def sig_handler(signum, frame): - with share_cancel.get_lock(): - if share_cancel.value != 0: - logger.warning(f'Interrupt (signal={signum}') - share_cancel.value = 1 + if share_cancel.value != 0: + logger.warning(f'Interrupt (signal={signum}') + share_cancel.value = 1 signal.signal(signal.SIGHUP, sig_handler) signal.signal(signal.SIGINT, sig_handler) @@ -4452,6 +4476,67 @@ def gen_infodb_name(gen_num, id_num): return f'g{gen_num}_{id_num}_info.db.gz' +class InfoDB: + def __init__(self, dbfile_path): + self.db = DB(dbfile_path) + self.fattr_dict = DBDict(self.db, FileAttr2, GfptarCommand.TABLE_ENTRY) + self.user_dict = {} # use memory for speed + self.group_dict = {} # use memory for speed + + def add(self, tarinfo): + t = tarinfo + if t.isfile(): + ftype = 'F' + elif t.isdir(): + ftype = 'D' + elif t.issym(): + ftype = 'L' + else: + ftype = '?' + user_id = self.user_dict.get(t.uname, None) # not uid + if user_id is None: + user_id = str(len(self.user_dict)) + self.user_dict[t.uname] = user_id + group_id = self.group_dict.get(t.gname, None) # not gid + if group_id is None: + group_id = str(len(self.group_dict)) + self.group_dict[t.gname] = group_id + fattr = FileAttr2(t.mode, int(t.mtime), user_id, group_id, + t.size, t.linkname, ftype) + self.fattr_dict[t.name] = fattr + + def close(self): + # id -> name + rev_user_dict = DBDict(self.db, StrObj, GfptarCommand.TABLE_USER) + rev_group_dict = DBDict(self.db, StrObj, GfptarCommand.TABLE_GROUP) + for k, v in self.user_dict.items(): + rev_user_dict[v] = k + for k, v in self.group_dict.items(): + rev_group_dict[v] = k + self.db.commit() + self.db.close() + + def compress_copy(self, db_path, bufsize, use_fsync): + db_path_tmp = db_path + '.tmp' + db_url_tmp = GfURL.init(db_path_tmp) + tmpdb_url = GfURL.init(self.db.filename) + if db_url_tmp.exists(): + db_url_tmp.remove() + # copy infodb from tmpdir to indir + with db_url_tmp.writeopen(use_fsync=use_fsync) as outf: + proc = Compress.compress(Compress.gzip_prog, outf) + with tmpdb_url.readopen() as inf: + shutil.copyfileobj(inf, proc.stdin, bufsize) + proc.stdin.close() + ret = proc.wait() + if ret != 0: + raise GfException('{}: returncode={}'.format( + ' '.join(proc.args), ret)) + + # atomic operation to avoid leaving junk files + db_url_tmp.rename(db_path) + + # ProcessPoolExecutor cannot serialize "self" object of GfptarCommand def gen_infodb_one(args): (lock, share_cancel, tmpdir, update, gen_num, id_num, @@ -4471,12 +4556,7 @@ def gen_infodb_one(args): # Local file tmpdb_path = os.path.join(tmpdir.name, str(id_num) + '_info.db') - tmpdb = DB(tmpdb_path) - fattr_dict = DBDict(tmpdb, FileAttr2, GfptarCommand.TABLE_ENTRY) - - user_dict = {} # use memory for speed - group_dict = {} # use memory for speed - + infodb = InfoDB(tmpdb_path) tar = GfTarFile.extract_open(tar_url, bufsize) db_close = False try: @@ -4510,61 +4590,16 @@ def gen_infodb_one(args): # f' {t.size:9d} {t.mtime} {name}') # logger.debug(info) # logger.debug(f'add to DB: {t.name}') - if t.isfile(): - ftype = 'F' - elif t.isdir(): - ftype = 'D' - elif t.issym(): - ftype = 'L' - else: - ftype = '?' - user_id = user_dict.get(t.uname, None) # not uid - if user_id is None: - user_id = str(len(user_dict)) - user_dict[t.uname] = user_id - group_id = group_dict.get(t.gname, None) # not gid - if group_id is None: - group_id = str(len(group_dict)) - group_dict[t.gname] = group_id - fattr = FileAttr2(t.mode, int(t.mtime), user_id, group_id, - t.size, t.linkname, ftype) - fattr_dict[t.name] = fattr + infodb.add(t) # success - # id -> name - rev_user_dict = DBDict(tmpdb, StrObj, GfptarCommand.TABLE_USER) - rev_group_dict = DBDict(tmpdb, StrObj, GfptarCommand.TABLE_GROUP) - for k, v in user_dict.items(): - rev_user_dict[v] = k - for k, v in group_dict.items(): - rev_group_dict[v] = k - tmpdb.commit() - tmpdb.close() + infodb.close() db_close = True - - db_path0 = indir_url.url_join(db_name + '.tmp') - db_url0 = GfURL.init(db_path0) - tmpdb_url = GfURL.init(tmpdb_path) - if db_url0.exists(): - db_url0.remove() - # copy infodb from tmpdir to indir - with db_url0.writeopen(use_fsync=use_fsync) as outf: - proc = Compress.compress(Compress.gzip_prog, outf) - with tmpdb_url.readopen() as inf: - shutil.copyfileobj(inf, proc.stdin, bufsize) - proc.stdin.close() - ret = proc.wait() - if ret != 0: - raise GfException('{}: returncode={}'.format( - ' '.join(proc.args), ret)) - - # atomic operation to avoid leaving junk files - db_url0.rename(db_path) + infodb.compress_copy(db_path, bufsize, use_fsync) finally: tar.close() if not db_close: - tmpdb.commit() - tmpdb.close() + infodb.db.close() progname = os.path.basename(__file__) From a4cd5c11e2686bb85e4e03b611be6acdd73ecc7d Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Fri, 26 Jul 2024 16:56:19 +0900 Subject: [PATCH 037/143] gfptar --list (simple): use gfptar*_info.db.gz --- gftool/gfptar/gfptar | 238 +++++++++++++++++++++++-------------------- 1 file changed, 126 insertions(+), 112 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index b355736d8..4b317dd2e 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -2239,7 +2239,8 @@ class TestGfptar(unittest.TestCase): (None, None, 'abc:/def://')) def test_DBDict(self): - tmpdir = tempfile.TemporaryDirectory(prefix='gfptar-test-', + euid = os.geteuid() + tmpdir = tempfile.TemporaryDirectory(prefix=f'gfptar-test-{euid}-', dir=None) testdb = DB(os.path.join(tmpdir.name, 'test.db')) d = DBDict(testdb, IntObj, 'test_dict') @@ -2280,7 +2281,8 @@ class TestGfptar(unittest.TestCase): d.unlink() def test_DBSet(self): - tmpdir = tempfile.TemporaryDirectory(prefix='gfptar-test-', + euid = os.geteuid() + tmpdir = tempfile.TemporaryDirectory(prefix=f'gfptar-test-{euid}-', dir=None) testdb = DB(os.path.join(tmpdir.name, 'test.db')) s = DBSet(testdb, IntObj, 'test_set') @@ -2310,7 +2312,8 @@ class TestGfptar(unittest.TestCase): s.unlink() def test_DBList(self): - tmpdir = tempfile.TemporaryDirectory(prefix='gfptar-test-', + euid = os.geteuid() + tmpdir = tempfile.TemporaryDirectory(prefix=f'gfptar-test-{euid}', dir=None) testdb = DB(os.path.join(tmpdir.name, 'test.db')) lst = DBList(testdb, IntObj, 'test_list') @@ -2376,7 +2379,13 @@ class GfptarCommand(Command): self.bufsize = self.opt['--bufsize'] self.progress_enabled = self._progress_enabled() self.use_fsync = not self.opt['--disable-fsync'] + self.workdir = self.opt['--workdir'] + euid = os.geteuid() + # Temporary files are removed when the process exits. + # dir=None: system default + self.tmpdir = tempfile.TemporaryDirectory(prefix=f'gfptar-{euid}-', + dir=self.workdir) progress_unit_type = self.opt['--progress-unit'] if progress_unit_type == 'si': @@ -3180,10 +3189,7 @@ class GfptarCommand(Command): return gfurl.listdir(recursive=True, first=True, hardlink_warn=self.hardlink_warn) - # Temporary files are removed when the process exits. - # dir=None: system default - tmpdir = tempfile.TemporaryDirectory(prefix='gfptar-', - dir=self.workdir) + tmpdir = self.tmpdir tardb_prefix = os.path.join(tmpdir.name, 'list_for_create') tardb_fmt = f'_{self.SERIAL_FORMAT}.db' serial = 1 @@ -3568,11 +3574,11 @@ class GfptarCommand(Command): self.progress_for_create(now) elif result[0] == 'DONE': (status, tar_size, cannot_be_archived, - outurl, infodb_url) = result + out_tar_path, out_dbgz_path) = result with self.lock(): self.archived_size += tar_size - self.info(f'created(.tar): {outurl}') - self.info(f'created(.lst): {infodb_url}') + self.info(f'created(.tar): {out_tar_path}') + self.info(f'created(.db.gz): {out_dbgz_path}') break elif result[0] == 'ERR': (status, exc_type_name, exc_value_str, @@ -3669,6 +3675,7 @@ class GfptarCommand(Command): serial_str = f'{self.SERIAL_FORMAT}_' % serial if gen >= 2: + # SEE ALSO: list_infodb() prefix_str = f'g{gen}_{serial_str}' else: prefix_str = serial_str @@ -3695,7 +3702,8 @@ class GfptarCommand(Command): dummy_sleep=self.dummy_sleep) # to reduce memory usage # SEE ALSO: gen_infodb_one() - tmpdb_path = tardb1.filename + '_info.db' + db_name = InfoDB.infodb_filename(gen, serial) + tmpdb_path = os.path.join(self.tmpdir.name, db_name + '.tmp') infodb = InfoDB(tmpdb_path) # TODO remove @@ -3703,6 +3711,7 @@ class GfptarCommand(Command): filelist_ok = DBList(tardb_ok, GfURLEntry, 'filelist_ok') cannot_be_archived = 0 + db_close = False try: for entry in filelist: logger.debug(f'subprocess(serial={serial}): {entry.path}') @@ -3732,16 +3741,19 @@ class GfptarCommand(Command): cannot_be_archived += 1 logger.warning(convert_message(e)) continue + infodb.commit_close() + db_close = True finally: tar.close() tardb1.close() tardb1.unlink() - infodb.close() + if not db_close: + infodb.db.close() # SEE ALSO: gen_infodb_name() - db_name = f'g{gen}_{serial_str}info.db.gz' - db_url_str = self.outdir_url.url_join(db_name) - infodb.compress_copy(db_url_str, self.bufsize, self.use_fsync) + out_db_path = self.outdir_url.url_join(db_name) + InfoDB.compress_copy(infodb.db.filename, out_db_path, + self.bufsize, self.use_fsync) # for DEBUG # raise Exception('unexpected raise') @@ -3752,7 +3764,7 @@ class GfptarCommand(Command): tardb_ok.unlink() tar_size = outurl.size() - return tar_size, cannot_be_archived, outurl.url_str, db_url_str + return tar_size, cannot_be_archived, outurl.url_str, out_db_path # TODO remove def create_a_members_list(self, url, filelist, target_host): @@ -3801,10 +3813,7 @@ class GfptarCommand(Command): self.search_target = len(specified_members) > 0 - # Temporary files are removed when the process exits. - # dir=None: system default - tmpdir = tempfile.TemporaryDirectory(prefix='gfptar-', - dir=self.workdir) + tmpdir = self.tmpdir db_file = os.path.join(tmpdir.name, 'extract.db') db_file_target = os.path.join(tmpdir.name, 'target.db') logger.debug(f'db_file={db_file}') @@ -3821,6 +3830,7 @@ class GfptarCommand(Command): self.sig_init() target_set, directory_set, member_set = self.extract_schedule_v3() self.extract_main(target_set, directory_set, member_set) + tmpdir.cleanup() def extract_schedule_v3(self): target_set = DBSet(self.db_target, StrObj, 'target_set') @@ -3838,13 +3848,14 @@ class GfptarCommand(Command): path = path.lstrip('/') member_check_dict[path] = False # initialize + tar_pattern = re.compile(self.PATT_TAR) for ent in self.indir_url.listdir(recursive=False): if self.is_canceled(): raise self.error_canceled() if ent.path.endswith(self.LIST_SUFFIX): # ignore *.lst, *.tar.lst continue base = os.path.basename(ent.path) - if not self.PATT_TAR.match(base): # ignore not *.tar.* + if not tar_pattern.match(base): # ignore not *.tar.* continue arch_url_str = ent.path list_file = arch_url_str + self.LIST_SUFFIX @@ -4274,106 +4285,85 @@ class GfptarCommand(Command): def list_simple(self, indir, quiet=False): self.options_init() indir_url = GfURL.init(indir) - filelistlist = [] - for ent in indir_url.listdir(recursive=False): - if not ent.path.endswith(self.LIST_SUFFIX): - continue - filelistlist.append(ent.path) - filelistlist.sort() - for filelist in filelistlist: - list_url = GfURL.init(filelist) - with list_url.readopen(textmode=True) as f: - while True: - line = f.readline() - if not line: - break - if not quiet: - print(line.rstrip()) - - def decompress_infodb(self, outdir, in_dbgz_path): - dbgz_url = GfURL.init(in_dbgz_path) - base = os.path.basename(in_dbgz_path) - db_path = os.path.join(outdir.name, base) - db_url = GfURL.init(db_path) - with dbgz_url.readopen() as inf: - proc = Compress.decompress(Compress.gzip_prog, inf) - with db_url.writeopen(use_fsync=self.use_fsync) as outf: - shutil.copyfileobj(proc.stdout, outf, self.bufsize) - proc.stdout.close() - ret = proc.wait() - if ret != 0: - raise GfException('{}: returncode={}'.format( - ' '.join(proc.args), ret)) - return db_url + self.gen_infodb_all(indir_url, self.tmpdir, update=False) - TABLE_ENTRY = 'entries' - TABLE_USER = 'users' - TABLE_GROUP = 'groups' + for path, fattr in self.list_infodb(indir_url, resolve_ugmap=False): + if not quiet: + print(f'{fattr.ftype} {path}') + self.tmpdir.cleanup() def list_verbose(self, indir, quiet=False): self.options_init() indir_url = GfURL.init(indir) - tmpdir = tempfile.TemporaryDirectory(prefix='gfptar-', - dir=self.workdir) - self.gen_infodb_all(indir_url, tmpdir, update=False) + self.gen_infodb_all(indir_url, self.tmpdir, update=False) - # TODO .db.gz - db_gen1_pattern = re.compile(r'^(\d+)_.+.db') + for path, fattr in self.list_infodb(indir_url, resolve_ugmap=True): + if not quiet: + print(f'{fattr.ftype} {fattr.mode:04o}' + f' {fattr.user:>10}/{fattr.group:<10}' + f' {fattr.size:9d} {fattr.mtime} {path}') + self.tmpdir.cleanup() + + TABLE_ENTRY = 'path_entry' + TABLE_USER = 'user_map' + TABLE_GROUP = 'group_map' + + def list_infodb(self, indir_url, resolve_ugmap=False): # divide by generations : gNN_ (NN >= 2) - db_gen_pattern = re.compile(r'^g(\d+)_(\d+)_.+.db') + db_gen_pattern = re.compile(InfoDB.PATT_INFODB) infodb_list = [] for ent in indir_url.listdir(recursive=False): path = ent.path # fullpath when ent is Gfarm base = os.path.basename(path) - g1_match = db_gen1_pattern.match(base) - if g1_match: - id_num = g1_match.group(1) + g_match = db_gen_pattern.match(base) + if g_match: + # int("0001") -> 1 + id_num = g_match.group(2) infodb_list.append((int(id_num), path)) - else: - g_match = db_gen_pattern.match(base) - if g_match: - id_num = g_match.group(2) - infodb_list.append((int(id_num), path)) def id_key(id_path): id_num, path = id_path return id_num infodb_list.sort(key=id_key) - for id_num, db_path in infodb_list: - tmpdb_url = self.decompress_infodb(tmpdir, db_path) + for id_num, in_dbgz_path in infodb_list: + base = os.path.basename(in_dbgz_path) + out_db_path = os.path.join(self.tmpdir.name, base + '.tmp.db') + tmpdb_url = InfoDB.decompress_copy(in_dbgz_path, out_db_path, + self.bufsize, self.use_fsync) db = DB(tmpdb_url.path) try: fattr_dict = DBDict(db, FileAttr2, self.TABLE_ENTRY) - user_dict = DBDict(db, StrObj, self.TABLE_USER) - group_dict = DBDict(db, StrObj, self.TABLE_GROUP) - user_dict_mem = {} - group_dict_mem = {} - # cache in memory - for k, v in user_dict.items(): - user_dict_mem[k] = v - for k, v in group_dict.items(): - group_dict_mem[k] = v - for path, t in fattr_dict.iterator(sort='ASC'): - # id -> name - user = user_dict_mem.get(t.user, '???') - group = group_dict_mem.get(t.group, '???') - if not quiet: - print(f'{t.ftype} {t.mode:04o}' - f' {user:>10}/{group:<10}' - f' {t.size:9d} {t.mtime} {path}') + if resolve_ugmap: + user_dict = DBDict(db, StrObj, self.TABLE_USER) + group_dict = DBDict(db, StrObj, self.TABLE_GROUP) + user_dict_mem = {} + group_dict_mem = {} + # cache in memory + for k, v in user_dict.items(): + user_dict_mem[k] = v + for k, v in group_dict.items(): + group_dict_mem[k] = v + for path, fattr in fattr_dict.iterator(sort='ASC'): + if resolve_ugmap: + # id -> name + fattr.user = user_dict_mem.get(fattr.user, '???') + fattr.group = group_dict_mem.get(fattr.group, '???') + yield path, fattr finally: db.close() # *.tar or *.tar.* - PATT_TAR = re.compile(r'.*\.tar(\.\w{1,5})?$') + PATT_TAR = r'.*\.tar(\.\w{1,5})?$' def gen_infodb_all(self, indir_url, tmpdir, update=False): - tar_pattern = self.PATT_TAR - gen1_pattern = re.compile(r'^(\d+)_.+') + PATT_TAR_GEN1 = r'^(\d+)_.+' + PATT_TAR_GEN = r'^g(\d+)_(\d+)_.+' + tar_pattern = re.compile(self.PATT_TAR) + gen1_pattern = re.compile(PATT_TAR_GEN1) # divide by generations : gNN_ (NN >= 2) - gen_pattern = re.compile(r'^g(\d+)_(\d+)_.+') + gen_pattern = re.compile(PATT_TAR_GEN) gen_dict_list = defaultdict(list) for ent in indir_url.listdir(recursive=False): @@ -4387,14 +4377,16 @@ class GfptarCommand(Command): if g1_match: # generation number = 1 gen_num = '1' + # ex. 0001 (str) id_num = g1_match.group(1) else: g_match = gen_pattern.match(base) if g_match: gen_num = g_match.group(1) + # ex. 0001 (str) id_num = g_match.group(2) else: - # skip irrelevant file + # ignore irrelevant file continue # gen -> tarlist gen_dict_list[gen_num].append((id_num, path)) @@ -4471,11 +4463,6 @@ def signal_init_for_gen_infodb(share_cancel): # logger.debug('signal_init_for_gen_infodb') -def gen_infodb_name(gen_num, id_num): - # ex. g2_0099_ - return f'g{gen_num}_{id_num}_info.db.gz' - - class InfoDB: def __init__(self, dbfile_path): self.db = DB(dbfile_path) @@ -4483,6 +4470,13 @@ class InfoDB: self.user_dict = {} # use memory for speed self.group_dict = {} # use memory for speed + PATT_INFODB = r'^gfptar(\d+)_(\d+)_.+.db.gz' + + @classmethod + def infodb_filename(cls, gen_num, id_num): + # ex. gfptar2_0099_info.db.gz + return f'gfptar{gen_num}_{int(id_num):04}_info.db.gz' + def add(self, tarinfo): t = tarinfo if t.isfile(): @@ -4505,7 +4499,7 @@ class InfoDB: t.size, t.linkname, ftype) self.fattr_dict[t.name] = fattr - def close(self): + def commit_close(self): # id -> name rev_user_dict = DBDict(self.db, StrObj, GfptarCommand.TABLE_USER) rev_group_dict = DBDict(self.db, StrObj, GfptarCommand.TABLE_GROUP) @@ -4516,16 +4510,16 @@ class InfoDB: self.db.commit() self.db.close() - def compress_copy(self, db_path, bufsize, use_fsync): - db_path_tmp = db_path + '.tmp' - db_url_tmp = GfURL.init(db_path_tmp) - tmpdb_url = GfURL.init(self.db.filename) - if db_url_tmp.exists(): - db_url_tmp.remove() - # copy infodb from tmpdir to indir - with db_url_tmp.writeopen(use_fsync=use_fsync) as outf: + @staticmethod + def compress_copy(in_db_path, out_dbgz_path, bufsize, use_fsync): + dbgz_path_tmp = out_dbgz_path + '.tmp' + dbgz_url_tmp = GfURL.init(dbgz_path_tmp) + db_url = GfURL.init(in_db_path) + if dbgz_url_tmp.exists(): + dbgz_url_tmp.remove() + with dbgz_url_tmp.writeopen(use_fsync=use_fsync) as outf: proc = Compress.compress(Compress.gzip_prog, outf) - with tmpdb_url.readopen() as inf: + with db_url.readopen() as inf: shutil.copyfileobj(inf, proc.stdin, bufsize) proc.stdin.close() ret = proc.wait() @@ -4534,7 +4528,26 @@ class InfoDB: ' '.join(proc.args), ret)) # atomic operation to avoid leaving junk files - db_url_tmp.rename(db_path) + dbgz_url_tmp.rename(out_dbgz_path) + logger.debug(f'created(.db.gz): {out_dbgz_path}') + + @staticmethod + def decompress_copy(in_dbgz_path, out_db_path, bufsize, use_fsync): + # out_db_path is stored in tempfile.TemporaryDirectory + dbgz_url = GfURL.init(in_dbgz_path) + db_url = GfURL.init(out_db_path) + if db_url.exists(): + db_url.remove() + with dbgz_url.readopen() as inf: + proc = Compress.decompress(Compress.gzip_prog, inf) + with db_url.writeopen(use_fsync=use_fsync) as outf: + shutil.copyfileobj(proc.stdout, outf, bufsize) + proc.stdout.close() + ret = proc.wait() + if ret != 0: + raise GfException('{}: returncode={}'.format( + ' '.join(proc.args), ret)) + return db_url # ProcessPoolExecutor cannot serialize "self" object of GfptarCommand @@ -4546,7 +4559,7 @@ def gen_infodb_one(args): return tar_url = GfURL.init(tar_path) indir_url = tar_url.parent - db_name = gen_infodb_name(gen_num, id_num) + db_name = InfoDB.infodb_filename(gen_num, id_num) db_path = indir_url.url_join(db_name) db_url = GfURL.init(db_path) if not update and db_url.exists(): @@ -4593,9 +4606,10 @@ def gen_infodb_one(args): infodb.add(t) # success - infodb.close() + infodb.commit_close() db_close = True - infodb.compress_copy(db_path, bufsize, use_fsync) + InfoDB.compress_copy(infodb.db.filename, db_path, bufsize, use_fsync) + # TODO progress finally: tar.close() if not db_close: From b765a870d20f8fb4ff46a9b2b23472b052cafd1f Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Sat, 27 Jul 2024 02:47:25 +0900 Subject: [PATCH 038/143] gfptar: new option: --sync-db gfptar --extract: use gfptar*_info.db.gz --- gftool/gfptar/gfptar | 317 +++++++++++++++++++++++++++---------------- 1 file changed, 199 insertions(+), 118 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 4b317dd2e..b7077df98 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -545,14 +545,14 @@ class GfLogger(logging.getLoggerClass()): self._mylog(logging.WARNING, msg, *args) def error(self, msg, *args, **kwargs): - self._mylog(logging.ERROR, msg, *args) + self._mylog(logging.ERROR, 'Error: ' + msg, *args) def error_exit(self, exit_code, msg, *args): - self._mylog(logging.ERROR, msg, *args) + self._mylog(logging.ERROR, 'Error: ' + msg, *args) sys.exit(exit_code) # def fatal(self, msg, *args, **kwargs): - # self._mylog(logging.ERROR, msg, args) + # self._mylog(logging.ERROR, 'Error: ' + msg, args) # if 'exit_code' in kwargs: # raise GfException('exit_code={}'.format( # kwargs['exit_code'])) @@ -762,11 +762,12 @@ class Command(metaclass=abc.ABCMeta): class GfURLEntry(DBObj): - TYPE_FILE = 'FILE' - TYPE_DIR = 'DIR' - TYPE_SYMLINK = 'SYM' - TYPE_OTHER = 'OTHER' + TYPE_FILE = 'F' + TYPE_DIR = 'D' + TYPE_SYMLINK = 'S' + TYPE_OTHER = '?' + # TODO remove type_map = { TYPE_FILE: 1, TYPE_DIR: 2, @@ -2379,6 +2380,7 @@ class GfptarCommand(Command): self.bufsize = self.opt['--bufsize'] self.progress_enabled = self._progress_enabled() self.use_fsync = not self.opt['--disable-fsync'] + self.sync_infodb = self.opt['--sync-db'] self.workdir = self.opt['--workdir'] euid = os.geteuid() @@ -2537,13 +2539,14 @@ class GfptarCommand(Command): save_opt_jobs = self.opt['--jobs'] save_opt_type = self.opt['--type'] save_opt_compress_prog = self.opt['--use-compress-program'] + save_opt_syncdb = self.opt['--sync-db'] # create tar per one entry self.opt['--size'] = 0 if full: - pattern_jobs = [0, 10] + pattern_jobs = [0, 16] else: - pattern_jobs = [16] + pattern_jobs = [] # skip for jobs in pattern_jobs: self.opt['--jobs'] = jobs self.test_simple('jobs_' + str(jobs), use_all_files=True) @@ -2551,6 +2554,11 @@ class GfptarCommand(Command): # create one tar self.opt['--size'] = unhumanize_number('100M') + + self.opt['--sync-db'] = True + self.test_simple('syncdb') + self.opt['--sync-db'] = save_opt_syncdb + if full: pattern_type = [ 'gz', @@ -2587,7 +2595,6 @@ class GfptarCommand(Command): self.opt['--type'] = t self.opt['--use-compress-program'] = prog self.test_simple('compress_prog_' + prog) - # self.opt['--use-compress-program'] = save_opt_compress_prog self.opt['--size'] = save_opt_size self.opt['--jobs'] = save_opt_jobs @@ -3828,11 +3835,10 @@ class GfptarCommand(Command): self.next_time = self.start_time + 1 self.sig_init() - target_set, directory_set, member_set = self.extract_schedule_v3() - self.extract_main(target_set, directory_set, member_set) + self.extract_main() tmpdir.cleanup() - def extract_schedule_v3(self): + def extract_schedule_v4(self): target_set = DBSet(self.db_target, StrObj, 'target_set') directory_set = DBSet(self.db, StrObj, 'directory_set') member_set = DBSet(self.db, StrObj, 'member_set') @@ -3848,39 +3854,55 @@ class GfptarCommand(Command): path = path.lstrip('/') member_check_dict[path] = False # initialize - tar_pattern = re.compile(self.PATT_TAR) - for ent in self.indir_url.listdir(recursive=False): + tar_list, infodb_list = self.list_tar_and_infodb_files(self.indir_url) + infodb_list2 = self.check_and_sync_infodb(self.indir_url, + tar_list=tar_list) + if infodb_list2 is not None: + infodb_list = infodb_list2 + self.sort_tar_or_infodb_list(infodb_list) + + infodb_dict = {} + for id_num, gen_num, infodb_path in infodb_list: + infodb_dict[id_num] = infodb_path + + error_num = 0 + tar_dict = {} + for id_num, gen_num, tar_path in tar_list: + tar_dict[id_num] = tar_path + infodb = infodb_dict.get(id_num, None) + if infodb is None: + logger.error(f'lost *_info.db.gz tied to {tar_path}') + error_num += 1 + del infodb_dict + + if self.debug: + infodb_list3 = self.list_infodb_files(self.indir_url) + self.sort_tar_or_infodb_list(infodb_list3) + if infodb_list != infodb_list3: + raise AssertionError(f'infodb_list{str(infodb_list)} != ' + f'infodb_list3{str(infodb_list3)}') + + for id_num, gen_num, in_dbgz_path in infodb_list: if self.is_canceled(): raise self.error_canceled() - if ent.path.endswith(self.LIST_SUFFIX): # ignore *.lst, *.tar.lst + tar_path = tar_dict.get(id_num, None) + if tar_path is None: + logger.error(f'lost *.tar.* tied to {in_dbgz_path}') + error_num += 1 continue - base = os.path.basename(ent.path) - if not tar_pattern.match(base): # ignore not *.tar.* - continue - arch_url_str = ent.path - list_file = arch_url_str + self.LIST_SUFFIX - list_url = GfURL.init(list_file) - if not list_url.exists(): - raise self.error_not_a_gfptar_directory(list_file) - with list_url.readopen(textmode=True) as f: - while True: - line = f.readline() - if not line: - break - if self.is_canceled(): - raise self.error_canceled() - self.total_num += 1 - line = line.rstrip() - # ex. "D /path/to/dir" - file_type = line[:1] - path = line[2:].lstrip('/') - if self.search_target: - logger.debug(f'archive_dict[{path}]: {file_type}') - archive_dict[path] = file_type - else: - if file_type == 'D': - logger.debug(f'directory_set.add: {path}') - directory_set.add(path) # all directories + for path, fattr in self.list_entries_from_one( + in_dbgz_path, resolve_ugmap=False): + if self.is_canceled(): + raise self.error_canceled() + self.total_num += 1 + file_type = fattr.ftype + if self.search_target: + logger.debug(f'archive_dict[{path}]: {file_type}') + archive_dict[path] = file_type + else: + if file_type == InfoDB.TYPE_DIR: + logger.debug(f'directory_set.add: {path}') + directory_set.add(path) # all directories if self.search_target: is_target = False for member in member_check_dict.keys(): @@ -3892,7 +3914,7 @@ class GfptarCommand(Command): logger.debug(f'member_set.add: {member}') member_set.add(member) found = True - if file_type == 'D': + if file_type == InfoDB.TYPE_DIR: logger.debug(f'directory_set.add: {member}') directory_set.add(member) is_dir = True @@ -3909,7 +3931,7 @@ class GfptarCommand(Command): logger.debug(f'member_set.add: {path}') member_set.add(path) found = True - if file_type == 'D': + if file_type == InfoDB.TYPE_DIR: logger.debug(f'directory_set.add: {path}') directory_set.add(path) if found: @@ -3917,10 +3939,10 @@ class GfptarCommand(Command): is_target = True archive_dict.clear() # re-use for next tar file if is_target: - logger.debug(f'target_set.add: {arch_url_str}') - target_set.add(arch_url_str) # select this tar file + logger.debug(f'target_set.add: {tar_path}') + target_set.add(tar_path) # select this tar file else: - target_set.add(arch_url_str) # use all tar files + target_set.add(tar_path) # use all tar files if self.progress_enabled: now = time.time() if now >= self.next_time: @@ -3939,10 +3961,11 @@ class GfptarCommand(Command): ' in archive files: ' + member) member_check_dict.clear() archive_dict.clear() + return target_set, directory_set, member_set, error_num - return target_set, directory_set, member_set - - def extract_main(self, target_set, directory_set, member_set): + def extract_main(self): + (target_set, directory_set, + member_set, error_num) = self.extract_schedule_v4() self.outdir_url.create_new_dir() self.created_directory_set = DBSet(self.db, StrObj, @@ -3973,6 +3996,8 @@ class GfptarCommand(Command): self.db.unlink() self.db_target.close() self.db_target.unlink() + if error_num > 0: + raise GfptarError(f'Total errors encountered: {error_num}') def extract_directories(self, directory_set): logger.debug('extract_directories') @@ -4285,9 +4310,10 @@ class GfptarCommand(Command): def list_simple(self, indir, quiet=False): self.options_init() indir_url = GfURL.init(indir) - self.gen_infodb_all(indir_url, self.tmpdir, update=False) + infodb_list = self.check_and_sync_infodb(indir_url) - for path, fattr in self.list_infodb(indir_url, resolve_ugmap=False): + for path, fattr in self.list_entries_from_all( + indir_url, infodb_list=infodb_list, resolve_ugmap=True): if not quiet: print(f'{fattr.ftype} {path}') self.tmpdir.cleanup() @@ -4295,23 +4321,34 @@ class GfptarCommand(Command): def list_verbose(self, indir, quiet=False): self.options_init() indir_url = GfURL.init(indir) - self.gen_infodb_all(indir_url, self.tmpdir, update=False) + infodb_list = self.check_and_sync_infodb(indir_url) - for path, fattr in self.list_infodb(indir_url, resolve_ugmap=True): + for path, fattr in self.list_entries_from_all( + indir_url, infodb_list=infodb_list, resolve_ugmap=True): if not quiet: print(f'{fattr.ftype} {fattr.mode:04o}' f' {fattr.user:>10}/{fattr.group:<10}' f' {fattr.size:9d} {fattr.mtime} {path}') self.tmpdir.cleanup() - TABLE_ENTRY = 'path_entry' - TABLE_USER = 'user_map' - TABLE_GROUP = 'group_map' + def list_entries_from_all(self, indir_url, infodb_list=None, + resolve_ugmap=False): + if infodb_list is None: + infodb_list = self.list_infodb_files(indir_url) + self.sort_tar_or_infodb_list(infodb_list) + for id_num, gen_num, in_dbgz_path in infodb_list: + yield from self.list_entries_from_one(in_dbgz_path, resolve_ugmap) + + def sort_tar_or_infodb_list(self, tar_or_infodb_list): + def id_key(value): + id_num, gen_num, path = value + return id_num - def list_infodb(self, indir_url, resolve_ugmap=False): - # divide by generations : gNN_ (NN >= 2) - db_gen_pattern = re.compile(InfoDB.PATT_INFODB) + tar_or_infodb_list.sort(key=id_key) + return tar_or_infodb_list + def list_infodb_files(self, indir_url): + db_gen_pattern = re.compile(InfoDB.PATT_INFODB) infodb_list = [] for ent in indir_url.listdir(recursive=False): path = ent.path # fullpath when ent is Gfarm @@ -4319,68 +4356,50 @@ class GfptarCommand(Command): g_match = db_gen_pattern.match(base) if g_match: # int("0001") -> 1 + gen_num = g_match.group(1) id_num = g_match.group(2) - infodb_list.append((int(id_num), path)) + infodb_list.append((int(id_num), int(gen_num), path)) + return infodb_list - def id_key(id_path): - id_num, path = id_path - return id_num + def list_tar_and_infodb_files(self, indir_url): + return self.list_tar_files(indir_url, infodb=True) - infodb_list.sort(key=id_key) - for id_num, in_dbgz_path in infodb_list: - base = os.path.basename(in_dbgz_path) - out_db_path = os.path.join(self.tmpdir.name, base + '.tmp.db') - tmpdb_url = InfoDB.decompress_copy(in_dbgz_path, out_db_path, - self.bufsize, self.use_fsync) - db = DB(tmpdb_url.path) - try: - fattr_dict = DBDict(db, FileAttr2, self.TABLE_ENTRY) - if resolve_ugmap: - user_dict = DBDict(db, StrObj, self.TABLE_USER) - group_dict = DBDict(db, StrObj, self.TABLE_GROUP) - user_dict_mem = {} - group_dict_mem = {} - # cache in memory - for k, v in user_dict.items(): - user_dict_mem[k] = v - for k, v in group_dict.items(): - group_dict_mem[k] = v - for path, fattr in fattr_dict.iterator(sort='ASC'): - if resolve_ugmap: - # id -> name - fattr.user = user_dict_mem.get(fattr.user, '???') - fattr.group = group_dict_mem.get(fattr.group, '???') - yield path, fattr - finally: - db.close() - - # *.tar or *.tar.* - PATT_TAR = r'.*\.tar(\.\w{1,5})?$' - - def gen_infodb_all(self, indir_url, tmpdir, update=False): + def list_tar_files(self, indir_url, infodb=False): + # *.tar or *.tar.* + PATT_TAR = r'.*\.tar(\.\w{1,5})?$' PATT_TAR_GEN1 = r'^(\d+)_.+' PATT_TAR_GEN = r'^g(\d+)_(\d+)_.+' - tar_pattern = re.compile(self.PATT_TAR) - gen1_pattern = re.compile(PATT_TAR_GEN1) - # divide by generations : gNN_ (NN >= 2) - gen_pattern = re.compile(PATT_TAR_GEN) - gen_dict_list = defaultdict(list) - + tar_pattern = re.compile(PATT_TAR) + tar_gen1_pattern = re.compile(PATT_TAR_GEN1) + # divide by generations : gN_* (N >= 2) + tar_gen_pattern = re.compile(PATT_TAR_GEN) + tar_list = [] + if infodb: + db_gen_pattern = re.compile(InfoDB.PATT_INFODB) + infodb_list = [] for ent in indir_url.listdir(recursive=False): path = ent.path base = os.path.basename(path) + if infodb: + db_match = db_gen_pattern.match(base) + if db_match: + # int("0001") -> 1 + gen_num = db_match.group(1) + id_num = db_match.group(2) + infodb_list.append((int(id_num), int(gen_num), path)) + continue if ent.path.endswith(self.LIST_SUFFIX): # ignore *.lst, *.tar.lst continue if not tar_pattern.match(base): # ignore not *.tar.* continue - g1_match = gen1_pattern.match(base) + g1_match = tar_gen1_pattern.match(base) if g1_match: # generation number = 1 gen_num = '1' # ex. 0001 (str) id_num = g1_match.group(1) else: - g_match = gen_pattern.match(base) + g_match = tar_gen_pattern.match(base) if g_match: gen_num = g_match.group(1) # ex. 0001 (str) @@ -4388,8 +4407,53 @@ class GfptarCommand(Command): else: # ignore irrelevant file continue - # gen -> tarlist - gen_dict_list[gen_num].append((id_num, path)) + tar_list.append((int(id_num), int(gen_num), path)) + if infodb: + return tar_list, infodb_list + else: + return tar_list + + # TODO InfoDB + def list_entries_from_one(self, in_dbgz_path, resolve_ugmap=False): + base = os.path.basename(in_dbgz_path) + out_db_path = os.path.join(self.tmpdir.name, base + '.tmp.db') + tmpdb_url = InfoDB.decompress_copy(in_dbgz_path, out_db_path, + self.bufsize, self.use_fsync) + db = DB(tmpdb_url.path) + try: + fattr_dict = DBDict(db, FileAttr2, InfoDB.TABLE_ENTRY) + if resolve_ugmap: + user_dict = DBDict(db, StrObj, InfoDB.TABLE_USER) + group_dict = DBDict(db, StrObj, InfoDB.TABLE_GROUP) + user_dict_mem = {} + group_dict_mem = {} + # cache in memory + for k, v in user_dict.items(): + user_dict_mem[k] = v + for k, v in group_dict.items(): + group_dict_mem[k] = v + for path, fattr in fattr_dict.iterator(sort='ASC'): + if resolve_ugmap: + # id -> name + fattr.user = user_dict_mem.get(fattr.user, '???') + fattr.group = group_dict_mem.get(fattr.group, '???') + yield path, fattr + finally: + db.close() + + def check_and_sync_infodb(self, indir_url, tar_list=None): + if not self.sync_infodb: + logger.debug('check_and_sync_infodb: disable') + return None + logger.debug('check_and_sync_infodb: enable') + if tar_list is None: + tar_list = self.list_tar_files(indir_url) + update = True + + # generation -> tar files + gen_to_tar_list = defaultdict(list) + for id_num, gen_num, path in tar_list: + gen_to_tar_list[gen_num].append((id_num, path)) if self.jobs >= 1: max_workers = self.jobs @@ -4412,15 +4476,14 @@ class GfptarCommand(Command): signal.signal(signal.SIGTERM, sig_handler) signal.signal(signal.SIGPIPE, signal.SIG_IGN) - for gen_num, tarlist in gen_dict_list.items(): + for gen_num, tarlist in gen_to_tar_list.items(): with concurrent.futures.ProcessPoolExecutor( max_workers=max_workers) as executor: arglist = [] for id_num, tar_path in tarlist: - arglist.append((lock, share_cancel, tmpdir, + arglist.append((lock, share_cancel, self.tmpdir, update, gen_num, id_num, tar_path, self.bufsize, self.use_fsync)) - # executor.map(gen_infodb_one, arglist) futures = [executor.submit(gen_infodb_one, arg) for arg in arglist] @@ -4441,6 +4504,14 @@ class GfptarCommand(Command): raise save_e if cancel: raise self.error_canceled() + del gen_to_tar_list + infodb_list = [] + for id_num, gen_num, tar_path in tar_list: + fname = InfoDB.infodb_filename(gen_num, id_num) + dname = os.path.dirname(tar_path) + infodb_path = os.path.join(dname, fname) + infodb_list.append((id_num, gen_num, infodb_path)) + return infodb_list signal_initialized = False @@ -4464,13 +4535,21 @@ def signal_init_for_gen_infodb(share_cancel): class InfoDB: + TYPE_FILE = 'F' + TYPE_DIR = 'D' + TYPE_SYMLINK = 'S' + TYPE_OTHER = '?' + TABLE_ENTRY = 'path_entry' + TABLE_USER = 'user_map' + TABLE_GROUP = 'group_map' + def __init__(self, dbfile_path): self.db = DB(dbfile_path) - self.fattr_dict = DBDict(self.db, FileAttr2, GfptarCommand.TABLE_ENTRY) + self.fattr_dict = DBDict(self.db, FileAttr2, self.TABLE_ENTRY) self.user_dict = {} # use memory for speed self.group_dict = {} # use memory for speed - PATT_INFODB = r'^gfptar(\d+)_(\d+)_.+.db.gz' + PATT_INFODB = r'^gfptar(\d+)_(\d+)_.+.db.gz$' @classmethod def infodb_filename(cls, gen_num, id_num): @@ -4480,13 +4559,13 @@ class InfoDB: def add(self, tarinfo): t = tarinfo if t.isfile(): - ftype = 'F' + ftype = self.TYPE_FILE elif t.isdir(): - ftype = 'D' + ftype = self.TYPE_DIR elif t.issym(): - ftype = 'L' + ftype = self.TYPE_SYMLINK else: - ftype = '?' + ftype = self.TYPE_OTHER user_id = self.user_dict.get(t.uname, None) # not uid if user_id is None: user_id = str(len(self.user_dict)) @@ -4501,8 +4580,8 @@ class InfoDB: def commit_close(self): # id -> name - rev_user_dict = DBDict(self.db, StrObj, GfptarCommand.TABLE_USER) - rev_group_dict = DBDict(self.db, StrObj, GfptarCommand.TABLE_GROUP) + rev_user_dict = DBDict(self.db, StrObj, self.TABLE_USER) + rev_group_dict = DBDict(self.db, StrObj, self.TABLE_GROUP) for k, v in self.user_dict.items(): rev_user_dict[v] = k for k, v in self.group_dict.items(): @@ -4689,6 +4768,7 @@ Options: --gzip-program=COMMAND gzip command (ex. pigz) [default: gzip] --bzip2-program=COMMAND bzip2 command (ex. pbzip2) [default: bzip2] --xz-program=COMMAND xz command [default: xz] + --sync-db regenerate gfptar*_info.db.gz --same-owner extract files with the same ownership (for euid=0 on local, or gfarmroot on Gfarm) --disable-gfarm-command disable the use of gfreg and gfexport @@ -4756,6 +4836,7 @@ _schema = Schema({ '--disable-gfarm-command': bool, '--disable-fsync': bool, '--gfsched-interval': Use(int), + '--sync-db': bool, '--same-owner': bool, '--workdir': Or(str, None), '--max-entries-per-tar': Use(unhumanize_number), From c4faeb2d58c0388ae583985b75e89e7859f63916 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Sat, 27 Jul 2024 05:18:22 +0900 Subject: [PATCH 039/143] gfptar: remove unnecessary temporary files --- gftool/gfptar/gfptar | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index b7077df98..79e318dd5 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -4590,7 +4590,8 @@ class InfoDB: self.db.close() @staticmethod - def compress_copy(in_db_path, out_dbgz_path, bufsize, use_fsync): + def compress_copy(in_db_path, out_dbgz_path, bufsize, use_fsync, + move=True): dbgz_path_tmp = out_dbgz_path + '.tmp' dbgz_url_tmp = GfURL.init(dbgz_path_tmp) db_url = GfURL.init(in_db_path) @@ -4609,6 +4610,8 @@ class InfoDB: # atomic operation to avoid leaving junk files dbgz_url_tmp.rename(out_dbgz_path) logger.debug(f'created(.db.gz): {out_dbgz_path}') + if move: + db_url.remove() @staticmethod def decompress_copy(in_dbgz_path, out_db_path, bufsize, use_fsync): From dc17dcc6379076a56851b767aefd814ca9d0f8a9 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Sat, 27 Jul 2024 13:28:19 +0900 Subject: [PATCH 040/143] gfptar: *.lst is no longer created gfptar: rename *.db.gz files: ex. g1_0001_gfptar.db.gz --- gftool/gfptar/gfptar | 157 +++++++++++++++++++++---------------------- 1 file changed, 75 insertions(+), 82 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 79e318dd5..a965ce4db 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -2362,7 +2362,7 @@ class CannotBeArchivedError(GfptarError): class GfptarCommand(Command): - LIST_SUFFIX = '.lst' + LIST_SUFFIX = '.lst' # to ignore old files SERIAL_FORMAT = '%04d' def __init__(self, name): @@ -3108,6 +3108,8 @@ class GfptarCommand(Command): for j in range(remainder - dir_num): yield rand_file(dir_ent.path, j) + TABLE_tar_entry = 'tar_entry' + def create(self, outdir, basedir, infiles): logger.debug(f'create start: outdir={outdir}, basedir={basedir}') self.options_init() @@ -3202,10 +3204,10 @@ class GfptarCommand(Command): serial = 1 gen = 1 # TODO # to reduce memory usage - tarlist_db = DB(tardb_prefix + tardb_fmt % serial) - tarlist = DBList(tarlist_db, GfURLEntry, 'tarlist') - tarlist_num = 0 - tarlist_size = 0 + filelist_db = DB(tardb_prefix + tardb_fmt % serial) + filelist = DBList(filelist_db, GfURLEntry, self.TABLE_tar_entry) + filelist_num = 0 + filelist_size = 0 for infile in infiles_checked: if self.is_canceled(): @@ -3227,28 +3229,29 @@ class GfptarCommand(Command): self.total_size += this_size self.total_num += 1 - if tarlist_num > 0 \ - and (tarlist_size + this_size > self.split_size - or tarlist_num + 1 > self.max_entries_per_tar): + if filelist_num > 0 \ + and (filelist_size + this_size > self.split_size + or filelist_num + 1 > self.max_entries_per_tar): try: - tarlist.commit() - tarlist.close() + filelist.commit() + filelist.close() # DO NOT share a DBList with children, # share the filename instead. self.create_job_execute(gen, serial, - tarlist.filename()) + filelist.filename()) except Exception as e1: self.cancel() if has_error is None: has_error = e1 serial += 1 - tarlist_db = DB(tardb_prefix + tardb_fmt % serial) - tarlist = DBList(tarlist_db, GfURLEntry, 'tarlist') - tarlist_num = 0 - tarlist_size = 0 - tarlist.append(entry) - tarlist_num += 1 - tarlist_size += this_size + filelist_db = DB(tardb_prefix + tardb_fmt % serial) + filelist = DBList(filelist_db, GfURLEntry, + self.TABLE_tar_entry) + filelist_num = 0 + filelist_size = 0 + filelist.append(entry) + filelist_num += 1 + filelist_size += this_size # progress for listing before starting threads if serial == 1 and self.progress_enabled: @@ -3262,9 +3265,9 @@ class GfptarCommand(Command): except MemoryError as e2: self.cancel() has_error = e2 - tarlist = [] - tarlist_num = 0 - tarlist_size = 0 + filelist = [] + filelist_num = 0 + filelist_size = 0 break # from loop of infiles_checked except Exception as e2: cannot_read_dir += 1 @@ -3276,11 +3279,11 @@ class GfptarCommand(Command): if has_error is not None: self.cancel() - if tarlist_num > 0: + if filelist_num > 0: try: - tarlist.commit() - tarlist.close() - self.create_job_execute(gen, serial, tarlist.filename()) + filelist.commit() + filelist.close() + self.create_job_execute(gen, serial, filelist.filename()) except Exception as e: self.cancel() if has_error is None: @@ -3652,12 +3655,12 @@ class GfptarCommand(Command): def create_a_tar_process1(self, input_queue, output_queue, gen, serial, dbfile): logger.debug(f'create_a_tar_process1: start (serial={serial})') - tardb1 = DB(dbfile) - filelist = DBList(tardb1, GfURLEntry, 'tarlist') + tardb = DB(dbfile) + filelist = DBList(tardb, GfURLEntry, self.TABLE_tar_entry) if len(filelist) == 0: - tardb1.close() - tardb1.unlink() + tardb.close() + tardb.unlink() logger.warning(f'empty filelist: {dbfile}') return 0, 0, '', '' first = None @@ -3713,10 +3716,6 @@ class GfptarCommand(Command): tmpdb_path = os.path.join(self.tmpdir.name, db_name + '.tmp') infodb = InfoDB(tmpdb_path) - # TODO remove - tardb_ok = DB(tardb1.filename + '_ok.db') - filelist_ok = DBList(tardb_ok, GfURLEntry, 'filelist_ok') - cannot_be_archived = 0 db_close = False try: @@ -3738,7 +3737,6 @@ class GfptarCommand(Command): logger.debug(f'tar.add_entry: {subpath}') tarinfo = tar.add_entry(subpath, entry) infodb.add(tarinfo) - filelist_ok.append(entry) # TODO remove size_all = entry.size_all() output_queue.put(('ADD', subpath, size_all)) except MemoryError: @@ -3752,8 +3750,8 @@ class GfptarCommand(Command): db_close = True finally: tar.close() - tardb1.close() - tardb1.unlink() + tardb.close() + tardb.unlink() if not db_close: infodb.db.close() @@ -3765,34 +3763,9 @@ class GfptarCommand(Command): # for DEBUG # raise Exception('unexpected raise') - # TODO remove - self.create_a_members_list(outurl, filelist_ok, target_host) - tardb_ok.close() - tardb_ok.unlink() - tar_size = outurl.size() return tar_size, cannot_be_archived, outurl.url_str, out_db_path - # TODO remove - def create_a_members_list(self, url, filelist, target_host): - outurl = GfURL.init(url.url_str + self.LIST_SUFFIX) - with outurl.writeopen(textmode=True, use_fsync=self.use_fsync, - hostname=target_host) as f: - for entry in filelist: - # ex. "D /path/to/dir" - if entry.is_file(): - f.write('F ') - elif entry.is_directory(): - f.write('D ') - elif entry.is_symlink(): - f.write('S ') - else: # unknown - f.write('? ') - subpath = entry.subpath(self.basedir_url) - f.write(subpath) - f.write('\n') - return outurl - def error_canceled(self): return GfptarError('Canceled') @@ -3838,13 +3811,25 @@ class GfptarCommand(Command): self.extract_main() tmpdir.cleanup() + TABLE_target_set = 'target_set' + TABLE_directory_set = 'directory_set' + TABLE_member_set = 'member_set' + TABLE_archive_dict = 'archive_dict' + TABLE_member_check_dict = 'member_check_dict' + TABLE_created_directory_set = 'created_directory_set' + TABLE_dirstat_dict = 'dirstat_dict' + def extract_schedule_v4(self): - target_set = DBSet(self.db_target, StrObj, 'target_set') - directory_set = DBSet(self.db, StrObj, 'directory_set') - member_set = DBSet(self.db, StrObj, 'member_set') + target_set = DBSet(self.db_target, StrObj, self.TABLE_target_set) + directory_set = DBSet(self.db, StrObj, self.TABLE_directory_set) + member_set = DBSet(self.db, StrObj, self.TABLE_member_set) + archive_dict = DBDict(self.db, JsonObj, self.TABLE_archive_dict) + member_check_dict = DBDict(self.db, JsonObj, + self.TABLE_member_check_dict) - archive_dict = DBDict(self.db, JsonObj, 'archive_dict') - member_check_dict = DBDict(self.db, JsonObj, 'member_check_dict') + self.created_directory_set = DBSet(self.db, StrObj, + self.TABLE_created_directory_set) + self.dirstat_dict = DBDict(self.db, FileAttr1, self.TABLE_dirstat_dict) if self.search_target: for member in self.specified_members: @@ -3968,16 +3953,13 @@ class GfptarCommand(Command): member_set, error_num) = self.extract_schedule_v4() self.outdir_url.create_new_dir() - self.created_directory_set = DBSet(self.db, StrObj, - 'created_directory_set') - # self.extract_directories(directory_set) + # self.extract_directories(directory_set) # slow self.extract_directories_fast(directory_set) self.extracted_num = 0 self.extracted_size = 0 self.start_time = time.time() self.next_time = self.start_time + 1 - self.dirstat_dict = DBDict(self.db, FileAttr1, 'dirstat_dict') self.gfsched_lock = None self.gfsched_next = 0 @@ -4434,7 +4416,7 @@ class GfptarCommand(Command): group_dict_mem[k] = v for path, fattr in fattr_dict.iterator(sort='ASC'): if resolve_ugmap: - # id -> name + # unique id -> name fattr.user = user_dict_mem.get(fattr.user, '???') fattr.group = group_dict_mem.get(fattr.group, '???') yield path, fattr @@ -4549,12 +4531,14 @@ class InfoDB: self.user_dict = {} # use memory for speed self.group_dict = {} # use memory for speed - PATT_INFODB = r'^gfptar(\d+)_(\d+)_.+.db.gz$' + # SEE ALSO: infodb_filename + PATT_INFODB = r'^g(\d+)_(\d+)_gfptar.db.gz$' @classmethod def infodb_filename(cls, gen_num, id_num): - # ex. gfptar2_0099_info.db.gz - return f'gfptar{gen_num}_{int(id_num):04}_info.db.gz' + # SEE ALSO: PATT_INFODB + # ex. g2_0099_gfptar.db.gz + return f'g{gen_num}_{int(id_num):04}_gfptar.db.gz' def add(self, tarinfo): t = tarinfo @@ -4579,7 +4563,7 @@ class InfoDB: self.fattr_dict[t.name] = fattr def commit_close(self): - # id -> name + # unique id -> name rev_user_dict = DBDict(self.db, StrObj, self.TABLE_USER) rev_group_dict = DBDict(self.db, StrObj, self.TABLE_GROUP) for k, v in self.user_dict.items(): @@ -4707,20 +4691,29 @@ gfptar - archive files in parallel Example of --create (Gfarm to Gfarm): Command line: gfptar -c gfarm:/home/user1/out -C gfarm:/home/user1 ./dir - Input files: + Input files (any files): gfarm:/home/user1/dir/test0000.data ... gfarm:/home/user1/dir/test9999.data Output files: gfarm:/home/user1/out/0001_dir_test0000.data..dir_test0999.data.tar.gz - gfarm:/home/user1/out/0001_dir_test0000.data..dir_test0999.data.tar.gz.lst + gfarm:/home/user1/out/g1_0001_gfptar.db.gz ... gfarm:/home/user1/out/0010_dir_test9000.data..dir_test9999.data.tar.gz - gfarm:/home/user1/out/0010_dir_test9000.data..di1_test9999.data.tar.gz.lst - Contents of list file (*.lst): - F dir/test0000.data - ... - F dir/test0999.data + gfarm:/home/user1/out/g1_0010_gfptar.db.gz + +Contents of gMM_NN_gfptar.db.gz file (sqlite3 and gzip): + MM: the generation number for each append operation + NN: the serial number + table 'path_entry': map of path name to JSON string + json.dumps([ int(file_mode), int(mtime), + int(user_unique_id), int(group_unique_id), + int(size), symlink_path, file_type(D,F,S) ] + file_type 'D': directory + file_type 'F': file + file_type 'S': symbolic link + table 'user_map' : map of unique id (not uid) to user name + table 'group_map': map of unique id (not gid) to group name Example of --extract (Gfarm to Local): Command line: From 8f1bfdb334491f38fcccd27bd79ef698388945e0 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Sat, 27 Jul 2024 14:39:33 +0900 Subject: [PATCH 041/143] gfptar: refactoring (no functional change) --- gftool/gfptar/gfptar | 239 ++++++++++++++++++++++--------------------- 1 file changed, 120 insertions(+), 119 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index a965ce4db..23e41f36b 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -767,7 +767,6 @@ class GfURLEntry(DBObj): TYPE_SYMLINK = 'S' TYPE_OTHER = '?' - # TODO remove type_map = { TYPE_FILE: 1, TYPE_DIR: 2, @@ -798,7 +797,6 @@ class GfURLEntry(DBObj): return (f'Entry(path={self.path},mode={oct(self.mode)},' f'user={self.uname},group={self.gname})') - # TODO remove # only path must be specified for key when using DBDict @classmethod def dumps(cls, obj, for_dict): @@ -812,7 +810,6 @@ class GfURLEntry(DBObj): array.append(obj.path) # [7] return json.dumps(array, separators=(',', ':')) - # TODO remove @classmethod def loads(cls, key, txt, for_dict): o = json.loads(txt) @@ -3711,7 +3708,7 @@ class GfptarCommand(Command): dummy_input=self.dummy_input, dummy_sleep=self.dummy_sleep) # to reduce memory usage - # SEE ALSO: gen_infodb_one() + # SEE ALSO: InfoDB.generate_one() db_name = InfoDB.infodb_filename(gen, serial) tmpdb_path = os.path.join(self.tmpdir.name, db_name + '.tmp') infodb = InfoDB(tmpdb_path) @@ -3735,6 +3732,7 @@ class GfptarCommand(Command): subpath = entry.subpath(self.basedir_url) try: logger.debug(f'tar.add_entry: {subpath}') + # GfTarFile.METHOD_add_entry tarinfo = tar.add_entry(subpath, entry) infodb.add(tarinfo) size_all = entry.size_all() @@ -3755,7 +3753,6 @@ class GfptarCommand(Command): if not db_close: infodb.db.close() - # SEE ALSO: gen_infodb_name() out_db_path = self.outdir_url.url_join(db_name) InfoDB.compress_copy(infodb.db.filename, out_db_path, self.bufsize, self.use_fsync) @@ -3875,8 +3872,9 @@ class GfptarCommand(Command): logger.error(f'lost *.tar.* tied to {in_dbgz_path}') error_num += 1 continue - for path, fattr in self.list_entries_from_one( - in_dbgz_path, resolve_ugmap=False): + for path, fattr in InfoDB.list_entries_from_one( + in_dbgz_path, self.tmpdir, self.bufsize, self.use_fsync, + resolve_ugmap=False): if self.is_canceled(): raise self.error_canceled() self.total_num += 1 @@ -4319,7 +4317,9 @@ class GfptarCommand(Command): infodb_list = self.list_infodb_files(indir_url) self.sort_tar_or_infodb_list(infodb_list) for id_num, gen_num, in_dbgz_path in infodb_list: - yield from self.list_entries_from_one(in_dbgz_path, resolve_ugmap) + yield from InfoDB.list_entries_from_one( + in_dbgz_path, self.tmpdir, self.bufsize, self.use_fsync, + resolve_ugmap) def sort_tar_or_infodb_list(self, tar_or_infodb_list): def id_key(value): @@ -4395,34 +4395,6 @@ class GfptarCommand(Command): else: return tar_list - # TODO InfoDB - def list_entries_from_one(self, in_dbgz_path, resolve_ugmap=False): - base = os.path.basename(in_dbgz_path) - out_db_path = os.path.join(self.tmpdir.name, base + '.tmp.db') - tmpdb_url = InfoDB.decompress_copy(in_dbgz_path, out_db_path, - self.bufsize, self.use_fsync) - db = DB(tmpdb_url.path) - try: - fattr_dict = DBDict(db, FileAttr2, InfoDB.TABLE_ENTRY) - if resolve_ugmap: - user_dict = DBDict(db, StrObj, InfoDB.TABLE_USER) - group_dict = DBDict(db, StrObj, InfoDB.TABLE_GROUP) - user_dict_mem = {} - group_dict_mem = {} - # cache in memory - for k, v in user_dict.items(): - user_dict_mem[k] = v - for k, v in group_dict.items(): - group_dict_mem[k] = v - for path, fattr in fattr_dict.iterator(sort='ASC'): - if resolve_ugmap: - # unique id -> name - fattr.user = user_dict_mem.get(fattr.user, '???') - fattr.group = group_dict_mem.get(fattr.group, '???') - yield path, fattr - finally: - db.close() - def check_and_sync_infodb(self, indir_url, tar_list=None): if not self.sync_infodb: logger.debug('check_and_sync_infodb: disable') @@ -4466,7 +4438,10 @@ class GfptarCommand(Command): arglist.append((lock, share_cancel, self.tmpdir, update, gen_num, id_num, tar_path, self.bufsize, self.use_fsync)) - futures = [executor.submit(gen_infodb_one, arg) + # InfoDB.generate_one is staticmethod, + # because ProcessPoolExecutor cannot serialize + # members of "self" object. + futures = [executor.submit(InfoDB.generate_one, arg) for arg in arglist] for future in concurrent.futures.as_completed(futures): @@ -4496,26 +4471,6 @@ class GfptarCommand(Command): return infodb_list -signal_initialized = False - - -def signal_init_for_gen_infodb(share_cancel): - global signal_initialized - - if not signal_initialized: - def sig_handler(signum, frame): - if share_cancel.value != 0: - logger.warning(f'Interrupt (signal={signum}') - share_cancel.value = 1 - - signal.signal(signal.SIGHUP, sig_handler) - signal.signal(signal.SIGINT, sig_handler) - signal.signal(signal.SIGTERM, sig_handler) - signal.signal(signal.SIGPIPE, signal.SIG_IGN) - signal_initialized = True - # logger.debug('signal_init_for_gen_infodb') - - class InfoDB: TYPE_FILE = 'F' TYPE_DIR = 'D' @@ -4615,71 +4570,117 @@ class InfoDB: ' '.join(proc.args), ret)) return db_url + signal_initialized = False -# ProcessPoolExecutor cannot serialize "self" object of GfptarCommand -def gen_infodb_one(args): - (lock, share_cancel, tmpdir, update, gen_num, id_num, - tar_path, bufsize, use_fsync) = args - if share_cancel.value != 0: - # logger.debug('Canceled (2)') - return - tar_url = GfURL.init(tar_path) - indir_url = tar_url.parent - db_name = InfoDB.infodb_filename(gen_num, id_num) - db_path = indir_url.url_join(db_name) - db_url = GfURL.init(db_path) - if not update and db_url.exists(): - logger.debug(f'not update: {db_path}') - return - signal_init_for_gen_infodb(share_cancel) - - # Local file - tmpdb_path = os.path.join(tmpdir.name, str(id_num) + '_info.db') - infodb = InfoDB(tmpdb_path) - tar = GfTarFile.extract_open(tar_url, bufsize) - db_close = False - try: - interval = 1 # sec. | for interrupt - next_check = time.time() + interval - while True: - now = time.time() - if now >= next_check: - # access manager.Value(): very high cost + @classmethod + def signal_init(cls, share_cancel): + if not cls.signal_initialized: + def sig_handler(signum, frame): if share_cancel.value != 0: - logger.info('Canceled') + logger.warning(f'Interrupt (signal={signum}') + share_cancel.value = 1 + + signal.signal(signal.SIGHUP, sig_handler) + signal.signal(signal.SIGINT, sig_handler) + signal.signal(signal.SIGTERM, sig_handler) + signal.signal(signal.SIGPIPE, signal.SIG_IGN) + cls.signal_initialized = True + logger.debug('InfoDB.signal_init') + + @staticmethod + def generate_one(args): + (lock, share_cancel, tmpdir, update, gen_num, id_num, + tar_path, bufsize, use_fsync) = args + if share_cancel.value != 0: + # logger.debug('Canceled (2)') + return + tar_url = GfURL.init(tar_path) + indir_url = tar_url.parent + db_name = InfoDB.infodb_filename(gen_num, id_num) + db_path = indir_url.url_join(db_name) + db_url = GfURL.init(db_path) + if not update and db_url.exists(): + logger.debug(f'not update: {db_path}') + return + InfoDB.signal_init(share_cancel) + + # Local file + tmpdb_path = os.path.join(tmpdir.name, str(id_num) + '_info.db') + infodb = InfoDB(tmpdb_path) + tar = GfTarFile.extract_open(tar_url, bufsize) + db_close = False + try: + interval = 1 # sec. | for interrupt + next_check = time.time() + interval + while True: + now = time.time() + if now >= next_check: + # access manager.Value(): very high cost + if share_cancel.value != 0: + logger.info('Canceled') + break + next_check = now + interval + try: + t = tar.next() + except MemoryError: + raise + except Exception as e: + logger.warning(f'{tar_path}: SKIPPED:' + f' invalid or empty tar: {str(e)}') + t = None + if t is None: break - next_check = now + interval - try: - t = tar.next() - except MemoryError: - raise - except Exception as e: - logger.warning(f'{tar_path}: SKIPPED: invalid or empty tar:' - f' {str(e)}') - t = None - if t is None: - break - # name = t.name - # if t.isdir(): - # name = name + '/' - # elif t.issym(): - # name = name + ' -> ' + t.linkname - # info = (f'{gen_num}:{id_num} {t.mode:04o}' - # f' {t.uname:>10}/{t.gname:<10}' - # f' {t.size:9d} {t.mtime} {name}') - # logger.debug(info) - # logger.debug(f'add to DB: {t.name}') - infodb.add(t) - # success - - infodb.commit_close() - db_close = True - InfoDB.compress_copy(infodb.db.filename, db_path, bufsize, use_fsync) - # TODO progress - finally: - tar.close() - if not db_close: - infodb.db.close() + # name = t.name + # if t.isdir(): + # name = name + '/' + # elif t.issym(): + # name = name + ' -> ' + t.linkname + # info = (f'{gen_num}:{id_num} {t.mode:04o}' + # f' {t.uname:>10}/{t.gname:<10}' + # f' {t.size:9d} {t.mtime} {name}') + # logger.debug(info) + # logger.debug(f'add to DB: {t.name}') + infodb.add(t) + # success + + infodb.commit_close() + db_close = True + InfoDB.compress_copy(infodb.db.filename, db_path, bufsize, + use_fsync) + # TODO progress + finally: + tar.close() + if not db_close: + infodb.db.close() + + @staticmethod + def list_entries_from_one(in_dbgz_path, tmpdir, bufsize, use_fsync, + resolve_ugmap=False): + base = os.path.basename(in_dbgz_path) + out_db_path = os.path.join(tmpdir.name, base + '.tmp.db') + tmpdb_url = InfoDB.decompress_copy(in_dbgz_path, out_db_path, + bufsize, use_fsync) + db = DB(tmpdb_url.path) + try: + fattr_dict = DBDict(db, FileAttr2, InfoDB.TABLE_ENTRY) + if resolve_ugmap: + user_dict = DBDict(db, StrObj, InfoDB.TABLE_USER) + group_dict = DBDict(db, StrObj, InfoDB.TABLE_GROUP) + user_dict_mem = {} + group_dict_mem = {} + # cache in memory + for k, v in user_dict.items(): + user_dict_mem[k] = v + for k, v in group_dict.items(): + group_dict_mem[k] = v + for path, fattr in fattr_dict.iterator(sort='ASC'): + if resolve_ugmap: + # unique id -> name + fattr.user = user_dict_mem.get(fattr.user, '???') + fattr.group = group_dict_mem.get(fattr.group, '???') + yield path, fattr + finally: + db.close() progname = os.path.basename(__file__) From 5f6bd1ba22ea390fdf864b6ceda0e3d38a358fd5 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Sat, 27 Jul 2024 15:45:15 +0900 Subject: [PATCH 042/143] gfptar: rename methods (no functional change) --- gftool/gfptar/gfptar | 62 ++++++++++++++++++++++---------------------- 1 file changed, 31 insertions(+), 31 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 23e41f36b..2a033472f 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -2445,22 +2445,22 @@ class GfptarCommand(Command): if outdir: basedir = self.opt['--basedir'] infiles = self.opt[''] - self.create(outdir, basedir, infiles) + self.cmd_create(outdir, basedir, infiles) return outdir = self.opt['--extract'] if outdir: indir = self.opt[''] members = self.opt[''] - self.extract(outdir, indir, members) + self.cmd_extract(outdir, indir, members) return indir = self.opt['--list'] if indir: if self.verbose: - self.list_verbose(indir) + self.cmd_list_verbose(indir) else: - self.list_simple(indir) + self.cmd_list_simple(indir) return if self.opt['--test']: @@ -2619,7 +2619,7 @@ class GfptarCommand(Command): ok = False try: - self.create(test1_dir, workdir, [testsrc_name]) + self.cmd_create(test1_dir, workdir, [testsrc_name]) except Exception as e: self.print_trace(e) if str(e).startswith('specifying '): @@ -2675,7 +2675,7 @@ class GfptarCommand(Command): # Gfarm -> Gfarm(tar) try: - self.create(test1_dir_gfarm, workdir_gfarm, [testsrc_name]) + self.cmd_create(test1_dir_gfarm, workdir_gfarm, [testsrc_name]) if noread: logger.error_exit(1, testname + '(create:Gfarm->Gfarm) ' + '... FAIL (unexpected success)') @@ -2684,10 +2684,10 @@ class GfptarCommand(Command): raise logger.info(f'IGNORED: {str(e)}') # Gfarm(tar) -> Gfarm - self.extract(test2_dir_gfarm, test1_dir_gfarm, []) + self.cmd_extract(test2_dir_gfarm, test1_dir_gfarm, []) # Gfarm -> Local(tar) try: - self.create(test3_dir_local, test2_dir_gfarm, [testsrc_name]) + self.cmd_create(test3_dir_local, test2_dir_gfarm, [testsrc_name]) if noread: logger.error_exit(1, testname + '(create:Gfarm->Local) ' + '... FAIL (unexpected success)') @@ -2696,13 +2696,13 @@ class GfptarCommand(Command): raise logger.info(f'IGNORED: {str(e)}') # Local(tar) -> Local - self.extract(test4_dir_local, test3_dir_local, []) + self.cmd_extract(test4_dir_local, test3_dir_local, []) # --list - self.list_simple(test1_dir_gfarm, quiet=True) - self.list_simple(test3_dir_local, quiet=True) - self.list_verbose(test1_dir_gfarm, quiet=True) - self.list_verbose(test3_dir_local, quiet=True) + self.cmd_list_simple(test1_dir_gfarm, quiet=True) + self.cmd_list_simple(test3_dir_local, quiet=True) + self.cmd_list_verbose(test1_dir_gfarm, quiet=True) + self.cmd_list_verbose(test3_dir_local, quiet=True) if readonly: # extract a member (SEE ALSO: test_prepare_srcdir) @@ -2716,8 +2716,8 @@ class GfptarCommand(Command): test_member_l_name = 'test-gfptar-member-l' test_member_g = workdir_gfarm_url.url_join(test_member_g_name) test_member_l = workdir_local_url.url_join(test_member_l_name) - self.extract(test_member_g, test1_dir_gfarm, [member]) - self.extract(test_member_l, test3_dir_local, [member]) + self.cmd_extract(test_member_g, test1_dir_gfarm, [member]) + self.cmd_extract(test_member_l, test3_dir_local, [member]) g_member = GfURL.init(os.path.join(test_member_g, member)) l_member = GfURL.init(os.path.join(test_member_l, member)) if not g_member.compare_data(l_member): @@ -2742,7 +2742,7 @@ class GfptarCommand(Command): # NOTE: check ignoring hardlinks for Local # Local -> Gfarm(tar) try: - self.create(test5_dir_gfarm, workdir_local, [testsrc_name]) + self.cmd_create(test5_dir_gfarm, workdir_local, [testsrc_name]) if noread: logger.error_exit(1, testname + '(create:Local->Gfarm) ' + '... FAIL (unexpected success)') @@ -2751,7 +2751,7 @@ class GfptarCommand(Command): raise logger.info(f'IGNORED: {str(e)}') # Gfarm(tar) -> Local - self.extract(test6_dir_local, test5_dir_gfarm, []) + self.cmd_extract(test6_dir_local, test5_dir_gfarm, []) test4_srcdir_local = os.path.join(test4_dir_local, testsrc_name) if not self.test_compare_local(test4_srcdir_local, srcdir_local, @@ -2800,19 +2800,19 @@ class GfptarCommand(Command): files = ['file1', 'dir1/readonly/file#2'] # Gfarm -> Gfarm(tar) - self.create(test1_dir_gfarm, srcdir_gfarm, members) + self.cmd_create(test1_dir_gfarm, srcdir_gfarm, members) # Gfarm(tar) -> Gfarm - self.extract(test2_dir_gfarm, test1_dir_gfarm, files) + self.cmd_extract(test2_dir_gfarm, test1_dir_gfarm, files) # Local -> Local(tar) - self.create(test3_dir_local, srcdir_local, files) + self.cmd_create(test3_dir_local, srcdir_local, files) # Local(tar) -> Local - self.extract(test4_dir_local, test3_dir_local, files) - self.extract(test5_dir_local, test3_dir_local, members) + self.cmd_extract(test4_dir_local, test3_dir_local, files) + self.cmd_extract(test5_dir_local, test3_dir_local, members) # error try: - self.extract(test6_dir_local, test3_dir_local, ['abcde']) + self.cmd_extract(test6_dir_local, test3_dir_local, ['abcde']) logger.error_exit(1, testname + '(not found in archive files) ' + '... FAIL (unexpected success)') except GfptarError as e: @@ -2954,13 +2954,13 @@ class GfptarCommand(Command): test12_dir_local = workdir_local_url.url_join(test12_name) # basedir -> Gfarm(tar) - self.create(test9_dir_gfarm, basedir, infiles) + self.cmd_create(test9_dir_gfarm, basedir, infiles) # Gfarm(tar) -> Gfarm - self.extract(test10_dir_gfarm, test9_dir_gfarm, []) + self.cmd_extract(test10_dir_gfarm, test9_dir_gfarm, []) # Gfarm -> Local(tar) - self.create(test11_dir_local, test10_dir_gfarm, infiles) + self.cmd_create(test11_dir_local, test10_dir_gfarm, infiles) # Local(tar) -> Local - self.extract(test12_dir_local, test11_dir_local, []) + self.cmd_extract(test12_dir_local, test11_dir_local, []) result = True for infile in infiles: @@ -3107,7 +3107,7 @@ class GfptarCommand(Command): TABLE_tar_entry = 'tar_entry' - def create(self, outdir, basedir, infiles): + def cmd_create(self, outdir, basedir, infiles): logger.debug(f'create start: outdir={outdir}, basedir={basedir}') self.options_init() self.outdir = outdir @@ -3769,7 +3769,7 @@ class GfptarCommand(Command): def error_not_a_gfptar_directory(self, url_str): return GfptarError('Not a gfptar-archived directory: ' + url_str) - def extract(self, outdir, indir, specified_members): + def cmd_extract(self, outdir, indir, specified_members): logger.debug(f'extract start: outdir={outdir}, indir={indir}') self.options_init() self.outdir = outdir @@ -4287,7 +4287,7 @@ class GfptarCommand(Command): f'{bytes_per_sec_str}B/s ' f'{ent_per_sec_str}Ent/s ') - def list_simple(self, indir, quiet=False): + def cmd_list_simple(self, indir, quiet=False): self.options_init() indir_url = GfURL.init(indir) infodb_list = self.check_and_sync_infodb(indir_url) @@ -4298,7 +4298,7 @@ class GfptarCommand(Command): print(f'{fattr.ftype} {path}') self.tmpdir.cleanup() - def list_verbose(self, indir, quiet=False): + def cmd_list_verbose(self, indir, quiet=False): self.options_init() indir_url = GfURL.init(indir) infodb_list = self.check_and_sync_infodb(indir_url) From 86ebbb0ce6266d1c7a927ec18bc9f0df5829f13f Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Sat, 27 Jul 2024 17:16:52 +0900 Subject: [PATCH 043/143] gfptar --sync-db: show progress --- gftool/gfptar/gfptar | 100 +++++++++++++++++++++++++++++++++++++++---- 1 file changed, 92 insertions(+), 8 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 2a033472f..6a23cb58b 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -1947,14 +1947,26 @@ class Compress: cls.xz_prog = None cls.compress_type = compress_type - if compress_prog is None: + if compress_prog is not None: + compress_prog0 = shutil.which(compress_prog) + if compress_prog0 is None: + raise GfptarError(f'{compress_prog}: command not found.') + compress_prog = compress_prog0 + else: if compress_type == cls.TYPE_GZIP: compress_prog = cls.gzip_prog elif compress_type == cls.TYPE_BZIP2: compress_prog = cls.bzip2_prog elif compress_type == cls.TYPE_XZ: compress_prog = cls.xz_prog + # else: + # compress_prog = None cls.compress_prog = compress_prog # may be None + logger.debug(f'compress_type={cls.compress_type}') + logger.debug(f'compress_prog={cls.compress_prog}') + logger.debug(f'gzip_prog={cls.gzip_prog}') + logger.debug(f'bzip2_prog={cls.bzip2_prog}') + logger.debug(f'xz_prog={cls.xz_prog}') @classmethod def compress(cls, compress_prog, outf): @@ -2370,6 +2382,7 @@ class GfptarCommand(Command): self.hardlink_warn = True if self.quiet: GfURL.shutup_stderr() + self.tmpdir = None def options_init(self): set_encoding(self.opt['--encoding']) @@ -2489,6 +2502,11 @@ class GfptarCommand(Command): finally: self.worker_terminate() self.sig_default() + if self.tmpdir: + try: + self.tmpdir.cleanup() + except Exception: + pass logger.debug('exit') def test_init(self): @@ -3840,7 +3858,7 @@ class GfptarCommand(Command): infodb_list2 = self.check_and_sync_infodb(self.indir_url, tar_list=tar_list) if infodb_list2 is not None: - infodb_list = infodb_list2 + infodb_list = infodb_list2 # use new list self.sort_tar_or_infodb_list(infodb_list) infodb_dict = {} @@ -4225,6 +4243,25 @@ class GfptarCommand(Command): f'{sec_str} ' f'{ent_per_sec_str}Ent/s ') + def progress_for_sync_infodb(self, now): + sec = now - self.start_time + sec_str = format_seconds(sec, minhour=True) + current_tar_num_str = self._humanize(self.current_tar_num) + total_tar_num_str = self._humanize(self.total_tar_num) + current_ent_num_str = self._humanize(self.current_ent_num) + current_size_str = self._humanize(self.current_size) + if sec > 0: + ent_per_sec = self.current_ent_num / sec + else: + ent_per_sec = 0 + ent_per_sec_str = self._humanize(ent_per_sec) + sys.stdout.write(f'\rsync-db: ' + f'{current_tar_num_str}/{total_tar_num_str}Tar ' + f'{current_size_str}B ' + f'{current_ent_num_str}Ent ' + f'{sec_str} ' + f'{ent_per_sec_str}Ent/s ') + # lock required def progress_for_create(self, now): sec = now - self.start_time @@ -4301,7 +4338,35 @@ class GfptarCommand(Command): def cmd_list_verbose(self, indir, quiet=False): self.options_init() indir_url = GfURL.init(indir) - infodb_list = self.check_and_sync_infodb(indir_url) + + # SEE ALSO: cmd_extract() + tar_list, infodb_list = self.list_tar_and_infodb_files(indir_url) + infodb_list2 = self.check_and_sync_infodb(indir_url, + tar_list=tar_list) + if infodb_list2 is not None: + infodb_list = infodb_list2 # use new list + + infodb_dict = {} + for id_num, gen_num, infodb_path in infodb_list: + infodb_dict[id_num] = infodb_path + + error_num = 0 + tar_dict = {} + for id_num, gen_num, tar_path in tar_list: + tar_dict[id_num] = tar_path + infodb = infodb_dict.get(id_num, None) + if infodb is None: + logger.error(f'lost *_info.db.gz tied to {tar_path}') + error_num += 1 + + for id_num, gen_num, in_dbgz_path in infodb_list: + tar_path = tar_dict.get(id_num, None) + if tar_path is None: + logger.error(f'lost *.tar.* tied to {in_dbgz_path}') + error_num += 1 + + if error_num > 0: + raise GfptarError(f'Total errors encountered: {error_num}') for path, fattr in self.list_entries_from_all( indir_url, infodb_list=infodb_list, resolve_ugmap=True): @@ -4402,18 +4467,22 @@ class GfptarCommand(Command): logger.debug('check_and_sync_infodb: enable') if tar_list is None: tar_list = self.list_tar_files(indir_url) - update = True + update = True # always True + self.total_tar_num = 0 + self.current_tar_num = 0 # generation -> tar files gen_to_tar_list = defaultdict(list) for id_num, gen_num, path in tar_list: gen_to_tar_list[gen_num].append((id_num, path)) + self.total_tar_num += 1 if self.jobs >= 1: max_workers = self.jobs else: max_workers = 1 + self.start_time = time.time() save_e = None cancel = False # Concurrent execution for each generation @@ -4430,6 +4499,8 @@ class GfptarCommand(Command): signal.signal(signal.SIGTERM, sig_handler) signal.signal(signal.SIGPIPE, signal.SIG_IGN) + self.current_ent_num = 0 + self.current_size = 0 for gen_num, tarlist in gen_to_tar_list.items(): with concurrent.futures.ProcessPoolExecutor( max_workers=max_workers) as executor: @@ -4446,7 +4517,13 @@ class GfptarCommand(Command): for future in concurrent.futures.as_completed(futures): try: - future.result() + num, size = future.result() + self.current_ent_num += num + self.current_size += size + self.current_tar_num += 1 + if self.progress_enabled or self.verbose: + now = time.time() + self.progress_for_sync_infodb(now) except Exception as e: # logger.error(f'{e}') self.print_trace(e) @@ -4457,6 +4534,9 @@ class GfptarCommand(Command): cancel = True self.sig_default() # no longer be able to access Manager + if self.progress_enabled or self.verbose: + self.progress_for_sync_infodb(now) + sys.stdout.write('\n') if save_e: raise save_e if cancel: @@ -4593,7 +4673,7 @@ class InfoDB: tar_path, bufsize, use_fsync) = args if share_cancel.value != 0: # logger.debug('Canceled (2)') - return + return 0, 0 tar_url = GfURL.init(tar_path) indir_url = tar_url.parent db_name = InfoDB.infodb_filename(gen_num, id_num) @@ -4601,7 +4681,7 @@ class InfoDB: db_url = GfURL.init(db_path) if not update and db_url.exists(): logger.debug(f'not update: {db_path}') - return + return 0, 0 InfoDB.signal_init(share_cancel) # Local file @@ -4609,6 +4689,8 @@ class InfoDB: infodb = InfoDB(tmpdb_path) tar = GfTarFile.extract_open(tar_url, bufsize) db_close = False + num = 0 + size = 0 try: interval = 1 # sec. | for interrupt next_check = time.time() + interval @@ -4641,17 +4723,19 @@ class InfoDB: # logger.debug(info) # logger.debug(f'add to DB: {t.name}') infodb.add(t) + num += 1 + size += t.size # success infodb.commit_close() db_close = True InfoDB.compress_copy(infodb.db.filename, db_path, bufsize, use_fsync) - # TODO progress finally: tar.close() if not db_close: infodb.db.close() + return num, size @staticmethod def list_entries_from_one(in_dbgz_path, tmpdir, bufsize, use_fsync, From 20cb2dc2a6f27a2225337edd9e89af1e80fe05e6 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Sat, 27 Jul 2024 17:24:20 +0900 Subject: [PATCH 044/143] sequel to "gfptar --sync-db: show progress" --- gftool/gfptar/gfptar | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 6a23cb58b..40a4e2457 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -4501,6 +4501,10 @@ class GfptarCommand(Command): self.current_ent_num = 0 self.current_size = 0 + if self.progress_enabled or self.verbose: + now = time.time() + self.progress_for_sync_infodb(now) + for gen_num, tarlist in gen_to_tar_list.items(): with concurrent.futures.ProcessPoolExecutor( max_workers=max_workers) as executor: @@ -4535,6 +4539,7 @@ class GfptarCommand(Command): self.sig_default() # no longer be able to access Manager if self.progress_enabled or self.verbose: + now = time.time() self.progress_for_sync_infodb(now) sys.stdout.write('\n') if save_e: From ebc17f87edb9515799fc12de3110a4d0a9948138 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Sat, 27 Jul 2024 22:51:07 +0900 Subject: [PATCH 045/143] gfptar: change the format of progress to fixed length gfptar: improve subprocess management gfptar --create: use .tar.gz.tmp --- gftool/gfptar/gfptar | 205 +++++++++++++++++++++++++++++-------------- 1 file changed, 138 insertions(+), 67 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 40a4e2457..c2fae8cc1 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -80,12 +80,15 @@ def humanize_number(num, binary_prefix=False): while n >= base and scale < ulen: n /= base scale += 1 - if n < 100: + if n < 10: + d = n.quantize(Decimal('0.000'), rounding=ROUND_DOWN) + elif n < 100: + d = n.quantize(Decimal('0.00'), rounding=ROUND_DOWN) + elif n < 1000: d = n.quantize(Decimal('0.0'), rounding=ROUND_DOWN) - return f'{d}{units[scale]}' else: d = n.quantize(Decimal('0'), rounding=ROUND_DOWN) - return f'{d}{units[scale]}' + return f'{d}{units[scale]}' def unhumanize_number(numstr, binary_prefix=False): @@ -1009,7 +1012,7 @@ class GfURL(metaclass=abc.ABCMeta): if not fullpath.startswith(base): logger.error('subpath: %s, %s', base, fullpath) raise AssertionError(f'base={base}, fullpath={fullpath}') - logger.debug('subpath: %s, %s', base, fullpath) + # logger.debug('subpath: %s, %s', base, fullpath) return fullpath[len(base):].lstrip('/') # relative path def url_join(self, subpath): @@ -2198,18 +2201,19 @@ class TestGfptar(unittest.TestCase): self.assertEqual(humanize_number(0.1), '0') self.assertEqual(humanize_number(1.9), '1') self.assertEqual(humanize_number(1023, binary_prefix=True), '1023') - self.assertEqual(humanize_number(1024, binary_prefix=True), '1.0Ki') + self.assertEqual(humanize_number(1024, binary_prefix=True), '1.000Ki') self.assertEqual(humanize_number(999), '999') - self.assertEqual(humanize_number(1000), '1.0k') - self.assertEqual(humanize_number(99999), '99.9k') - self.assertEqual(humanize_number(100000), '100k') - self.assertEqual(humanize_number(1900000), '1.9M') - self.assertEqual(humanize_number(2000000), '2.0M') + self.assertEqual(humanize_number(1000), '1.000k') + self.assertEqual(humanize_number(99999), '99.99k') + self.assertEqual(humanize_number(100000), '100.0k') + self.assertEqual(humanize_number(1900000), '1.900M') + self.assertEqual(humanize_number(2000000), '2.000M') def test_unhumanize(self): self.assertEqual(unhumanize_number('999'), 999) self.assertEqual(unhumanize_number('1k'), 1000) self.assertEqual(unhumanize_number('1K'), 1000) + # self.assertEqual(unhumanize_number('1.5k'), 1500) # TODO self.assertEqual(unhumanize_number('1k', binary_prefix=True), 1024) self.assertEqual(unhumanize_number('1ki'), 1024) self.assertEqual(unhumanize_number('1Ki'), 1024) @@ -2366,17 +2370,29 @@ class CannotBeArchivedError(GfptarError): msg += 'directories' else: msg += 'directory' - self.message = f'Error: {msg} cannot be archived' + self.message = f'{msg} cannot be archived' super().__init__(self.message) +@contextmanager +def ignore_exception(ignore): + try: + yield + except Exception as e: + if ignore: + logger.debug(f'Ignored: {str(e)}') + return + raise + + class GfptarCommand(Command): LIST_SUFFIX = '.lst' # to ignore old files + TMP_SUFFIX = '.tmp' SERIAL_FORMAT = '%04d' def __init__(self, name): self.init(name) - self.canceled = threading.Event() + self.canceled = False self.lock_init(False) self.futures = None self.hardlink_warn = True @@ -2396,8 +2412,11 @@ class GfptarCommand(Command): euid = os.geteuid() # Temporary files are removed when the process exits. # dir=None: system default + if self.tmpdir: + self.tmpdir.cleanup() self.tmpdir = tempfile.TemporaryDirectory(prefix=f'gfptar-{euid}-', dir=self.workdir) + self.sig_init() # to clean tmpdir progress_unit_type = self.opt['--progress-unit'] if progress_unit_type == 'si': @@ -2428,8 +2447,9 @@ class GfptarCommand(Command): def sig_init(self): def sig_handler(signum, frame): - logger.warning(f'Interrupt (signal={signum}') - self.cancel() + # logger.warning(f'Interrupt (signal={signum})') + sys.stderr.write(f'\nInterrupt (signal={signum})\n') + self.canceled = True signal.signal(signal.SIGHUP, sig_handler) signal.signal(signal.SIGINT, sig_handler) @@ -2442,6 +2462,12 @@ class GfptarCommand(Command): signal.signal(signal.SIGTERM, signal.SIG_DFL) signal.signal(signal.SIGPIPE, signal.SIG_IGN) + def sig_ignore(self): + signal.signal(signal.SIGHUP, signal.SIG_IGN) + signal.signal(signal.SIGINT, signal.SIG_IGN) + signal.signal(signal.SIGTERM, signal.SIG_IGN) + signal.signal(signal.SIGPIPE, signal.SIG_IGN) + def getDoc(self) -> str: return __doc__ @@ -2503,11 +2529,9 @@ class GfptarCommand(Command): self.worker_terminate() self.sig_default() if self.tmpdir: - try: - self.tmpdir.cleanup() - except Exception: - pass - logger.debug('exit') + self.tmpdir.cleanup() + logger.debug('tmpdir.cleanup') + logger.debug('DONE') def test_init(self): self.am_I_gfarmroot = am_I_gfarmroot() @@ -3180,8 +3204,7 @@ class GfptarCommand(Command): self.listing = True has_error = None - self.create_job_init() - self.sig_init() + self.create_job_init() # before creating threads infiles_checked = [] for infile in infiles: @@ -3294,7 +3317,7 @@ class GfptarCommand(Command): if has_error is not None: self.cancel() - if filelist_num > 0: + if filelist_num > 0 and not self.is_canceled(): try: filelist.commit() filelist.close() @@ -3414,6 +3437,7 @@ class GfptarCommand(Command): if self.MT_enabled(): has_error = self._create_job_check_MT(timeout=timeout) self.thread_pool.shutdown(wait=False) + logger.debug('self.thread_pool.shutdown') if has_error is not None: raise has_error self.worker_terminate() @@ -3490,15 +3514,15 @@ class GfptarCommand(Command): def clear_canceled(self): with self.lock(): - self.canceled.clear() + self.canceled = False def cancel(self): with self.lock(): - self.canceled.set() + self.canceled = True def is_canceled(self): with self.lock(): - val = self.canceled.is_set() + val = self.canceled return val def _gfsched_sometimes(self, gfurl): @@ -3602,8 +3626,8 @@ class GfptarCommand(Command): out_tar_path, out_dbgz_path) = result with self.lock(): self.archived_size += tar_size - self.info(f'created(.tar): {out_tar_path}') - self.info(f'created(.db.gz): {out_dbgz_path}') + self.info('created(.tar): {}', out_tar_path) + self.info('created(.db.gz): {}', out_dbgz_path) break elif result[0] == 'ERR': (status, exc_type_name, exc_value_str, @@ -3624,7 +3648,7 @@ class GfptarCommand(Command): self.name, loglevel=self.loglevel, debug=self.debug, verbose=self.verbose) - self.sig_default() + self.sig_ignore() while True: pid = os.getpid() logger.debug(f'create_a_tar_process: start, pid={pid}') @@ -3705,7 +3729,7 @@ class GfptarCommand(Command): else: prefix_str = serial_str outname_max = self.outdir_url.MAXNAMLEN \ - - len(prefix_str) - len(self.LIST_SUFFIX) + - len(prefix_str) - len(self.TMP_SUFFIX) outname_len = len(outname.encode()) offset = 0 while outname_len > outname_max: @@ -3717,14 +3741,17 @@ class GfptarCommand(Command): offset += 1 # ex.: home/user1/dir -> home_user1_dir outname = prefix_str + outname.replace('/', '_') - outurl = GfURL.init(self.outdir_url.url_join(outname), - use_gfarm_command=self.use_gfarm_command) - target_host = self.select_a_target_host(outurl, serial) - tar = GfTarFile.create_open(outurl, self.compress_type, self.bufsize, - use_fsync=self.use_fsync, - target_host=target_host, - dummy_input=self.dummy_input, - dummy_sleep=self.dummy_sleep) + outname_path = self.outdir_url.url_join(outname) + outname_path_tmp = outname_path + self.TMP_SUFFIX + outurl_tmp = GfURL.init(outname_path_tmp, + use_gfarm_command=self.use_gfarm_command) + target_host = self.select_a_target_host(outurl_tmp, serial) + tar_tmp = GfTarFile.create_open(outurl_tmp, self.compress_type, + self.bufsize, + use_fsync=self.use_fsync, + target_host=target_host, + dummy_input=self.dummy_input, + dummy_sleep=self.dummy_sleep) # to reduce memory usage # SEE ALSO: InfoDB.generate_one() db_name = InfoDB.infodb_filename(gen, serial) @@ -3733,9 +3760,11 @@ class GfptarCommand(Command): cannot_be_archived = 0 db_close = False + cancel = False + has_error = False try: for entry in filelist: - logger.debug(f'subprocess(serial={serial}): {entry.path}') + # logger.debug(f'subprocess(serial={serial}): {entry.path}') while not input_queue.empty(): try: qdata = input_queue.get(timeout=1) @@ -3743,20 +3772,25 @@ class GfptarCommand(Command): qdata = None if qdata == 'CANCEL': logger.debug('receive CANCEL from parent') + cancel = True break else: logger.error('unexpected message from parent') + cancel = True break + if cancel: + break subpath = entry.subpath(self.basedir_url) try: - logger.debug(f'tar.add_entry: {subpath}') + logger.debug(f'tar_tmp.add_entry: {subpath}') # GfTarFile.METHOD_add_entry - tarinfo = tar.add_entry(subpath, entry) + tarinfo = tar_tmp.add_entry(subpath, entry) infodb.add(tarinfo) size_all = entry.size_all() output_queue.put(('ADD', subpath, size_all)) except MemoryError: - tar.close() + raise + except BrokenPipeError: raise except Exception as e: cannot_be_archived += 1 @@ -3764,12 +3798,25 @@ class GfptarCommand(Command): continue infodb.commit_close() db_close = True + if cancel: + raise self.error_canceled() + except Exception: + has_error = True + raise finally: - tar.close() - tardb.close() - tardb.unlink() - if not db_close: - infodb.db.close() + with ignore_exception(has_error): + tar_tmp.close() + with ignore_exception(has_error): + tardb.close() + with ignore_exception(has_error): + tardb.unlink() + with ignore_exception(has_error): + if not db_close: + infodb.db.close() + + # success + tar_size = outurl_tmp.size() + outurl_tmp.rename(outname_path) out_db_path = self.outdir_url.url_join(db_name) InfoDB.compress_copy(infodb.db.filename, out_db_path, @@ -3778,8 +3825,7 @@ class GfptarCommand(Command): # for DEBUG # raise Exception('unexpected raise') - tar_size = outurl.size() - return tar_size, cannot_be_archived, outurl.url_str, out_db_path + return tar_size, cannot_be_archived, outname_path, out_db_path def error_canceled(self): return GfptarError('Canceled') @@ -3822,7 +3868,6 @@ class GfptarCommand(Command): self.start_time = time.time() self.next_time = self.start_time + 1 - self.sig_init() self.extract_main() tmpdir.cleanup() @@ -3948,9 +3993,9 @@ class GfptarCommand(Command): now = time.time() if now >= self.next_time: self.next_time = now + 1 - self.progress_for_listing(now) + self.progress_for_schedule(now) if self.progress_enabled: - self.progress_for_listing(time.time()) + self.progress_for_schedule(time.time()) sys.stdout.write('\n') if self.search_target: self.total_num = len(member_set) # update @@ -4229,7 +4274,7 @@ class GfptarCommand(Command): def _humanize_raw(self, n): return int(n) - def progress_for_listing(self, now): + def progress_for_schedule(self, now): sec = now - self.start_time sec_str = format_seconds(sec, minhour=True) total_num_str = self._humanize(self.total_num) @@ -4238,10 +4283,10 @@ class GfptarCommand(Command): else: ent_per_sec = 0 ent_per_sec_str = self._humanize(ent_per_sec) - sys.stdout.write(f'\rlisting: ' + sys.stdout.write(f'\rschedule: ' f'{total_num_str}Ent ' f'{sec_str} ' - f'{ent_per_sec_str}Ent/s ') + f'{ent_per_sec_str}Ent/s') def progress_for_sync_infodb(self, now): sec = now - self.start_time @@ -4260,7 +4305,7 @@ class GfptarCommand(Command): f'{current_size_str}B ' f'{current_ent_num_str}Ent ' f'{sec_str} ' - f'{ent_per_sec_str}Ent/s ') + f'{ent_per_sec_str}Ent/s') # lock required def progress_for_create(self, now): @@ -4291,12 +4336,12 @@ class GfptarCommand(Command): total_size_str = self._humanize(self.total_size) bytes_per_sec_str = self._humanize(bytes_per_sec) ent_per_sec_str = self._humanize(ent_per_sec) - sys.stdout.write(f'\rcreated: {percent_str}% ' + sys.stdout.write(f'\rcreate: {percent_str}% ' f'{stored_size_str}/{total_size_str}B ' f'{stored_num_str}/{total_num_str}Ent ' f'{sec_str} ' f'{bytes_per_sec_str}B/s ' - f'{ent_per_sec_str}Ent/s ') + f'{ent_per_sec_str}Ent/s') # lock required def progress_for_extract(self, now): @@ -4317,20 +4362,21 @@ class GfptarCommand(Command): extracted_size_str = self._humanize(self.extracted_size) bytes_per_sec_str = self._humanize(bytes_per_sec) ent_per_sec_str = self._humanize(ent_per_sec) - sys.stdout.write(f'\rextracted: {percent:.0f}% ' + sys.stdout.write(f'\rextract: {percent:.0f}% ' f'{extracted_size_str}B ' f'{extracted_num_str}/{total_num_str}Ent ' f'{sec_str} ' f'{bytes_per_sec_str}B/s ' - f'{ent_per_sec_str}Ent/s ') + f'{ent_per_sec_str}Ent/s') def cmd_list_simple(self, indir, quiet=False): self.options_init() indir_url = GfURL.init(indir) infodb_list = self.check_and_sync_infodb(indir_url) - for path, fattr in self.list_entries_from_all( indir_url, infodb_list=infodb_list, resolve_ugmap=True): + if self.is_canceled(): + break if not quiet: print(f'{fattr.ftype} {path}') self.tmpdir.cleanup() @@ -4370,6 +4416,8 @@ class GfptarCommand(Command): for path, fattr in self.list_entries_from_all( indir_url, infodb_list=infodb_list, resolve_ugmap=True): + if self.is_canceled(): + break if not quiet: print(f'{fattr.ftype} {fattr.mode:04o}' f' {fattr.user:>10}/{fattr.group:<10}' @@ -4485,13 +4533,20 @@ class GfptarCommand(Command): self.start_time = time.time() save_e = None cancel = False + + sigs = [signal.SIGHUP, signal.SIGINT, signal.SIGTERM] + orig_sig_handler = {} + for sig in sigs: + orig_sig_handler[sig] = signal.getsignal(sig) + # Concurrent execution for each generation with multiprocessing.Manager() as manager: lock = manager.Lock() share_cancel = manager.Value('i', 0) def sig_handler(signum, frame): - logger.warning(f'Interrupt (signal={signum}') + # logger.warning(f'Interrupt (signal={signum}') + sys.stderr.write(f'\nInterrupt (signal={signum})\n') share_cancel.value = 1 signal.signal(signal.SIGHUP, sig_handler) @@ -4536,7 +4591,10 @@ class GfptarCommand(Command): save_e = e if share_cancel.value != 0: cancel = True - self.sig_default() # no longer be able to access Manager + # no longer be able to access Manager + for sig in sigs: + # restore + signal.signal(sig, orig_sig_handler[sig]) if self.progress_enabled or self.verbose: now = time.time() @@ -4662,7 +4720,8 @@ class InfoDB: if not cls.signal_initialized: def sig_handler(signum, frame): if share_cancel.value != 0: - logger.warning(f'Interrupt (signal={signum}') + # logger.warning(f'Interrupt (signal={signum})') + sys.stderr.write(f'\nInterrupt (signal={signum})\n') share_cancel.value = 1 signal.signal(signal.SIGHUP, sig_handler) @@ -4696,6 +4755,7 @@ class InfoDB: db_close = False num = 0 size = 0 + has_error = True try: interval = 1 # sec. | for interrupt next_check = time.time() + interval @@ -4705,6 +4765,7 @@ class InfoDB: # access manager.Value(): very high cost if share_cancel.value != 0: logger.info('Canceled') + has_error = True break next_check = now + interval try: @@ -4736,10 +4797,15 @@ class InfoDB: db_close = True InfoDB.compress_copy(infodb.db.filename, db_path, bufsize, use_fsync) + except Exception: + has_error = True + raise finally: - tar.close() - if not db_close: - infodb.db.close() + with ignore_exception(has_error): + tar.close() + with ignore_exception(has_error): + if not db_close: + infodb.db.close() return num, size @staticmethod @@ -4750,6 +4816,7 @@ class InfoDB: tmpdb_url = InfoDB.decompress_copy(in_dbgz_path, out_db_path, bufsize, use_fsync) db = DB(tmpdb_url.path) + has_error = False try: fattr_dict = DBDict(db, FileAttr2, InfoDB.TABLE_ENTRY) if resolve_ugmap: @@ -4768,8 +4835,12 @@ class InfoDB: fattr.user = user_dict_mem.get(fattr.user, '???') fattr.group = group_dict_mem.get(fattr.group, '???') yield path, fattr + except Exception: + has_error = True + raise finally: - db.close() + with ignore_exception(has_error): + db.close() progname = os.path.basename(__file__) From a191e24c1aab27c4737b6a13ab1aa2032ad75b05 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Sat, 27 Jul 2024 23:23:40 +0900 Subject: [PATCH 046/143] gfptar: support decimal point for SI/Binary prefix --- gftool/gfptar/gfptar | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index c2fae8cc1..75cf2a3b6 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -101,12 +101,12 @@ def unhumanize_number(numstr, binary_prefix=False): return int(numstr) lastchar = numstr[-1] if lastchar == 'i' and numstrlen > 2: - # binary prefix - n = int(numstr[:(numstrlen-2)]) + # binary prefix: ex. 1Ki + n = float(numstr[:(numstrlen-2)]) prefix = numstr[-2] base = 1024 # even if binary_prefix=False else: # SI prefix - n = int(numstr[:(numstrlen-1)]) + n = float(numstr[:(numstrlen-1)]) prefix = lastchar units = {'K': 1, @@ -121,7 +121,7 @@ def unhumanize_number(numstr, binary_prefix=False): power = units.get(prefix.upper()) if power is None: return int(numstr) - return n * (base ** power) + return int(n * (base ** power)) # Do not use the same DB object from multi-threads. @@ -2213,7 +2213,7 @@ class TestGfptar(unittest.TestCase): self.assertEqual(unhumanize_number('999'), 999) self.assertEqual(unhumanize_number('1k'), 1000) self.assertEqual(unhumanize_number('1K'), 1000) - # self.assertEqual(unhumanize_number('1.5k'), 1500) # TODO + self.assertEqual(unhumanize_number('1.234k'), 1234) self.assertEqual(unhumanize_number('1k', binary_prefix=True), 1024) self.assertEqual(unhumanize_number('1ki'), 1024) self.assertEqual(unhumanize_number('1Ki'), 1024) @@ -2222,6 +2222,7 @@ class TestGfptar(unittest.TestCase): self.assertEqual(unhumanize_number('4Ti'), 4398046511104) self.assertEqual(unhumanize_number('5Pi'), 5629499534213120) self.assertEqual(unhumanize_number('6Ei'), 6917529027641081856) + self.assertEqual(unhumanize_number('6.1E'), 6100000000000000000) def test_GfURL_use_gfarm_command_for_local(self): url = GfURL.init('/tmp', use_gfarm_command=True) @@ -4884,7 +4885,7 @@ Example of --extract (Gfarm to Local): ... /home/user1/out2/dir/test9999.data -SI prefix or Binary prerix: +SI prefix or Binary prefix: SI prefix: - 1k = 10^3 = 1000^1 (kilo) (not K) - 1M = 10^6 = 1000^2 (mega) From 881ac0d750a42874f5e6326869b3fb5b0eec056e Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Sun, 28 Jul 2024 13:54:22 +0900 Subject: [PATCH 047/143] gfptar: simplify --- gftool/gfptar/gfptar | 49 ++++++++++++++++++++++---------------------- 1 file changed, 24 insertions(+), 25 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 75cf2a3b6..a276e074f 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -2404,6 +2404,11 @@ class GfptarCommand(Command): def options_init(self): set_encoding(self.opt['--encoding']) self.jobs = self.opt['--jobs'] + # SEE ALSO: MT_enabled() + if self.jobs >= 1: + self.max_workers = self.jobs + else: + self.max_workers = 1 self.bufsize = self.opt['--bufsize'] self.progress_enabled = self._progress_enabled() self.use_fsync = not self.opt['--disable-fsync'] @@ -2584,6 +2589,7 @@ class GfptarCommand(Command): # create tar per one entry self.opt['--size'] = 0 if full: + # jobs == 0: not use multithread pattern_jobs = [0, 16] else: pattern_jobs = [] # skip @@ -3046,6 +3052,7 @@ class GfptarCommand(Command): exclude_list=exclude_list) def MT_enabled(self): + # jobs == 0: not use multithread return self.jobs >= 1 # lock required @@ -3357,16 +3364,12 @@ class GfptarCommand(Command): self.lock_init(True) self.create_job_execute = self._create_job_execute_MT self.thread_pool = concurrent.futures.ThreadPoolExecutor( - max_workers=self.jobs) + max_workers=self.max_workers) self.futures = {} # key: serial number else: self.lock_init(False) self.create_job_execute = self._create_job_execute - if self.jobs <= 0: - worker_num = 1 - else: - worker_num = self.jobs # kill old processes for regress self.worker_terminate() @@ -3376,15 +3379,19 @@ class GfptarCommand(Command): self.worker_list = [] # (started, process, inq, outq) self.worker_ident_dict = {} - for i in range(worker_num): + for i in range(self.max_workers): inq = multiprocessing.Queue() outq = multiprocessing.Queue() + # NOTE: use multiprocessing.Process instead of ProcessPoolExecutor. # ProcessPoolExecutor cannot be utilized here, - # because, for example, Queue cannot be not specified as arguments. + # because, for example, multiprocessing.Queue() cannot be + # specified as arguments for ProcessPoolExecutor. + # multiprocessing.Manager().Queue() can be used + # for ProcessPoolExecutor, but it is very slow. process = multiprocessing.Process(target=self.create_a_tar_process, args=(inq, outq)) started = Started() - started.status = False + started.status = False # lock required self.worker_list.append((started, process, inq, outq)) def _create_job_execute(self, gen, serial, arg): @@ -3653,10 +3660,7 @@ class GfptarCommand(Command): while True: pid = os.getpid() logger.debug(f'create_a_tar_process: start, pid={pid}') - try: - self.create_a_tar_process0(input_queue, output_queue) - except KeyboardInterrupt: - pass + self.create_a_tar_process0(input_queue, output_queue) def create_a_tar_process0(self, input_queue, output_queue): request = input_queue.get() @@ -3674,8 +3678,6 @@ class GfptarCommand(Command): output_queue.put(('DONE', tar_size, cannot_be_archived, outurl, infodb_url)) logger.debug(f'subprocess exits: serial={serial}') - except KeyboardInterrupt: - pass except Exception as e: logger.debug(f'create_a_tar_process: error (serial={serial})') self.print_trace(e) @@ -4125,7 +4127,7 @@ class GfptarCommand(Command): def extract_from_archives_MT(self, target_set, member_set): self.lock_init(True) with concurrent.futures.ThreadPoolExecutor( - max_workers=self.jobs) as executor: + max_workers=self.max_workers) as executor: self.futures = {} # tar filenames serial = 0 for target in target_set.iterator(sort='ASC'): @@ -4451,9 +4453,9 @@ class GfptarCommand(Command): base = os.path.basename(path) g_match = db_gen_pattern.match(base) if g_match: - # int("0001") -> 1 gen_num = g_match.group(1) id_num = g_match.group(2) + # int("0001") -> 1 infodb_list.append((int(id_num), int(gen_num), path)) return infodb_list @@ -4462,7 +4464,7 @@ class GfptarCommand(Command): def list_tar_files(self, indir_url, infodb=False): # *.tar or *.tar.* - PATT_TAR = r'.*\.tar(\.\w{1,5})?$' + PATT_TAR = r'^.*\.tar(\.\w{1,5})?$' PATT_TAR_GEN1 = r'^(\d+)_.+' PATT_TAR_GEN = r'^g(\d+)_(\d+)_.+' tar_pattern = re.compile(PATT_TAR) @@ -4479,9 +4481,9 @@ class GfptarCommand(Command): if infodb: db_match = db_gen_pattern.match(base) if db_match: - # int("0001") -> 1 gen_num = db_match.group(1) id_num = db_match.group(2) + # int("0001") -> 1 infodb_list.append((int(id_num), int(gen_num), path)) continue if ent.path.endswith(self.LIST_SUFFIX): # ignore *.lst, *.tar.lst @@ -4526,11 +4528,6 @@ class GfptarCommand(Command): gen_to_tar_list[gen_num].append((id_num, path)) self.total_tar_num += 1 - if self.jobs >= 1: - max_workers = self.jobs - else: - max_workers = 1 - self.start_time = time.time() save_e = None cancel = False @@ -4563,7 +4560,7 @@ class GfptarCommand(Command): for gen_num, tarlist in gen_to_tar_list.items(): with concurrent.futures.ProcessPoolExecutor( - max_workers=max_workers) as executor: + max_workers=self.max_workers) as executor: arglist = [] for id_num, tar_path in tarlist: arglist.append((lock, share_cancel, self.tmpdir, @@ -4630,12 +4627,14 @@ class InfoDB: self.user_dict = {} # use memory for speed self.group_dict = {} # use memory for speed - # SEE ALSO: infodb_filename + # SEE ALSO: infodb_filename() PATT_INFODB = r'^g(\d+)_(\d+)_gfptar.db.gz$' @classmethod def infodb_filename(cls, gen_num, id_num): # SEE ALSO: PATT_INFODB + # gen_num (>= 1): the generation number + # id_num (>= 1): the serial number # ex. g2_0099_gfptar.db.gz return f'g{gen_num}_{int(id_num):04}_gfptar.db.gz' From f8baa81487fbd9dd4a405e3883d98c37743914ae Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Sun, 28 Jul 2024 14:18:21 +0900 Subject: [PATCH 048/143] gfptar --verbose --list: print the serial number for each tar achive --- gftool/gfptar/gfptar | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index a276e074f..f7d461898 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -4376,7 +4376,7 @@ class GfptarCommand(Command): self.options_init() indir_url = GfURL.init(indir) infodb_list = self.check_and_sync_infodb(indir_url) - for path, fattr in self.list_entries_from_all( + for id_num, gen_num, path, fattr in self.list_entries_from_all( indir_url, infodb_list=infodb_list, resolve_ugmap=True): if self.is_canceled(): break @@ -4417,12 +4417,12 @@ class GfptarCommand(Command): if error_num > 0: raise GfptarError(f'Total errors encountered: {error_num}') - for path, fattr in self.list_entries_from_all( + for id_num, gen_num, path, fattr in self.list_entries_from_all( indir_url, infodb_list=infodb_list, resolve_ugmap=True): if self.is_canceled(): break if not quiet: - print(f'{fattr.ftype} {fattr.mode:04o}' + print(f'g{gen_num}_{id_num:04} {fattr.ftype} {fattr.mode:04o}' f' {fattr.user:>10}/{fattr.group:<10}' f' {fattr.size:9d} {fattr.mtime} {path}') self.tmpdir.cleanup() @@ -4433,9 +4433,10 @@ class GfptarCommand(Command): infodb_list = self.list_infodb_files(indir_url) self.sort_tar_or_infodb_list(infodb_list) for id_num, gen_num, in_dbgz_path in infodb_list: - yield from InfoDB.list_entries_from_one( + for path, fattr in InfoDB.list_entries_from_one( in_dbgz_path, self.tmpdir, self.bufsize, self.use_fsync, - resolve_ugmap) + resolve_ugmap): + yield id_num, gen_num, path, fattr def sort_tar_or_infodb_list(self, tar_or_infodb_list): def id_key(value): @@ -4902,6 +4903,7 @@ Limitations: Options: -t, --list=DIR list mode, list the members of + (use with --verbose to see more details) -x, --extract=DIR extract mode, extract all members or specified s from to @@ -4968,6 +4970,7 @@ Usage: {f} [options] -c [-C ] [--] ... {f} [options] -x [--] [...] {f} [options] -t + {f} [options] -t -v {f} [options] --test {f} [options] --test -C ... {f} [options] --test-long From 1f00772502c75906dd3dd3bb926a4732ebaf2872 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Sun, 28 Jul 2024 14:32:32 +0900 Subject: [PATCH 049/143] gfptar: update help --- gftool/gfptar/gfptar | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index f7d461898..3e4adc844 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -2589,7 +2589,7 @@ class GfptarCommand(Command): # create tar per one entry self.opt['--size'] = 0 if full: - # jobs == 0: not use multithread + # jobs == 0: not use multi-thread pattern_jobs = [0, 16] else: pattern_jobs = [] # skip @@ -3052,7 +3052,7 @@ class GfptarCommand(Command): exclude_list=exclude_list) def MT_enabled(self): - # jobs == 0: not use multithread + # jobs == 0: not use multi-thread return self.jobs >= 1 # lock required @@ -4863,23 +4863,22 @@ Example of --create (Gfarm to Gfarm): ... gfarm:/home/user1/out/0010_dir_test9000.data..dir_test9999.data.tar.gz gfarm:/home/user1/out/g1_0010_gfptar.db.gz - -Contents of gMM_NN_gfptar.db.gz file (sqlite3 and gzip): - MM: the generation number for each append operation - NN: the serial number - table 'path_entry': map of path name to JSON string - json.dumps([ int(file_mode), int(mtime), - int(user_unique_id), int(group_unique_id), - int(size), symlink_path, file_type(D,F,S) ] - file_type 'D': directory - file_type 'F': file - file_type 'S': symbolic link - table 'user_map' : map of unique id (not uid) to user name - table 'group_map': map of unique id (not gid) to group name + Contents of gMM_NN_gfptar.db.gz file (sqlite3 and gzip): + MM: the generation number for each append operation + NN: the serial number + table 'path_entry': map of path name to JSON string + json.dumps([ int(file_mode), int(mtime), + int(user_unique_id), int(group_unique_id), + int(size), symlink_path, file_type(D,F,S) ] + file_type 'D': directory + file_type 'F': file + file_type 'S': symbolic link + table 'user_map' : map of unique id (not uid) to user name + table 'group_map': map of unique id (not gid) to group name Example of --extract (Gfarm to Local): Command line: - gfptar -x /home/user1/out2 gfarm:/home/user1/out + gfptar -x /home/user1/out2 gfarm:/home/user1/gfptar-dir Output files: /home/user1/out2/dir/test0000.data ... From 2977131416cca339c5f3555a277e398d2b692999 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Sun, 28 Jul 2024 15:01:59 +0900 Subject: [PATCH 050/143] gfptar: new option: --progress-interval --- gftool/gfptar/gfptar | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 3e4adc844..144bc0405 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -2410,7 +2410,6 @@ class GfptarCommand(Command): else: self.max_workers = 1 self.bufsize = self.opt['--bufsize'] - self.progress_enabled = self._progress_enabled() self.use_fsync = not self.opt['--disable-fsync'] self.sync_infodb = self.opt['--sync-db'] @@ -2424,6 +2423,8 @@ class GfptarCommand(Command): dir=self.workdir) self.sig_init() # to clean tmpdir + self.progress_enabled = self._progress_enabled() + self.progress_interval = self.opt['--progress-interval'] progress_unit_type = self.opt['--progress-unit'] if progress_unit_type == 'si': self._humanize = self._humanize_si @@ -3192,7 +3193,7 @@ class GfptarCommand(Command): self.suffix = '.tar.' + self.compress_type self.start_time = time.time() - self.next_time = self.start_time + 1 + self.next_time = self.start_time + self.progress_interval self.archived_size = 0 self.stored_size = 0 @@ -3303,7 +3304,7 @@ class GfptarCommand(Command): if serial == 1 and self.progress_enabled: now = time.time() if now >= self.next_time: - self.next_time = now + 1 + self.next_time = now + self.progress_interval self.progress_for_create(now) if has_error is not None: @@ -3627,7 +3628,7 @@ class GfptarCommand(Command): self.stored_num += 1 now = time.time() if now >= self.next_time: - self.next_time = now + 1 + self.next_time = now + self.progress_interval self.progress_for_create(now) elif result[0] == 'DONE': (status, tar_size, cannot_be_archived, @@ -3869,7 +3870,7 @@ class GfptarCommand(Command): self.total_num = 0 self.start_time = time.time() - self.next_time = self.start_time + 1 + self.next_time = self.start_time + self.progress_interval self.extract_main() tmpdir.cleanup() @@ -3995,7 +3996,7 @@ class GfptarCommand(Command): if self.progress_enabled: now = time.time() if now >= self.next_time: - self.next_time = now + 1 + self.next_time = now + self.progress_interval self.progress_for_schedule(now) if self.progress_enabled: self.progress_for_schedule(time.time()) @@ -4023,7 +4024,7 @@ class GfptarCommand(Command): self.extracted_num = 0 self.extracted_size = 0 self.start_time = time.time() - self.next_time = self.start_time + 1 + self.next_time = self.start_time + self.progress_interval self.gfsched_lock = None self.gfsched_next = 0 @@ -4261,7 +4262,7 @@ class GfptarCommand(Command): self.extracted_size += tarinfo.size now = time.time() if now >= self.next_time: - self.next_time = now + 1 + self.next_time = now + self.progress_interval self.progress_for_extract(now) tar.close() @@ -4434,8 +4435,8 @@ class GfptarCommand(Command): self.sort_tar_or_infodb_list(infodb_list) for id_num, gen_num, in_dbgz_path in infodb_list: for path, fattr in InfoDB.list_entries_from_one( - in_dbgz_path, self.tmpdir, self.bufsize, self.use_fsync, - resolve_ugmap): + in_dbgz_path, self.tmpdir, self.bufsize, self.use_fsync, + resolve_ugmap): yield id_num, gen_num, path, fattr def sort_tar_or_infodb_list(self, tar_or_infodb_list): @@ -4934,6 +4935,7 @@ Options: --disable-fsync disable calling fsync() before close() --gfsched-interval=SEC interval of updating candidate hosts to write (for Gfarm URL only) [default: 120] + --progress-interval=SEC interval of updating progress [default: 1.0] --encoding=CODEC codec for filename encoding (https://docs.python.org/3/library/codecs.html#standard-encodings) [default: utf-8] @@ -4999,6 +5001,7 @@ _schema = Schema({ '--same-owner': bool, '--workdir': Or(str, None), '--max-entries-per-tar': Use(unhumanize_number), + '--progress-interval': Use(float), '--progress-unit': str, '--memory': Or(Use(unhumanize_number), None), '--test': bool, From 1f69a72dc074809bbef7e9ffc6b9c765543dec0e Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Sun, 28 Jul 2024 19:38:09 +0900 Subject: [PATCH 051/143] gfptar --verbose --list: print missing linkname of symlink gfptar --verbose --list: change the format of mtime gfptar: clarify --- gftool/gfptar/gfptar | 302 +++++++++++++++++++++---------------------- 1 file changed, 150 insertions(+), 152 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 144bc0405..e5e06e0c9 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -48,6 +48,7 @@ import multiprocessing import queue import inspect from collections import defaultdict +import datetime from docopt import docopt from schema import Schema, Use, Or @@ -198,7 +199,7 @@ class StrObj(DBObj): class FileAttr1(DBObj): def __init__(self, mode, mtime, user, group): - self.mode = mode + self.mode = mode & 0o7777 self.mtime = mtime self.user = user self.group = group @@ -216,7 +217,7 @@ class FileAttr1(DBObj): class FileAttr2(DBObj): def __init__(self, mode, mtime, user, group, size, linkname, ftype): - self.mode = mode + self.mode = mode & 0o7777 self.mtime = mtime self.user = user self.group = group @@ -856,7 +857,7 @@ class GfURLEntry(DBObj): else: logger.warning('unsupported type: %s, %s', path, self.file_type) return None - tarinfo.mode = self.mode + tarinfo.mode = self.mode & 0o7777 tarinfo.mtime = self.mtime tarinfo.size = self.size tarinfo.linkname = self.linkname @@ -2400,6 +2401,7 @@ class GfptarCommand(Command): if self.quiet: GfURL.shutup_stderr() self.tmpdir = None + self.test_mode = False def options_init(self): set_encoding(self.opt['--encoding']) @@ -2551,6 +2553,7 @@ class GfptarCommand(Command): self.pid = os.getpid() out = gfwhoami() self.gfarm_user = out.strip() + self.test_mode = True def test_main_short(self): self.test_init() @@ -3903,44 +3906,16 @@ class GfptarCommand(Command): path = path.lstrip('/') member_check_dict[path] = False # initialize - tar_list, infodb_list = self.list_tar_and_infodb_files(self.indir_url) - infodb_list2 = self.check_and_sync_infodb(self.indir_url, - tar_list=tar_list) - if infodb_list2 is not None: - infodb_list = infodb_list2 # use new list - self.sort_tar_or_infodb_list(infodb_list) - - infodb_dict = {} - for id_num, gen_num, infodb_path in infodb_list: - infodb_dict[id_num] = infodb_path - error_num = 0 - tar_dict = {} - for id_num, gen_num, tar_path in tar_list: - tar_dict[id_num] = tar_path - infodb = infodb_dict.get(id_num, None) - if infodb is None: - logger.error(f'lost *_info.db.gz tied to {tar_path}') - error_num += 1 - del infodb_dict - - if self.debug: - infodb_list3 = self.list_infodb_files(self.indir_url) - self.sort_tar_or_infodb_list(infodb_list3) - if infodb_list != infodb_list3: - raise AssertionError(f'infodb_list{str(infodb_list)} != ' - f'infodb_list3{str(infodb_list3)}') - - for id_num, gen_num, in_dbgz_path in infodb_list: + for id_num, gen_num, dbgz_path, tar_path in \ + self.list_infodb_files(self.indir_url): if self.is_canceled(): raise self.error_canceled() - tar_path = tar_dict.get(id_num, None) - if tar_path is None: - logger.error(f'lost *.tar.* tied to {in_dbgz_path}') + if dbgz_path is None or tar_path is None: error_num += 1 continue for path, fattr in InfoDB.list_entries_from_one( - in_dbgz_path, self.tmpdir, self.bufsize, self.use_fsync, + dbgz_path, self.tmpdir, self.bufsize, self.use_fsync, resolve_ugmap=False): if self.is_canceled(): raise self.error_canceled() @@ -4374,153 +4349,186 @@ class GfptarCommand(Command): f'{ent_per_sec_str}Ent/s') def cmd_list_simple(self, indir, quiet=False): - self.options_init() - indir_url = GfURL.init(indir) - infodb_list = self.check_and_sync_infodb(indir_url) - for id_num, gen_num, path, fattr in self.list_entries_from_all( - indir_url, infodb_list=infodb_list, resolve_ugmap=True): - if self.is_canceled(): - break - if not quiet: - print(f'{fattr.ftype} {path}') - self.tmpdir.cleanup() + self.cmd_list(indir, quiet=quiet, verbose=False) def cmd_list_verbose(self, indir, quiet=False): + self.cmd_list(indir, quiet=quiet, verbose=True) + + def cmd_list(self, indir, quiet=False, verbose=True): self.options_init() indir_url = GfURL.init(indir) - # SEE ALSO: cmd_extract() - tar_list, infodb_list = self.list_tar_and_infodb_files(indir_url) - infodb_list2 = self.check_and_sync_infodb(indir_url, - tar_list=tar_list) - if infodb_list2 is not None: - infodb_list = infodb_list2 # use new list - - infodb_dict = {} - for id_num, gen_num, infodb_path in infodb_list: - infodb_dict[id_num] = infodb_path - - error_num = 0 - tar_dict = {} - for id_num, gen_num, tar_path in tar_list: - tar_dict[id_num] = tar_path - infodb = infodb_dict.get(id_num, None) - if infodb is None: - logger.error(f'lost *_info.db.gz tied to {tar_path}') - error_num += 1 - - for id_num, gen_num, in_dbgz_path in infodb_list: - tar_path = tar_dict.get(id_num, None) - if tar_path is None: - logger.error(f'lost *.tar.* tied to {in_dbgz_path}') - error_num += 1 + def print_quiet(id_num, gen_num, path, fattr): + pass - if error_num > 0: - raise GfptarError(f'Total errors encountered: {error_num}') + def print_verbose(id_num, gen_num, path, fattr): + name = path + if fattr.ftype == InfoDB.TYPE_DIR: + name = name + '/' + elif fattr.ftype == InfoDB.TYPE_SYMLINK: + name = name + ' -> ' + fattr.linkname + dt_object = datetime.datetime.fromtimestamp(fattr.mtime) + mtime_str = dt_object.strftime('%Y-%m-%d %H:%M') + print(f'g{gen_num}_{id_num:04} {fattr.ftype} {fattr.mode:04o}' + f' {fattr.user:>8}/{fattr.group:<8}' + f' {fattr.size:9d} {mtime_str} {name}') + + def print_simple(id_num, gen_num, path, fattr): + name = path + if fattr.ftype == InfoDB.TYPE_DIR: + name = name + '/' + print(f'{fattr.ftype} {name}') + + if quiet: + print_func = print_quiet + elif verbose: + print_func = print_verbose + else: + print_func = print_simple for id_num, gen_num, path, fattr in self.list_entries_from_all( - indir_url, infodb_list=infodb_list, resolve_ugmap=True): + indir_url, resolve_ugmap=True): if self.is_canceled(): break - if not quiet: - print(f'g{gen_num}_{id_num:04} {fattr.ftype} {fattr.mode:04o}' - f' {fattr.user:>10}/{fattr.group:<10}' - f' {fattr.size:9d} {fattr.mtime} {path}') + print_func(id_num, gen_num, path, fattr) self.tmpdir.cleanup() - def list_entries_from_all(self, indir_url, infodb_list=None, - resolve_ugmap=False): - if infodb_list is None: - infodb_list = self.list_infodb_files(indir_url) - self.sort_tar_or_infodb_list(infodb_list) - for id_num, gen_num, in_dbgz_path in infodb_list: + def list_entries_from_all(self, indir_url, resolve_ugmap=False): + # SEE ALSO: extract_schedule_*() + infodb_list = self.list_infodb_files(indir_url) + error_num = 0 + for id_num, gen_num, dbgz_path, tar_path in infodb_list: + if dbgz_path is None: + error_num += 1 + continue + if tar_path is None: + error_num += 1 + # FALLTHROUGH for path, fattr in InfoDB.list_entries_from_one( - in_dbgz_path, self.tmpdir, self.bufsize, self.use_fsync, + dbgz_path, self.tmpdir, self.bufsize, self.use_fsync, resolve_ugmap): yield id_num, gen_num, path, fattr + if error_num > 0: + raise GfptarError(f'Total errors encountered: {error_num}') - def sort_tar_or_infodb_list(self, tar_or_infodb_list): + def sort_infodb_list(self, tar_or_infodb_list): def id_key(value): - id_num, gen_num, path = value - return id_num + return value[0] # id_num tar_or_infodb_list.sort(key=id_key) return tar_or_infodb_list - def list_infodb_files(self, indir_url): - db_gen_pattern = re.compile(InfoDB.PATT_INFODB) - infodb_list = [] - for ent in indir_url.listdir(recursive=False): - path = ent.path # fullpath when ent is Gfarm - base = os.path.basename(path) - g_match = db_gen_pattern.match(base) - if g_match: - gen_num = g_match.group(1) - id_num = g_match.group(2) - # int("0001") -> 1 - infodb_list.append((int(id_num), int(gen_num), path)) - return infodb_list + def list_infodb_files(self, indir_url, sort=True): + tar_list, infodb_list = self.list_tar_infodb_files(indir_url) + if self.sync_infodb: + infodb_list1 = self.create_infodb_files(indir_url, tar_list) + if self.test_mode: + self.sort_infodb_list(infodb_list) + self.sort_infodb_list(infodb_list1) + if infodb_list != infodb_list1: + raise AssertionError(f'infodb_list{str(infodb_list)} != ' + f'infodb_list1{str(infodb_list1)}') + for id_num, gen_num, infodb_path in infodb_list: + u = GfURL.init(infodb_path) + if not u.exists(): + raise AssertionError(f'{infodb_path} does not exist' + ' even after create_infodb_files') + del infodb_list + infodb_list = infodb_list1 - def list_tar_and_infodb_files(self, indir_url): - return self.list_tar_files(indir_url, infodb=True) + infodb_dict = {} + for id_num, gen_num, infodb_path in infodb_list: + infodb_dict[id_num] = (gen_num, infodb_path) - def list_tar_files(self, indir_url, infodb=False): - # *.tar or *.tar.* - PATT_TAR = r'^.*\.tar(\.\w{1,5})?$' + tar_dict = {} + for id_num, gen_num, tar_path in tar_list: + tar_dict[id_num] = (gen_num, tar_path) + infodb = infodb_dict.get(id_num, None) + if infodb is None: + fname = InfoDB.infodb_filename(gen_num, id_num) + logger.error(f'lost {fname} tied to {tar_path}.' + ' --sync-db will recreate the file.') + infodb_dict[id_num] = (gen_num, None) + + for id_num, gen_num, infodb_path in infodb_list: + tar = tar_dict.get(id_num, None) + if tar is None: + logger.error(f'lost *.tar.* tied to {infodb_path}') + + infodb_tar_list = [] + for id_num, gen_path in sorted(infodb_dict.items()): + gen_num, infodb_path = gen_path + tar = tar_dict.get(id_num, None) + if tar is not None: + gen_num2, tar_path = tar + else: + tar_path = None + infodb_tar_list.append((id_num, gen_num, infodb_path, tar_path)) + + if sort: + self.sort_infodb_list(infodb_tar_list) + return infodb_tar_list + + # def list_only_infodb_files(self, indir_url): + # db_gen_pattern = re.compile(InfoDB.PATT_INFODB) + # infodb_list = [] + # for ent in indir_url.listdir(recursive=False): + # path = ent.path # fullpath when ent is Gfarm + # base = os.path.basename(path) + # g_match = db_gen_pattern.match(base) + # if g_match: + # gen_num = g_match.group(1) + # id_num = g_match.group(2) + # # int("0001") -> 1 + # infodb_list.append((int(id_num), int(gen_num), path)) + # return infodb_list + + def list_tar_infodb_files(self, indir_url, check=True, sync=False): + PATT_TAR = r'^.*\.tar(\.\w{1,5})?$' # *.tar or *.tar.* PATT_TAR_GEN1 = r'^(\d+)_.+' - PATT_TAR_GEN = r'^g(\d+)_(\d+)_.+' + PATT_TAR_GEN_ALL = r'^g(\d+)_(\d+)_.+' tar_pattern = re.compile(PATT_TAR) tar_gen1_pattern = re.compile(PATT_TAR_GEN1) - # divide by generations : gN_* (N >= 2) - tar_gen_pattern = re.compile(PATT_TAR_GEN) + tar_gen_all_pattern = re.compile(PATT_TAR_GEN_ALL) + infodb_pattern = re.compile(InfoDB.PATT_INFODB) tar_list = [] - if infodb: - db_gen_pattern = re.compile(InfoDB.PATT_INFODB) - infodb_list = [] + infodb_list = [] for ent in indir_url.listdir(recursive=False): path = ent.path base = os.path.basename(path) - if infodb: - db_match = db_gen_pattern.match(base) - if db_match: - gen_num = db_match.group(1) - id_num = db_match.group(2) - # int("0001") -> 1 - infodb_list.append((int(id_num), int(gen_num), path)) - continue + db_match = infodb_pattern.match(base) + if db_match: + gen_num = db_match.group(1) + id_num = db_match.group(2) + # int("0001") -> 1 + infodb_list.append((int(id_num), int(gen_num), path)) + continue if ent.path.endswith(self.LIST_SUFFIX): # ignore *.lst, *.tar.lst continue if not tar_pattern.match(base): # ignore not *.tar.* continue - g1_match = tar_gen1_pattern.match(base) - if g1_match: + t1_match = tar_gen1_pattern.match(base) + if t1_match: # generation number = 1 gen_num = '1' # ex. 0001 (str) - id_num = g1_match.group(1) + id_num = t1_match.group(1) else: - g_match = tar_gen_pattern.match(base) - if g_match: - gen_num = g_match.group(1) + t_match = tar_gen_all_pattern.match(base) + if t_match: + gen_num = t_match.group(1) # ex. 0001 (str) - id_num = g_match.group(2) + id_num = t_match.group(2) else: # ignore irrelevant file continue + # int("0001") -> 1 tar_list.append((int(id_num), int(gen_num), path)) - if infodb: - return tar_list, infodb_list - else: - return tar_list + return tar_list, infodb_list - def check_and_sync_infodb(self, indir_url, tar_list=None): - if not self.sync_infodb: - logger.debug('check_and_sync_infodb: disable') - return None - logger.debug('check_and_sync_infodb: enable') - if tar_list is None: - tar_list = self.list_tar_files(indir_url) - update = True # always True + def create_infodb_files(self, indir_url, tar_list): + logger.debug('create_infodb_files') + overwrite = True # always True self.total_tar_num = 0 self.current_tar_num = 0 @@ -4566,7 +4574,7 @@ class GfptarCommand(Command): arglist = [] for id_num, tar_path in tarlist: arglist.append((lock, share_cancel, self.tmpdir, - update, gen_num, id_num, tar_path, + overwrite, gen_num, id_num, tar_path, self.bufsize, self.use_fsync)) # InfoDB.generate_one is staticmethod, # because ProcessPoolExecutor cannot serialize @@ -4735,7 +4743,7 @@ class InfoDB: @staticmethod def generate_one(args): - (lock, share_cancel, tmpdir, update, gen_num, id_num, + (lock, share_cancel, tmpdir, overwrite, gen_num, id_num, tar_path, bufsize, use_fsync) = args if share_cancel.value != 0: # logger.debug('Canceled (2)') @@ -4745,8 +4753,8 @@ class InfoDB: db_name = InfoDB.infodb_filename(gen_num, id_num) db_path = indir_url.url_join(db_name) db_url = GfURL.init(db_path) - if not update and db_url.exists(): - logger.debug(f'not update: {db_path}') + if not overwrite and db_url.exists(): + logger.debug(f'not overwrite: {db_path}') return 0, 0 InfoDB.signal_init(share_cancel) @@ -4780,16 +4788,6 @@ class InfoDB: t = None if t is None: break - # name = t.name - # if t.isdir(): - # name = name + '/' - # elif t.issym(): - # name = name + ' -> ' + t.linkname - # info = (f'{gen_num}:{id_num} {t.mode:04o}' - # f' {t.uname:>10}/{t.gname:<10}' - # f' {t.size:9d} {t.mtime} {name}') - # logger.debug(info) - # logger.debug(f'add to DB: {t.name}') infodb.add(t) num += 1 size += t.size From d26917f024f87fb78e975c2e3bdaa328db9671bc Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Sun, 28 Jul 2024 22:19:19 +0900 Subject: [PATCH 052/143] gfptar --create: allow members named gfarm:* ex. -c outdir -C gfarm:/tmp gfarm:abc (for gfarm:/tmp/gfarm:abc) --- gftool/gfptar/gfptar | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index e5e06e0c9..6ba0020b7 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -938,10 +938,13 @@ class GfURL(metaclass=abc.ABCMeta): @classmethod def parse(cls, url): - supported_classes = [GfURLGfarm] + if cls == GfURL: + supported_classes = [GfURLGfarm] + else: + supported_classes = [cls] for c in supported_classes: - if url.startswith(f'{c.SCHEME}:'): + if c.SCHEME_LEN > 0 and url.startswith(f'{c.SCHEME}:'): # gfarm://host/path -> //host/path hostpath = url[(c.SCHEME_LEN+1):] scheme = c.SCHEME @@ -2253,6 +2256,8 @@ class TestGfptar(unittest.TestCase): (None, None, '/abc/#def:/')) self.assertEqual(GfURL.parse('abc:/def://'), (None, None, 'abc:/def://')) + self.assertEqual(GfURLLocal.parse('gfarm:abc:/def://'), + (None, None, 'gfarm:abc:/def://')) def test_DBDict(self): euid = os.geteuid() @@ -2557,13 +2562,14 @@ class GfptarCommand(Command): def test_main_short(self): self.test_init() + self.test_member() self.test_opt_pattern(full=False) self.test_specified_dir() def test_main_long(self): self.test_init() self.test_unit() - self.test_invalid('url', 'gfarm:/tmp', 'dst', True) + # self.test_invalid('url', 'gfarm:/tmp', 'dst', False) # allowed self.test_invalid('dot1', '.', 'dst', True) self.test_invalid('dot2', '', 'dst', False) self.test_invalid('dot3', './', 'dst', False) @@ -2673,7 +2679,7 @@ class GfptarCommand(Command): ok = False try: self.cmd_create(test1_dir, workdir, [testsrc_name]) - except Exception as e: + except GfException as e: self.print_trace(e) if str(e).startswith('specifying '): ok = True @@ -2849,8 +2855,9 @@ class GfptarCommand(Command): test6_dir_local = workdir_local_url.url_join(test6_name) # pick files as members (SEE ALSO: test_prepare_srcdir) - members = ['file1', 'dir1'] - files = ['file1', 'dir1/readonly/file#2'] + # 'gfarm:abc' is a filename (not Gfarm URL) + members = ['gfarm:abc', 'dir1'] + files = ['gfarm:abc', 'dir1/readonly/file#2'] # Gfarm -> Gfarm(tar) self.cmd_create(test1_dir_gfarm, srcdir_gfarm, members) @@ -2908,7 +2915,8 @@ class GfptarCommand(Command): L = 'hardlink' longname = ('0123456789' * 30)[:255] tree = [ - (F, 'file1', 0o664, 1234567890, None), + # (F, 'file1', 0o664, 1234567890, None), + (F, 'gfarm:abc', 0o644, 1234567890, None), # not Gfarm URL (D, 'dir1', 0o715, 2234567890, None), ] tree_readonly = [ @@ -2923,7 +2931,7 @@ class GfptarCommand(Command): tree_link = [ (D, 'dir1/ディレクトリ 2', 0o755, 5234567890, None), (L, 'dir1/ディレクトリ 2/hardlink1', - 0o400, 6234567890, 'file1'), + 0o400, 6234567890, 'gfarm:abc'), (S, 'dir1/ディレクトリ 2/symlink1', 0o777, 7234567890, 'hardlink1'), ] @@ -3220,10 +3228,12 @@ class GfptarCommand(Command): infiles_checked = [] for infile in infiles: - infile_url = GfURL.init(infile) - if not infile_url.is_local(): - raise GfException('specifying a relative path is required ' - 'instead of a URL: ' + infile) + # infile_url = GfURL.init(infile) + # if not infile_url.is_local(): + # raise GfException('specifying a relative path is required ' + # 'instead of a URL: ' + infile) + # NOTE: allow members named gfarm:* (ex. gfarm:/tmp/gfarm:abc) + infile_url = GfURL.init(infile, local=True) infile = infile_url.path # normalize and ignore scheme # normalized: ex. .///abc -> ./abc infile = infile.lstrip('/') # relative path only From ce91b06255873fc6e37dc8fe9b22d943888479ea Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Sun, 28 Jul 2024 22:33:26 +0900 Subject: [PATCH 053/143] gfptar: remove unnecessary temporary file --- gftool/gfptar/gfptar | 2 ++ 1 file changed, 2 insertions(+) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 6ba0020b7..eb3065701 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -4851,6 +4851,8 @@ class InfoDB: finally: with ignore_exception(has_error): db.close() + with ignore_exception(has_error): + db.unlink() progname = os.path.basename(__file__) From 51a934d91a7534881c7e22e069485dca74346515 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Mon, 29 Jul 2024 21:44:08 +0900 Subject: [PATCH 054/143] gfptar --create: improve performance --- gftool/gfptar/gfptar | 178 ++++++++++++++++++++++++++----------------- 1 file changed, 106 insertions(+), 72 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index eb3065701..a82d4835c 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -129,6 +129,10 @@ def unhumanize_number(numstr, binary_prefix=False): class DB: def __init__(self, filename, check_same_thread=True): self.filename = filename + # NOTE: check_same_thread=True: ignore the following error: + # SQLite objects created in a thread can only be used + # in that same thread. + # The object was created in thread id MM and this is thread id NN. self.con = sqlite3.connect(filename, check_same_thread=check_same_thread) self.con.execute('PRAGMA synchronous = OFF') @@ -771,14 +775,6 @@ class GfURLEntry(DBObj): TYPE_SYMLINK = 'S' TYPE_OTHER = '?' - type_map = { - TYPE_FILE: 1, - TYPE_DIR: 2, - TYPE_SYMLINK: 3, - TYPE_OTHER: 4, - } - type_map_reverse = {v: k for k, v in type_map.items()} - def __init__(self, path, mode, file_type, uname, gname, size, mtime, linkname): self.path = path @@ -804,10 +800,9 @@ class GfURLEntry(DBObj): # only path must be specified for key when using DBDict @classmethod def dumps(cls, obj, for_dict): - t = obj.type_map[obj.file_type] # serialize using list() to reduce size # [0]...[6] - array = [obj.mode, t, obj.uname, obj.gname, + array = [obj.mode, obj.file_type, obj.uname, obj.gname, obj.size, obj.mtime, obj.linkname] # save path to key when using dict, so don't save path to value if not for_dict: @@ -821,8 +816,7 @@ class GfURLEntry(DBObj): path = key else: path = o[7] - t = cls.type_map_reverse[o[1]] - return cls(path, o[0], t, o[2], o[3], o[4], o[5], o[6]) + return cls(path, o[0], o[1], o[2], o[3], o[4], o[5], o[6]) def url(self): return GfURL.init(self.path) @@ -3113,8 +3107,6 @@ class GfptarCommand(Command): def rand_dir(first_dir_index): first_dir = f'dir{first_dir_index:04d}' - dir_list = generate_random_dirname() - dir_depth = len(dir_list) mode = 0o755 file_type = GfURLEntry.TYPE_DIR @@ -3128,6 +3120,8 @@ class GfptarCommand(Command): # path = os.path.join(base_dir, first_dir) # yield GfURLEntry(path, mode, file_type, uname, gname, # size, mtime, linkname) + dir_list = generate_random_dirname() + dir_depth = len(dir_list) for i in range(dir_depth): parent = dir_list[:(i+1)] dir_path = os.path.join(*parent) @@ -3144,7 +3138,10 @@ class GfptarCommand(Command): file_type = GfURLEntry.TYPE_FILE uname = 'testuser1' gname = 'testgroup1' - size = random.randint(size_min, size_max) + if size_max > 0: + size = random.randint(size_min, size_max) + else: + size = 0 mtime = now linkname = '' return GfURLEntry(path, mode, file_type, uname, gname, @@ -3152,19 +3149,20 @@ class GfptarCommand(Command): dir_num = int(num / files_per_dir) remainder = num % files_per_dir - for i in range(dir_num): - dir_num = 0 - for dir_ent in rand_dir(i): - dir_num += 1 + for dir_idx in range(dir_num): + dir_count = 0 + for dir_ent in rand_dir(dir_idx): + dir_count += 1 yield dir_ent - for j in range(files_per_dir - dir_num): + for j in range(files_per_dir - dir_count): yield rand_file(dir_ent.path, j) + dir_idx = dir_num if remainder > 0: - dir_num = 0 - for dir_ent in rand_dir(dir_num): - dir_num += 1 + dir_count = 0 + for dir_ent in rand_dir(dir_idx): + dir_count += 1 yield dir_ent - for j in range(remainder - dir_num): + for j in range(remainder - dir_count): yield rand_file(dir_ent.path, j) TABLE_tar_entry = 'tar_entry' @@ -3203,27 +3201,27 @@ class GfptarCommand(Command): self.split_size = self.assumed_size * 100 / self.ratio self.suffix = '.tar.' + self.compress_type - self.start_time = time.time() - self.next_time = self.start_time + self.progress_interval - self.archived_size = 0 self.stored_size = 0 self.stored_num = 0 self.total_size = 0 self.total_num = 0 + self.start_time = time.time() + self.next_time = self.start_time + self.progress_interval + self.listing = True + if self.progress_enabled: + self.progress_for_create(self.start_time) self.gfsched_lock = None self.gfsched_next = 0 self.gfsched_list = None - self.outdir_url.create_new_dir() - self.cannot_be_archived = 0 cannot_read_dir = 0 serial = 0 - self.listing = True has_error = None + self.outdir_url.create_new_dir() self.create_job_init() # before creating threads infiles_checked = [] @@ -3349,8 +3347,9 @@ class GfptarCommand(Command): if has_error is None: has_error = e - with self.lock(): # for progress - self.listing = False + if not self.is_canceled(): + with self.lock(): # for progress + self.listing = False self.create_job_final() @@ -3535,17 +3534,13 @@ class GfptarCommand(Command): self._lock.release() def clear_canceled(self): - with self.lock(): - self.canceled = False + self.canceled = False def cancel(self): - with self.lock(): - self.canceled = True + self.canceled = True def is_canceled(self): - with self.lock(): - val = self.canceled - return val + return self.canceled def _gfsched_sometimes(self, gfurl): # if gfurl.is_gfarm() is not True: @@ -3583,7 +3578,15 @@ class GfptarCommand(Command): else: raise - def create_a_tar_thread0(self, gen, serial, dbfile): + MSG_CHILD_READY = 'READY' + MSG_CHILD_PROGRESS = 'PROGRESS' + MSG_CHILD_DONE = 'DONE' + MSG_CHILD_ERROR = 'ERROR' + MSG_PARENT_START = 'START' + MSG_PARENT_CANCEL = 'CANCEL' + MSG_PARENT_ERROR_COMPLETE = 'ERROR_COMPLETE' + + def create_a_tar_thread0(self, gen, serial, arg): logger.debug(f'create_a_tar: start (gen={gen}, serial={serial})') if self.is_canceled(): logger.debug(f'Canceled (create 1): serial={serial}') @@ -3604,46 +3607,51 @@ class GfptarCommand(Command): process.start() # high cost to start started.status = True - inq.put(('START', gen, serial, dbfile)) + inq.put((self.MSG_PARENT_START, gen, serial, arg)) response = outq.get() - if response != 'READY': - logger.error(f'Unexpected child response: {response}') + if response != self.MSG_CHILD_READY: + logger.error(f'Unexpected response from child process: {response}') return try: cancel = False while True: - if self.is_canceled() and cancel is False: + if self.is_canceled() and not cancel: logger.debug(f'Canceled (create 2): serial={serial}') - inq.put('CANCEL') + inq.put(self.MSG_PARENT_CANCEL) cancel = True # prevent input_queue from filling up try: result = outq.get(timeout=1) except queue.Empty: + logger.debug('MSG_CHILD_ timeout, retry...') result = None if not process.is_alive(): logger.warning('Unexpected child process termination' f' (serial={serial})') break if result is None: - logger.debug('waiting for message from child:' + logger.debug('waiting for message from child process:' f' serial={serial}') continue if len(result) == 0: - logger.warning('unknown result (None)') + logger.warning('unknown result (None) from child process') break - elif result[0] == 'ADD': - status, subpath, size_all = result + msg = result[0] + if msg == self.MSG_CHILD_PROGRESS: + logger.debug('MSG_CHILD_PROGRESS') + status, num, size, path_list = result with self.lock(): - self.info('stored: {}', subpath) - self.stored_size += size_all + for entry_path in path_list: + self.info('archived: {}', entry_path) + self.stored_size += size + self.stored_num += num if self.progress_enabled: - self.stored_num += 1 now = time.time() if now >= self.next_time: - self.next_time = now + self.progress_interval + self.next_time = now \ + + self.progress_interval self.progress_for_create(now) - elif result[0] == 'DONE': + elif msg == self.MSG_CHILD_DONE: (status, tar_size, cannot_be_archived, out_tar_path, out_dbgz_path) = result with self.lock(): @@ -3651,15 +3659,16 @@ class GfptarCommand(Command): self.info('created(.tar): {}', out_tar_path) self.info('created(.db.gz): {}', out_dbgz_path) break - elif result[0] == 'ERR': + elif msg == self.MSG_CHILD_ERROR: (status, exc_type_name, exc_value_str, exc_traceback_str) = result - inq.put('ERR_COMPLETE') + inq.put(self.MSG_PARENT_ERROR_COMPLETE) raise Exception( f'{exc_type_name}: {exc_value_str}\n' f'{"".join(exc_traceback_str)}') else: - logger.warning(f'unknown result: {str(result)}') + logger.error('unknown message from child process:' + f' {str(result)}') break finally: logger.debug(f'(parent) subprocess finished: serial={serial}') @@ -3678,10 +3687,10 @@ class GfptarCommand(Command): def create_a_tar_process0(self, input_queue, output_queue): request = input_queue.get() - if len(request) > 0 and request[0] == 'START': - output_queue.put('READY') + if len(request) > 0 and request[0] == self.MSG_PARENT_START: + output_queue.put(self.MSG_CHILD_READY) else: - logger.error(f'Unexpected request for child: {request}') + logger.error(f'Unexpected request from parent process: {request}') return # exit op, gen, serial, dbfile = request logger.debug(f'create_a_tar_process0: start (serial={serial})') @@ -3689,8 +3698,8 @@ class GfptarCommand(Command): result = self.create_a_tar_process1(input_queue, output_queue, gen, serial, dbfile) tar_size, cannot_be_archived, outurl, infodb_url = result - output_queue.put(('DONE', tar_size, cannot_be_archived, - outurl, infodb_url)) + output_queue.put((self.MSG_CHILD_DONE, tar_size, + cannot_be_archived, outurl, infodb_url)) logger.debug(f'subprocess exits: serial={serial}') except Exception as e: logger.debug(f'create_a_tar_process: error (serial={serial})') @@ -3700,11 +3709,11 @@ class GfptarCommand(Command): exc_value_str = str(exc_value) exc_traceback_str = traceback.format_exception( exc_type, exc_value, exc_traceback) - output_queue.put(('ERR', exc_type_name, exc_value_str, - exc_traceback_str)) + output_queue.put((self.MSG_CHILD_ERROR, + exc_type_name, exc_value_str, exc_traceback_str)) try: input_queue.get(timeout=10) - # ERR_COMPLETE + # expect: self.MSG_PARENT_ERROR_COMPLETE except queue.Empty: pass @@ -3741,7 +3750,7 @@ class GfptarCommand(Command): serial_str = f'{self.SERIAL_FORMAT}_' % serial if gen >= 2: - # SEE ALSO: list_infodb() + # SEE ALSO: InfoDB.infodb_filename() prefix_str = f'g{gen}_{serial_str}' else: prefix_str = serial_str @@ -3779,6 +3788,12 @@ class GfptarCommand(Command): db_close = False cancel = False has_error = False + next_time = time.time() + self.progress_interval + progress_path_list = [] + progress_path_list_max = 100000 # limiter to limit memory usage + progress_num = 0 + progress_size = 0 + report_path = (self.debug or self.verbose) try: for entry in filelist: # logger.debug(f'subprocess(serial={serial}): {entry.path}') @@ -3787,12 +3802,12 @@ class GfptarCommand(Command): qdata = input_queue.get(timeout=1) except queue.Empty: qdata = None - if qdata == 'CANCEL': - logger.debug('receive CANCEL from parent') + if qdata == self.MSG_PARENT_CANCEL: + logger.debug('receive CANCEL from parent process') cancel = True break else: - logger.error('unexpected message from parent') + logger.error('unexpected message from parent process') cancel = True break if cancel: @@ -3803,8 +3818,21 @@ class GfptarCommand(Command): # GfTarFile.METHOD_add_entry tarinfo = tar_tmp.add_entry(subpath, entry) infodb.add(tarinfo) - size_all = entry.size_all() - output_queue.put(('ADD', subpath, size_all)) + progress_num += 1 + progress_size += entry.size_all() + if report_path: + progress_path_list.append(subpath) + now = time.time() + if now >= next_time \ + or len(progress_path_list) > progress_path_list_max: + next_time = now + self.progress_interval + output_queue.put((self.MSG_CHILD_PROGRESS, + progress_num, progress_size, + progress_path_list)) + # reset + progress_path_list = [] + progress_num = 0 + progress_size = 0 except MemoryError: raise except BrokenPipeError: @@ -3813,6 +3841,11 @@ class GfptarCommand(Command): cannot_be_archived += 1 logger.warning(convert_message(e)) continue + # success + if progress_num > 0: + output_queue.put((self.MSG_CHILD_PROGRESS, + progress_num, progress_size, + progress_path_list)) infodb.commit_close() db_close = True if cancel: @@ -4010,6 +4043,8 @@ class GfptarCommand(Command): self.extracted_size = 0 self.start_time = time.time() self.next_time = self.start_time + self.progress_interval + if self.progress_enabled: + self.progress_for_extract(self._start_time) self.gfsched_lock = None self.gfsched_next = 0 @@ -4981,7 +5016,6 @@ Usage: {f} [options] -c [-C ] [--] ... {f} [options] -x [--] [...] {f} [options] -t - {f} [options] -t -v {f} [options] --test {f} [options] --test -C ... {f} [options] --test-long From 305b6c7edd967a0ee3fc76dd69e4622d83ac049a Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Mon, 29 Jul 2024 22:00:31 +0900 Subject: [PATCH 055/143] gfptar: sequel to 51a934d (improve performance) --- gftool/gfptar/gfptar | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index a82d4835c..20325c004 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -3257,7 +3257,7 @@ class GfptarCommand(Command): hardlink_warn=self.hardlink_warn) tmpdir = self.tmpdir - tardb_prefix = os.path.join(tmpdir.name, 'list_for_create') + tardb_prefix = os.path.join(tmpdir.name, 'create') tardb_fmt = f'_{self.SERIAL_FORMAT}.db' serial = 1 gen = 1 # TODO @@ -4044,7 +4044,7 @@ class GfptarCommand(Command): self.start_time = time.time() self.next_time = self.start_time + self.progress_interval if self.progress_enabled: - self.progress_for_extract(self._start_time) + self.progress_for_extract(self.start_time) self.gfsched_lock = None self.gfsched_next = 0 From 2d27703e9dd9146e51fac5189652ea67dd106273 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Mon, 29 Jul 2024 22:19:52 +0900 Subject: [PATCH 056/143] gfptar --extract: print total_size in progress --- gftool/gfptar/gfptar | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 20325c004..b516b5c3a 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -3914,10 +3914,6 @@ class GfptarCommand(Command): self.db = DB(db_file, check_same_thread=False) self.db_target = DB(db_file_target, check_same_thread=False) - self.total_num = 0 - self.start_time = time.time() - self.next_time = self.start_time + self.progress_interval - self.extract_main() tmpdir.cleanup() @@ -3949,6 +3945,13 @@ class GfptarCommand(Command): path = path.lstrip('/') member_check_dict[path] = False # initialize + self.total_num = 0 + self.total_size = 0 + self.start_time = time.time() + self.next_time = self.start_time + self.progress_interval + if self.progress_enabled: + self.progress_for_schedule(self.start_time) + error_num = 0 for id_num, gen_num, dbgz_path, tar_path in \ self.list_infodb_files(self.indir_url): @@ -3963,6 +3966,7 @@ class GfptarCommand(Command): if self.is_canceled(): raise self.error_canceled() self.total_num += 1 + self.total_size += fattr.size file_type = fattr.ftype if self.search_target: logger.debug(f'archive_dict[{path}]: {file_type}') @@ -4302,12 +4306,14 @@ class GfptarCommand(Command): sec = now - self.start_time sec_str = format_seconds(sec, minhour=True) total_num_str = self._humanize(self.total_num) + total_size_str = self._humanize(self.total_size) if sec > 0: ent_per_sec = self.total_num / sec else: ent_per_sec = 0 ent_per_sec_str = self._humanize(ent_per_sec) sys.stdout.write(f'\rschedule: ' + f'{total_size_str}B ' f'{total_num_str}Ent ' f'{sec_str} ' f'{ent_per_sec_str}Ent/s') @@ -4372,9 +4378,15 @@ class GfptarCommand(Command): sec = now - self.start_time sec_str = format_seconds(sec, minhour=True) if self.total_num > 0: - percent = self.extracted_num * 100 / self.total_num + percent1 = self.extracted_num * 100 / self.total_num + else: + percent1 = 0 + if self.total_size > 0: + percent2 = self.extracted_size * 100 / self.total_size + percent = (percent1 + percent2) / 2 else: - percent = 0 + percent = percent1 + percent_str = f'{percent:.0f}' if sec > 0: bytes_per_sec = self.extracted_size / sec ent_per_sec = self.extracted_num / sec @@ -4383,11 +4395,12 @@ class GfptarCommand(Command): ent_per_sec = 0 extracted_num_str = self._humanize(self.extracted_num) total_num_str = self._humanize(self.total_num) + total_size_str = self._humanize(self.total_size) extracted_size_str = self._humanize(self.extracted_size) bytes_per_sec_str = self._humanize(bytes_per_sec) ent_per_sec_str = self._humanize(ent_per_sec) - sys.stdout.write(f'\rextract: {percent:.0f}% ' - f'{extracted_size_str}B ' + sys.stdout.write(f'\rextract: {percent_str}% ' + f'{extracted_size_str}/{total_size_str}B ' f'{extracted_num_str}/{total_num_str}Ent ' f'{sec_str} ' f'{bytes_per_sec_str}B/s ' From 2987900781fede3b2df38236dc918d6a3a4e1e40 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Mon, 29 Jul 2024 22:32:42 +0900 Subject: [PATCH 057/143] gfptar: simplify --- gftool/gfptar/gfptar | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index b516b5c3a..9bca9a3a4 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -144,8 +144,11 @@ class DB: def close(self): self.con.close() + self.con = None def unlink(self): + if self.con is not None: + self.close() os.remove(self.filename) @@ -270,18 +273,15 @@ class DBCollection: self.create_table() def commit(self): - self.con.commit() + self.db.commit() def close(self): - self.con.close() - self.con = None + self.db.close() def filename(self): return self.db.filename def unlink(self): - if self.con is not None: - self.close() self.db.unlink() def __len__(self): From 4faed77d1f7a29126d4e8a3b6c90f9b85aee1401 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Mon, 29 Jul 2024 23:06:09 +0900 Subject: [PATCH 058/143] gfptar: clarify --- gftool/gfptar/gfptar | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 9bca9a3a4..f419627f7 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -4196,6 +4196,14 @@ class GfptarCommand(Command): arch_url = GfURL.init(target, use_gfarm_command=self.use_gfarm_command) tar = GfTarFile.extract_open(arch_url, self.bufsize) + try: + self.extract_from_a_tar1(serial, target, member_set, tar) + with self.lock(): + self.info('extracted(done): {}', arch_url.url_str) + finally: + tar.close() + + def extract_from_a_tar1(self, serial, target, member_set, tar): with self.lock(): members_num = len(member_set) index = serial @@ -4289,10 +4297,6 @@ class GfptarCommand(Command): self.next_time = now + self.progress_interval self.progress_for_extract(now) - tar.close() - with self.lock(): - self.info('extracted(done): {}', arch_url.url_str) - def _humanize_si(self, n): return humanize_number(n) From 22c6e00c59ea44791e54aae3b26d1ab00be20bda Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Tue, 30 Jul 2024 12:41:30 +0900 Subject: [PATCH 059/143] gfptar: fix freeze on python 3.6 since ebc17f8 --- gftool/gfptar/gfptar | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index f419627f7..145960143 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -3679,7 +3679,7 @@ class GfptarCommand(Command): self.name, loglevel=self.loglevel, debug=self.debug, verbose=self.verbose) - self.sig_ignore() + self.sig_default() while True: pid = os.getpid() logger.debug(f'create_a_tar_process: start, pid={pid}') From d93f5049d2b1f8b20547ee3244415d3329eadccf Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Tue, 30 Jul 2024 12:42:23 +0900 Subject: [PATCH 060/143] gfptar --test: add test of --verbose --- gftool/gfptar/gfptar | 45 +++++++++++++++++++++++++++----------------- 1 file changed, 28 insertions(+), 17 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 145960143..ee87775db 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -735,6 +735,8 @@ class Command(metaclass=abc.ABCMeta): self.name = name self._docopt = docopt(self.getDoc()) self.opt = self.getSchema().validate(self._docopt) + + def log_init(self): self.debug = self.opt['--debug'] self.verbose = self.opt['--verbose'] self.quiet = self.opt['--quiet'] @@ -749,12 +751,9 @@ class Command(metaclass=abc.ABCMeta): self.loglevel = loglevel # use stderr with lock - logger_init(name, loglevel=loglevel, debug=self.debug, + logger_init(self.name, loglevel=loglevel, debug=self.debug, verbose=self.verbose) - - logger.debug('USE_GFMKDIR_PLUS: %s', USE_GFMKDIR_PLUS) - logger.debug('USE_GFCHMOD_PLUS: %s', USE_GFCHMOD_PLUS) - logger.debug('USE_GFREG_PLUS: %s', USE_GFREG_PLUS) + logger.debug(pformat(self.opt)) @abc.abstractmethod def getDoc(self) -> str: @@ -2397,12 +2396,17 @@ class GfptarCommand(Command): self.lock_init(False) self.futures = None self.hardlink_warn = True - if self.quiet: - GfURL.shutup_stderr() self.tmpdir = None self.test_mode = False def options_init(self): + self.log_init() + if self.quiet: + GfURL.shutup_stderr() + logger.debug('USE_GFMKDIR_PLUS: %s', USE_GFMKDIR_PLUS) + logger.debug('USE_GFCHMOD_PLUS: %s', USE_GFCHMOD_PLUS) + logger.debug('USE_GFREG_PLUS: %s', USE_GFREG_PLUS) + set_encoding(self.opt['--encoding']) self.jobs = self.opt['--jobs'] # SEE ALSO: MT_enabled() @@ -2486,7 +2490,6 @@ class GfptarCommand(Command): return not self.debug and not self.verbose and not self.quiet def run(self): - logger.debug(pformat(self.opt)) try: outdir = self.opt['--create'] if outdir: @@ -2542,11 +2545,11 @@ class GfptarCommand(Command): logger.debug('DONE') def test_init(self): + self.options_init() self.am_I_gfarmroot = am_I_gfarmroot() if self.am_I_gfarmroot: logger.error('warning: gfarmroot is enabled: ' + '"noread" files are not used') - self.options_init() self.hardlink_warn = False self.uid = os.getuid() self.pid = os.getpid() @@ -2589,9 +2592,19 @@ class GfptarCommand(Command): save_opt_type = self.opt['--type'] save_opt_compress_prog = self.opt['--use-compress-program'] save_opt_syncdb = self.opt['--sync-db'] + save_opt_verbose = self.opt['--verbose'] # create tar per one entry self.opt['--size'] = 0 + + # test --verbose and --sync-db + self.opt['--verbose'] = True + self.opt['--sync-db'] = True + self.test_simple('syncdb') + self.opt['--sync-db'] = save_opt_syncdb + self.opt['--verbose'] = save_opt_verbose + + # test --jobs if full: # jobs == 0: not use multi-thread pattern_jobs = [0, 16] @@ -2605,10 +2618,7 @@ class GfptarCommand(Command): # create one tar self.opt['--size'] = unhumanize_number('100M') - self.opt['--sync-db'] = True - self.test_simple('syncdb') - self.opt['--sync-db'] = save_opt_syncdb - + # test --type if full: pattern_type = [ 'gz', @@ -2622,6 +2632,7 @@ class GfptarCommand(Command): self.test_simple('type_' + t) self.opt['--type'] = save_opt_type + # test --type and --use-compress-program if full: pattern_compress_prog = { 'gzip': 'gz', @@ -3168,8 +3179,8 @@ class GfptarCommand(Command): TABLE_tar_entry = 'tar_entry' def cmd_create(self, outdir, basedir, infiles): - logger.debug(f'create start: outdir={outdir}, basedir={basedir}') self.options_init() + logger.debug(f'create start: outdir={outdir}, basedir={basedir}') self.outdir = outdir self.outdir_url = GfURL.init(outdir) self.basedir_url = GfURL.init(basedir) @@ -3656,7 +3667,7 @@ class GfptarCommand(Command): out_tar_path, out_dbgz_path) = result with self.lock(): self.archived_size += tar_size - self.info('created(.tar): {}', out_tar_path) + self.info('created({}): {}', self.suffix, out_tar_path) self.info('created(.db.gz): {}', out_dbgz_path) break elif msg == self.MSG_CHILD_ERROR: @@ -3884,8 +3895,8 @@ class GfptarCommand(Command): return GfptarError('Not a gfptar-archived directory: ' + url_str) def cmd_extract(self, outdir, indir, specified_members): - logger.debug(f'extract start: outdir={outdir}, indir={indir}') self.options_init() + logger.debug(f'extract start: outdir={outdir}, indir={indir}') self.outdir = outdir self.outdir_url = GfURL.init(outdir) self.indir = indir @@ -4199,7 +4210,7 @@ class GfptarCommand(Command): try: self.extract_from_a_tar1(serial, target, member_set, tar) with self.lock(): - self.info('extracted(done): {}', arch_url.url_str) + self.info('extract,DONE: {}', arch_url.url_str) finally: tar.close() From 3971c2c05080fd8e6dd73b92dd6953866b366491 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Tue, 30 Jul 2024 14:46:08 +0900 Subject: [PATCH 061/143] gfptar: clarify --- gftool/gfptar/gfptar | 35 ++++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index ee87775db..5a0c57cde 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -2507,7 +2507,7 @@ class GfptarCommand(Command): indir = self.opt['--list'] if indir: - if self.verbose: + if self.opt['--verbose']: self.cmd_list_verbose(indir) else: self.cmd_list_simple(indir) @@ -4429,6 +4429,8 @@ class GfptarCommand(Command): def cmd_list(self, indir, quiet=False, verbose=True): self.options_init() + if self.quiet: + quiet = self.quiet indir_url = GfURL.init(indir) def print_quiet(id_num, gen_num, path, fattr): @@ -4620,15 +4622,18 @@ class GfptarCommand(Command): for sig in sigs: orig_sig_handler[sig] = signal.getsignal(sig) + self.sig_ignore() # for parent process + # Concurrent execution for each generation - with multiprocessing.Manager() as manager: + with multiprocessing.Manager() as manager: # subprocess lock = manager.Lock() share_cancel = manager.Value('i', 0) def sig_handler(signum, frame): - # logger.warning(f'Interrupt (signal={signum}') - sys.stderr.write(f'\nInterrupt (signal={signum})\n') - share_cancel.value = 1 + if share_cancel.value == 0: + # logger.warning(f'Interrupt (signal={signum}') + sys.stderr.write(f'\nInterrupt (signal={signum})\n') + share_cancel.value = 1 signal.signal(signal.SIGHUP, sig_handler) signal.signal(signal.SIGINT, sig_handler) @@ -4797,15 +4802,16 @@ class InfoDB: return db_url signal_initialized = False + signal_canceled = False @classmethod - def signal_init(cls, share_cancel): + def signal_init(cls): if not cls.signal_initialized: def sig_handler(signum, frame): - if share_cancel.value != 0: - # logger.warning(f'Interrupt (signal={signum})') - sys.stderr.write(f'\nInterrupt (signal={signum})\n') - share_cancel.value = 1 + if not cls.signal_canceled: + pid = os.getpid() + logger.info(f'Interrupt (signal={signum}) (PID={pid})') + cls.signal_canceled = True signal.signal(signal.SIGHUP, sig_handler) signal.signal(signal.SIGINT, sig_handler) @@ -4814,8 +4820,8 @@ class InfoDB: cls.signal_initialized = True logger.debug('InfoDB.signal_init') - @staticmethod - def generate_one(args): + @classmethod + def generate_one(cls, args): (lock, share_cancel, tmpdir, overwrite, gen_num, id_num, tar_path, bufsize, use_fsync) = args if share_cancel.value != 0: @@ -4829,7 +4835,7 @@ class InfoDB: if not overwrite and db_url.exists(): logger.debug(f'not overwrite: {db_path}') return 0, 0 - InfoDB.signal_init(share_cancel) + InfoDB.signal_init() # Local file tmpdb_path = os.path.join(tmpdir.name, str(id_num) + '_info.db') @@ -4843,6 +4849,9 @@ class InfoDB: interval = 1 # sec. | for interrupt next_check = time.time() + interval while True: + if cls.signal_canceled: + logger.info('Canceled') + break now = time.time() if now >= next_check: # access manager.Value(): very high cost From f82c91f1f62432bfbab6e0892e40e3bd97900bd7 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Tue, 30 Jul 2024 16:22:13 +0900 Subject: [PATCH 062/143] gfptar: not use f-string for logger.debug() --- gftool/gfptar/gfptar | 120 +++++++++++++++++++++++-------------------- 1 file changed, 63 insertions(+), 57 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 5a0c57cde..b30c8033d 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -535,28 +535,33 @@ class GfLogger(logging.getLoggerClass()): self._mylock = threading.Lock() super().__init__(name) - def _mylog(self, level, msg, *args): + def _mylog(self, level, msg, *args, exc_info=None): frame = inspect.currentframe().f_back.f_back lineno = frame.f_lineno + if exc_info: + if isinstance(exc_info, BaseException): + exc_info = (type(exc_info), exc_info, exc_info.__traceback__) + elif not isinstance(exc_info, tuple): + exc_info = sys.exc_info() record = self.makeRecord(self.name, level, frame.f_code.co_filename, - lineno, msg, args, None) + lineno, msg, args, exc_info) with self._mylock: self.handle(record) - def debug(self, msg, *args): - self._mylog(logging.DEBUG, msg, *args) + def debug(self, msg, *args, **kwargs): + self._mylog(logging.DEBUG, msg, *args, **kwargs) - def info(self, msg, *args): - self._mylog(logging.INFO, msg, *args) + def info(self, msg, *args, **kwargs): + self._mylog(logging.INFO, msg, *args, **kwargs) def warning(self, msg, *args, **kwargs): - self._mylog(logging.WARNING, msg, *args) + self._mylog(logging.WARNING, msg, *args, **kwargs) def error(self, msg, *args, **kwargs): - self._mylog(logging.ERROR, 'Error: ' + msg, *args) + self._mylog(logging.ERROR, 'Error: ' + msg, *args, **kwargs) - def error_exit(self, exit_code, msg, *args): - self._mylog(logging.ERROR, 'Error: ' + msg, *args) + def error_exit(self, exit_code, msg, *args, **kwargs): + self._mylog(logging.ERROR, 'Error: ' + msg, *args, **kwargs) sys.exit(exit_code) # def fatal(self, msg, *args, **kwargs): @@ -876,7 +881,7 @@ class GfURLEntry(DBObj): if a != b: raise GfException( f'{ent1.path} vs {ent2.path}: prop={name}: {a} != {b}') - logger.debug(f'GfURLEntry.compare:prop={name}: PASS') + logger.debug('GfURLEntry.compare:prop=%s: PASS', name) def cmpprop(ent1, ent2, properties): for pname in properties: @@ -1962,11 +1967,11 @@ class Compress: # else: # compress_prog = None cls.compress_prog = compress_prog # may be None - logger.debug(f'compress_type={cls.compress_type}') - logger.debug(f'compress_prog={cls.compress_prog}') - logger.debug(f'gzip_prog={cls.gzip_prog}') - logger.debug(f'bzip2_prog={cls.bzip2_prog}') - logger.debug(f'xz_prog={cls.xz_prog}') + logger.debug('compress_type=%s', cls.compress_type) + logger.debug('compress_prog=%s', cls.compress_prog) + logger.debug('gzip_prog=%s', cls.gzip_prog) + logger.debug('bzip2_prog=%s', cls.bzip2_prog) + logger.debug('xz_prog=%s', cls.xz_prog) @classmethod def compress(cls, compress_prog, outf): @@ -2380,7 +2385,7 @@ def ignore_exception(ignore): yield except Exception as e: if ignore: - logger.debug(f'Ignored: {str(e)}') + logger.debug('Ignored', exc_info=e) return raise @@ -2912,8 +2917,8 @@ class GfptarCommand(Command): def test_prepare_srcdir(self, dir_url_str, readonly=False, noread=False, link=False, longname=False): - logger.debug(f'readonly={str(readonly)}, noread={str(noread)}, ' - f'link={str(link)}, longname={str(longname)}') + logger.debug('readonly=%s, noread=%s, link=%s, longname=%s', + readonly, noread, link, longname) F = 'file' D = 'directory' S = 'symlink' @@ -3180,7 +3185,7 @@ class GfptarCommand(Command): def cmd_create(self, outdir, basedir, infiles): self.options_init() - logger.debug(f'create start: outdir={outdir}, basedir={basedir}') + logger.debug('create start: outdir=%s, basedir=%s', outdir, basedir) self.outdir = outdir self.outdir_url = GfURL.init(outdir) self.basedir_url = GfURL.init(basedir) @@ -3280,7 +3285,7 @@ class GfptarCommand(Command): for infile in infiles_checked: if self.is_canceled(): - logger.debug(f'Canceled (listdir 1): serial={serial}') + logger.debug('Canceled (listdir 1): serial=%d', serial) break url_str = os.path.join(self.basedir_url.url_str, infile) @@ -3289,7 +3294,7 @@ class GfptarCommand(Command): try: for entry in listdir_switch(gfurl): if self.is_canceled(): - logger.debug(f'Canceled (listdir 2): serial={serial}') + logger.debug('Canceled (listdir 2): serial=%d', serial) break logger.debug('listdir: entry.path=%s', entry.path) # include length of path @@ -3432,15 +3437,15 @@ class GfptarCommand(Command): except Exception as e: # NOTE: cannot catch TimeoutError # ignore timeout - logger.debug(f'_create_job_execute_MT(): {type(e)} {str(e)}') + logger.debug('_create_job_execute_MT()', exc_info=e) if has_error is not None: raise has_error def process_close(self, process, index=None): if not process.is_alive(): return - logger.debug(f'process_close() start (index={index}):' - f' pid={process.pid}') + logger.debug('process_close() start (index=%s): pid=%s', + index, process.pid) # timeout = 10 # process.join(timeout) process.terminate() @@ -3458,12 +3463,12 @@ class GfptarCommand(Command): ok = True break except ValueError as e: - logger.debug(f'retry[{i}] process.close() (index={index}):' - f' {str(e)}') + logger.debug('retry[%d] process.close() (index=%d):', + i, index, exc_info=e) time.sleep(0.1) if not ok: process.close() - logger.debug(f'process_close() finished (index={index})') + logger.debug('process_close() finished (index=%d)', index) def create_job_final(self, timeout=None): if self.MT_enabled(): @@ -3598,9 +3603,9 @@ class GfptarCommand(Command): MSG_PARENT_ERROR_COMPLETE = 'ERROR_COMPLETE' def create_a_tar_thread0(self, gen, serial, arg): - logger.debug(f'create_a_tar: start (gen={gen}, serial={serial})') + logger.debug('create_a_tar: start (gen=%d, serial=%d)', gen, serial) if self.is_canceled(): - logger.debug(f'Canceled (create 1): serial={serial}') + logger.debug('Canceled (create 1): serial=%d', serial) return with self.lock(): @@ -3628,7 +3633,7 @@ class GfptarCommand(Command): cancel = False while True: if self.is_canceled() and not cancel: - logger.debug(f'Canceled (create 2): serial={serial}') + logger.debug('Canceled (create 2): serial=%d', serial) inq.put(self.MSG_PARENT_CANCEL) cancel = True # prevent input_queue from filling up try: @@ -3638,7 +3643,7 @@ class GfptarCommand(Command): result = None if not process.is_alive(): logger.warning('Unexpected child process termination' - f' (serial={serial})') + ' (serial=%d)', serial) break if result is None: logger.debug('waiting for message from child process:' @@ -3682,7 +3687,7 @@ class GfptarCommand(Command): f' {str(result)}') break finally: - logger.debug(f'(parent) subprocess finished: serial={serial}') + logger.debug('(parent) subprocess finished: serial=%d', serial) def create_a_tar_process(self, input_queue, output_queue): global logger @@ -3691,9 +3696,9 @@ class GfptarCommand(Command): debug=self.debug, verbose=self.verbose) self.sig_default() - while True: + while True: # not exit pid = os.getpid() - logger.debug(f'create_a_tar_process: start, pid={pid}') + logger.debug('create_a_tar_process: start, pid=%d', pid) self.create_a_tar_process0(input_queue, output_queue) def create_a_tar_process0(self, input_queue, output_queue): @@ -3704,16 +3709,16 @@ class GfptarCommand(Command): logger.error(f'Unexpected request from parent process: {request}') return # exit op, gen, serial, dbfile = request - logger.debug(f'create_a_tar_process0: start (serial={serial})') + logger.debug('create_a_tar_process0: start (serial=%d)', serial) try: result = self.create_a_tar_process1(input_queue, output_queue, gen, serial, dbfile) tar_size, cannot_be_archived, outurl, infodb_url = result output_queue.put((self.MSG_CHILD_DONE, tar_size, cannot_be_archived, outurl, infodb_url)) - logger.debug(f'subprocess exits: serial={serial}') + logger.debug('subprocess exits: serial=%d', serial) except Exception as e: - logger.debug(f'create_a_tar_process: error (serial={serial})') + logger.debug('create_a_tar_process: error (serial=%d)', serial) self.print_trace(e) exc_type, exc_value, exc_traceback = sys.exc_info() exc_type_name = exc_type.__name__ @@ -3730,7 +3735,7 @@ class GfptarCommand(Command): def create_a_tar_process1(self, input_queue, output_queue, gen, serial, dbfile): - logger.debug(f'create_a_tar_process1: start (serial={serial})') + logger.debug('create_a_tar_process1: start (serial=%d)', serial) tardb = DB(dbfile) filelist = DBList(tardb, GfURLEntry, self.TABLE_tar_entry) @@ -3773,7 +3778,7 @@ class GfptarCommand(Command): # use last half of name outname = outname[-(outname_max-offset):] outname_len = len(outname.encode()) - logger.debug(f'modified outname_len={outname_len}') + logger.debug('modified outname_len=%d', outname_len) # loop for multibyte characters offset += 1 # ex.: home/user1/dir -> home_user1_dir @@ -3807,7 +3812,7 @@ class GfptarCommand(Command): report_path = (self.debug or self.verbose) try: for entry in filelist: - # logger.debug(f'subprocess(serial={serial}): {entry.path}') + # logger.debug('subprocess(serial=%d): %s', serial, entry.path) while not input_queue.empty(): try: qdata = input_queue.get(timeout=1) @@ -3825,7 +3830,7 @@ class GfptarCommand(Command): break subpath = entry.subpath(self.basedir_url) try: - logger.debug(f'tar_tmp.add_entry: {subpath}') + logger.debug('tar_tmp.add_entry: %s', subpath) # GfTarFile.METHOD_add_entry tarinfo = tar_tmp.add_entry(subpath, entry) infodb.add(tarinfo) @@ -3896,7 +3901,7 @@ class GfptarCommand(Command): def cmd_extract(self, outdir, indir, specified_members): self.options_init() - logger.debug(f'extract start: outdir={outdir}, indir={indir}') + logger.debug('extract start: outdir=%s, indir=%s', outdir, indir) self.outdir = outdir self.outdir_url = GfURL.init(outdir) self.indir = indir @@ -3918,8 +3923,8 @@ class GfptarCommand(Command): tmpdir = self.tmpdir db_file = os.path.join(tmpdir.name, 'extract.db') db_file_target = os.path.join(tmpdir.name, 'target.db') - logger.debug(f'db_file={db_file}') - logger.debug(f'db_file_target={db_file_target}') + logger.debug('db_file=%s', db_file) + logger.debug('db_file_target=%s', db_file_target) # to reduce memory usage self.db = DB(db_file, check_same_thread=False) @@ -3980,25 +3985,25 @@ class GfptarCommand(Command): self.total_size += fattr.size file_type = fattr.ftype if self.search_target: - logger.debug(f'archive_dict[{path}]: {file_type}') + logger.debug('archive_dict[%s]: %s', path, file_type) archive_dict[path] = file_type else: if file_type == InfoDB.TYPE_DIR: - logger.debug(f'directory_set.add: {path}') + logger.debug('directory_set.add: %s', path) directory_set.add(path) # all directories if self.search_target: is_target = False for member in member_check_dict.keys(): file_type = archive_dict.get(member, None) - logger.debug(f'from member_check_dict.keys: {member} ' - f'[{file_type}]') + logger.debug('from member_check_dict.keys: %s [%s]', + member, file_type) found = False if file_type is not None: - logger.debug(f'member_set.add: {member}') + logger.debug('member_set.add: %s', member) member_set.add(member) found = True if file_type == InfoDB.TYPE_DIR: - logger.debug(f'directory_set.add: {member}') + logger.debug('directory_set.add: %s', member) directory_set.add(member) is_dir = True else: @@ -4011,18 +4016,18 @@ class GfptarCommand(Command): # find {member}/* files for path, file_type in archive_dict.find_by_prefix( member + '/'): - logger.debug(f'member_set.add: {path}') + logger.debug('member_set.add: %s', path) member_set.add(path) found = True if file_type == InfoDB.TYPE_DIR: - logger.debug(f'directory_set.add: {path}') + logger.debug('directory_set.add: %s', path) directory_set.add(path) if found: member_check_dict[member] = True is_target = True archive_dict.clear() # re-use for next tar file if is_target: - logger.debug(f'target_set.add: {tar_path}') + logger.debug('target_set.add: %s', tar_path) target_set.add(tar_path) # select this tar file else: target_set.add(tar_path) # use all tar files @@ -4038,7 +4043,7 @@ class GfptarCommand(Command): self.total_num = len(member_set) # update for member, found in member_check_dict.items(): - logger.debug(f'check member_check_dict: {member}, {found}') + logger.debug('check member_check_dict: %s, %s', member, found) if not found: raise GfptarError('The specified file is not found' ' in archive files: ' + member) @@ -4227,6 +4232,7 @@ class GfptarCommand(Command): except MemoryError: raise except Exception as e: + logger.debug('tar.next()', exc_info=e) logger.warning(f'{target}: SKIPPED: invalid or empty tar: ' f' {str(e)}') tarinfo = None @@ -4779,7 +4785,7 @@ class InfoDB: # atomic operation to avoid leaving junk files dbgz_url_tmp.rename(out_dbgz_path) - logger.debug(f'created(.db.gz): {out_dbgz_path}') + logger.debug('created(.db.gz): %s', out_dbgz_path) if move: db_url.remove() @@ -4833,7 +4839,7 @@ class InfoDB: db_path = indir_url.url_join(db_name) db_url = GfURL.init(db_path) if not overwrite and db_url.exists(): - logger.debug(f'not overwrite: {db_path}') + logger.debug('not overwrite: %s', db_path) return 0, 0 InfoDB.signal_init() From bcf367aee529c3a47988bb9bcc5aef57cee817b2 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Wed, 31 Jul 2024 02:05:48 +0900 Subject: [PATCH 063/143] gfptar: new option: --dry-run --- gftool/gfptar/gfptar | 154 ++++++++++++++++++++++++++++++++----------- 1 file changed, 114 insertions(+), 40 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index b30c8033d..b9da2fcd9 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -911,14 +911,16 @@ class GfURL(metaclass=abc.ABCMeta): shutup_stderr() @staticmethod - def init(url, use_gfarm_command=False, local=False): + def init(url, use_gfarm_command=False, local=False, dry_run=False): if local: - return GfURLLocal(url) + return GfURLLocal(url, dry_run) if GfURLGfarm.is_my_URL(url): - return GfURLGfarm(url) - gfurl1 = GfURLLocal(url) + return GfURLGfarm(url, dry_run) + gfurl1 = GfURLLocal(url, dry_run) if not use_gfarm_command: return gfurl1 + if dry_run: # cannot resolve the Gfarm URL from gfarm2fs + return gfurl1 # use_gfarm_command=True: use gf* commands on gfarm2fs gfurl2 = gfurl1.get_gfarm_url_by_gfarm2fs() if gfurl2 is not None: @@ -927,12 +929,24 @@ class GfURL(metaclass=abc.ABCMeta): return gfurl2 return gfurl1 - def __init__(self, url): + def __init__(self, url, dry_run): self._url_str = url scheme, host, path = self.parse(url) self._scheme = scheme self._host = host self._path = path + self.dry_run = dry_run + if dry_run: + self.chmod = self.dryrun_chmod + self.chown = self.dryrun_chown + self.utime = self.dryrun_utime + self.mkdir = self.dryrun_mkdir + self.rename = self.dryrun_rename + self.remove = self.dryrun_remove + self.remove_tree = self.dryrun_remove_tree + self.symlink = self.dryrun_symlink + self.hardlink = self.dryrun_hardlink + self.writeopen = self.dryrun_writeopen @classmethod def parse(cls, url): @@ -979,7 +993,8 @@ class GfURL(metaclass=abc.ABCMeta): if self.root_url_str != '' \ and (parent_path == '.' or parent_path == '/'): parent_path = '' - return GfURL.init(self.root_url_str + parent_path) + return GfURL.init(self.root_url_str + parent_path, + dry_run=self.dry_run) @property def parent_iter(self): @@ -1027,6 +1042,8 @@ class GfURL(metaclass=abc.ABCMeta): return isinstance(self, GfURLLocal) def create_new_dir(self): + if self.dry_run: + return if self.exists(): raise FileExistsError(self.url_str) self.mkdir() @@ -1045,18 +1062,31 @@ class GfURL(metaclass=abc.ABCMeta): follow_symlinks=True): raise NotImplementedError + def dryrun_chmod(self, mode, mtime=None, user=None, group=None, + follow_symlinks=True): + pass + @abc.abstractmethod def chown(self, user, group, follow_symlinks=True): raise NotImplementedError + def dryrun_chown(self, user, group, follow_symlinks=True): + pass + @abc.abstractmethod def utime(self, atime, mtime, follow_symlinks=True): raise NotImplementedError + def dryrun_utime(self, atime, mtime, follow_symlinks=True): + pass + @abc.abstractmethod def mkdir(self, mode=0o700, parents=False): raise NotImplementedError + def dryrun_mkdir(self, mode=0o700, parents=False): + pass + def makedirs(self, mode=0o700): self.mkdir(mode, parents=True) @@ -1064,22 +1094,37 @@ class GfURL(metaclass=abc.ABCMeta): def rename(self, dest): raise NotImplementedError + def dryrun_rename(self, dest): + pass + @abc.abstractmethod def remove(self): raise NotImplementedError + def dryrun_remove(self): + pass + @abc.abstractmethod def remove_tree(self, remove_readonly=False): raise NotImplementedError + def dryrun_remove_tree(self, remove_readonly=False): + pass + @abc.abstractmethod def symlink(self, linkname): raise NotImplementedError + def dryrun_symlink(self, linkname): + pass + @abc.abstractmethod def hardlink(self, linkname): raise NotImplementedError + def dryrun_hardlink(self, linkname): + pass + @abc.abstractmethod def exists(self): raise NotImplementedError @@ -1134,6 +1179,15 @@ class GfURL(metaclass=abc.ABCMeta): user=None, group=None, use_fsync=True, hostname=None): raise NotImplementedError + @contextmanager + def dryrun_writeopen(self, textmode=False, mode=0o600, mtime=None, + user=None, group=None, use_fsync=True, hostname=None): + f = open(os.devnull, 'wb') + try: + yield f + finally: + f.close() + def copy_from(self, inf, bufsize, mode=0o600, mtime=None, user=None, group=None, use_fsync=True, hostname=None): readlen = 0 @@ -1279,8 +1333,8 @@ class GfURLGfarm(GfURL): r'(\d+)\s+(\S+\s+\d+\s+\d+:\d+:\d+\s+\d+)\s+(.+)$') PAT_EMPTY = re.compile(r'^\s*$') - def __init__(self, url): - super().__init__(url) + # def __init__(self, url, dry_run): + # super().__init__(url, dry_run) @classmethod def is_my_URL(cls, url): @@ -1667,9 +1721,6 @@ class GfURLLocal(GfURL): SCHEME = '' SCHEME_LEN = 0 - def __init__(self, url): - super().__init__(url) - @classmethod def is_my_URL(cls, url): return True @@ -1679,7 +1730,8 @@ class GfURLLocal(GfURL): XATTR_GFARM2FS_URL = 'gfarm2fs.url' try: val = os.getxattr(self.url_str, XATTR_GFARM2FS_URL) - gfurl = GfURL.init(val.decode(get_encoding())) + gfurl = GfURL.init(val.decode(get_encoding()), + dry_run=self.dry_run) if gfurl.is_gfarm(): return gfurl raise GfException('unexpected Gfarm URL format: ' + gfurl.url_str) @@ -1687,11 +1739,16 @@ class GfURLLocal(GfURL): if e.errno == errno.EOPNOTSUPP: return None elif e.errno == errno.ENOENT: - # creation mode + parent_path = self.parent.url_str + if parent_path == '': + # local filesystem + return None + # creation mode (parent must exist) try: - val = os.getxattr(self.parent.url_str, XATTR_GFARM2FS_URL) + val = os.getxattr(parent_path, XATTR_GFARM2FS_URL) parent_str = val.decode(get_encoding()) - gfurl = GfURL.init(os.path.join(parent_str, self.basename)) + gfurl = GfURL.init(os.path.join(parent_str, self.basename), + dry_run=self.dry_run) if gfurl.is_gfarm(): return gfurl raise GfException('unexpected Gfarm URL format: ' @@ -2063,7 +2120,7 @@ class GfTarFile(tarfile.TarFile): raise FileExistsError(gfurl.url_str) # list of tuple(proc, closeable obj, synchronizable obj) proc_list = [] - if gfurl.is_gfarm(): + if gfurl.is_gfarm() and not gfurl.dry_run: if compress_prog: gfreg_obj = gfurl.gfreg(mode=0o600, hostname=target_host) compress_proc = Compress.compress(compress_prog, @@ -2080,21 +2137,25 @@ class GfTarFile(tarfile.TarFile): fileobj=gfreg_obj.stdin, copybufsize=copybufsize) proc_list.append(tuple([gfreg_obj.proc, gfreg_obj, None])) - else: # Local - if compress_prog: + else: # Local or dry_run + if gfurl.dry_run: + outf = open(os.devnull, 'wb') + else: outf = open(gfurl.url_str, 'wb') + if compress_prog: compress_proc = Compress.compress(compress_prog, outf) tar = cls.open(None, mode=openmode, fileobj=compress_proc.stdin, copybufsize=copybufsize) proc_list.append(tuple([compress_proc, compress_proc.stdin, None])) - proc_list.append(tuple([None, outf, outf])) else: - outf = open(gfurl.url_str, 'wb') tar = cls.open(gfurl.url_str, mode=openmode, fileobj=outf, copybufsize=copybufsize) - proc_list.append(tuple([None, outf, outf])) + if gfurl.dry_run: + proc_list.append((None, outf, None)) + else: + proc_list.append((None, outf, outf)) gfurl.chmod(0o600) logger.debug('GfTarFile.create_open: %s', gfurl.url_str) setattr(tar, cls.ATTR_PROC_LIST, proc_list) @@ -2459,6 +2520,8 @@ class GfptarCommand(Command): if Compress.gzip_prog is None: raise GfException('gzip: command not found. Please install gzip.') + self.dry_run = self.opt['--dry-run'] + def set_memory_limit(self, max_memory): resource.setrlimit(resource.RLIMIT_AS, (max_memory, max_memory)) @@ -3187,7 +3250,7 @@ class GfptarCommand(Command): self.options_init() logger.debug('create start: outdir=%s, basedir=%s', outdir, basedir) self.outdir = outdir - self.outdir_url = GfURL.init(outdir) + self.outdir_url = GfURL.init(outdir, dry_run=self.dry_run) self.basedir_url = GfURL.init(basedir) self.assumed_size = self.opt['--size'] if self.assumed_size <= 0: @@ -3575,6 +3638,8 @@ class GfptarCommand(Command): self.gfsched_next = now + self.gfsched_interval def select_a_target_host(self, outurl, index): + if self.dry_run: + return if not outurl.is_gfarm(): return None if self.jobs <= 1: @@ -3786,7 +3851,8 @@ class GfptarCommand(Command): outname_path = self.outdir_url.url_join(outname) outname_path_tmp = outname_path + self.TMP_SUFFIX outurl_tmp = GfURL.init(outname_path_tmp, - use_gfarm_command=self.use_gfarm_command) + use_gfarm_command=self.use_gfarm_command, + dry_run=self.dry_run) target_host = self.select_a_target_host(outurl_tmp, serial) tar_tmp = GfTarFile.create_open(outurl_tmp, self.compress_type, self.bufsize, @@ -3881,12 +3947,15 @@ class GfptarCommand(Command): infodb.db.close() # success - tar_size = outurl_tmp.size() + if self.dry_run: + tar_size = 1024 ** 2 # no particular reason + else: + tar_size = outurl_tmp.size() outurl_tmp.rename(outname_path) out_db_path = self.outdir_url.url_join(db_name) InfoDB.compress_copy(infodb.db.filename, out_db_path, - self.bufsize, self.use_fsync) + self.bufsize, self.use_fsync, self.dry_run) # for DEBUG # raise Exception('unexpected raise') @@ -3903,7 +3972,7 @@ class GfptarCommand(Command): self.options_init() logger.debug('extract start: outdir=%s, indir=%s', outdir, indir) self.outdir = outdir - self.outdir_url = GfURL.init(outdir) + self.outdir_url = GfURL.init(outdir, dry_run=self.dry_run) self.indir = indir self.specified_members = specified_members self.same_owner = self.opt['--same-owner'] @@ -4093,7 +4162,7 @@ class GfptarCommand(Command): if self.is_canceled(): raise self.error_canceled() url_str = self.outdir_url.url_join(d) - dir_url = GfURL.init(url_str) + dir_url = GfURL.init(url_str, dry_run=self.dry_run) try: dir_url.mkdir() except MemoryError: @@ -4111,7 +4180,7 @@ class GfptarCommand(Command): if self.is_canceled(): raise self.error_canceled() url_str = self.outdir_url.url_join(d) - dir_url = GfURL.init(url_str) + dir_url = GfURL.init(url_str, dry_run=self.dry_run) logger.debug('extract_directories_fast: ' f'out_url={self.outdir_url.url_str}, d={d}') # url.path is normalized @@ -4119,7 +4188,7 @@ class GfptarCommand(Command): logger.debug('skip (already created): %s', url_str) continue parent_url = dir_url.parent - if parent_url.path in created_set: + if parent_url.path in created_set: # parent exists dir_url.mkdir() created_set.add(dir_url.path) self.info('prepare_dir: {}', dir_url.path) @@ -4149,7 +4218,7 @@ class GfptarCommand(Command): logger.warning('No information of the directory: %s', d) continue url_str = self.outdir_url.url_join(d) - dir_url = GfURL.init(url_str) + dir_url = GfURL.init(url_str, dry_run=self.dry_run) if self.same_owner: dir_url.chmod(tarinfo.mode, mtime=tarinfo.mtime, user=tarinfo.uname, group=tarinfo.gname) @@ -4249,14 +4318,16 @@ class GfptarCommand(Command): # ex. /a/b/c/ -> a/b/c outfile = tarinfo.name.strip('/') # relative path only outurl_str = self.outdir_url.url_join(outfile) - outurl = GfURL.init(outurl_str) + outurl = GfURL.init(outurl_str, dry_run=self.dry_run) parent = outurl.parent with self.lock(): exist_dir = parent.path in self.created_directory_set if not exist_dir: if not parent.exists(): - parent.makedirs() # default 0700 + with ignore_exception(True): + if not self.dry_run: + parent.makedirs() # default 0700 with self.lock(): self.created_directory_set.add(parent.path) @@ -4659,7 +4730,8 @@ class GfptarCommand(Command): for id_num, tar_path in tarlist: arglist.append((lock, share_cancel, self.tmpdir, overwrite, gen_num, id_num, tar_path, - self.bufsize, self.use_fsync)) + self.bufsize, self.use_fsync, + self.dry_run)) # InfoDB.generate_one is staticmethod, # because ProcessPoolExecutor cannot serialize # members of "self" object. @@ -4767,9 +4839,9 @@ class InfoDB: @staticmethod def compress_copy(in_db_path, out_dbgz_path, bufsize, use_fsync, - move=True): + dry_run, move=True): dbgz_path_tmp = out_dbgz_path + '.tmp' - dbgz_url_tmp = GfURL.init(dbgz_path_tmp) + dbgz_url_tmp = GfURL.init(dbgz_path_tmp, dry_run=dry_run) db_url = GfURL.init(in_db_path) if dbgz_url_tmp.exists(): dbgz_url_tmp.remove() @@ -4829,7 +4901,7 @@ class InfoDB: @classmethod def generate_one(cls, args): (lock, share_cancel, tmpdir, overwrite, gen_num, id_num, - tar_path, bufsize, use_fsync) = args + tar_path, bufsize, use_fsync, dry_run) = args if share_cancel.value != 0: # logger.debug('Canceled (2)') return 0, 0 @@ -4884,7 +4956,7 @@ class InfoDB: infodb.commit_close() db_close = True InfoDB.compress_copy(infodb.db.filename, db_path, bufsize, - use_fsync) + use_fsync, dry_run) except Exception: has_error = True raise @@ -5043,13 +5115,14 @@ Options: --test-long run long tests (-q option is recommended) --test-workdir-local=DIR local directory for test [default: /tmp] --test-workdir-gfarm=DIR Gfarm directory for test [default: gfarm:/tmp] - --dummy-num=NUM the number of dummy (random) files for input - (for -c option) (ignore arguments) - (create dummy 1000 entries per dir) + --dummy-num=NUM the number of dummy input entries for --create + (ignore arguments) + (create 1000 files and directories per 1 unit) (default: disabled) --dummy-size-min=BYTES minimum size of dummy files [default: 0] --dummy-size-max=BYTES maximum size of dummy files [default: 1Mi] --dummy-sleep=SEC sleep time per dummy file [default: 0.0] + --dry-run not create output files -q, --quiet quiet messages -v, --verbose verbose output -d, --debug debug mode @@ -5099,6 +5172,7 @@ _schema = Schema({ '--dummy-size-min': Use(unhumanize_number), '--dummy-size-max': Use(unhumanize_number), '--dummy-sleep': Use(float), + '--dry-run': bool, '--quiet': bool, '--verbose': bool, '--debug': bool, From 9ed4842e30439d1c7584b9947b51448ed49bc89c Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Wed, 31 Jul 2024 12:09:16 +0900 Subject: [PATCH 064/143] gfptar --dummy-sleep: time.sleep(0) is long time since Python 3.11 --- gftool/gfptar/gfptar | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index b9da2fcd9..4d1a56738 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -2197,7 +2197,8 @@ class GfTarFile(tarfile.TarFile): if tarinfo is None: # warning, skip return if entry.is_file(): - time.sleep(getattr(self, self.ATTR_DUMMY_SLEEP)) + if self.dummy_sleep > 0: + time.sleep(self.dummy_sleep) with RandomStream(entry.size) as f: self.addfile(tarinfo, fileobj=f) else: @@ -3155,7 +3156,8 @@ class GfptarCommand(Command): def list_dummy_files(self, base_dir, num, size_min, size_max, dummy_sleep): # defaults files_per_dir = 1000 - dummy_sleep_per_entry = dummy_sleep / 512 # for each readdir() + # dummy_sleep_per_entry = dummy_sleep / 512 # for each readdir() + dummy_sleep_per_entry = 0 dir_min_depth = 5 dir_max_depth = 5 dir_min_length = 30 @@ -3209,7 +3211,9 @@ class GfptarCommand(Command): size, mtime, linkname) def rand_file(dir_path, idx): - time.sleep(dummy_sleep_per_entry) + if dummy_sleep_per_entry > 0: + # time.sleep(0): long time since Python 3.11 + time.sleep(dummy_sleep_per_entry) # f = generate_random_filename() f = f'{idx}.txt' path = os.path.join(dir_path, f) @@ -5120,7 +5124,7 @@ Options: (create 1000 files and directories per 1 unit) (default: disabled) --dummy-size-min=BYTES minimum size of dummy files [default: 0] - --dummy-size-max=BYTES maximum size of dummy files [default: 1Mi] + --dummy-size-max=BYTES maximum size of dummy files [default: 0] --dummy-sleep=SEC sleep time per dummy file [default: 0.0] --dry-run not create output files -q, --quiet quiet messages From 8175656ee61af4590435eb964db8513586c7bf73 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Wed, 31 Jul 2024 13:45:46 +0900 Subject: [PATCH 065/143] gfptar: sequel to 22c6e00 (fix freeze on python 3.6 since ebc17f8) --- gftool/gfptar/gfptar | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 4d1a56738..f67dfbc62 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -3748,9 +3748,12 @@ class GfptarCommand(Command): (status, exc_type_name, exc_value_str, exc_traceback_str) = result inq.put(self.MSG_PARENT_ERROR_COMPLETE) - raise Exception( - f'{exc_type_name}: {exc_value_str}\n' - f'{"".join(exc_traceback_str)}') + if self.verbose or self.debug: + raise GfException( + f'{exc_type_name}: {exc_value_str}\n' + f'{"".join(exc_traceback_str)}') + else: + raise GfException(f'{exc_type_name}: {exc_value_str}') else: logger.error('unknown message from child process:' f' {str(result)}') @@ -3764,11 +3767,11 @@ class GfptarCommand(Command): self.name, loglevel=self.loglevel, debug=self.debug, verbose=self.verbose) - self.sig_default() - while True: # not exit - pid = os.getpid() - logger.debug('create_a_tar_process: start, pid=%d', pid) - self.create_a_tar_process0(input_queue, output_queue) + logger.debug('create_a_tar_process: start, pid=%d', os.getpid()) + + self.sig_ignore() + while self.create_a_tar_process0(input_queue, output_queue): + pass def create_a_tar_process0(self, input_queue, output_queue): request = input_queue.get() @@ -3786,6 +3789,7 @@ class GfptarCommand(Command): output_queue.put((self.MSG_CHILD_DONE, tar_size, cannot_be_archived, outurl, infodb_url)) logger.debug('subprocess exits: serial=%d', serial) + return True # next job except Exception as e: logger.debug('create_a_tar_process: error (serial=%d)', serial) self.print_trace(e) @@ -3801,6 +3805,7 @@ class GfptarCommand(Command): # expect: self.MSG_PARENT_ERROR_COMPLETE except queue.Empty: pass + return False # process exit def create_a_tar_process1(self, input_queue, output_queue, gen, serial, dbfile): From e2d0c99e79a230951406583509a24896254befd8 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Fri, 2 Aug 2024 11:51:00 +0900 Subject: [PATCH 066/143] gfptar: refactoring --- gftool/gfptar/gfptar | 536 ++++++++++++++++++++++--------------------- 1 file changed, 280 insertions(+), 256 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index f67dfbc62..c11b4f9dd 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -2465,6 +2465,7 @@ class GfptarCommand(Command): self.hardlink_warn = True self.tmpdir = None self.test_mode = False + self.worker_list = [] # (started, process, inq, outq) def options_init(self): self.log_init() @@ -2526,10 +2527,11 @@ class GfptarCommand(Command): def set_memory_limit(self, max_memory): resource.setrlimit(resource.RLIMIT_AS, (max_memory, max_memory)) - def sig_init(self): + def sig_init(self, message=True): def sig_handler(signum, frame): - # logger.warning(f'Interrupt (signal={signum})') - sys.stderr.write(f'\nInterrupt (signal={signum})\n') + if message: + # logger.warning(f'Interrupt (signal={signum})') + sys.stderr.write(f'\nInterrupt[1] (signal={signum})\n') self.canceled = True signal.signal(signal.SIGHUP, sig_handler) @@ -2567,6 +2569,13 @@ class GfptarCommand(Command): self.cmd_create(outdir, basedir, infiles) return + outdir = self.opt['--update'] + if outdir: + basedir = self.opt['--basedir'] + infiles = self.opt[''] + self.cmd_update(outdir, basedir, infiles) + return + outdir = self.opt['--extract'] if outdir: indir = self.opt[''] @@ -3250,7 +3259,7 @@ class GfptarCommand(Command): TABLE_tar_entry = 'tar_entry' - def cmd_create(self, outdir, basedir, infiles): + def create_common(self, outdir, basedir, infiles): self.options_init() logger.debug('create start: outdir=%s, basedir=%s', outdir, basedir) self.outdir = outdir @@ -3300,12 +3309,16 @@ class GfptarCommand(Command): self.gfsched_list = None self.cannot_be_archived = 0 - cannot_read_dir = 0 - serial = 0 - has_error = None + self.create_job_init() # before creating threads + def cmd_update(self, outdir, basedir, infiles): + self.create_common(outdir, basedir, infiles) + # TODO + print("TODO...") + + def cmd_create(self, outdir, basedir, infiles): + self.create_common(outdir, basedir, infiles) self.outdir_url.create_new_dir() - self.create_job_init() # before creating threads infiles_checked = [] for infile in infiles: @@ -3350,6 +3363,9 @@ class GfptarCommand(Command): filelist_num = 0 filelist_size = 0 + cannot_read_dir = 0 + has_error = None + for infile in infiles_checked: if self.is_canceled(): logger.debug('Canceled (listdir 1): serial=%d', serial) @@ -3473,7 +3489,6 @@ class GfptarCommand(Command): class Started(): pass - self.worker_list = [] # (started, process, inq, outq) self.worker_ident_dict = {} for i in range(self.max_workers): inq = multiprocessing.Queue() @@ -3484,9 +3499,9 @@ class GfptarCommand(Command): # specified as arguments for ProcessPoolExecutor. # multiprocessing.Manager().Queue() can be used # for ProcessPoolExecutor, but it is very slow. + started = Started() process = multiprocessing.Process(target=self.create_a_tar_process, args=(inq, outq)) - started = Started() started.status = False # lock required self.worker_list.append((started, process, inq, outq)) @@ -3516,7 +3531,7 @@ class GfptarCommand(Command): # timeout = 10 # process.join(timeout) process.terminate() - process.kill() + process.kill() # no effect in Python 3.6 # process.close() may raise ValueError. # (I don't know the reason): # ValueError: Cannot close a process while it is still @@ -3552,14 +3567,14 @@ class GfptarCommand(Command): err_list = [] for index, worker in enumerate(self.worker_list): started, process, inq, outq = worker - try: + if started.status: + with ignore_exception(True): + inq.put(self.MSG_PARENT_EXIT) + with ignore_exception(True): inq.close() - except Exception: - pass - try: + started.status = False + with ignore_exception(True): outq.close() - except Exception: - pass try: if started.status: self.process_close(process, index=index) @@ -3670,6 +3685,7 @@ class GfptarCommand(Command): MSG_PARENT_START = 'START' MSG_PARENT_CANCEL = 'CANCEL' MSG_PARENT_ERROR_COMPLETE = 'ERROR_COMPLETE' + MSG_PARENT_EXIT = 'EXIT' def create_a_tar_thread0(self, gen, serial, arg): logger.debug('create_a_tar: start (gen=%d, serial=%d)', gen, serial) @@ -3755,8 +3771,8 @@ class GfptarCommand(Command): else: raise GfException(f'{exc_type_name}: {exc_value_str}') else: - logger.error('unknown message from child process:' - f' {str(result)}') + logger.error('Unexpected message from child process:' + f' {result}') break finally: logger.debug('(parent) subprocess finished: serial=%d', serial) @@ -3767,19 +3783,31 @@ class GfptarCommand(Command): self.name, loglevel=self.loglevel, debug=self.debug, verbose=self.verbose) - logger.debug('create_a_tar_process: start, pid=%d', os.getpid()) + pid = os.getpid() + logger.debug('create_a_tar_process: start, pid=%d', pid) - self.sig_ignore() + self.sig_init(message=False) while self.create_a_tar_process0(input_queue, output_queue): pass + logger.debug('create_a_tar_process: exit, pid=%d', pid) def create_a_tar_process0(self, input_queue, output_queue): - request = input_queue.get() - if len(request) > 0 and request[0] == self.MSG_PARENT_START: + while True: + try: + request = input_queue.get(timeout=1) + break + except queue.Empty: + if self.is_canceled(): # SIGTERM from parent + return False # exit + logger.debug('create_a_tar_process0: request=%s', request) + if request == self.MSG_PARENT_EXIT: + return False # exit + elif request[0] == self.MSG_PARENT_START: output_queue.put(self.MSG_CHILD_READY) + # OK else: - logger.error(f'Unexpected request from parent process: {request}') - return # exit + raise GfException('Unexpected message from parent process:' + f" {request}") op, gen, serial, dbfile = request logger.debug('create_a_tar_process0: start (serial=%d)', serial) try: @@ -3805,7 +3833,7 @@ class GfptarCommand(Command): # expect: self.MSG_PARENT_ERROR_COMPLETE except queue.Empty: pass - return False # process exit + return False # exit def create_a_tar_process1(self, input_queue, output_queue, gen, serial, dbfile): @@ -3840,7 +3868,7 @@ class GfptarCommand(Command): serial_str = f'{self.SERIAL_FORMAT}_' % serial if gen >= 2: - # SEE ALSO: InfoDB.infodb_filename() + # SEE ALSO: InfoDB.dbgz_filename() prefix_str = f'g{gen}_{serial_str}' else: prefix_str = serial_str @@ -3870,8 +3898,8 @@ class GfptarCommand(Command): dummy_input=self.dummy_input, dummy_sleep=self.dummy_sleep) # to reduce memory usage - # SEE ALSO: InfoDB.generate_one() - db_name = InfoDB.infodb_filename(gen, serial) + # SEE ALSO: InfoDB.generate_db_and_dbgz() + db_name = InfoDB.dbgz_filename(gen, serial) tmpdb_path = os.path.join(self.tmpdir.name, db_name + '.tmp') infodb = InfoDB(tmpdb_path) @@ -3901,7 +3929,7 @@ class GfptarCommand(Command): logger.error('unexpected message from parent process') cancel = True break - if cancel: + if cancel or self.is_canceled(): break subpath = entry.subpath(self.basedir_url) try: @@ -4019,7 +4047,7 @@ class GfptarCommand(Command): TABLE_created_directory_set = 'created_directory_set' TABLE_dirstat_dict = 'dirstat_dict' - def extract_schedule_v4(self): + def extract_schedule(self): target_set = DBSet(self.db_target, StrObj, self.TABLE_target_set) directory_set = DBSet(self.db, StrObj, self.TABLE_directory_set) member_set = DBSet(self.db, StrObj, self.TABLE_member_set) @@ -4047,16 +4075,16 @@ class GfptarCommand(Command): self.progress_for_schedule(self.start_time) error_num = 0 - for id_num, gen_num, dbgz_path, tar_path in \ - self.list_infodb_files(self.indir_url): + for serial, gen, tar_path, dbgz_path, db_path in \ + self.list_tar_dbgz_db(self.indir_url, sort=False, + progress=self.progress_enabled): if self.is_canceled(): raise self.error_canceled() if dbgz_path is None or tar_path is None: error_num += 1 continue - for path, fattr in InfoDB.list_entries_from_one( - dbgz_path, self.tmpdir, self.bufsize, self.use_fsync, - resolve_ugmap=False): + for path, fattr in InfoDB.list_entries_from_db( + db_path, resolve_ugmap=False): if self.is_canceled(): raise self.error_canceled() self.total_num += 1 @@ -4069,6 +4097,9 @@ class GfptarCommand(Command): if file_type == InfoDB.TYPE_DIR: logger.debug('directory_set.add: %s', path) directory_set.add(path) # all directories + logger.debug('os.remove: %s', db_path) + os.remove(db_path) + if self.search_target: is_target = False for member in member_check_dict.keys(): @@ -4131,7 +4162,7 @@ class GfptarCommand(Command): def extract_main(self): (target_set, directory_set, - member_set, error_num) = self.extract_schedule_v4() + member_set, error_num) = self.extract_schedule() self.outdir_url.create_new_dir() # self.extract_directories(directory_set) # slow @@ -4178,7 +4209,7 @@ class GfptarCommand(Command): raise except Exception: dir_url.makedirs() - self.info('prepare_dir: {}', dir_url.url_str) + self.info('created(D): {}', dir_url.url_str) self.created_directory_set.add(dir_url.path) def extract_directories_fast(self, directory_set): @@ -4200,19 +4231,19 @@ class GfptarCommand(Command): if parent_url.path in created_set: # parent exists dir_url.mkdir() created_set.add(dir_url.path) - self.info('prepare_dir: {}', dir_url.path) + self.info('created(D): {}', dir_url.path) else: # no parent dir_url.makedirs() created_set.add(dir_url.path) - self.info('prepare_dir: {}', dir_url.path) + self.info('created(D): {}', dir_url.path) created_set.add(parent_url.path) - self.info('prepare_dir: {}', parent_url.path) + self.info('created(D): {}', parent_url.path) for p in parent_url.parent_iter: path = p.path if path == '.' or path == '/': continue created_set.add(path) - self.info('prepare_dir: {}', path) + self.info('created(D): {}', path) def update_stat_for_directories(self, directory_set): logger.debug('update_stat_for_directories') @@ -4293,7 +4324,7 @@ class GfptarCommand(Command): try: self.extract_from_a_tar1(serial, target, member_set, tar) with self.lock(): - self.info('extract,DONE: {}', arch_url.url_str) + self.info('DONE: {}', arch_url.url_str) finally: tar.close() @@ -4335,8 +4366,8 @@ class GfptarCommand(Command): if not exist_dir: if not parent.exists(): with ignore_exception(True): - if not self.dry_run: - parent.makedirs() # default 0700 + self.info('created(D): {}', parent.path) + parent.makedirs() # default 0700 with self.lock(): self.created_directory_set.add(parent.path) @@ -4519,10 +4550,10 @@ class GfptarCommand(Command): quiet = self.quiet indir_url = GfURL.init(indir) - def print_quiet(id_num, gen_num, path, fattr): + def print_quiet(serial, gen, path, fattr): pass - def print_verbose(id_num, gen_num, path, fattr): + def print_verbose(serial, gen, path, fattr): name = path if fattr.ftype == InfoDB.TYPE_DIR: name = name + '/' @@ -4530,11 +4561,11 @@ class GfptarCommand(Command): name = name + ' -> ' + fattr.linkname dt_object = datetime.datetime.fromtimestamp(fattr.mtime) mtime_str = dt_object.strftime('%Y-%m-%d %H:%M') - print(f'g{gen_num}_{id_num:04} {fattr.ftype} {fattr.mode:04o}' + print(f'g{gen}_{serial:04} {fattr.ftype} {fattr.mode:04o}' f' {fattr.user:>8}/{fattr.group:<8}' f' {fattr.size:9d} {mtime_str} {name}') - def print_simple(id_num, gen_num, path, fattr): + def print_simple(serial, gen, path, fattr): name = path if fattr.ftype == InfoDB.TYPE_DIR: name = name + '/' @@ -4547,122 +4578,58 @@ class GfptarCommand(Command): else: print_func = print_simple - for id_num, gen_num, path, fattr in self.list_entries_from_all( + for serial, gen, ent_path, fattr in self.list_entries( indir_url, resolve_ugmap=True): if self.is_canceled(): break - print_func(id_num, gen_num, path, fattr) + print_func(serial, gen, ent_path, fattr) self.tmpdir.cleanup() - def list_entries_from_all(self, indir_url, resolve_ugmap=False): - # SEE ALSO: extract_schedule_*() - infodb_list = self.list_infodb_files(indir_url) + def list_entries(self, indir_url, resolve_ugmap=False, progress=False): + # SEE ALSO: extract_schedule() error_num = 0 - for id_num, gen_num, dbgz_path, tar_path in infodb_list: - if dbgz_path is None: + for serial, gen, tar_path, dbgz_path, db_path in \ + self.list_tar_dbgz_db(indir_url, sort=True, progress=progress): + if db_path is None: error_num += 1 continue if tar_path is None: error_num += 1 # FALLTHROUGH - for path, fattr in InfoDB.list_entries_from_one( - dbgz_path, self.tmpdir, self.bufsize, self.use_fsync, - resolve_ugmap): - yield id_num, gen_num, path, fattr + for ent_path, fattr in InfoDB.list_entries_from_db( + db_path, resolve_ugmap=resolve_ugmap): + yield serial, gen, ent_path, fattr + logger.debug('os.remove: %s', db_path) + os.remove(db_path) if error_num > 0: raise GfptarError(f'Total errors encountered: {error_num}') - def sort_infodb_list(self, tar_or_infodb_list): + def sort_tar_or_db_list(self, tar_or_db_list): def id_key(value): - return value[0] # id_num - - tar_or_infodb_list.sort(key=id_key) - return tar_or_infodb_list - - def list_infodb_files(self, indir_url, sort=True): - tar_list, infodb_list = self.list_tar_infodb_files(indir_url) - if self.sync_infodb: - infodb_list1 = self.create_infodb_files(indir_url, tar_list) - if self.test_mode: - self.sort_infodb_list(infodb_list) - self.sort_infodb_list(infodb_list1) - if infodb_list != infodb_list1: - raise AssertionError(f'infodb_list{str(infodb_list)} != ' - f'infodb_list1{str(infodb_list1)}') - for id_num, gen_num, infodb_path in infodb_list: - u = GfURL.init(infodb_path) - if not u.exists(): - raise AssertionError(f'{infodb_path} does not exist' - ' even after create_infodb_files') - del infodb_list - infodb_list = infodb_list1 - - infodb_dict = {} - for id_num, gen_num, infodb_path in infodb_list: - infodb_dict[id_num] = (gen_num, infodb_path) + return value[0] # serial - tar_dict = {} - for id_num, gen_num, tar_path in tar_list: - tar_dict[id_num] = (gen_num, tar_path) - infodb = infodb_dict.get(id_num, None) - if infodb is None: - fname = InfoDB.infodb_filename(gen_num, id_num) - logger.error(f'lost {fname} tied to {tar_path}.' - ' --sync-db will recreate the file.') - infodb_dict[id_num] = (gen_num, None) - - for id_num, gen_num, infodb_path in infodb_list: - tar = tar_dict.get(id_num, None) - if tar is None: - logger.error(f'lost *.tar.* tied to {infodb_path}') - - infodb_tar_list = [] - for id_num, gen_path in sorted(infodb_dict.items()): - gen_num, infodb_path = gen_path - tar = tar_dict.get(id_num, None) - if tar is not None: - gen_num2, tar_path = tar - else: - tar_path = None - infodb_tar_list.append((id_num, gen_num, infodb_path, tar_path)) - - if sort: - self.sort_infodb_list(infodb_tar_list) - return infodb_tar_list - - # def list_only_infodb_files(self, indir_url): - # db_gen_pattern = re.compile(InfoDB.PATT_INFODB) - # infodb_list = [] - # for ent in indir_url.listdir(recursive=False): - # path = ent.path # fullpath when ent is Gfarm - # base = os.path.basename(path) - # g_match = db_gen_pattern.match(base) - # if g_match: - # gen_num = g_match.group(1) - # id_num = g_match.group(2) - # # int("0001") -> 1 - # infodb_list.append((int(id_num), int(gen_num), path)) - # return infodb_list - - def list_tar_infodb_files(self, indir_url, check=True, sync=False): + tar_or_db_list.sort(key=id_key) + return tar_or_db_list + + def list_tar_list_dbgz_from_dir(self, indir_url): PATT_TAR = r'^.*\.tar(\.\w{1,5})?$' # *.tar or *.tar.* PATT_TAR_GEN1 = r'^(\d+)_.+' PATT_TAR_GEN_ALL = r'^g(\d+)_(\d+)_.+' tar_pattern = re.compile(PATT_TAR) tar_gen1_pattern = re.compile(PATT_TAR_GEN1) tar_gen_all_pattern = re.compile(PATT_TAR_GEN_ALL) - infodb_pattern = re.compile(InfoDB.PATT_INFODB) + dbgz_pattern = re.compile(InfoDB.PATT_DBGZ) tar_list = [] - infodb_list = [] + dbgz_list = [] for ent in indir_url.listdir(recursive=False): path = ent.path base = os.path.basename(path) - db_match = infodb_pattern.match(base) + db_match = dbgz_pattern.match(base) if db_match: - gen_num = db_match.group(1) - id_num = db_match.group(2) + gen = db_match.group(1) + serial = db_match.group(2) # int("0001") -> 1 - infodb_list.append((int(id_num), int(gen_num), path)) + dbgz_list.append((int(serial), int(gen), path)) continue if ent.path.endswith(self.LIST_SUFFIX): # ignore *.lst, *.tar.lst continue @@ -4671,34 +4638,71 @@ class GfptarCommand(Command): t1_match = tar_gen1_pattern.match(base) if t1_match: # generation number = 1 - gen_num = '1' + gen = '1' # ex. 0001 (str) - id_num = t1_match.group(1) + serial = t1_match.group(1) else: t_match = tar_gen_all_pattern.match(base) if t_match: - gen_num = t_match.group(1) + gen = t_match.group(1) # ex. 0001 (str) - id_num = t_match.group(2) + serial = t_match.group(2) else: # ignore irrelevant file continue # int("0001") -> 1 - tar_list.append((int(id_num), int(gen_num), path)) - return tar_list, infodb_list + tar_list.append((int(serial), int(gen), path)) + return tar_list, dbgz_list + + # yield tuple(serial, gen, tar_path, dbgz_path, db_path) + def list_tar_dbgz_db(self, indir_url, sort=False, progress=False): + logger.debug('list_tar_dbgz_db') + + tar_list, dbgz_list = self.list_tar_list_dbgz_from_dir(indir_url) - def create_infodb_files(self, indir_url, tar_list): - logger.debug('create_infodb_files') - overwrite = True # always True + tar_dict = {} + dbgz_dict = {} + for serial, gen, tar_path in tar_list: + tar_dict[serial] = (gen, tar_path) + + for serial, gen, dbgz_path in dbgz_list: + dbgz_dict[serial] = (gen, dbgz_path) + gen_tar = tar_dict.get(serial, None) + if gen_tar is None: + logger.error(f'lost *.tar.* tied to {dbgz_path}') + tar_dict[serial] = (gen, None) + + for serial, gen, tar_path in tar_list: + gen_dbgz = dbgz_dict.get(serial, None) + if gen_dbgz is None: + dbgz_fname = InfoDB.dbgz_filename(gen, serial) + logger.info(f'lost {dbgz_fname} (auto re-creation)' + f' tied to {tar_path}') + dbgz_dict[serial] = (gen, None) + + # for progress self.total_tar_num = 0 self.current_tar_num = 0 # generation -> tar files - gen_to_tar_list = defaultdict(list) - for id_num, gen_num, path in tar_list: - gen_to_tar_list[gen_num].append((id_num, path)) + gen_tarlist_dict = defaultdict(list) + # tar_dbgz_list = [] + for serial, gen_dbgz in dbgz_dict.items(): + gen, dbgz_path = gen_dbgz + gen_tar = tar_dict.get(serial, None) + if gen_tar is not None: + gen2, tar_path = gen_tar + else: + tar_path = None + # tar_dbgz_list.append((serial, gen, tar_path, dbgz_path)) + gen_tarlist_dict[gen].append((serial, tar_path, dbgz_path)) self.total_tar_num += 1 + del dbgz_dict + del tar_dict + del tar_list + del dbgz_list + self.start_time = time.time() save_e = None cancel = False @@ -4718,9 +4722,10 @@ class GfptarCommand(Command): def sig_handler(signum, frame): if share_cancel.value == 0: # logger.warning(f'Interrupt (signal={signum}') - sys.stderr.write(f'\nInterrupt (signal={signum})\n') + sys.stderr.write(f'\nInterrupt[2] (signal={signum})\n') share_cancel.value = 1 + # for Manager process signal.signal(signal.SIGHUP, sig_handler) signal.signal(signal.SIGINT, sig_handler) signal.signal(signal.SIGTERM, sig_handler) @@ -4728,34 +4733,55 @@ class GfptarCommand(Command): self.current_ent_num = 0 self.current_size = 0 - if self.progress_enabled or self.verbose: + if progress: now = time.time() self.progress_for_sync_infodb(now) - for gen_num, tarlist in gen_to_tar_list.items(): + if sort: # sort by gen + gen_tarlist_list = sorted(gen_tarlist_dict.items()) + else: + gen_tarlist_list = gen_tarlist_dict.items() + + for gen, tarlist in gen_tarlist_list: + if sort: # sort by serial + self.sort_tar_or_db_list(tarlist) + + # concurrent processes per generation (gen) + futures = {} with concurrent.futures.ProcessPoolExecutor( max_workers=self.max_workers) as executor: - arglist = [] - for id_num, tar_path in tarlist: - arglist.append((lock, share_cancel, self.tmpdir, - overwrite, gen_num, id_num, tar_path, - self.bufsize, self.use_fsync, - self.dry_run)) - # InfoDB.generate_one is staticmethod, - # because ProcessPoolExecutor cannot serialize - # members of "self" object. - futures = [executor.submit(InfoDB.generate_one, arg) - for arg in arglist] - - for future in concurrent.futures.as_completed(futures): + for serial, tar_path, dbgz_path in tarlist: + if tar_path is None: + continue + if dbgz_path is None: + update = True + else: + update = self.sync_infodb + arg = (lock, share_cancel, self.tmpdir.name, + update, gen, serial, tar_path, + self.bufsize, self.use_fsync, self.dry_run) + # InfoDB.generate_db_and_dbgz is classmethod, + # because ProcessPoolExecutor cannot serialize + # members of "GfptarCommand (self)" object. + future = executor.submit(InfoDB.generate_db_and_dbgz, + arg) + futures[serial] = future + + for serial, tar_path, dbgz_path in tarlist: + if tar_path is None: + db_path = None + # missing tar file + yield serial, gen, tar_path, dbgz_path, db_path try: - num, size = future.result() - self.current_ent_num += num + future = futures[serial] + db_path, dbgz_path, ent_num, size = future.result() + self.current_ent_num += ent_num self.current_size += size self.current_tar_num += 1 - if self.progress_enabled or self.verbose: + if ent_num > 0 and progress: now = time.time() self.progress_for_sync_infodb(now) + yield serial, gen, tar_path, dbgz_path, db_path except Exception as e: # logger.error(f'{e}') self.print_trace(e) @@ -4769,7 +4795,7 @@ class GfptarCommand(Command): # restore signal.signal(sig, orig_sig_handler[sig]) - if self.progress_enabled or self.verbose: + if progress: now = time.time() self.progress_for_sync_infodb(now) sys.stdout.write('\n') @@ -4777,14 +4803,6 @@ class GfptarCommand(Command): raise save_e if cancel: raise self.error_canceled() - del gen_to_tar_list - infodb_list = [] - for id_num, gen_num, tar_path in tar_list: - fname = InfoDB.infodb_filename(gen_num, id_num) - dname = os.path.dirname(tar_path) - infodb_path = os.path.join(dname, fname) - infodb_list.append((id_num, gen_num, infodb_path)) - return infodb_list class InfoDB: @@ -4796,22 +4814,27 @@ class InfoDB: TABLE_USER = 'user_map' TABLE_GROUP = 'group_map' - def __init__(self, dbfile_path): - self.db = DB(dbfile_path) - self.fattr_dict = DBDict(self.db, FileAttr2, self.TABLE_ENTRY) + def __init__(self, db_path, clear=False): + self.db = DB(db_path) + self.fattr_dict = DBDict(self.db, FileAttr2, self.TABLE_ENTRY, + clear=clear) self.user_dict = {} # use memory for speed self.group_dict = {} # use memory for speed - # SEE ALSO: infodb_filename() - PATT_INFODB = r'^g(\d+)_(\d+)_gfptar.db.gz$' + # SEE ALSO: dbgz_filename() + PATT_DBGZ = r'^g(\d+)_(\d+)_gfptar.db.gz$' @classmethod - def infodb_filename(cls, gen_num, id_num): - # SEE ALSO: PATT_INFODB - # gen_num (>= 1): the generation number - # id_num (>= 1): the serial number + def db_filename(cls, gen, serial): + # SEE ALSO: PATT_DBGZ + # gen (>= 1): the generation number + # serial (>= 1): the serial number # ex. g2_0099_gfptar.db.gz - return f'g{gen_num}_{int(id_num):04}_gfptar.db.gz' + return f'g{gen}_{int(serial):04}_gfptar.db' + + @classmethod + def dbgz_filename(cls, gen, serial): + return cls.db_filename(gen, serial) + '.gz' def add(self, tarinfo): t = tarinfo @@ -4897,7 +4920,7 @@ class InfoDB: def sig_handler(signum, frame): if not cls.signal_canceled: pid = os.getpid() - logger.info(f'Interrupt (signal={signum}) (PID={pid})') + logger.info(f'Interrupt[3] (signal={signum}) (PID={pid})') cls.signal_canceled = True signal.signal(signal.SIGHUP, sig_handler) @@ -4907,84 +4930,84 @@ class InfoDB: cls.signal_initialized = True logger.debug('InfoDB.signal_init') + # return db_path, dbgz_path, ent_num, size @classmethod - def generate_one(cls, args): - (lock, share_cancel, tmpdir, overwrite, gen_num, id_num, + def generate_db_and_dbgz(cls, args): + (lock, share_cancel, db_dir, update, gen, serial, tar_path, bufsize, use_fsync, dry_run) = args if share_cancel.value != 0: # logger.debug('Canceled (2)') - return 0, 0 - tar_url = GfURL.init(tar_path) - indir_url = tar_url.parent - db_name = InfoDB.infodb_filename(gen_num, id_num) - db_path = indir_url.url_join(db_name) - db_url = GfURL.init(db_path) - if not overwrite and db_url.exists(): - logger.debug('not overwrite: %s', db_path) - return 0, 0 + return None, None, 0, 0 InfoDB.signal_init() - # Local file - tmpdb_path = os.path.join(tmpdir.name, str(id_num) + '_info.db') - infodb = InfoDB(tmpdb_path) - tar = GfTarFile.extract_open(tar_url, bufsize) - db_close = False + tar_url = GfURL.init(tar_path) + tardir_url = tar_url.parent + dbgz_fname = InfoDB.dbgz_filename(gen, serial) + dbgz_path = tardir_url.url_join(dbgz_fname) + dbgz_url = GfURL.init(dbgz_path) + + # db_dir is local filesystem + db_path = os.path.join(db_dir, cls.db_filename(gen, serial)) + num = 0 size = 0 - has_error = True - try: - interval = 1 # sec. | for interrupt - next_check = time.time() + interval - while True: - if cls.signal_canceled: - logger.info('Canceled') - break - now = time.time() - if now >= next_check: - # access manager.Value(): very high cost - if share_cancel.value != 0: + if not update and dbgz_url.exists(): + logger.debug('not update, decompress dbgz: %s', dbgz_path) + InfoDB.decompress_copy(dbgz_path, db_path, bufsize, use_fsync) + else: + # generate db and dbgz + tar = GfTarFile.extract_open(tar_url, bufsize) + infodb = InfoDB(db_path, clear=True) + db_close = False + has_error = True + try: + interval = 1 # sec. (for interrupt) + next_check = time.time() + interval + while True: + if cls.signal_canceled: logger.info('Canceled') - has_error = True break - next_check = now + interval - try: - t = tar.next() - except MemoryError: - raise - except Exception as e: - logger.warning(f'{tar_path}: SKIPPED:' - f' invalid or empty tar: {str(e)}') - t = None - if t is None: - break - infodb.add(t) - num += 1 - size += t.size - # success - - infodb.commit_close() - db_close = True - InfoDB.compress_copy(infodb.db.filename, db_path, bufsize, - use_fsync, dry_run) - except Exception: - has_error = True - raise - finally: - with ignore_exception(has_error): - tar.close() - with ignore_exception(has_error): - if not db_close: - infodb.db.close() - return num, size + now = time.time() + if now >= next_check: + # access manager.Value(): very high cost + if share_cancel.value != 0: + logger.info('Canceled') + has_error = True + break + next_check = now + interval + try: + t = tar.next() + except MemoryError: + raise + except Exception as e: + logger.warning(f'{tar_path}: SKIPPED:' + f' invalid or empty tar: {str(e)}') + t = None + if t is None: + break + infodb.add(t) + num += 1 + size += t.size + # success + + infodb.commit_close() + db_close = True + InfoDB.compress_copy(db_path, dbgz_path, bufsize, + use_fsync, dry_run, move=False) + except Exception: + has_error = True + raise + finally: + with ignore_exception(has_error): + tar.close() + with ignore_exception(has_error): + if not db_close: + infodb.db.close() + return db_path, dbgz_path, num, size @staticmethod - def list_entries_from_one(in_dbgz_path, tmpdir, bufsize, use_fsync, - resolve_ugmap=False): - base = os.path.basename(in_dbgz_path) - out_db_path = os.path.join(tmpdir.name, base + '.tmp.db') - tmpdb_url = InfoDB.decompress_copy(in_dbgz_path, out_db_path, - bufsize, use_fsync) - db = DB(tmpdb_url.path) + def list_entries_from_db(db_path, resolve_ugmap=False): + db = DB(db_path) has_error = False try: fattr_dict = DBDict(db, FileAttr2, InfoDB.TABLE_ENTRY) @@ -5010,8 +5033,6 @@ class InfoDB: finally: with ignore_exception(has_error): db.close() - with ignore_exception(has_error): - db.unlink() progname = os.path.basename(__file__) @@ -5078,6 +5099,7 @@ Options: from to -c, --create=DIR create mode, create tar files in from s + -u, --update=DIR append files newer than same entries in archive -C, --basedir=DIR base directory for s [default: .] -j, --jobs=NUM the number of jobs to copy per tar file in parallel [default: 4] @@ -5141,6 +5163,7 @@ Usage: {f} [options] -c [-C ] [--] ... {f} [options] -x [--] [...] {f} [options] -t + {f} [options] -u [-C ] [--] ... {f} [options] --test {f} [options] --test -C ... {f} [options] --test-long @@ -5152,6 +5175,7 @@ _schema = Schema({ '--list': Or(str, None), '--extract': Or(str, None), '--create': Or(str, None), + '--update': Or(str, None), '--basedir': Or(str, None), '--encoding': str, '--size': Use(unhumanize_number), From 66d14e69ccdcb143fa6cb1072329db2ad27fff77 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Fri, 2 Aug 2024 15:28:44 +0900 Subject: [PATCH 067/143] gfptar: limit the number of DB files created in advance --- gftool/gfptar/gfptar | 146 +++++++++++++++++++++++++++---------------- 1 file changed, 91 insertions(+), 55 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index c11b4f9dd..18a12f5c6 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -2801,11 +2801,14 @@ class GfptarCommand(Command): noread = False link = False longname = False - self.test_prepare_srcdir(srcdir_local, - readonly, noread, link, longname) - self.test_prepare_srcdir(srcdir_gfarm, - readonly, noread, link, longname) - + count1 = self.test_prepare_srcdir(srcdir_local, + readonly, noread, link, longname) + count2 = self.test_prepare_srcdir(srcdir_gfarm, + readonly, noread, link, longname) + if count1 != count2: + logger.error(f'count1({count1}) != count2({count2})') + logger.error_exit(1, testname + '(count1 != count2) ' + + '... FAIL (test_prepare_srcdir is broken)') test1_name = 'test-1-create' test1_dir_gfarm = workdir_gfarm_url.url_join(test1_name) test2_name = 'test-2-extract' @@ -2817,6 +2820,7 @@ class GfptarCommand(Command): # Gfarm -> Gfarm(tar) try: + # from srcdir_gfarm self.cmd_create(test1_dir_gfarm, workdir_gfarm, [testsrc_name]) if noread: logger.error_exit(1, testname + '(create:Gfarm->Gfarm) ' + @@ -2840,11 +2844,19 @@ class GfptarCommand(Command): # Local(tar) -> Local self.cmd_extract(test4_dir_local, test3_dir_local, []) - # --list - self.cmd_list_simple(test1_dir_gfarm, quiet=True) - self.cmd_list_simple(test3_dir_local, quiet=True) - self.cmd_list_verbose(test1_dir_gfarm, quiet=True) - self.cmd_list_verbose(test3_dir_local, quiet=True) + quiet = not (self.debug or self.verbose) + # --list for gfarm + count = self.cmd_list_simple(test1_dir_gfarm, quiet=quiet) + if count != count1: + logger.error(f'count({count}) != count1({count1})') + logger.error_exit(1, testname + + '(list_simple(test1_dir_gfarm)) ... FAIL') + # --verbose --list for local + count = self.cmd_list_verbose(test3_dir_local, quiet=quiet) + if count != count1: + logger.error(f'count({count}) != count1({count1})') + logger.error_exit(1, testname + + '(list_verbose(test3_dir_local)) ... FAIL') if readonly: # extract a member (SEE ALSO: test_prepare_srcdir) @@ -2998,28 +3010,30 @@ class GfptarCommand(Command): L = 'hardlink' longname = ('0123456789' * 30)[:255] tree = [ - # (F, 'file1', 0o664, 1234567890, None), - (F, 'gfarm:abc', 0o644, 1234567890, None), # not Gfarm URL - (D, 'dir1', 0o715, 2234567890, None), + # (F, 'file1', 0o664, 1234567890, None, True), + (F, 'gfarm:abc', 0o644, 1234567890, None, True), # not Gfarm URL + (D, 'dir1', 0o715, 2234567890, None, True), ] tree_readonly = [ - (D, 'dir1/readonly', 0o700, 3234567891, None), # chmod 500 later - (F, 'dir1/readonly/file#2', 0o456, 4234567892, None), + (D, 'dir1/readonly', 0o700, # chmod 0o500 later + 3234567891, None, True), + (F, 'dir1/readonly/file#2', 0o456, 4234567892, None, True), ] tree_noread = [ - (F, 'dir1/noread-file', 0o000, 1234567891, None), - (D, 'dir1/noread-dir', 0o700, 1234567892, None), # chmod 000 later - (F, 'dir1/noread-dir/file3', 0o400, 1234567893, None), + (F, 'dir1/noread-file', 0o000, 1234567891, None, False), + (D, 'dir1/noread-dir', 0o700, # chmod 0o000 later + 1234567892, None, True), + (F, 'dir1/noread-dir/file3', 0o400, 1234567893, None, False), ] tree_link = [ - (D, 'dir1/ディレクトリ 2', 0o755, 5234567890, None), + (D, 'dir1/ディレクトリ 2', 0o755, 5234567890, None, True), (L, 'dir1/ディレクトリ 2/hardlink1', - 0o400, 6234567890, 'gfarm:abc'), + 0o400, 6234567890, 'gfarm:abc', True), (S, 'dir1/ディレクトリ 2/symlink1', - 0o777, 7234567890, 'hardlink1'), + 0o777, 7234567890, 'hardlink1', True), ] tree_longname = [ - (F, 'dir1/' + longname, 0o775, 9234567890, None), + (F, 'dir1/' + longname, 0o775, 9234567890, None, True), ] if readonly: tree += tree_readonly @@ -3029,14 +3043,17 @@ class GfptarCommand(Command): tree += tree_link if longname: tree += tree_longname + count = 0 srcdir_url = GfURL.init(dir_url_str) srcdir_url.mkdir() + count += 1 for ent in tree: ftype = ent[0] path = ent[1] mode = ent[2] mtime = ent[3] linkname = ent[4] + readable = ent[5] url = GfURL.init(srcdir_url.url_join(path)) if ftype == F: with url.writeopen(textmode=True, @@ -3052,6 +3069,10 @@ class GfptarCommand(Command): elif ftype == L: url.hardlink(srcdir_url.url_join(linkname)) url.chmod(mode, mtime=mtime) + else: + raise Exception(f'Unexpected ftype={ftype}') + if readable: + count += 1 for ent in reversed(tree): ftype = ent[0] path = ent[1] @@ -3068,6 +3089,7 @@ class GfptarCommand(Command): if noread: noread_url = GfURL.init(srcdir_url.url_join('dir1/noread-dir')) noread_url.chmod(mode=0o000) + return count # the number of readable files def test_specified_dir(self): basedir = self.opt['--basedir'] @@ -4077,7 +4099,7 @@ class GfptarCommand(Command): error_num = 0 for serial, gen, tar_path, dbgz_path, db_path in \ self.list_tar_dbgz_db(self.indir_url, sort=False, - progress=self.progress_enabled): + progress=False): if self.is_canceled(): raise self.error_canceled() if dbgz_path is None or tar_path is None: @@ -4094,9 +4116,9 @@ class GfptarCommand(Command): logger.debug('archive_dict[%s]: %s', path, file_type) archive_dict[path] = file_type else: - if file_type == InfoDB.TYPE_DIR: + if file_type == InfoDB.TYPE_DIR: # all directories logger.debug('directory_set.add: %s', path) - directory_set.add(path) # all directories + directory_set.add(path.rstrip('/')) logger.debug('os.remove: %s', db_path) os.remove(db_path) @@ -4398,7 +4420,7 @@ class GfptarCommand(Command): fattr = FileAttr1(tarinfo.mode, tarinfo.mtime, tarinfo.uname, tarinfo.gname) with self.lock(): - self.dirstat_dict[outfile] = fattr + self.dirstat_dict[outfile.rstrip('/')] = fattr elif tarinfo.issym(): logger.debug('extract,link: %s, %s', outfile, tarinfo.linkname) @@ -4539,10 +4561,10 @@ class GfptarCommand(Command): f'{ent_per_sec_str}Ent/s') def cmd_list_simple(self, indir, quiet=False): - self.cmd_list(indir, quiet=quiet, verbose=False) + return self.cmd_list(indir, quiet=quiet, verbose=False) def cmd_list_verbose(self, indir, quiet=False): - self.cmd_list(indir, quiet=quiet, verbose=True) + return self.cmd_list(indir, quiet=quiet, verbose=True) def cmd_list(self, indir, quiet=False, verbose=True): self.options_init() @@ -4578,12 +4600,15 @@ class GfptarCommand(Command): else: print_func = print_simple + count = 0 for serial, gen, ent_path, fattr in self.list_entries( indir_url, resolve_ugmap=True): if self.is_canceled(): break print_func(serial, gen, ent_path, fattr) + count += 1 self.tmpdir.cleanup() + return count def list_entries(self, indir_url, resolve_ugmap=False, progress=False): # SEE ALSO: extract_schedule() @@ -4704,7 +4729,7 @@ class GfptarCommand(Command): del dbgz_list self.start_time = time.time() - save_e = None + self.save_e = None cancel = False sigs = [signal.SIGHUP, signal.SIGINT, signal.SIGTERM] @@ -4742,16 +4767,48 @@ class GfptarCommand(Command): else: gen_tarlist_list = gen_tarlist_dict.items() + def result_one(one, gen): + future, serial, tar_path, dbgz_path = one + try: + db_path, dbgz_path, ent_num, size = future.result() + self.current_ent_num += ent_num + self.current_size += size + self.current_tar_num += 1 + if ent_num > 0 and progress: + now = time.time() + self.progress_for_sync_infodb(now) + yield serial, gen, tar_path, dbgz_path, db_path + except Exception as e: + # logger.error(f'{e}') + self.print_trace(e) + share_cancel.value = 1 + if self.save_e is None: + self.save_e = e + + def result_all(previous_list, gen): + for one in previous_list: + yield from result_one(one, gen) + for gen, tarlist in gen_tarlist_list: if sort: # sort by serial self.sort_tar_or_db_list(tarlist) # concurrent processes per generation (gen) - futures = {} + job_list = [] with concurrent.futures.ProcessPoolExecutor( max_workers=self.max_workers) as executor: for serial, tar_path, dbgz_path in tarlist: + if len(job_list) >= self.max_workers: + # wait for a previous job + yield from result_one(job_list[0], gen) + job_list = job_list[1:] # remove first if tar_path is None: + # missing tar file + db_path = None + # wait for all previous jobs + yield from result_all(job_list, gen) + job_list = [] + yield serial, gen, tar_path, dbgz_path, db_path continue if dbgz_path is None: update = True @@ -4765,29 +4822,8 @@ class GfptarCommand(Command): # members of "GfptarCommand (self)" object. future = executor.submit(InfoDB.generate_db_and_dbgz, arg) - futures[serial] = future - - for serial, tar_path, dbgz_path in tarlist: - if tar_path is None: - db_path = None - # missing tar file - yield serial, gen, tar_path, dbgz_path, db_path - try: - future = futures[serial] - db_path, dbgz_path, ent_num, size = future.result() - self.current_ent_num += ent_num - self.current_size += size - self.current_tar_num += 1 - if ent_num > 0 and progress: - now = time.time() - self.progress_for_sync_infodb(now) - yield serial, gen, tar_path, dbgz_path, db_path - except Exception as e: - # logger.error(f'{e}') - self.print_trace(e) - share_cancel.value = 1 - if save_e is None: - save_e = e + job_list.append((future, serial, tar_path, dbgz_path)) + yield from result_all(job_list, gen) if share_cancel.value != 0: cancel = True # no longer be able to access Manager @@ -4799,8 +4835,8 @@ class GfptarCommand(Command): now = time.time() self.progress_for_sync_infodb(now) sys.stdout.write('\n') - if save_e: - raise save_e + if self.save_e: + raise self.save_e if cancel: raise self.error_canceled() From 2bc35c3f6586d1251c60aa873980f49bc9bffddd Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Fri, 2 Aug 2024 16:49:38 +0900 Subject: [PATCH 068/143] gfptar --extract: create directories in threads --- gftool/gfptar/gfptar | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 18a12f5c6..9fbc15213 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -4118,7 +4118,7 @@ class GfptarCommand(Command): else: if file_type == InfoDB.TYPE_DIR: # all directories logger.debug('directory_set.add: %s', path) - directory_set.add(path.rstrip('/')) + directory_set.add(path.strip('/')) logger.debug('os.remove: %s', db_path) os.remove(db_path) @@ -4187,8 +4187,9 @@ class GfptarCommand(Command): member_set, error_num) = self.extract_schedule() self.outdir_url.create_new_dir() + # NOTE: Directories are created in threads # self.extract_directories(directory_set) # slow - self.extract_directories_fast(directory_set) + # self.extract_directories_fast(directory_set) self.extracted_num = 0 self.extracted_size = 0 @@ -4377,21 +4378,27 @@ class GfptarCommand(Command): member_set.remove(tarinfo.name) # members_num == 0 -> extract all - # ex. /a/b/c/ -> a/b/c + # ex. //a/b/c// -> a/b/c outfile = tarinfo.name.strip('/') # relative path only outurl_str = self.outdir_url.url_join(outfile) outurl = GfURL.init(outurl_str, dry_run=self.dry_run) - parent = outurl.parent + # create directories if not exist + if tarinfo.isdir(): + dir_url = outurl + dir_path = outfile + else: + dir_url = outurl.parent + dir_path = dir_url.path with self.lock(): - exist_dir = parent.path in self.created_directory_set - if not exist_dir: - if not parent.exists(): - with ignore_exception(True): - self.info('created(D): {}', parent.path) - parent.makedirs() # default 0700 + dir_exists = dir_path in self.created_directory_set + if not dir_exists: + if not dir_url.exists(): + with ignore_exception(True): # may be race condition + dir_url.makedirs() # default 0700 + self.info('created(D): {}', dir_path) with self.lock(): - self.created_directory_set.add(parent.path) + self.created_directory_set.add(dir_path) if tarinfo.isfile(): target_host = self.select_a_target_host(outurl, index) @@ -4420,7 +4427,7 @@ class GfptarCommand(Command): fattr = FileAttr1(tarinfo.mode, tarinfo.mtime, tarinfo.uname, tarinfo.gname) with self.lock(): - self.dirstat_dict[outfile.rstrip('/')] = fattr + self.dirstat_dict[outfile] = fattr elif tarinfo.issym(): logger.debug('extract,link: %s, %s', outfile, tarinfo.linkname) From 460e436bb1a7e958a99fa7cce15382e7704820d5 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Fri, 2 Aug 2024 17:13:13 +0900 Subject: [PATCH 069/143] gfptar: avoid AssertionError --- gftool/gfptar/gfptar | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 9fbc15213..4ef43f05a 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -3522,6 +3522,7 @@ class GfptarCommand(Command): # multiprocessing.Manager().Queue() can be used # for ProcessPoolExecutor, but it is very slow. started = Started() + inq.gfptar_my_closed = False process = multiprocessing.Process(target=self.create_a_tar_process, args=(inq, outq)) started.status = False # lock required @@ -3590,16 +3591,18 @@ class GfptarCommand(Command): for index, worker in enumerate(self.worker_list): started, process, inq, outq = worker if started.status: - with ignore_exception(True): - inq.put(self.MSG_PARENT_EXIT) + if not inq.gfptar_my_closed: # avoid AssertionError + with ignore_exception(True): + inq.put(self.MSG_PARENT_EXIT) with ignore_exception(True): + inq.gfptar_my_closed = True inq.close() - started.status = False with ignore_exception(True): outq.close() try: if started.status: self.process_close(process, index=index) + started.status = False except Exception: err_list.append(worker) self.worker_list = err_list From a782eba8e95ec03096cada339393bcf6fa7d0336 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Fri, 2 Aug 2024 19:16:03 +0900 Subject: [PATCH 070/143] gfptar --test: simplify --- gftool/gfptar/gfptar | 116 ++++++++++++++++++++++++++----------------- 1 file changed, 71 insertions(+), 45 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 4ef43f05a..19300b609 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -49,6 +49,7 @@ import queue import inspect from collections import defaultdict import datetime +import atexit from docopt import docopt from schema import Schema, Use, Or @@ -2634,16 +2635,19 @@ class GfptarCommand(Command): out = gfwhoami() self.gfarm_user = out.strip() self.test_mode = True + self.test_workdir_init() def test_main_short(self): self.test_init() - self.test_member() self.test_opt_pattern(full=False) + self.test_member() self.test_specified_dir() def test_main_long(self): self.test_init() self.test_unit() + self.test_opt_pattern(full=True) + self.test_member() # self.test_invalid('url', 'gfarm:/tmp', 'dst', False) # allowed self.test_invalid('dot1', '.', 'dst', True) self.test_invalid('dot2', '', 'dst', False) @@ -2653,8 +2657,6 @@ class GfptarCommand(Command): self.test_invalid('dotdot2', '../', 'dst', False) self.test_invalid('dotdot3', '../abc', 'dst', False) self.test_invalid('dotdot4', './..', 'dst', False) - self.test_member() - self.test_opt_pattern(full=True) def test_unit(self): verbosity = 2 @@ -2675,7 +2677,8 @@ class GfptarCommand(Command): # create tar per one entry self.opt['--size'] = 0 - # test --verbose and --sync-db + # test --sync-db + # --verbose: test report_path (SEE ALSO: create_a_tar_process1) self.opt['--verbose'] = True self.opt['--sync-db'] = True self.test_simple('syncdb') @@ -2740,16 +2743,43 @@ class GfptarCommand(Command): self.opt['--type'] = save_opt_type self.opt['--use-compress-program'] = save_opt_compress_prog - def test_invalid(self, name, src, dst, for_gfarm): - testname = f'gfptar-test-invalid-{name}' - d_name = f'{testname}-{self.gfarm_user}-{self.uid}-{self.pid}' + def test_workdir_init(self): + def remove_tree(url): + logger.debug('remove_tree: %s', url.url_str) + url.remove_tree(remove_readonly=True) + + test_dname = f"gfptar-test-{self.gfarm_user}-{self.uid}-{self.pid}" + self.testdir_gfarm = os.path.join(self.opt['--test-workdir-gfarm'], + test_dname) + self.testdir_gfarm_url = GfURL.init(self.testdir_gfarm) + self.testdir_gfarm_url.mkdir() + atexit.register(remove_tree, self.testdir_gfarm_url) + + self.testdir_local = os.path.join(self.opt['--test-workdir-local'], + test_dname) + self.testdir_local_url = GfURL.init(self.testdir_local) + self.testdir_local_url.mkdir() + atexit.register(remove_tree, self.testdir_local_url) + + def test_workdir_set(self, dname, gfarm=True, local=True): + if gfarm: + self.workdir_gfarm = os.path.join(self.testdir_gfarm, dname) + self.workdir_gfarm_url = GfURL.init(self.workdir_gfarm) + self.workdir_gfarm_url.mkdir() + if local: + self.workdir_local = os.path.join(self.testdir_local, dname) + self.workdir_local_url = GfURL.init(self.workdir_local) + self.workdir_local_url.mkdir() + def test_invalid(self, name, src, dst, for_gfarm): + testname = f'invalid-{name}' + self.test_workdir_set(testname, gfarm=for_gfarm, local=not for_gfarm) if for_gfarm: - workdir = os.path.join(self.opt['--test-workdir-gfarm'], d_name) + workdir = self.workdir_gfarm + workdir_url = self.workdir_gfarm_url else: - workdir = os.path.join(self.opt['--test-workdir-local'], d_name) - workdir_url = GfURL.init(workdir) - workdir_url.mkdir() + workdir = self.workdir_local + workdir_url = self.workdir_local_url testsrc_name = src # srcdir = workdir_url.url_join(testsrc_name) @@ -2776,14 +2806,12 @@ class GfptarCommand(Command): print(testname + ' ... FAIL (unexpected success)') def test_simple(self, name, use_all_files=False): - testname = f'gfptar-test-simple-{name}' - d_name = f'{testname}-{self.gfarm_user}-{self.uid}-{self.pid}' - workdir_local = os.path.join(self.opt['--test-workdir-local'], d_name) - workdir_gfarm = os.path.join(self.opt['--test-workdir-gfarm'], d_name) - workdir_local_url = GfURL.init(workdir_local) - workdir_gfarm_url = GfURL.init(workdir_gfarm) - workdir_local_url.mkdir() - workdir_gfarm_url.mkdir() + testname = f'simple-{name}' + self.test_workdir_set(testname) + workdir_local = self.workdir_local + workdir_gfarm = self.workdir_gfarm + workdir_local_url = self.workdir_local_url + workdir_gfarm_url = self.workdir_gfarm_url testsrc_name = 'test-src' srcdir_local = workdir_local_url.url_join(testsrc_name) @@ -2844,15 +2872,14 @@ class GfptarCommand(Command): # Local(tar) -> Local self.cmd_extract(test4_dir_local, test3_dir_local, []) - quiet = not (self.debug or self.verbose) # --list for gfarm - count = self.cmd_list_simple(test1_dir_gfarm, quiet=quiet) + count = self.cmd_list_simple(test1_dir_gfarm) if count != count1: logger.error(f'count({count}) != count1({count1})') logger.error_exit(1, testname + '(list_simple(test1_dir_gfarm)) ... FAIL') # --verbose --list for local - count = self.cmd_list_verbose(test3_dir_local, quiet=quiet) + count = self.cmd_list_verbose(test3_dir_local) if count != count1: logger.error(f'count({count}) != count1({count1})') logger.error_exit(1, testname + @@ -2922,13 +2949,11 @@ class GfptarCommand(Command): def test_member(self): testname = 'gfptar-test-member' - d_name = f'{testname}-{self.gfarm_user}-{self.uid}-{self.pid}' - workdir_local = os.path.join(self.opt['--test-workdir-local'], d_name) - workdir_gfarm = os.path.join(self.opt['--test-workdir-gfarm'], d_name) - workdir_local_url = GfURL.init(workdir_local) - workdir_gfarm_url = GfURL.init(workdir_gfarm) - workdir_local_url.mkdir() - workdir_gfarm_url.mkdir() + self.test_workdir_set(testname) + # workdir_local = self.workdir_local + # workdir_gfarm = self.workdir_gfarm + workdir_local_url = self.workdir_local_url + workdir_gfarm_url = self.workdir_gfarm_url testsrc_name = 'test-src' srcdir_local = workdir_local_url.url_join(testsrc_name) @@ -3102,13 +3127,11 @@ class GfptarCommand(Command): basedir_url = GfURL.init(basedir) testname = 'gfptar-test-specified-dir' - d_name = f'{testname}-{self.gfarm_user}-{self.uid}-{self.pid}' - workdir_local = os.path.join(self.opt['--test-workdir-local'], d_name) - workdir_gfarm = os.path.join(self.opt['--test-workdir-gfarm'], d_name) - workdir_local_url = GfURL.init(workdir_local) - workdir_gfarm_url = GfURL.init(workdir_gfarm) - workdir_local_url.mkdir() - workdir_gfarm_url.mkdir() + self.test_workdir_set(testname) + # workdir_local = self.workdir_local + # workdir_gfarm = self.workdir_gfarm + workdir_local_url = self.workdir_local_url + workdir_gfarm_url = self.workdir_gfarm_url test9_name = 'test-9-create' test9_dir_gfarm = workdir_gfarm_url.url_join(test9_name) @@ -4570,17 +4593,20 @@ class GfptarCommand(Command): f'{bytes_per_sec_str}B/s ' f'{ent_per_sec_str}Ent/s') - def cmd_list_simple(self, indir, quiet=False): - return self.cmd_list(indir, quiet=quiet, verbose=False) + def cmd_list_simple(self, indir): + return self.cmd_list(indir, verbose=False) - def cmd_list_verbose(self, indir, quiet=False): - return self.cmd_list(indir, quiet=quiet, verbose=True) + def cmd_list_verbose(self, indir): + return self.cmd_list(indir, verbose=True) - def cmd_list(self, indir, quiet=False, verbose=True): + def cmd_list(self, indir, verbose=True): self.options_init() - if self.quiet: - quiet = self.quiet + quiet = self.quiet indir_url = GfURL.init(indir) + if self.test_mode: + f = os.devnull + else: + f = sys.stdout def print_quiet(serial, gen, path, fattr): pass @@ -4595,13 +4621,13 @@ class GfptarCommand(Command): mtime_str = dt_object.strftime('%Y-%m-%d %H:%M') print(f'g{gen}_{serial:04} {fattr.ftype} {fattr.mode:04o}' f' {fattr.user:>8}/{fattr.group:<8}' - f' {fattr.size:9d} {mtime_str} {name}') + f' {fattr.size:9d} {mtime_str} {name}', file=f) def print_simple(serial, gen, path, fattr): name = path if fattr.ftype == InfoDB.TYPE_DIR: name = name + '/' - print(f'{fattr.ftype} {name}') + print(f'{fattr.ftype} {name}', file=f) if quiet: print_func = print_quiet From f88b763f29474ee6e28f0b84497bcc6df0e6c57e Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Fri, 2 Aug 2024 22:22:36 +0900 Subject: [PATCH 071/143] gfptar --list: can be quickly cancelled --- gftool/gfptar/gfptar | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 19300b609..f2addb7fb 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -2532,7 +2532,7 @@ class GfptarCommand(Command): def sig_handler(signum, frame): if message: # logger.warning(f'Interrupt (signal={signum})') - sys.stderr.write(f'\nInterrupt[1] (signal={signum})\n') + sys.stderr.write(f'\nInterrupt (signal={signum})\n') self.canceled = True signal.signal(signal.SIGHUP, sig_handler) @@ -4717,8 +4717,14 @@ class GfptarCommand(Command): # yield tuple(serial, gen, tar_path, dbgz_path, db_path) def list_tar_dbgz_db(self, indir_url, sort=False, progress=False): - logger.debug('list_tar_dbgz_db') + logger.debug('list_tar_dbgz_db: start: %s', indir_url.url_str) + try: + yield from self.list_tar_dbgz_db0(indir_url, + sort=sort, progress=progress) + finally: + logger.debug('list_tar_dbgz_db: done: %s', indir_url.url_str) + def list_tar_dbgz_db0(self, indir_url, sort=False, progress=False): tar_list, dbgz_list = self.list_tar_list_dbgz_from_dir(indir_url) tar_dict = {} @@ -4773,7 +4779,7 @@ class GfptarCommand(Command): for sig in sigs: orig_sig_handler[sig] = signal.getsignal(sig) - self.sig_ignore() # for parent process + self.sig_ignore() # Concurrent execution for each generation with multiprocessing.Manager() as manager: # subprocess @@ -4783,10 +4789,10 @@ class GfptarCommand(Command): def sig_handler(signum, frame): if share_cancel.value == 0: # logger.warning(f'Interrupt (signal={signum}') - sys.stderr.write(f'\nInterrupt[2] (signal={signum})\n') + sys.stderr.write(f'\nInterrupt (signal={signum})\n') share_cancel.value = 1 + self.cancel() - # for Manager process signal.signal(signal.SIGHUP, sig_handler) signal.signal(signal.SIGINT, sig_handler) signal.signal(signal.SIGTERM, sig_handler) @@ -4860,8 +4866,9 @@ class GfptarCommand(Command): arg) job_list.append((future, serial, tar_path, dbgz_path)) yield from result_all(job_list, gen) - if share_cancel.value != 0: - cancel = True + if share_cancel.value != 0: + cancel = True + break # no longer be able to access Manager for sig in sigs: # restore @@ -4992,7 +4999,7 @@ class InfoDB: def sig_handler(signum, frame): if not cls.signal_canceled: pid = os.getpid() - logger.info(f'Interrupt[3] (signal={signum}) (PID={pid})') + logger.info(f'Interrupt (signal={signum}) (PID={pid})') cls.signal_canceled = True signal.signal(signal.SIGHUP, sig_handler) From 4ac364e5d04571133dd9972c18c60c42fc25573f Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Mon, 12 Aug 2024 10:15:40 +0900 Subject: [PATCH 072/143] gfptar: new option: --update --- gftool/gfptar/gfptar | 125 ++++++++++++++++++++++++++++++++++--------- 1 file changed, 101 insertions(+), 24 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index f2addb7fb..562dd7033 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -736,7 +736,7 @@ def convert_message(error): return f'{error.__class__.__name__}: {str(error)}' -class Command(metaclass=abc.ABCMeta): +class Program(metaclass=abc.ABCMeta): def init(self, name) -> NoReturn: self.name = name self._docopt = docopt(self.getDoc()) @@ -2453,7 +2453,7 @@ def ignore_exception(ignore): raise -class GfptarCommand(Command): +class GfptarProgram(Program): LIST_SUFFIX = '.lst' # to ignore old files TMP_SUFFIX = '.tmp' SERIAL_FORMAT = '%04d' @@ -2613,6 +2613,7 @@ class GfptarCommand(Command): # already reported pass else: + sys.stderr.write('\n') logger.error(convert_message(e)) sys.exit(1) finally: @@ -3304,7 +3305,8 @@ class GfptarCommand(Command): TABLE_tar_entry = 'tar_entry' - def create_common(self, outdir, basedir, infiles): + def cmd_create_init(self, cmd_name, outdir, basedir): + self.cmd_name = cmd_name self.options_init() logger.debug('create start: outdir=%s, basedir=%s', outdir, basedir) self.outdir = outdir @@ -3346,8 +3348,6 @@ class GfptarCommand(Command): self.start_time = time.time() self.next_time = self.start_time + self.progress_interval self.listing = True - if self.progress_enabled: - self.progress_for_create(self.start_time) self.gfsched_lock = None self.gfsched_next = 0 @@ -3355,17 +3355,51 @@ class GfptarCommand(Command): self.cannot_be_archived = 0 self.create_job_init() # before creating threads - - def cmd_update(self, outdir, basedir, infiles): - self.create_common(outdir, basedir, infiles) - # TODO - print("TODO...") + self.fattr_dict_list = [] def cmd_create(self, outdir, basedir, infiles): - self.create_common(outdir, basedir, infiles) + self.cmd_create_init('create', outdir, basedir) self.outdir_url.create_new_dir() + serial = 1 + gen = 1 + self.cmd_create_common(serial, gen, infiles) + + def cmd_update(self, outdir, basedir, infiles): + self.cmd_create_init('update', outdir, basedir) + if not self.outdir_url.exists(): + raise FileNotFoundError(outdir) + broken_count = 0 + max_serial = 0 + max_gen = 0 + suffix = '.' + self.compress_type + for serial, gen, tar_path, dbgz_path, db_path in \ + self.list_tar_dbgz_db(self.outdir_url, sort=True, + progress=True): + if self.is_canceled(): + raise self.error_canceled() + if serial > max_serial: + max_serial = serial + if gen > max_gen: + max_gen = gen + if dbgz_path is None or tar_path is None: + broken_count += 1 + continue + if not tar_path.endswith(suffix): + raise GfptarError(f"Tar compression type mismatch: {tar_path}") + db = DB(db_path) + fattr_dict = DBDict(db, FileAttr2, InfoDB.TABLE_ENTRY) + self.fattr_dict_list.append(fattr_dict) + max_serial += 1 + max_gen += 1 + self.start_time = time.time() + self.next_time = self.start_time + self.progress_interval + self.cmd_create_common(max_serial, max_gen, infiles) + if broken_count > 0: + raise GfptarError("Some .tar.* or .db.gz are broken:" + f" error={broken_count}") - infiles_checked = [] + def cmd_create_common(self, serial, gen, infiles): + infiles_checked = set() for infile in infiles: # infile_url = GfURL.init(infile) # if not infile_url.is_local(): @@ -3384,7 +3418,7 @@ class GfptarCommand(Command): if infile == '..' or infile.startswith('../'): raise GfException('specifying parent directory (..) ' + 'as members is not allowed: ' + infile) - infiles_checked.append(infile) + infiles_checked.add(infile) def listdir_switch(gfurl): if self.dummy_num is not None: @@ -3397,11 +3431,42 @@ class GfptarCommand(Command): return gfurl.listdir(recursive=True, first=True, hardlink_warn=self.hardlink_warn) + def is_update_target1(entry): + return True + + def is_update_target2(entry): + # entry is an input file. + path = entry.subpath(self.basedir_url) + # compare mtime from newer .db + for fattr_dict in reversed(self.fattr_dict_list): + # fattr_dict has existing files. + fattr = fattr_dict.get(path) + if fattr is None: + continue + elif int(entry.mtime) > int(fattr.mtime): # sec. + logger.debug(f"is_update_target2: path={path}:" + f" entry.mtime({entry.mtime}) >" + f" fattr.mtime({fattr.mtime})") + return True + elif fattr.size != entry.size: + logger.debug(f"is_update_target2: path={path}:" + f" fattr.size({fattr.size})" + f" != entry.size({entry.size})") + return True + else: + return False + # not found + logger.debug(f"is_update_target2: path={path}: not found (True)") + return True + + if len(self.fattr_dict_list) > 0: + is_update_target = is_update_target2 + else: + is_update_target = is_update_target1 + tmpdir = self.tmpdir - tardb_prefix = os.path.join(tmpdir.name, 'create') + tardb_prefix = os.path.join(tmpdir.name, self.cmd_name) tardb_fmt = f'_{self.SERIAL_FORMAT}.db' - serial = 1 - gen = 1 # TODO # to reduce memory usage filelist_db = DB(tardb_prefix + tardb_fmt % serial) filelist = DBList(filelist_db, GfURLEntry, self.TABLE_tar_entry) @@ -3411,6 +3476,9 @@ class GfptarCommand(Command): cannot_read_dir = 0 has_error = None + if self.progress_enabled: + self.progress_for_create(self.start_time) + for infile in infiles_checked: if self.is_canceled(): logger.debug('Canceled (listdir 1): serial=%d', serial) @@ -3425,6 +3493,9 @@ class GfptarCommand(Command): logger.debug('Canceled (listdir 2): serial=%d', serial) break logger.debug('listdir: entry.path=%s', entry.path) + if not is_update_target(entry): + continue + # include length of path this_size = entry.size_all() with self.lock(): # for progress @@ -3515,6 +3586,8 @@ class GfptarCommand(Command): raise has_error if self.is_canceled(): raise self.error_canceled() + if self.total_num == 0: + print('No files were updated.') def create_job_init(self): if self.MT_enabled(): @@ -4187,6 +4260,7 @@ class GfptarCommand(Command): logger.debug('target_set.add: %s', tar_path) target_set.add(tar_path) # select this tar file else: + # TODO gen -> tar_path target_set.add(tar_path) # use all tar files if self.progress_enabled: now = time.time() @@ -4242,7 +4316,8 @@ class GfptarCommand(Command): self.db_target.close() self.db_target.unlink() if error_num > 0: - raise GfptarError(f'Total errors encountered: {error_num}') + raise GfptarError("Some .tar.* or .db.gz are broken:" + f" error={error_num}") def extract_directories(self, directory_set): logger.debug('extract_directories') @@ -4553,7 +4628,7 @@ class GfptarCommand(Command): total_size_str = self._humanize(self.total_size) bytes_per_sec_str = self._humanize(bytes_per_sec) ent_per_sec_str = self._humanize(ent_per_sec) - sys.stdout.write(f'\rcreate: {percent_str}% ' + sys.stdout.write(f'\r{self.cmd_name}: {percent_str}% ' f'{stored_size_str}/{total_size_str}B ' f'{stored_num_str}/{total_num_str}Ent ' f'{sec_str} ' @@ -4663,11 +4738,12 @@ class GfptarCommand(Command): logger.debug('os.remove: %s', db_path) os.remove(db_path) if error_num > 0: - raise GfptarError(f'Total errors encountered: {error_num}') + raise GfptarError("Some .tar.* or .db.gz are broken:" + f" error={error_num}") def sort_tar_or_db_list(self, tar_or_db_list): def id_key(value): - return value[0] # serial + return value[0] # 0:serial, 1:gen, 2:path tar_or_db_list.sort(key=id_key) return tar_or_db_list @@ -4805,7 +4881,8 @@ class GfptarCommand(Command): self.progress_for_sync_infodb(now) if sort: # sort by gen - gen_tarlist_list = sorted(gen_tarlist_dict.items()) + gen_tarlist_list = sorted(gen_tarlist_dict.items(), + key=lambda x: x[0]) else: gen_tarlist_list = gen_tarlist_dict.items() @@ -4861,7 +4938,7 @@ class GfptarCommand(Command): self.bufsize, self.use_fsync, self.dry_run) # InfoDB.generate_db_and_dbgz is classmethod, # because ProcessPoolExecutor cannot serialize - # members of "GfptarCommand (self)" object. + # members of "GfptarProgram (self)" object. future = executor.submit(InfoDB.generate_db_and_dbgz, arg) job_list.append((future, serial, tar_path, dbgz_path)) @@ -5183,7 +5260,7 @@ Options: -j, --jobs=NUM the number of jobs to copy per tar file in parallel [default: 4] -s, --size=BYTES assumed bytes per output file [default: 200Mi] - -T, --type=TYPE compression type (and tar archive suffix) + -T, --type=TYPE compression type and tar archive suffix - gz : use gzip (*.tar.gz) - bz2: use bzip2 (*.tar.bz2) - xz : use xz (*.tar.xz) @@ -5296,5 +5373,5 @@ _schema = Schema({ if __name__ == '__main__': - gfptar = GfptarCommand(progname) + gfptar = GfptarProgram(progname) gfptar.run() From 1ce0e2f0743044e5bc7b91e93385a1df650de50f Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Mon, 12 Aug 2024 17:49:12 +0900 Subject: [PATCH 073/143] gfptar --test: add test_update() --- gftool/gfptar/gfptar | 236 +++++++++++++++++++++++++++++++------------ 1 file changed, 169 insertions(+), 67 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 562dd7033..f2900ba3c 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -1970,7 +1970,7 @@ class GfURLLocal(GfURL): user=None, group=None, use_fsync=True, hostname=None): tmpmode = mode | 0o200 # necessary (Permission denied at ex.0o400) fd = os.open(path=self.url_str, - flags=(os.O_WRONLY | os.O_CREAT | os.O_EXCL), + flags=(os.O_WRONLY | os.O_CREAT | os.O_TRUNC), mode=tmpmode, ) if textmode: @@ -2642,6 +2642,7 @@ class GfptarProgram(Program): self.test_init() self.test_opt_pattern(full=False) self.test_member() + self.test_update() self.test_specified_dir() def test_main_long(self): @@ -2649,6 +2650,7 @@ class GfptarProgram(Program): self.test_unit() self.test_opt_pattern(full=True) self.test_member() + self.test_update() # self.test_invalid('url', 'gfarm:/tmp', 'dst', False) # allowed self.test_invalid('dot1', '.', 'dst', True) self.test_invalid('dot2', '', 'dst', False) @@ -3025,41 +3027,118 @@ class GfptarProgram(Program): workdir_local_url.remove_tree(remove_readonly=True) workdir_gfarm_url.remove_tree(remove_readonly=True) + def test_update(self): + testname = 'gfptar-test-update' + self.test_workdir_set(testname) + # workdir_local = self.workdir_local + # workdir_gfarm = self.workdir_gfarm + workdir_local_url = self.workdir_local_url + workdir_gfarm_url = self.workdir_gfarm_url + + testsrc_name1 = 'test-src1' + testsrc_name2 = 'test-src2' + srcdir_local1 = workdir_local_url.url_join(testsrc_name1) + srcdir_local2 = workdir_local_url.url_join(testsrc_name2) + srcdir_gfarm1 = workdir_gfarm_url.url_join(testsrc_name1) + srcdir_gfarm2 = workdir_gfarm_url.url_join(testsrc_name2) + + tree = [ + (self.D, 'dir1', 0o715, 10, None, True), + (self.F, 'dir1/file1', 0o664, 10, None, True), + (self.F, 'dir1/file2', 0o664, 10, None, True), + (self.S, 'dir1/symlink1', 0o777, 10, 'file1', True), + ] + self.test_create_tree(srcdir_local1, tree) + self.test_create_tree(srcdir_gfarm1, tree) + tree = [ + (self.D, 'dir1', 0o750, 11, None, True), + (self.F, 'dir1/file1', 0o660, 100, None, True), + (self.F, 'dir1/file2', 0o664, 10, None, True), # not updated + (self.S, 'dir1/symlink1', 0o770, 11, 'file1', True), + ] + self.test_create_tree(srcdir_local2, tree) + self.test_create_tree(srcdir_gfarm2, tree) + + test1_name = 'test-1-create' + test1_dir_local = workdir_local_url.url_join(test1_name) + test2_name = 'test-2-extract' + test2_dir_local = workdir_local_url.url_join(test2_name) + + test3_name = 'test-3-create' + test3_dir_gfarm = workdir_gfarm_url.url_join(test3_name) + test4_name = 'test-4-extract' + test4_dir_local = workdir_local_url.url_join(test4_name) + + members = ['dir1'] + + # Local -> Local(tar) + self.cmd_create(test1_dir_local, srcdir_local1, members) + # Local -> Local(tar) (update) + self.cmd_update(test1_dir_local, srcdir_local2, members) + # Local(tar) -> Local + self.cmd_extract(test2_dir_local, test1_dir_local, []) + + # self.test_mode = False + # self.cmd_list_verbose(test1_dir_local) + # self.test_mode = True + + if not self.test_compare_local(test2_dir_local, srcdir_local2, + same_owner=True): + logger.error_exit(1, testname + ' ... FAIL (different data)(1)') + + # Gfarm -> Gfarm(tar) + self.cmd_create(test3_dir_gfarm, srcdir_gfarm1, members) + # Gfarm -> Gfarm(tar) (update) + self.cmd_update(test3_dir_gfarm, srcdir_gfarm2, members) + # Gfarm(tar) -> Local + self.cmd_extract(test4_dir_local, test3_dir_gfarm, []) + + if not self.test_compare_local(test4_dir_local, srcdir_local2, + same_owner=True): + logger.error_exit(1, testname + ' ... FAIL (different data)(2)') + + print(testname + ' ... PASS') + workdir_local_url.remove_tree(remove_readonly=True) + workdir_gfarm_url.remove_tree(remove_readonly=True) + + # for test + F = 'file' + D = 'directory' + S = 'symlink' + L = 'hardlink' + def test_prepare_srcdir(self, dir_url_str, readonly=False, noread=False, link=False, longname=False): logger.debug('readonly=%s, noread=%s, link=%s, longname=%s', readonly, noread, link, longname) - F = 'file' - D = 'directory' - S = 'symlink' - L = 'hardlink' longname = ('0123456789' * 30)[:255] tree = [ - # (F, 'file1', 0o664, 1234567890, None, True), - (F, 'gfarm:abc', 0o644, 1234567890, None, True), # not Gfarm URL - (D, 'dir1', 0o715, 2234567890, None, True), + # (self.F, 'file1', 0o664, 1234567890, None, True), + (self.F, 'gfarm:abc', 0o644, 1234567890, + None, True), # not Gfarm URL + (self.D, 'dir1', 0o715, 2234567890, None, True), ] tree_readonly = [ - (D, 'dir1/readonly', 0o700, # chmod 0o500 later + (self.D, 'dir1/readonly', 0o700, # chmod 0o500 later 3234567891, None, True), - (F, 'dir1/readonly/file#2', 0o456, 4234567892, None, True), + (self.F, 'dir1/readonly/file#2', 0o456, 4234567892, None, True), ] tree_noread = [ - (F, 'dir1/noread-file', 0o000, 1234567891, None, False), - (D, 'dir1/noread-dir', 0o700, # chmod 0o000 later + (self.F, 'dir1/noread-file', 0o000, 1234567891, None, False), + (self.D, 'dir1/noread-dir', 0o700, # chmod 0o000 later 1234567892, None, True), - (F, 'dir1/noread-dir/file3', 0o400, 1234567893, None, False), + (self.F, 'dir1/noread-dir/file3', 0o400, 1234567893, None, False), ] tree_link = [ - (D, 'dir1/ディレクトリ 2', 0o755, 5234567890, None, True), - (L, 'dir1/ディレクトリ 2/hardlink1', + (self.D, 'dir1/ディレクトリ 2', 0o755, 5234567890, None, True), + (self.L, 'dir1/ディレクトリ 2/hardlink1', 0o400, 6234567890, 'gfarm:abc', True), - (S, 'dir1/ディレクトリ 2/symlink1', + (self.S, 'dir1/ディレクトリ 2/symlink1', 0o777, 7234567890, 'hardlink1', True), ] tree_longname = [ - (F, 'dir1/' + longname, 0o775, 9234567890, None, True), + (self.F, 'dir1/' + longname, 0o775, 9234567890, None, True), ] if readonly: tree += tree_readonly @@ -3069,6 +3148,19 @@ class GfptarProgram(Program): tree += tree_link if longname: tree += tree_longname + + count = self.test_create_tree(dir_url_str, tree) + if readonly: + url_str = os.path.join(dir_url_str, 'dir1/readonly') + readonly_url = GfURL.init(url_str) + readonly_url.chmod(mode=0o500) + if noread: + url_str = os.path.join(dir_url_str, 'dir1/noread-dir') + noread_url = GfURL.init(url_str) + noread_url.chmod(mode=0o000) + return count + + def test_create_tree(self, dir_url_str, tree): count = 0 srcdir_url = GfURL.init(dir_url_str) srcdir_url.mkdir() @@ -3081,18 +3173,18 @@ class GfptarProgram(Program): linkname = ent[4] readable = ent[5] url = GfURL.init(srcdir_url.url_join(path)) - if ftype == F: + if ftype == self.F: with url.writeopen(textmode=True, mode=mode, mtime=mtime, use_fsync=self.use_fsync) as f: - f.write(path) - elif ftype == D: + f.write(path + str(mtime)) + elif ftype == self.D: url.mkdir() - elif ftype == S: + elif ftype == self.S: url.symlink(linkname) url.chmod(mode, mtime=mtime, follow_symlinks=False) - elif ftype == L: + elif ftype == self.L: url.hardlink(srcdir_url.url_join(linkname)) url.chmod(mode, mtime=mtime) else: @@ -3106,15 +3198,9 @@ class GfptarProgram(Program): mtime = ent[3] # linkname = ent[4] url = GfURL.init(srcdir_url.url_join(path)) - if ftype == D: + if ftype == self.D: url.chmod(mode, mtime=mtime) srcdir_url.chmod(mode=0o700, mtime=0) - if readonly: - readonly_url = GfURL.init(srcdir_url.url_join('dir1/readonly')) - readonly_url.chmod(mode=0o500) - if noread: - noread_url = GfURL.init(srcdir_url.url_join('dir1/noread-dir')) - noread_url.chmod(mode=0o000) return count # the number of readable files def test_specified_dir(self): @@ -3374,7 +3460,7 @@ class GfptarProgram(Program): suffix = '.' + self.compress_type for serial, gen, tar_path, dbgz_path, db_path in \ self.list_tar_dbgz_db(self.outdir_url, sort=True, - progress=True): + progress=self.progress_enabled): if self.is_canceled(): raise self.error_canceled() if serial > max_serial: @@ -4160,7 +4246,6 @@ class GfptarProgram(Program): self.extract_main() tmpdir.cleanup() - TABLE_target_set = 'target_set' TABLE_directory_set = 'directory_set' TABLE_member_set = 'member_set' TABLE_archive_dict = 'archive_dict' @@ -4169,7 +4254,6 @@ class GfptarProgram(Program): TABLE_dirstat_dict = 'dirstat_dict' def extract_schedule(self): - target_set = DBSet(self.db_target, StrObj, self.TABLE_target_set) directory_set = DBSet(self.db, StrObj, self.TABLE_directory_set) member_set = DBSet(self.db, StrObj, self.TABLE_member_set) archive_dict = DBDict(self.db, JsonObj, self.TABLE_archive_dict) @@ -4188,6 +4272,8 @@ class GfptarProgram(Program): path = path.lstrip('/') member_check_dict[path] = False # initialize + gen_tarlist_dict = defaultdict(list) # gen -> tarlist + self.total_num = 0 self.total_size = 0 self.start_time = time.time() @@ -4197,7 +4283,7 @@ class GfptarProgram(Program): error_num = 0 for serial, gen, tar_path, dbgz_path, db_path in \ - self.list_tar_dbgz_db(self.indir_url, sort=False, + self.list_tar_dbgz_db(self.indir_url, sort=True, progress=False): if self.is_canceled(): raise self.error_canceled() @@ -4257,11 +4343,12 @@ class GfptarProgram(Program): is_target = True archive_dict.clear() # re-use for next tar file if is_target: - logger.debug('target_set.add: %s', tar_path) - target_set.add(tar_path) # select this tar file + logger.debug('gen_tarlist_dict[%d].append(%s)', + gen, tar_path) + gen_tarlist_dict[gen].append(tar_path) else: - # TODO gen -> tar_path - target_set.add(tar_path) # use all tar files + # use all tar files + gen_tarlist_dict[gen].append(tar_path) if self.progress_enabled: now = time.time() if now >= self.next_time: @@ -4280,10 +4367,10 @@ class GfptarProgram(Program): ' in archive files: ' + member) member_check_dict.clear() archive_dict.clear() - return target_set, directory_set, member_set, error_num + return gen_tarlist_dict, directory_set, member_set, error_num def extract_main(self): - (target_set, directory_set, + (gen_tarlist_dict, directory_set, member_set, error_num) = self.extract_schedule() self.outdir_url.create_new_dir() @@ -4303,9 +4390,11 @@ class GfptarProgram(Program): self.gfsched_list = None if self.MT_enabled(): - self.extract_from_archives_MT(target_set, member_set) + for gen, tarlist in gen_tarlist_dict.items(): + self.extract_from_archives_MT(tarlist, member_set) else: - self.extract_from_archives(target_set, member_set) + for gen, tarlist in gen_tarlist_dict.items(): + self.extract_from_archives(tarlist, member_set) if self.progress_enabled: self.progress_for_extract(time.time()) sys.stdout.write('\n') @@ -4390,25 +4479,25 @@ class GfptarProgram(Program): dir_url.chmod(tarinfo.mode, mtime=tarinfo.mtime) self.info('update_stat: {}, mode={}', d, oct(tarinfo.mode)) - def extract_from_archives(self, target_set, member_set): + def extract_from_archives(self, tarlist, member_set): self.lock_init(False) - serial = 0 - for target in target_set.iterator(sort='ASC'): - logger.debug('target_set: %s', target) - serial += 1 - self.extract_from_a_tar(serial, target, member_set) + index = 0 + for target in tarlist: + logger.debug('target tar: %s', target) + index += 1 + self.extract_from_a_tar(index, target, member_set) - def extract_from_archives_MT(self, target_set, member_set): + def extract_from_archives_MT(self, tarlist, member_set): self.lock_init(True) with concurrent.futures.ThreadPoolExecutor( max_workers=self.max_workers) as executor: self.futures = {} # tar filenames - serial = 0 - for target in target_set.iterator(sort='ASC'): - logger.debug('target_set: %s', target) - serial += 1 + index = 0 + for target in tarlist: + logger.debug('target tar: %s', target) + index += 1 t = executor.submit(self.extract_from_a_tar, - serial, target, member_set) + index, target, member_set) self.futures[t] = target for t in concurrent.futures.as_completed(self.futures, @@ -4428,17 +4517,17 @@ class GfptarProgram(Program): # t2.cancel() raise exc - def extract_from_a_tar(self, serial, target, member_set): + def extract_from_a_tar(self, index, target, member_set): try: - self.extract_from_a_tar0(serial, target, member_set) + self.extract_from_a_tar0(index, target, member_set) except Exception: if self.is_canceled(): raise self.error_canceled() else: raise - def extract_from_a_tar0(self, serial, target, member_set): - logger.debug('extract_from_a_tar start: serial=%d', serial) + def extract_from_a_tar0(self, index, target, member_set): + logger.debug('extract_from_a_tar start: index=%d', index) if self.is_canceled(): logger.debug('Canceled (extract 1): name=%s', target) return @@ -4446,16 +4535,15 @@ class GfptarProgram(Program): use_gfarm_command=self.use_gfarm_command) tar = GfTarFile.extract_open(arch_url, self.bufsize) try: - self.extract_from_a_tar1(serial, target, member_set, tar) + self.extract_from_a_tar1(index, target, member_set, tar) with self.lock(): self.info('DONE: {}', arch_url.url_str) finally: tar.close() - def extract_from_a_tar1(self, serial, target, member_set, tar): + def extract_from_a_tar1(self, index, target, member_set, tar): with self.lock(): members_num = len(member_set) - index = serial while True: if self.is_canceled(): logger.debug('Canceled (extract 2): name=%s', target) @@ -4532,7 +4620,14 @@ class GfptarProgram(Program): elif tarinfo.issym(): logger.debug('extract,link: %s, %s', outfile, tarinfo.linkname) - outurl.symlink(tarinfo.linkname) + try: + outurl.symlink(tarinfo.linkname) + except Exception: + if outurl.exists(): + outurl.remove() + outurl.symlink(tarinfo.linkname) + else: + raise if self.same_owner: outurl.chmod(tarinfo.mode, mtime=tarinfo.mtime, user=tarinfo.uname, group=tarinfo.gname, @@ -4675,13 +4770,20 @@ class GfptarProgram(Program): return self.cmd_list(indir, verbose=True) def cmd_list(self, indir, verbose=True): + if self.test_mode: + outf = open(os.devnull, 'w') + else: + outf = sys.stdout + try: + return self.cmd_list0(outf, indir, verbose) + finally: + if self.test_mode: + outf.close() + + def cmd_list0(self, outf, indir, verbose): self.options_init() quiet = self.quiet indir_url = GfURL.init(indir) - if self.test_mode: - f = os.devnull - else: - f = sys.stdout def print_quiet(serial, gen, path, fattr): pass @@ -4696,13 +4798,13 @@ class GfptarProgram(Program): mtime_str = dt_object.strftime('%Y-%m-%d %H:%M') print(f'g{gen}_{serial:04} {fattr.ftype} {fattr.mode:04o}' f' {fattr.user:>8}/{fattr.group:<8}' - f' {fattr.size:9d} {mtime_str} {name}', file=f) + f' {fattr.size:9d} {mtime_str} {name}', file=outf) def print_simple(serial, gen, path, fattr): name = path if fattr.ftype == InfoDB.TYPE_DIR: name = name + '/' - print(f'{fattr.ftype} {name}', file=f) + print(f'{fattr.ftype} {name}', file=outf) if quiet: print_func = print_quiet From 02efba1fdc925ccd7f9065e0ba991922531b32c2 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Tue, 13 Aug 2024 18:14:22 +0900 Subject: [PATCH 074/143] gfptar --test: fix AssertionError --- gftool/gfptar/gfptar | 58 +++++++++++++++++++++++--------------------- 1 file changed, 30 insertions(+), 28 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index f2900ba3c..7caecd98f 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -1415,7 +1415,7 @@ class GfURLGfarm(GfURL): def remove_tree(self, remove_readonly=False): path = self.url_str if path == '/' or path == '': - raise AssertionError + raise AssertionError('remove_tree: "/" or ""') try: execcmd(['gfrm', '-rf', path]) except MemoryError: @@ -1588,7 +1588,8 @@ class GfURLGfarm(GfURL): # ex. gfarm:/home/user1/dir: -> gfarm:/home/user1/dir dirname = line[:-1] if not dirname.startswith(self.url_str): - raise AssertionError + raise AssertionError(f'{dirname}.startswith(' + f'{self.url_str})') first = False def gfsched(self, is_file=False, write_mode=False, number=None): @@ -1825,7 +1826,7 @@ class GfURLLocal(GfURL): def remove_tree(self, remove_readonly=False): path = self.path if path == '/' or path == '': - raise AssertionError + raise AssertionError('remove_tree: "/" or ""') if remove_readonly: self._chmod_recursive(path, 0o700, 0o600) shutil.rmtree(path) @@ -3736,25 +3737,28 @@ class GfptarProgram(Program): # timeout = 10 # process.join(timeout) process.terminate() - process.kill() # no effect in Python 3.6 - # process.close() may raise ValueError. - # (I don't know the reason): - # ValueError: Cannot close a process while it is still - # running. You should first call join() or terminate(). - # - # Retry process.close() if ValueError is caught. - ok = False - for i in range(50): # retry, max 5s - try: - process.close() - ok = True - break - except ValueError as e: - logger.debug('retry[%d] process.close() (index=%d):', - i, index, exc_info=e) - time.sleep(0.1) - if not ok: - process.close() + + if hasattr(process, 'kill'): + process.kill() # Python 3.7 or later + + # process.close() may raise ValueError. + # (I don't know the reason): + # ValueError: Cannot close a process while it is still + # running. You should first call join() or terminate(). + # + # Retry process.close() if ValueError is caught. + ok = False + for i in range(50): # retry, max 5s + try: + process.close() # Python 3.7 or later + ok = True + break + except ValueError as e: + logger.debug('retry[%d] process.close() (index=%d):', + i, index, exc_info=e) + time.sleep(0.1) + if not ok: + process.close() logger.debug('process_close() finished (index=%d)', index) def create_job_final(self, timeout=None): @@ -3767,9 +3771,6 @@ class GfptarProgram(Program): self.worker_terminate() def worker_terminate(self): - if getattr(self, 'worker_list', None) is None: - return - err_list = [] for index, worker in enumerate(self.worker_list): started, process, inq, outq = worker if started.status: @@ -3785,9 +3786,10 @@ class GfptarProgram(Program): if started.status: self.process_close(process, index=index) started.status = False - except Exception: - err_list.append(worker) - self.worker_list = err_list + except Exception as e: + logger.debug("self.process_close", exc_info=e) + # ignore + self.worker_list = [] def _create_job_check_MT(self, timeout=0.1): has_error = None From fcee609a70b1dccf9dfb2c85d49c7be10569a104 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Wed, 14 Aug 2024 11:54:59 +0900 Subject: [PATCH 075/143] gfptar: simplify --- gftool/gfptar/gfptar | 96 +++++++++++++++++++------------------------- 1 file changed, 42 insertions(+), 54 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 7caecd98f..d3dfe8157 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -4931,9 +4931,7 @@ class GfptarProgram(Program): self.total_tar_num = 0 self.current_tar_num = 0 - # generation -> tar files - gen_tarlist_dict = defaultdict(list) - # tar_dbgz_list = [] + tar_dbgz_list = [] for serial, gen_dbgz in dbgz_dict.items(): gen, dbgz_path = gen_dbgz gen_tar = tar_dict.get(serial, None) @@ -4941,9 +4939,10 @@ class GfptarProgram(Program): gen2, tar_path = gen_tar else: tar_path = None - # tar_dbgz_list.append((serial, gen, tar_path, dbgz_path)) - gen_tarlist_dict[gen].append((serial, tar_path, dbgz_path)) + tar_dbgz_list.append((serial, gen, tar_path, dbgz_path)) self.total_tar_num += 1 + if sort: # sort by serial + tar_dbgz_list.sort(key=lambda x: x[0]) del dbgz_dict del tar_dict @@ -4984,14 +4983,8 @@ class GfptarProgram(Program): now = time.time() self.progress_for_sync_infodb(now) - if sort: # sort by gen - gen_tarlist_list = sorted(gen_tarlist_dict.items(), - key=lambda x: x[0]) - else: - gen_tarlist_list = gen_tarlist_dict.items() - - def result_one(one, gen): - future, serial, tar_path, dbgz_path = one + def result_one(one): + future, gen, serial, tar_path, dbgz_path = one try: db_path, dbgz_path, ent_num, size = future.result() self.current_ent_num += ent_num @@ -5008,48 +5001,43 @@ class GfptarProgram(Program): if self.save_e is None: self.save_e = e - def result_all(previous_list, gen): + def result_all(previous_list): for one in previous_list: - yield from result_one(one, gen) - - for gen, tarlist in gen_tarlist_list: - if sort: # sort by serial - self.sort_tar_or_db_list(tarlist) - - # concurrent processes per generation (gen) - job_list = [] - with concurrent.futures.ProcessPoolExecutor( - max_workers=self.max_workers) as executor: - for serial, tar_path, dbgz_path in tarlist: - if len(job_list) >= self.max_workers: - # wait for a previous job - yield from result_one(job_list[0], gen) - job_list = job_list[1:] # remove first - if tar_path is None: - # missing tar file - db_path = None - # wait for all previous jobs - yield from result_all(job_list, gen) - job_list = [] - yield serial, gen, tar_path, dbgz_path, db_path - continue - if dbgz_path is None: - update = True - else: - update = self.sync_infodb - arg = (lock, share_cancel, self.tmpdir.name, - update, gen, serial, tar_path, - self.bufsize, self.use_fsync, self.dry_run) - # InfoDB.generate_db_and_dbgz is classmethod, - # because ProcessPoolExecutor cannot serialize - # members of "GfptarProgram (self)" object. - future = executor.submit(InfoDB.generate_db_and_dbgz, - arg) - job_list.append((future, serial, tar_path, dbgz_path)) - yield from result_all(job_list, gen) - if share_cancel.value != 0: - cancel = True - break + yield from result_one(one) + + # concurrent processes per generation (gen) + job_list = [] + with concurrent.futures.ProcessPoolExecutor( + max_workers=self.max_workers) as executor: + for serial, gen, tar_path, dbgz_path in tar_dbgz_list: + if share_cancel.value != 0: + cancel = True + break + if len(job_list) >= self.max_workers: + # wait for a previous job + yield from result_one(job_list[0]) + job_list = job_list[1:] # remove first + if tar_path is None: + # missing tar file + db_path = None + # wait for all previous jobs + yield from result_all(job_list) + job_list = [] + yield serial, gen, tar_path, dbgz_path, db_path + continue + if dbgz_path is None: + update = True + else: + update = self.sync_infodb + arg = (lock, share_cancel, self.tmpdir.name, + update, gen, serial, tar_path, + self.bufsize, self.use_fsync, self.dry_run) + # InfoDB.generate_db_and_dbgz is classmethod, + # because ProcessPoolExecutor cannot serialize + # members of "GfptarProgram (self)" object. + future = executor.submit(InfoDB.generate_db_and_dbgz, arg) + job_list.append((future, serial, gen, tar_path, dbgz_path)) + yield from result_all(job_list) # no longer be able to access Manager for sig in sigs: # restore From 9e7957a38cfe0e9829fd69f6e9bb341b912067b8 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Wed, 14 Aug 2024 12:40:06 +0900 Subject: [PATCH 076/143] gfptar: fix freeze when using --verbose --create --- gftool/gfptar/gfptar | 1 + 1 file changed, 1 insertion(+) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index d3dfe8157..d66dde656 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -3995,6 +3995,7 @@ class GfptarProgram(Program): pid = os.getpid() logger.debug('create_a_tar_process: start, pid=%d', pid) + self.lock_init(False) # child process self.sig_init(message=False) while self.create_a_tar_process0(input_queue, output_queue): pass From c87977514d0119c74f0ec94638179651e5b54ebe Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Wed, 14 Aug 2024 15:07:08 +0900 Subject: [PATCH 077/143] gfptar: fix freeze (and simplify) --- gftool/gfptar/gfptar | 28 +++------------------------- 1 file changed, 3 insertions(+), 25 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index d66dde656..ffa5d5c96 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -3734,31 +3734,9 @@ class GfptarProgram(Program): return logger.debug('process_close() start (index=%s): pid=%s', index, process.pid) - # timeout = 10 - # process.join(timeout) - process.terminate() - - if hasattr(process, 'kill'): - process.kill() # Python 3.7 or later - - # process.close() may raise ValueError. - # (I don't know the reason): - # ValueError: Cannot close a process while it is still - # running. You should first call join() or terminate(). - # - # Retry process.close() if ValueError is caught. - ok = False - for i in range(50): # retry, max 5s - try: - process.close() # Python 3.7 or later - ok = True - break - except ValueError as e: - logger.debug('retry[%d] process.close() (index=%d):', - i, index, exc_info=e) - time.sleep(0.1) - if not ok: - process.close() + # process.terminate() + os.kill(process.pid, signal.SIGKILL) + process.join() logger.debug('process_close() finished (index=%d)', index) def create_job_final(self, timeout=None): From 22bf4597c75770552b723ff1b23c8d4ab36c72b0 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Thu, 15 Aug 2024 12:17:24 +0900 Subject: [PATCH 078/143] gfptar: simplify (remove unnecessary termination process) --- gftool/gfptar/gfptar | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index ffa5d5c96..41df4c927 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -3705,7 +3705,6 @@ class GfptarProgram(Program): # multiprocessing.Manager().Queue() can be used # for ProcessPoolExecutor, but it is very slow. started = Started() - inq.gfptar_my_closed = False process = multiprocessing.Process(target=self.create_a_tar_process, args=(inq, outq)) started.status = False # lock required @@ -3751,12 +3750,11 @@ class GfptarProgram(Program): def worker_terminate(self): for index, worker in enumerate(self.worker_list): started, process, inq, outq = worker - if started.status: - if not inq.gfptar_my_closed: # avoid AssertionError - with ignore_exception(True): - inq.put(self.MSG_PARENT_EXIT) + # NOTE: processes will be killed. + # if started.status: + # with ignore_exception(True): + # inq.put(self.MSG_PARENT_EXIT) with ignore_exception(True): - inq.gfptar_my_closed = True inq.close() with ignore_exception(True): outq.close() @@ -3988,7 +3986,7 @@ class GfptarProgram(Program): if self.is_canceled(): # SIGTERM from parent return False # exit logger.debug('create_a_tar_process0: request=%s', request) - if request == self.MSG_PARENT_EXIT: + if request == self.MSG_PARENT_EXIT: # not used return False # exit elif request[0] == self.MSG_PARENT_START: output_queue.put(self.MSG_CHILD_READY) From 12bc12f03ad2572ee5d3ed3106e415107940e554 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Thu, 15 Aug 2024 15:31:55 +0900 Subject: [PATCH 079/143] gfptar --update: fix "OperationalError: unable to open database file" --- gftool/gfptar/gfptar | 48 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 39 insertions(+), 9 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 41df4c927..4d0eb5c4f 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -2514,6 +2514,14 @@ class GfptarProgram(Program): if self.memory_limit is not None: self.set_memory_limit(self.memory_limit) + soft_lim, hard_lim = resource.getrlimit(resource.RLIMIT_NOFILE) + logger.info(f"current soft RLIMIT_NOFILE: {soft_lim}") + logger.info(f"current hard RLIMIT_NOFILE: {hard_lim}") + new_soft_lim = hard_lim + resource.setrlimit(resource.RLIMIT_NOFILE, (new_soft_lim, hard_lim)) + soft_lim, hard_lim = resource.getrlimit(resource.RLIMIT_NOFILE) + logger.info(f"new soft RLIMIT_NOFILE: {soft_lim}") + self.compress_type = self.opt['--type'] self.compress_prog = self.opt['--use-compress-program'] self.gzip_prog = self.opt['--gzip-program'] @@ -3655,6 +3663,8 @@ class GfptarProgram(Program): self.create_job_final() + for fattr_dict in self.fattr_dict_list: + fattr_dict.close() tmpdir.cleanup() if self.progress_enabled: @@ -4659,14 +4669,24 @@ class GfptarProgram(Program): sec_str = format_seconds(sec, minhour=True) current_tar_num_str = self._humanize(self.current_tar_num) total_tar_num_str = self._humanize(self.total_tar_num) - current_ent_num_str = self._humanize(self.current_ent_num) - current_size_str = self._humanize(self.current_size) + if self.current_ent_num > 0: + current_ent_num_str = self._humanize(self.current_ent_num) + else: + current_ent_num_str = '?' + if self.current_size > 0: + current_size_str = self._humanize(self.current_size) + else: + current_size_str = '?' if sec > 0: ent_per_sec = self.current_ent_num / sec else: ent_per_sec = 0 ent_per_sec_str = self._humanize(ent_per_sec) - sys.stdout.write(f'\rsync-db: ' + if self.sync_infodb: + name = 'sync-db' + else: + name = 'load-db' + sys.stdout.write(f'\r{name}: ' f'{current_tar_num_str}/{total_tar_num_str}Tar ' f'{current_size_str}B ' f'{current_ent_num_str}Ent ' @@ -4775,9 +4795,12 @@ class GfptarProgram(Program): name = name + ' -> ' + fattr.linkname dt_object = datetime.datetime.fromtimestamp(fattr.mtime) mtime_str = dt_object.strftime('%Y-%m-%d %H:%M') - print(f'g{gen}_{serial:04} {fattr.ftype} {fattr.mode:04o}' - f' {fattr.user:>8}/{fattr.group:<8}' - f' {fattr.size:9d} {mtime_str} {name}', file=outf) + # print(f'g{gen}_{serial:04} {fattr.ftype} {fattr.mode:04o}' + # f' {fattr.user:>8}/{fattr.group:<8}' + # f' {fattr.size:9d} {mtime_str} {name}', file=outf) + outf.write('g%d_%04d %s %04o %8s/%-8s %9d %s %s\n' % ( + gen, serial, fattr.ftype, fattr.mode, fattr.user, + fattr.group, fattr.size, mtime_str, name)) def print_simple(serial, gen, path, fattr): name = path @@ -4967,7 +4990,7 @@ class GfptarProgram(Program): self.current_ent_num += ent_num self.current_size += size self.current_tar_num += 1 - if ent_num > 0 and progress: + if progress: now = time.time() self.progress_for_sync_infodb(now) yield serial, gen, tar_path, dbgz_path, db_path @@ -5013,7 +5036,7 @@ class GfptarProgram(Program): # because ProcessPoolExecutor cannot serialize # members of "GfptarProgram (self)" object. future = executor.submit(InfoDB.generate_db_and_dbgz, arg) - job_list.append((future, serial, gen, tar_path, dbgz_path)) + job_list.append((future, gen, serial, tar_path, dbgz_path)) yield from result_all(job_list) # no longer be able to access Manager for sig in sigs: @@ -5178,7 +5201,12 @@ class InfoDB: size = 0 if not update and dbgz_url.exists(): logger.debug('not update, decompress dbgz: %s', dbgz_path) - InfoDB.decompress_copy(dbgz_path, db_path, bufsize, use_fsync) + use_fsync2 = False # workdir + InfoDB.decompress_copy(dbgz_path, db_path, bufsize, use_fsync2) + # NOTE: slow + # infodb = InfoDB(db_path) + # num = len(infodb.fattr_dict) + # infodb.db.close() else: # generate db and dbgz tar = GfTarFile.extract_open(tar_url, bufsize) @@ -5314,6 +5342,8 @@ Limitations: - Hard links are not preserved. - File names cannot include newline characters. - Subsecond (less than a second) for mtime is not preserved. + - --update: If many *.db.gz files exists, input files may not be appended. + (For the upper limit, see the output of `ulimit -n -H`) Options: -t, --list=DIR list mode, From f6fbf9b1337fc21a8a35a15bc7b430504d17ad53 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Tue, 3 Sep 2024 22:59:07 +0900 Subject: [PATCH 080/143] gfptar: new option: -r,--append (Please use --ratio instead of the old -r option) --- gftool/gfptar/gfptar | 130 ++++++++++++++++++++++++++++++------------- 1 file changed, 92 insertions(+), 38 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 4d0eb5c4f..9e86b1cb8 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -2579,6 +2579,12 @@ class GfptarProgram(Program): self.cmd_create(outdir, basedir, infiles) return + outdir = self.opt['--append'] + if outdir: + basedir = self.opt['--basedir'] + infiles = self.opt[''] + self.cmd_append(outdir, basedir, infiles) + outdir = self.opt['--update'] if outdir: basedir = self.opt['--basedir'] @@ -2651,7 +2657,7 @@ class GfptarProgram(Program): self.test_init() self.test_opt_pattern(full=False) self.test_member() - self.test_update() + self.test_update_append() self.test_specified_dir() def test_main_long(self): @@ -2659,7 +2665,7 @@ class GfptarProgram(Program): self.test_unit() self.test_opt_pattern(full=True) self.test_member() - self.test_update() + self.test_update_append() # self.test_invalid('url', 'gfarm:/tmp', 'dst', False) # allowed self.test_invalid('dot1', '.', 'dst', True) self.test_invalid('dot2', '', 'dst', False) @@ -3036,8 +3042,8 @@ class GfptarProgram(Program): workdir_local_url.remove_tree(remove_readonly=True) workdir_gfarm_url.remove_tree(remove_readonly=True) - def test_update(self): - testname = 'gfptar-test-update' + def test_update_append(self): + testname = 'gfptar-test-update-append' self.test_workdir_set(testname) # workdir_local = self.workdir_local # workdir_gfarm = self.workdir_gfarm @@ -3046,10 +3052,15 @@ class GfptarProgram(Program): testsrc_name1 = 'test-src1' testsrc_name2 = 'test-src2' + testsrc_name3 = 'test-src3' srcdir_local1 = workdir_local_url.url_join(testsrc_name1) srcdir_local2 = workdir_local_url.url_join(testsrc_name2) + srcdir_local3 = workdir_local_url.url_join(testsrc_name3) srcdir_gfarm1 = workdir_gfarm_url.url_join(testsrc_name1) srcdir_gfarm2 = workdir_gfarm_url.url_join(testsrc_name2) + srcdir_gfarm3 = workdir_gfarm_url.url_join(testsrc_name3) + expect_dir_name = 'test-expect' + expect_dir_local = workdir_local_url.url_join(expect_dir_name) tree = [ (self.D, 'dir1', 0o715, 10, None, True), @@ -3059,31 +3070,53 @@ class GfptarProgram(Program): ] self.test_create_tree(srcdir_local1, tree) self.test_create_tree(srcdir_gfarm1, tree) + tree = [ (self.D, 'dir1', 0o750, 11, None, True), - (self.F, 'dir1/file1', 0o660, 100, None, True), - (self.F, 'dir1/file2', 0o664, 10, None, True), # not updated + (self.F, 'dir1/file1', 0o660, 11, None, True), + (self.F, 'dir1/file2', 0o664, 9, None, True), # not updated (self.S, 'dir1/symlink1', 0o770, 11, 'file1', True), ] self.test_create_tree(srcdir_local2, tree) self.test_create_tree(srcdir_gfarm2, tree) - test1_name = 'test-1-create' + tree = [ + (self.D, 'dir2', 0o700, 10, None, True), + (self.F, 'dir2/file3', 0o660, 100, None, True), + ] + self.test_create_tree(srcdir_local3, tree) + self.test_create_tree(srcdir_gfarm3, tree) + + # expected tree + tree = [ + (self.D, 'dir1', 0o750, 11, None, True), + (self.F, 'dir1/file1', 0o660, 11, None, True), + (self.F, 'dir1/file2', 0o664, 10, None, True), + (self.D, 'dir2', 0o700, 10, None, True), + (self.F, 'dir2/file3', 0o660, 100, None, True), + (self.S, 'dir1/symlink1', 0o770, 11, 'file1', True), + ] + self.test_create_tree(expect_dir_local, tree) + + test1_name = 'test-1-create-l' test1_dir_local = workdir_local_url.url_join(test1_name) - test2_name = 'test-2-extract' + test2_name = 'test-2-extract-l' test2_dir_local = workdir_local_url.url_join(test2_name) - test3_name = 'test-3-create' + test3_name = 'test-3-create-g' test3_dir_gfarm = workdir_gfarm_url.url_join(test3_name) - test4_name = 'test-4-extract' + test4_name = 'test-4-extract-g' test4_dir_local = workdir_local_url.url_join(test4_name) - members = ['dir1'] + members1 = ['dir1'] + members2 = ['dir2'] # Local -> Local(tar) - self.cmd_create(test1_dir_local, srcdir_local1, members) + self.cmd_create(test1_dir_local, srcdir_local1, members1) # Local -> Local(tar) (update) - self.cmd_update(test1_dir_local, srcdir_local2, members) + self.cmd_update(test1_dir_local, srcdir_local2, members1) + # Local -> Local(tar) (append) + self.cmd_append(test1_dir_local, srcdir_local3, members2) # Local(tar) -> Local self.cmd_extract(test2_dir_local, test1_dir_local, []) @@ -3091,18 +3124,20 @@ class GfptarProgram(Program): # self.cmd_list_verbose(test1_dir_local) # self.test_mode = True - if not self.test_compare_local(test2_dir_local, srcdir_local2, + if not self.test_compare_local(test2_dir_local, expect_dir_local, same_owner=True): logger.error_exit(1, testname + ' ... FAIL (different data)(1)') # Gfarm -> Gfarm(tar) - self.cmd_create(test3_dir_gfarm, srcdir_gfarm1, members) + self.cmd_create(test3_dir_gfarm, srcdir_gfarm1, members1) # Gfarm -> Gfarm(tar) (update) - self.cmd_update(test3_dir_gfarm, srcdir_gfarm2, members) + self.cmd_update(test3_dir_gfarm, srcdir_gfarm2, members1) + # Gfarm -> Gfarm(tar) (append) + self.cmd_append(test3_dir_gfarm, srcdir_gfarm3, members2) # Gfarm(tar) -> Local self.cmd_extract(test4_dir_local, test3_dir_gfarm, []) - if not self.test_compare_local(test4_dir_local, srcdir_local2, + if not self.test_compare_local(test4_dir_local, expect_dir_local, same_owner=True): logger.error_exit(1, testname + ' ... FAIL (different data)(2)') @@ -3273,7 +3308,7 @@ class GfptarProgram(Program): diff_args += ['-r', dir1, dir2] out, err, ret = execcmd_raw(diff_args) if ret != 0: - logger.error(f'diff -r {dir1} {dir2}: {err}') + logger.error(f'diff -r {dir1} {dir2}: {out}: {err}') return False logger.debug('diff -r (data check): PASS') return self.test_compare(dir1, dir2, data=data, same_owner=same_owner, @@ -3459,6 +3494,26 @@ class GfptarProgram(Program): gen = 1 self.cmd_create_common(serial, gen, infiles) + def error_type_mismatch(self, tar_path): + return GfptarError(f"Tar compression type mismatch: {tar_path}") + + def cmd_append(self, outdir, basedir, infiles): + self.cmd_create_init('append', outdir, basedir) + if not self.outdir_url.exists(): + raise FileNotFoundError(outdir) + max_serial = 0 + max_gen = 0 + suffix = '.' + self.compress_type + tar_list, dbgz_list = self.list_tar_list_dbgz_from_dir(self.outdir_url) + for serial, gen, tar_path in tar_list: + if not tar_path.endswith(suffix): + raise self.error_type_mismatch(tar_path) + if serial > max_serial: + max_serial = serial + if gen > max_gen: + max_gen = gen + self.cmd_create_common(max_serial + 1, max_gen + 1, infiles) + def cmd_update(self, outdir, basedir, infiles): self.cmd_create_init('update', outdir, basedir) if not self.outdir_url.exists(): @@ -3472,6 +3527,8 @@ class GfptarProgram(Program): progress=self.progress_enabled): if self.is_canceled(): raise self.error_canceled() + if not tar_path.endswith(suffix): + raise self.error_type_mismatch(tar_path) if serial > max_serial: max_serial = serial if gen > max_gen: @@ -3479,16 +3536,12 @@ class GfptarProgram(Program): if dbgz_path is None or tar_path is None: broken_count += 1 continue - if not tar_path.endswith(suffix): - raise GfptarError(f"Tar compression type mismatch: {tar_path}") db = DB(db_path) fattr_dict = DBDict(db, FileAttr2, InfoDB.TABLE_ENTRY) self.fattr_dict_list.append(fattr_dict) - max_serial += 1 - max_gen += 1 self.start_time = time.time() self.next_time = self.start_time + self.progress_interval - self.cmd_create_common(max_serial, max_gen, infiles) + self.cmd_create_common(max_serial + 1, max_gen + 1, infiles) if broken_count > 0: raise GfptarError("Some .tar.* or .db.gz are broken:" f" error={broken_count}") @@ -3543,11 +3596,12 @@ class GfptarProgram(Program): f" entry.mtime({entry.mtime}) >" f" fattr.mtime({fattr.mtime})") return True - elif fattr.size != entry.size: - logger.debug(f"is_update_target2: path={path}:" - f" fattr.size({fattr.size})" - f" != entry.size({entry.size})") - return True + # NOTE: compare only mtime + # elif fattr.size != entry.size: + # logger.debug(f"is_update_target2: path={path}:" + # f" fattr.size({fattr.size})" + # f" != entry.size({entry.size})") + # return True else: return False # not found @@ -5346,15 +5400,13 @@ Limitations: (For the upper limit, see the output of `ulimit -n -H`) Options: - -t, --list=DIR list mode, - list the members of - (use with --verbose to see more details) - -x, --extract=DIR extract mode, - extract all members or specified s + -c, --create=DIR create tar files in from s + -r, --append=DIR append files (create new tar files) + -u, --update=DIR append files newer than same entries in tar files + -x, --extract=DIR extract all members or specified s from to - -c, --create=DIR create mode, - create tar files in from s - -u, --update=DIR append files newer than same entries in archive + -t, --list=DIR list the members of + (use with --verbose to see more details) -C, --basedir=DIR base directory for s [default: .] -j, --jobs=NUM the number of jobs to copy per tar file in parallel [default: 4] @@ -5365,7 +5417,7 @@ Options: - xz : use xz (*.tar.xz) - no : no compression (*.tar) [default: gz] - -r, --ratio=RATIO assumed compression ratio (%) [default: 50] + --ratio=RATIO assumed compression ratio (%) [default: 50] -I, --use-compress-program=COMMAND filter data through COMMAND, the command must accept -d option for decompression @@ -5416,9 +5468,10 @@ Options: Usage: {f} [options] -c [-C ] [--] ... + {f} [options] -r [-C ] [--] ... + {f} [options] -u [-C ] [--] ... {f} [options] -x [--] [...] {f} [options] -t - {f} [options] -u [-C ] [--] ... {f} [options] --test {f} [options] --test -C ... {f} [options] --test-long @@ -5430,6 +5483,7 @@ _schema = Schema({ '--list': Or(str, None), '--extract': Or(str, None), '--create': Or(str, None), + '--append': Or(str, None), '--update': Or(str, None), '--basedir': Or(str, None), '--encoding': str, From 154159b3fe537c13bb66aa09f4040a500cd89080 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Sat, 7 Sep 2024 22:43:27 +0900 Subject: [PATCH 081/143] gfptar: new options: --exclude and --exclude-from --- gftool/gfptar/gfptar | 175 +++++++++++++++++++++++++++++++++---------- 1 file changed, 136 insertions(+), 39 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 9e86b1cb8..72157804b 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -50,6 +50,7 @@ import inspect from collections import defaultdict import datetime import atexit +import fnmatch from docopt import docopt from schema import Schema, Use, Or @@ -2210,7 +2211,7 @@ class GfTarFile(tarfile.TarFile): # NOTE: add() is not expected behavior. Use addfile() instead. # - add() can copy a hard link, # but a hard link cannot be extracted from gfexport (stream open) - # - When the specified for --create is a symlink, + # - When the specified MEMBER for --create is a symlink, # the entry will be archived as symlink. def _add_entry(self, subpath, entry): tarinfo = entry.toTarinfo(subpath) @@ -2534,6 +2535,48 @@ class GfptarProgram(Program): self.dry_run = self.opt['--dry-run'] + self.exclude_re_list = [] + self.exclude = self.opt['--exclude'] + if isinstance(self.exclude, str): + reobj = re.compile(fnmatch.translate(self.exclude)) + self.exclude_re_list.append(reobj) + elif isinstance(self.exclude, list): + for ex in self.exclude: + reobj = re.compile(fnmatch.translate(ex)) + self.exclude_re_list.append(reobj) + + self.exclude_from = self.opt['--exclude-from'] + if self.exclude_from is not None: + logger.debug(f"exclude_from={self.exclude_from}") + exclude_url = GfURL.init(self.exclude_from) + with exclude_url.readopen(textmode=True) as f: + while True: + line = f.readline().strip() + if not line: + break + if len(line) == 0: + continue + logger.debug(f"exclude_from: line=`{line}`") + reobj = re.compile(fnmatch.translate(line)) + self.exclude_re_list.append(reobj) + + if len(self.exclude_re_list) > 0: + self.exclude_match = self.exclude_match0 + else: + self.exclude_match = self.exclude_match_noop + + def exclude_match0(self, path): + if path.startswith('./'): # equal to removeprefix('./') + path = path[2:] + # logger.debug(f"exclude_match0(): path={path}") + if any(reobj.match(path) for reobj in self.exclude_re_list): + logger.debug(f"exclude_match0()=True: path={path}") + return True + return False + + def exclude_match_noop(self, path): + return False + def set_memory_limit(self, max_memory): resource.setrlimit(resource.RLIMIT_AS, (max_memory, max_memory)) @@ -2575,27 +2618,27 @@ class GfptarProgram(Program): outdir = self.opt['--create'] if outdir: basedir = self.opt['--basedir'] - infiles = self.opt[''] + infiles = self.opt['MEMBER'] self.cmd_create(outdir, basedir, infiles) return outdir = self.opt['--append'] if outdir: basedir = self.opt['--basedir'] - infiles = self.opt[''] + infiles = self.opt['MEMBER'] self.cmd_append(outdir, basedir, infiles) outdir = self.opt['--update'] if outdir: basedir = self.opt['--basedir'] - infiles = self.opt[''] + infiles = self.opt['MEMBER'] self.cmd_update(outdir, basedir, infiles) return outdir = self.opt['--extract'] if outdir: - indir = self.opt[''] - members = self.opt[''] + indir = self.opt['INDIR'] + members = self.opt['MEMBER'] self.cmd_extract(outdir, indir, members) return @@ -2657,7 +2700,7 @@ class GfptarProgram(Program): self.test_init() self.test_opt_pattern(full=False) self.test_member() - self.test_update_append() + self.test_update_append_exclude() self.test_specified_dir() def test_main_long(self): @@ -2665,7 +2708,7 @@ class GfptarProgram(Program): self.test_unit() self.test_opt_pattern(full=True) self.test_member() - self.test_update_append() + self.test_update_append_exclude() # self.test_invalid('url', 'gfarm:/tmp', 'dst', False) # allowed self.test_invalid('dot1', '.', 'dst', True) self.test_invalid('dot2', '', 'dst', False) @@ -3042,8 +3085,8 @@ class GfptarProgram(Program): workdir_local_url.remove_tree(remove_readonly=True) workdir_gfarm_url.remove_tree(remove_readonly=True) - def test_update_append(self): - testname = 'gfptar-test-update-append' + def test_update_append_exclude(self): + testname = 'gfptar-test-update-append-exclude' self.test_workdir_set(testname) # workdir_local = self.workdir_local # workdir_gfarm = self.workdir_gfarm @@ -3062,27 +3105,42 @@ class GfptarProgram(Program): expect_dir_name = 'test-expect' expect_dir_local = workdir_local_url.url_join(expect_dir_name) + exclude_name = 'exclude.txt' + exclude_file_l = workdir_local_url.url_join(exclude_name) + exclude_file_l_url = GfURL.init(exclude_file_l) + exclude_file_g = workdir_gfarm_url.url_join(exclude_name) + exclude_file_g_url = GfURL.init(exclude_file_g) + + # for cmd_create tree = [ (self.D, 'dir1', 0o715, 10, None, True), (self.F, 'dir1/file1', 0o664, 10, None, True), (self.F, 'dir1/file2', 0o664, 10, None, True), (self.S, 'dir1/symlink1', 0o777, 10, 'file1', True), + (self.F, 'dir1/exclude1.ext1', 0o660, 10, None, True), ] self.test_create_tree(srcdir_local1, tree) self.test_create_tree(srcdir_gfarm1, tree) + # for cmd_update tree = [ (self.D, 'dir1', 0o750, 11, None, True), (self.F, 'dir1/file1', 0o660, 11, None, True), (self.F, 'dir1/file2', 0o664, 9, None, True), # not updated (self.S, 'dir1/symlink1', 0o770, 11, 'file1', True), + (self.F, 'dir1/exclude2.ext2', 0o660, 10, None, True), + (self.F, 'dir1/exclude2.ext3', 0o660, 10, None, True), ] self.test_create_tree(srcdir_local2, tree) self.test_create_tree(srcdir_gfarm2, tree) + # for cmd_append tree = [ (self.D, 'dir2', 0o700, 10, None, True), (self.F, 'dir2/file3', 0o660, 100, None, True), + (self.F, 'dir2/exclude4.ext4', 0o660, 10, None, True), + (self.F, 'dir2/exclude5.ext5', 0o660, 10, None, True), + (self.F, 'dir2/exclude6.ext6', 0o660, 10, None, True), ] self.test_create_tree(srcdir_local3, tree) self.test_create_tree(srcdir_gfarm3, tree) @@ -3092,9 +3150,9 @@ class GfptarProgram(Program): (self.D, 'dir1', 0o750, 11, None, True), (self.F, 'dir1/file1', 0o660, 11, None, True), (self.F, 'dir1/file2', 0o664, 10, None, True), + (self.S, 'dir1/symlink1', 0o770, 11, 'file1', True), (self.D, 'dir2', 0o700, 10, None, True), (self.F, 'dir2/file3', 0o660, 100, None, True), - (self.S, 'dir1/symlink1', 0o770, 11, 'file1', True), ] self.test_create_tree(expect_dir_local, tree) @@ -3109,15 +3167,34 @@ class GfptarProgram(Program): test4_dir_local = workdir_local_url.url_join(test4_name) members1 = ['dir1'] - members2 = ['dir2'] + members2 = ['dir2'] # for cmd_append + + save_opt_exclude = self.opt['--exclude'] + save_opt_exclude_from = self.opt['--exclude-from'] + + # for cmd_update + with exclude_file_l_url.writeopen(textmode=True) as f: + f.write('*.ext2\n') + f.write('*.ext3\n') + with exclude_file_g_url.writeopen(textmode=True) as f: + f.write('*.ext2\n') + f.write('*.ext3\n') # Local -> Local(tar) + self.opt['--exclude'] = ['*.ext1'] + self.opt['--exclude-from'] = None self.cmd_create(test1_dir_local, srcdir_local1, members1) # Local -> Local(tar) (update) + self.opt['--exclude'] = None + self.opt['--exclude-from'] = exclude_file_l self.cmd_update(test1_dir_local, srcdir_local2, members1) # Local -> Local(tar) (append) + self.opt['--exclude'] = ['*.ext4', '*.ext5'] + self.opt['--exclude-from'] = None self.cmd_append(test1_dir_local, srcdir_local3, members2) # Local(tar) -> Local + self.opt['--exclude'] = ['*.ext6'] + self.opt['--exclude-from'] = None self.cmd_extract(test2_dir_local, test1_dir_local, []) # self.test_mode = False @@ -3129,12 +3206,20 @@ class GfptarProgram(Program): logger.error_exit(1, testname + ' ... FAIL (different data)(1)') # Gfarm -> Gfarm(tar) + self.opt['--exclude'] = ['*.ext1'] + self.opt['--exclude-from'] = None self.cmd_create(test3_dir_gfarm, srcdir_gfarm1, members1) # Gfarm -> Gfarm(tar) (update) + self.opt['--exclude'] = None + self.opt['--exclude-from'] = exclude_file_g self.cmd_update(test3_dir_gfarm, srcdir_gfarm2, members1) # Gfarm -> Gfarm(tar) (append) + self.opt['--exclude'] = ['*.ext4', '*.ext5'] + self.opt['--exclude-from'] = None self.cmd_append(test3_dir_gfarm, srcdir_gfarm3, members2) # Gfarm(tar) -> Local + self.opt['--exclude'] = ['*.ext6'] + self.opt['--exclude-from'] = None self.cmd_extract(test4_dir_local, test3_dir_gfarm, []) if not self.test_compare_local(test4_dir_local, expect_dir_local, @@ -3144,6 +3229,8 @@ class GfptarProgram(Program): print(testname + ' ... PASS') workdir_local_url.remove_tree(remove_readonly=True) workdir_gfarm_url.remove_tree(remove_readonly=True) + self.opt['--exclude'] = save_opt_exclude + self.opt['--exclude-from'] = save_opt_exclude_from # for test F = 'file' @@ -3251,7 +3338,7 @@ class GfptarProgram(Program): basedir = self.opt['--basedir'] if basedir is None: return - infiles = self.opt[''] + infiles = self.opt['MEMBER'] if len(infiles) == 0: return @@ -3579,10 +3666,10 @@ class GfptarProgram(Program): return gfurl.listdir(recursive=True, first=True, hardlink_warn=self.hardlink_warn) - def is_update_target1(entry): + def is_update_target_true(entry): return True - def is_update_target2(entry): + def is_update_target0(entry): # entry is an input file. path = entry.subpath(self.basedir_url) # compare mtime from newer .db @@ -3609,9 +3696,9 @@ class GfptarProgram(Program): return True if len(self.fattr_dict_list) > 0: - is_update_target = is_update_target2 + is_update_target = is_update_target0 else: - is_update_target = is_update_target1 + is_update_target = is_update_target_true tmpdir = self.tmpdir tardb_prefix = os.path.join(tmpdir.name, self.cmd_name) @@ -3643,7 +3730,9 @@ class GfptarProgram(Program): break logger.debug('listdir: entry.path=%s', entry.path) if not is_update_target(entry): - continue + continue # skip + if self.exclude_match(entry.path): + continue # skip # include length of path this_size = entry.size_all() @@ -4602,11 +4691,13 @@ class GfptarProgram(Program): tarinfo = None if tarinfo is None: break + if self.exclude_match(tarinfo.name): + continue # skip if members_num > 0: with self.lock(): # perfect match if tarinfo.name not in member_set: - continue + continue # skip member_set.remove(tarinfo.name) # members_num == 0 -> extract all @@ -4857,10 +4948,10 @@ class GfptarProgram(Program): fattr.group, fattr.size, mtime_str, name)) def print_simple(serial, gen, path, fattr): - name = path if fattr.ftype == InfoDB.TYPE_DIR: - name = name + '/' - print(f'{fattr.ftype} {name}', file=outf) + print(f'{fattr.ftype} {path}/', file=outf) + else: + print(f'{fattr.ftype} {path}', file=outf) if quiet: print_func = print_quiet @@ -5400,24 +5491,28 @@ Limitations: (For the upper limit, see the output of `ulimit -n -H`) Options: - -c, --create=DIR create tar files in from s - -r, --append=DIR append files (create new tar files) - -u, --update=DIR append files newer than same entries in tar files - -x, --extract=DIR extract all members or specified s - from to - -t, --list=DIR list the members of + -c, --create=OUTDIR create tar files in OUTDIR from MEMBERs + -r, --append=OUTDIR append files (create new tar files) + -u, --update=OUTDIR append files newer than same entries in tar files + -x, --extract=OUTDIR extract all members or specified MEMBERs + from INDIR to OUTDIR + -t, --list=DIR list the members of DIR (use with --verbose to see more details) - -C, --basedir=DIR base directory for s [default: .] + -C, --basedir=DIR change to directory for MEMBERs [default: .] + --exclude= Exclude files matching wildcard patterns + (https://docs.python.org/ja/3/library/fnmatch.html) + -X, --exclude-from=FILE Exclude files matching wildcard patterns + listed in FILE -j, --jobs=NUM the number of jobs to copy per tar file in parallel [default: 4] -s, --size=BYTES assumed bytes per output file [default: 200Mi] + --ratio=RATIO assumed compression ratio (%) [default: 50] -T, --type=TYPE compression type and tar archive suffix - gz : use gzip (*.tar.gz) - bz2: use bzip2 (*.tar.bz2) - xz : use xz (*.tar.xz) - no : no compression (*.tar) [default: gz] - --ratio=RATIO assumed compression ratio (%) [default: 50] -I, --use-compress-program=COMMAND filter data through COMMAND, the command must accept -d option for decompression @@ -5454,7 +5549,7 @@ Options: --test-workdir-local=DIR local directory for test [default: /tmp] --test-workdir-gfarm=DIR Gfarm directory for test [default: gfarm:/tmp] --dummy-num=NUM the number of dummy input entries for --create - (ignore arguments) + (ignore MEMBER arguments) (create 1000 files and directories per 1 unit) (default: disabled) --dummy-size-min=BYTES minimum size of dummy files [default: 0] @@ -5467,13 +5562,13 @@ Options: -?, -h, --help show this help and exit Usage: - {f} [options] -c [-C ] [--] ... - {f} [options] -r [-C ] [--] ... - {f} [options] -u [-C ] [--] ... - {f} [options] -x [--] [...] - {f} [options] -t + {f} [options] [--exclude=PATTERN]... -c OUTDIR [-C DIR] [--] MEMBER... + {f} [options] [--exclude=PATTERN]... -r OUTDIR [-C DIR] [--] MEMBER... + {f} [options] [--exclude=PATTERN]... -u OUTDIR [-C DIR] [--] MEMBER... + {f} [options] [--exclude=PATTERN]... -x OUTDIR [--] INDIR [MEMBER...] + {f} [options] -t DIR {f} [options] --test - {f} [options] --test -C ... + {f} [options] --test -C DIR MEMBER... {f} [options] --test-long {f} -h | --help """.format(f=progname) @@ -5486,6 +5581,8 @@ _schema = Schema({ '--append': Or(str, None), '--update': Or(str, None), '--basedir': Or(str, None), + '--exclude': Or([str], None), + '--exclude-from': Or(str, None), '--encoding': str, '--size': Use(unhumanize_number), '--bufsize': Use(unhumanize_number), @@ -5520,8 +5617,8 @@ _schema = Schema({ '--debug': bool, '--help': bool, '--': bool, - '': Or(str, None), - '': [str], + 'INDIR': Or(str, None), + 'MEMBER': [str], }) From 291a6f5e3093d437eb3eae7a1290ef87f08276a4 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Sat, 7 Sep 2024 22:55:52 +0900 Subject: [PATCH 082/143] gfptar: new option: --resume: same as --update --- gftool/gfptar/gfptar | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 72157804b..e269e64c0 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -2629,6 +2629,8 @@ class GfptarProgram(Program): self.cmd_append(outdir, basedir, infiles) outdir = self.opt['--update'] + if not outdir: + outdir = self.opt['--resume'] if outdir: basedir = self.opt['--basedir'] infiles = self.opt['MEMBER'] @@ -5494,6 +5496,7 @@ Options: -c, --create=OUTDIR create tar files in OUTDIR from MEMBERs -r, --append=OUTDIR append files (create new tar files) -u, --update=OUTDIR append files newer than same entries in tar files + --resume=OUTDIR same as --update -x, --extract=OUTDIR extract all members or specified MEMBERs from INDIR to OUTDIR -t, --list=DIR list the members of DIR @@ -5564,6 +5567,7 @@ Options: Usage: {f} [options] [--exclude=PATTERN]... -c OUTDIR [-C DIR] [--] MEMBER... {f} [options] [--exclude=PATTERN]... -r OUTDIR [-C DIR] [--] MEMBER... + {f} [options] [--exclude=PATTERN]... --resume OUTDIR [-C DIR] [--] MEMBER... {f} [options] [--exclude=PATTERN]... -u OUTDIR [-C DIR] [--] MEMBER... {f} [options] [--exclude=PATTERN]... -x OUTDIR [--] INDIR [MEMBER...] {f} [options] -t DIR @@ -5580,6 +5584,7 @@ _schema = Schema({ '--create': Or(str, None), '--append': Or(str, None), '--update': Or(str, None), + '--resume': Or(str, None), '--basedir': Or(str, None), '--exclude': Or([str], None), '--exclude-from': Or(str, None), From 47f7d71afd5f3070a844747dd09bccb7333433d7 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Sat, 7 Sep 2024 23:02:25 +0900 Subject: [PATCH 083/143] gfptar: reneame option: --sync-db -> --generate-db --- gftool/gfptar/gfptar | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index e269e64c0..b0bef7b52 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -2487,7 +2487,7 @@ class GfptarProgram(Program): self.max_workers = 1 self.bufsize = self.opt['--bufsize'] self.use_fsync = not self.opt['--disable-fsync'] - self.sync_infodb = self.opt['--sync-db'] + self.sync_infodb = self.opt['--generate-db'] self.workdir = self.opt['--workdir'] euid = os.geteuid() @@ -2734,18 +2734,18 @@ class GfptarProgram(Program): save_opt_jobs = self.opt['--jobs'] save_opt_type = self.opt['--type'] save_opt_compress_prog = self.opt['--use-compress-program'] - save_opt_syncdb = self.opt['--sync-db'] + save_opt_gendb = self.opt['--generate-db'] save_opt_verbose = self.opt['--verbose'] # create tar per one entry self.opt['--size'] = 0 - # test --sync-db + # test --generate-db # --verbose: test report_path (SEE ALSO: create_a_tar_process1) self.opt['--verbose'] = True - self.opt['--sync-db'] = True - self.test_simple('syncdb') - self.opt['--sync-db'] = save_opt_syncdb + self.opt['--generate-db'] = True + self.test_simple('generate_db') + self.opt['--generate-db'] = save_opt_gendb self.opt['--verbose'] = save_opt_verbose # test --jobs @@ -5523,7 +5523,7 @@ Options: --gzip-program=COMMAND gzip command (ex. pigz) [default: gzip] --bzip2-program=COMMAND bzip2 command (ex. pbzip2) [default: bzip2] --xz-program=COMMAND xz command [default: xz] - --sync-db regenerate gfptar*_info.db.gz + --generate-db regenerate gfptar*_info.db.gz --same-owner extract files with the same ownership (for euid=0 on local, or gfarmroot on Gfarm) --disable-gfarm-command disable the use of gfreg and gfexport @@ -5601,7 +5601,7 @@ _schema = Schema({ '--disable-gfarm-command': bool, '--disable-fsync': bool, '--gfsched-interval': Use(int), - '--sync-db': bool, + '--generate-db': bool, '--same-owner': bool, '--workdir': Or(str, None), '--max-entries-per-tar': Use(unhumanize_number), From 0f452401014b716585fe9a48820bae7a43a37e3e Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Sat, 7 Sep 2024 23:25:15 +0900 Subject: [PATCH 084/143] gfptar --extract: Do not remove member from member_set for --update --- gftool/gfptar/gfptar | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index b0bef7b52..6535e4056 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -3170,6 +3170,7 @@ class GfptarProgram(Program): members1 = ['dir1'] members2 = ['dir2'] # for cmd_append + members_all = members1 + members2 save_opt_exclude = self.opt['--exclude'] save_opt_exclude_from = self.opt['--exclude-from'] @@ -3197,7 +3198,7 @@ class GfptarProgram(Program): # Local(tar) -> Local self.opt['--exclude'] = ['*.ext6'] self.opt['--exclude-from'] = None - self.cmd_extract(test2_dir_local, test1_dir_local, []) + self.cmd_extract(test2_dir_local, test1_dir_local, members_all) # self.test_mode = False # self.cmd_list_verbose(test1_dir_local) @@ -3222,6 +3223,7 @@ class GfptarProgram(Program): # Gfarm(tar) -> Local self.opt['--exclude'] = ['*.ext6'] self.opt['--exclude-from'] = None + # self.cmd_extract(test4_dir_local, test3_dir_gfarm, members_all) self.cmd_extract(test4_dir_local, test3_dir_gfarm, []) if not self.test_compare_local(test4_dir_local, expect_dir_local, @@ -4700,7 +4702,8 @@ class GfptarProgram(Program): # perfect match if tarinfo.name not in member_set: continue # skip - member_set.remove(tarinfo.name) + # NOTE: DO NOT remove for --update + # member_set.remove(tarinfo.name) # members_num == 0 -> extract all # ex. //a/b/c// -> a/b/c From e85090f5a61abb3d066040006651861ba7083367 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Sun, 8 Sep 2024 10:24:32 +0900 Subject: [PATCH 085/143] gfptar: remove *.tmp files when error happens --- gftool/gfptar/gfptar | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 6535e4056..29b55040a 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -4234,6 +4234,13 @@ class GfptarProgram(Program): use_gfarm_command=self.use_gfarm_command, dry_run=self.dry_run) target_host = self.select_a_target_host(outurl_tmp, serial) + + def remove(gfurl): + if gfurl.exists(): + gfurl.remove() + + atexit.register(remove, outurl_tmp) + remove(outurl_tmp) tar_tmp = GfTarFile.create_open(outurl_tmp, self.compress_type, self.bufsize, use_fsync=self.use_fsync, @@ -4243,7 +4250,7 @@ class GfptarProgram(Program): # to reduce memory usage # SEE ALSO: InfoDB.generate_db_and_dbgz() db_name = InfoDB.dbgz_filename(gen, serial) - tmpdb_path = os.path.join(self.tmpdir.name, db_name + '.tmp') + tmpdb_path = os.path.join(self.tmpdir.name, db_name + self.TMP_SUFFIX) infodb = InfoDB(tmpdb_path) cannot_be_archived = 0 @@ -4332,6 +4339,7 @@ class GfptarProgram(Program): else: tar_size = outurl_tmp.size() outurl_tmp.rename(outname_path) + atexit.unregister(remove) out_db_path = self.outdir_url.url_join(db_name) InfoDB.compress_copy(infodb.db.filename, out_db_path, @@ -5273,8 +5281,13 @@ class InfoDB: dbgz_path_tmp = out_dbgz_path + '.tmp' dbgz_url_tmp = GfURL.init(dbgz_path_tmp, dry_run=dry_run) db_url = GfURL.init(in_db_path) - if dbgz_url_tmp.exists(): - dbgz_url_tmp.remove() + + def remove(gfurl): + if gfurl.exists(): + gfurl.remove() + + remove(dbgz_url_tmp) + atexit.register(remove, dbgz_url_tmp) with dbgz_url_tmp.writeopen(use_fsync=use_fsync) as outf: proc = Compress.compress(Compress.gzip_prog, outf) with db_url.readopen() as inf: @@ -5287,6 +5300,7 @@ class InfoDB: # atomic operation to avoid leaving junk files dbgz_url_tmp.rename(out_dbgz_path) + atexit.unregister(remove) logger.debug('created(.db.gz): %s', out_dbgz_path) if move: db_url.remove() From a4133c8c693ebb1c470cdf526435f35335097c29 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Thu, 12 Sep 2024 15:20:34 +0900 Subject: [PATCH 086/143] gfptar: clarify --- gftool/gfptar/gfptar | 53 +++++++++++++++++++++++++++----------------- 1 file changed, 33 insertions(+), 20 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 29b55040a..f76b0fec9 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -3570,9 +3570,7 @@ class GfptarProgram(Program): self.next_time = self.start_time + self.progress_interval self.listing = True - self.gfsched_lock = None - self.gfsched_next = 0 - self.gfsched_list = None + self.gfsched_init() self.cannot_be_archived = 0 self.create_job_init() # before creating threads @@ -3982,32 +3980,49 @@ class GfptarProgram(Program): def is_canceled(self): return self.canceled + def gfsched_init(self): + self.gfsched_lock = threading.Lock() + self.gfsched_next = 0 + self.gfsched_list = [] + def _gfsched_sometimes(self, gfurl): # if gfurl.is_gfarm() is not True: # raise AssertionError - if self.gfsched_lock is None: - self.gfsched_lock = threading.Lock() + now = time.time() with self.gfsched_lock: - now = time.time() - if now >= self.gfsched_next: - # access to Gfarm - self.gfsched_list = gfurl.gfsched(write_mode=True, - number=self.jobs) - # prevent IndexError - self.gfsched_list = self.gfsched_list[:self.jobs] - logger.debug("update gfsched_list") - self.gfsched_next = now + self.gfsched_interval + if now < self.gfsched_next: + return # not updated + # access to Gfarm + old_schedlist = self.gfsched_list + try: + schedlist = gfurl.gfsched(write_mode=True, + number=self.jobs) + except Exception: + logger.warning('Error occurred in gfsched') + schedlist = old_schedlist + self.gfsched_list = schedlist + logger.debug("update gfsched_list") + self.gfsched_next = now + self.gfsched_interval def select_a_target_host(self, outurl, index): if self.dry_run: return if not outurl.is_gfarm(): return None - if self.jobs <= 1: + if self.jobs <= 1: # not use gfsched return None self._gfsched_sometimes(outurl) - with self.lock(): - target_host = self.gfsched_list[index % len(self.gfsched_list)] + with self.gfsched_lock: + schedlist_len = len(self.gfsched_list) + if schedlist_len == 0: + logger.warning( + 'The destination host cannot be selected by gfptar') + return None # no candidate + if schedlist_len > self.jobs: + # use only top hosts + self.gfsched_list = self.gfsched_list[:self.jobs] + schedlist_len = len(self.gfsched_list) + target_host = self.gfsched_list[index % schedlist_len] logger.debug("selected target_host: %s", target_host) return target_host @@ -4529,9 +4544,7 @@ class GfptarProgram(Program): if self.progress_enabled: self.progress_for_extract(self.start_time) - self.gfsched_lock = None - self.gfsched_next = 0 - self.gfsched_list = None + self.gfsched_init() if self.MT_enabled(): for gen, tarlist in gen_tarlist_dict.items(): From 9465974e79ab2b3ddc07ccc734adde99eade36c2 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Fri, 13 Sep 2024 09:13:27 +0900 Subject: [PATCH 087/143] gfptar: remove unnecessary codes --- gftool/gfptar/gfptar | 5 ----- 1 file changed, 5 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index f76b0fec9..b9dc7e916 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -4394,13 +4394,10 @@ class GfptarProgram(Program): tmpdir = self.tmpdir db_file = os.path.join(tmpdir.name, 'extract.db') - db_file_target = os.path.join(tmpdir.name, 'target.db') logger.debug('db_file=%s', db_file) - logger.debug('db_file_target=%s', db_file_target) # to reduce memory usage self.db = DB(db_file, check_same_thread=False) - self.db_target = DB(db_file_target, check_same_thread=False) self.extract_main() tmpdir.cleanup() @@ -4559,8 +4556,6 @@ class GfptarProgram(Program): self.update_stat_for_directories(directory_set) self.db.close() self.db.unlink() - self.db_target.close() - self.db_target.unlink() if error_num > 0: raise GfptarError("Some .tar.* or .db.gz are broken:" f" error={error_num}") From 7e946fd1e7aa051b025a412f7b7bcefeefbfecb8 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Sat, 14 Sep 2024 20:20:44 +0900 Subject: [PATCH 088/143] gfptar: change the format of progress_for_schedule() --- gftool/gfptar/gfptar | 49 +++++++++++++++++++++++++++++++------------- 1 file changed, 35 insertions(+), 14 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index b9dc7e916..75c83e95d 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -4432,6 +4432,8 @@ class GfptarProgram(Program): self.total_num = 0 self.total_size = 0 + self.selected_num = 0 + self.selected_size = 0 self.start_time = time.time() self.next_time = self.start_time + self.progress_interval if self.progress_enabled: @@ -4450,12 +4452,13 @@ class GfptarProgram(Program): db_path, resolve_ugmap=False): if self.is_canceled(): raise self.error_canceled() - self.total_num += 1 - self.total_size += fattr.size + size = fattr.size file_type = fattr.ftype + self.total_num += 1 + self.total_size += size if self.search_target: logger.debug('archive_dict[%s]: %s', path, file_type) - archive_dict[path] = file_type + archive_dict[path] = (file_type, size) else: if file_type == InfoDB.TYPE_DIR: # all directories logger.debug('directory_set.add: %s', path) @@ -4466,13 +4469,16 @@ class GfptarProgram(Program): if self.search_target: is_target = False for member in member_check_dict.keys(): - file_type = archive_dict.get(member, None) + ftype_size = archive_dict.get(member, None) logger.debug('from member_check_dict.keys: %s [%s]', - member, file_type) + member, str(ftype_size)) found = False - if file_type is not None: + if ftype_size is not None: + file_type, size = ftype_size logger.debug('member_set.add: %s', member) member_set.add(member) + self.selected_num += 1 + self.selected_size += size found = True if file_type == InfoDB.TYPE_DIR: logger.debug('directory_set.add: %s', member) @@ -4486,10 +4492,14 @@ class GfptarProgram(Program): is_dir = True if is_dir: # find {member}/* files - for path, file_type in archive_dict.find_by_prefix( + for path, ftype_size in archive_dict.find_by_prefix( member + '/'): - logger.debug('member_set.add: %s', path) + logger.debug('find_by_prefix: %s [%s]', + path, str(ftype_size)) + file_type, size = ftype_size member_set.add(path) + self.selected_num += 1 + self.selected_size += size found = True if file_type == InfoDB.TYPE_DIR: logger.debug('directory_set.add: %s', path) @@ -4514,7 +4524,9 @@ class GfptarProgram(Program): self.progress_for_schedule(time.time()) sys.stdout.write('\n') if self.search_target: - self.total_num = len(member_set) # update + # update for progress + self.total_num = self.selected_num + self.total_size = self.selected_size for member, found in member_check_dict.items(): logger.debug('check member_check_dict: %s, %s', member, found) @@ -4824,11 +4836,20 @@ class GfptarProgram(Program): else: ent_per_sec = 0 ent_per_sec_str = self._humanize(ent_per_sec) - sys.stdout.write(f'\rschedule: ' - f'{total_size_str}B ' - f'{total_num_str}Ent ' - f'{sec_str} ' - f'{ent_per_sec_str}Ent/s') + if self.selected_num > 0: + selected_num_str = self._humanize(self.selected_num) + selected_size_str = self._humanize(self.selected_size) + sys.stdout.write(f'\rschedule: ' + f'{selected_size_str}/{total_size_str}B ' + f'{selected_num_str}/{total_num_str}Ent ' + f'{sec_str} ' + f'{ent_per_sec_str}Ent/s') + else: + sys.stdout.write(f'\rschedule: ' + f'{total_size_str}B ' + f'{total_num_str}Ent ' + f'{sec_str} ' + f'{ent_per_sec_str}Ent/s') def progress_for_sync_infodb(self, now): sec = now - self.start_time From c700386ec0abf7b63e37ce7fda9b40dd29122678 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Sat, 14 Sep 2024 20:27:37 +0900 Subject: [PATCH 089/143] gfptar --extract: check the outdir before scheduling --- gftool/gfptar/gfptar | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 75c83e95d..5f6cb6a18 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -4376,6 +4376,7 @@ class GfptarProgram(Program): logger.debug('extract start: outdir=%s, indir=%s', outdir, indir) self.outdir = outdir self.outdir_url = GfURL.init(outdir, dry_run=self.dry_run) + self.outdir_url.create_new_dir() self.indir = indir self.specified_members = specified_members self.same_owner = self.opt['--same-owner'] @@ -4540,7 +4541,6 @@ class GfptarProgram(Program): def extract_main(self): (gen_tarlist_dict, directory_set, member_set, error_num) = self.extract_schedule() - self.outdir_url.create_new_dir() # NOTE: Directories are created in threads # self.extract_directories(directory_set) # slow From 2b70e3cd1e86fc7e61647ec3fcdb8b327deabb79 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Sun, 15 Sep 2024 13:19:41 +0900 Subject: [PATCH 090/143] gfptar: extract in order of gen (generation ID) --- gftool/gfptar/gfptar | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 5f6cb6a18..54178ceec 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -4555,11 +4555,12 @@ class GfptarProgram(Program): self.gfsched_init() + # sort by generation ID if self.MT_enabled(): - for gen, tarlist in gen_tarlist_dict.items(): + for gen, tarlist in sorted(gen_tarlist_dict.items()): self.extract_from_archives_MT(tarlist, member_set) else: - for gen, tarlist in gen_tarlist_dict.items(): + for gen, tarlist in sorted(gen_tarlist_dict.items()): self.extract_from_archives(tarlist, member_set) if self.progress_enabled: self.progress_for_extract(time.time()) From 3fab6b3c82d10dc46c6f214095f4cde6bc5b7cfd Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Sun, 15 Sep 2024 22:17:45 +0900 Subject: [PATCH 091/143] gfptar: use dd for local I/O to improve performance gfptar: new option: --disabe-dd --- gftool/gfptar/gfptar | 99 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 95 insertions(+), 4 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 54178ceec..2011ade73 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -907,18 +907,29 @@ class GfURLEntry(DBObj): class GfURL(metaclass=abc.ABCMeta): MAXNAMLEN = 255 # SEE ALSO: dirent.h, gfarm/gfs.h (GFS_MAXNAMLEN) + USE_DD = False @classmethod def shutup_stderr(cls): shutup_stderr() - @staticmethod - def init(url, use_gfarm_command=False, local=False, dry_run=False): + @classmethod + def use_dd(cls, enable): + cls.USE_DD = enable + + @classmethod + def init(cls, url, use_gfarm_command=False, local=False, dry_run=False): if local: - return GfURLLocal(url, dry_run) + if cls.USE_DD: + return GfURLLocalDD(url, dry_run) + else: + return GfURLLocal(url, dry_run) if GfURLGfarm.is_my_URL(url): return GfURLGfarm(url, dry_run) - gfurl1 = GfURLLocal(url, dry_run) + if cls.USE_DD: + gfurl1 = GfURLLocalDD(url, dry_run) + else: + gfurl1 = GfURLLocal(url, dry_run) if not use_gfarm_command: return gfurl1 if dry_run: # cannot resolve the Gfarm URL from gfarm2fs @@ -1989,6 +2000,82 @@ class GfURLLocal(GfURL): self.chmod(mode, mtime=mtime, user=user, group=group) +class GfURLLocalDD(GfURLLocal): + def dd_read(self, textmode=False): + # dd if=file + args = ['dd', 'if=' + self.url_str] + logger.debug('%s', args) + if textmode: + encoding = get_encoding() + else: + encoding = None + return subprocess.Popen( + args, shell=False, encoding=encoding, close_fds=True, + stdin=subprocess.DEVNULL, stdout=subprocess.PIPE, + stderr=subprocess.DEVNULL) # shut up stderr + + def dd_write(self, textmode=False, mode=None, mtime=None, + user=None, group=None, hostname=None): + class DDWrite(): + def __init__(self, url, proc, stdin): + self.url = url + self.proc = proc + self.stdin = stdin + + def close(self): + self.stdin.close() + + def post(self): + # must be called after wait() + if mode is not None: + self.url.chmod(mode, mtime=mtime, user=user, group=group) + else: + if mtime is not None: + self.url.utime(mtime, mtime) + self.url.chown(user, group) + + # dd of=file + args = ['dd', 'of=' + self.url_str] + logger.debug('%s', args) + if textmode: + encoding = get_encoding() + else: + encoding = None + proc = subprocess.Popen( + args, shell=False, encoding=encoding, close_fds=True, + stdin=subprocess.PIPE, stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL) # shut up stderr + return DDWrite(self, proc, proc.stdin) + + @contextmanager + def readopen(self, textmode=False): + proc = self.dd_read(textmode=textmode) + try: + yield proc.stdout + finally: + proc.stdout.close() + ret = proc.wait() + if ret != 0: + raise GfException('{}: returncode={}'.format( + ' '.join(proc.args), ret)) + + @contextmanager + def writeopen(self, textmode=False, mode=0o600, mtime=None, + user=None, group=None, use_fsync=True, hostname=None): + dd_obj = self.dd_write(textmode=textmode, mode=mode, mtime=mtime, + user=user, group=group, hostname=hostname) + proc = dd_obj.proc + try: + yield dd_obj.stdin + finally: + dd_obj.close() + ret = proc.wait() + if ret != 0: + raise GfException('{}: returncode={}'.format( + ' '.join(proc.args), ret)) + dd_obj.post() + + class Compress: TYPE_NO = 'no' TYPE_GZIP = 'gz' @@ -2487,6 +2574,8 @@ class GfptarProgram(Program): self.max_workers = 1 self.bufsize = self.opt['--bufsize'] self.use_fsync = not self.opt['--disable-fsync'] + self.use_dd = not self.opt['--disable-dd'] + GfURL.use_dd(self.use_dd) self.sync_infodb = self.opt['--generate-db'] self.workdir = self.opt['--workdir'] @@ -5576,6 +5665,7 @@ Options: --disable-gfarm-command disable the use of gfreg and gfexport for tar files on gfarm2fs --disable-fsync disable calling fsync() before close() + --disable-dd use Python for local I/O instead of dd command --gfsched-interval=SEC interval of updating candidate hosts to write (for Gfarm URL only) [default: 120] --progress-interval=SEC interval of updating progress [default: 1.0] @@ -5647,6 +5737,7 @@ _schema = Schema({ '--xz-program': Or(str, None), '--disable-gfarm-command': bool, '--disable-fsync': bool, + '--disable-dd': bool, '--gfsched-interval': Use(int), '--generate-db': bool, '--same-owner': bool, From 7d192b7edcb7c9048948d40109ed3a65c4149653 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Sun, 15 Sep 2024 23:04:34 +0900 Subject: [PATCH 092/143] gfptar: set conv=fdatasync for dd when not using --disable-fsync --- gftool/gfptar/gfptar | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 2011ade73..a61c4b801 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -2014,7 +2014,8 @@ class GfURLLocalDD(GfURLLocal): stdin=subprocess.DEVNULL, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL) # shut up stderr - def dd_write(self, textmode=False, mode=None, mtime=None, + def dd_write(self, textmode=False, use_fsync=True, + mode=None, mtime=None, user=None, group=None, hostname=None): class DDWrite(): def __init__(self, url, proc, stdin): @@ -2036,6 +2037,8 @@ class GfURLLocalDD(GfURLLocal): # dd of=file args = ['dd', 'of=' + self.url_str] + if use_fsync: + args.append('conv=fdatasync') logger.debug('%s', args) if textmode: encoding = get_encoding() @@ -2062,7 +2065,8 @@ class GfURLLocalDD(GfURLLocal): @contextmanager def writeopen(self, textmode=False, mode=0o600, mtime=None, user=None, group=None, use_fsync=True, hostname=None): - dd_obj = self.dd_write(textmode=textmode, mode=mode, mtime=mtime, + dd_obj = self.dd_write(textmode=textmode, use_fsync=use_fsync, + mode=mode, mtime=mtime, user=user, group=group, hostname=hostname) proc = dd_obj.proc try: From e5a4f9af800486f384a633b1dd08248861eefd8c Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Sun, 15 Sep 2024 23:29:01 +0900 Subject: [PATCH 093/143] gfptar: change option: --disable-dd -> --use-dd --- gftool/gfptar/gfptar | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index a61c4b801..1adafcb6c 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -2578,7 +2578,7 @@ class GfptarProgram(Program): self.max_workers = 1 self.bufsize = self.opt['--bufsize'] self.use_fsync = not self.opt['--disable-fsync'] - self.use_dd = not self.opt['--disable-dd'] + self.use_dd = self.opt['--use-dd'] GfURL.use_dd(self.use_dd) self.sync_infodb = self.opt['--generate-db'] @@ -5669,7 +5669,7 @@ Options: --disable-gfarm-command disable the use of gfreg and gfexport for tar files on gfarm2fs --disable-fsync disable calling fsync() before close() - --disable-dd use Python for local I/O instead of dd command + --use-dd use dd for local I/O instead of Python --gfsched-interval=SEC interval of updating candidate hosts to write (for Gfarm URL only) [default: 120] --progress-interval=SEC interval of updating progress [default: 1.0] @@ -5741,7 +5741,7 @@ _schema = Schema({ '--xz-program': Or(str, None), '--disable-gfarm-command': bool, '--disable-fsync': bool, - '--disable-dd': bool, + '--use-dd': bool, '--gfsched-interval': Use(int), '--generate-db': bool, '--same-owner': bool, From a3c3bd35cd4137710c5471dd8f1cb81b0587636e Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Sun, 15 Sep 2024 23:37:03 +0900 Subject: [PATCH 094/143] gfptar --use-dd: dd bs=64K --- gftool/gfptar/gfptar | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 1adafcb6c..0d08ce511 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -2002,8 +2002,8 @@ class GfURLLocal(GfURL): class GfURLLocalDD(GfURLLocal): def dd_read(self, textmode=False): - # dd if=file - args = ['dd', 'if=' + self.url_str] + # dd if=file bs=64K + args = ['dd', 'if=' + self.url_str, 'bs=64K'] logger.debug('%s', args) if textmode: encoding = get_encoding() @@ -2035,8 +2035,8 @@ class GfURLLocalDD(GfURLLocal): self.url.utime(mtime, mtime) self.url.chown(user, group) - # dd of=file - args = ['dd', 'of=' + self.url_str] + # dd of=file bs=64K + args = ['dd', 'of=' + self.url_str, 'bs=64K'] if use_fsync: args.append('conv=fdatasync') logger.debug('%s', args) From 15bf990150fbb88b23ceb72b645201a9c208a9d5 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Mon, 16 Sep 2024 19:37:55 +0900 Subject: [PATCH 095/143] gfptar: change the order of "Usage" --- gftool/gfptar/gfptar | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 0d08ce511..59e6ede83 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -5708,8 +5708,8 @@ Options: Usage: {f} [options] [--exclude=PATTERN]... -c OUTDIR [-C DIR] [--] MEMBER... {f} [options] [--exclude=PATTERN]... -r OUTDIR [-C DIR] [--] MEMBER... - {f} [options] [--exclude=PATTERN]... --resume OUTDIR [-C DIR] [--] MEMBER... {f} [options] [--exclude=PATTERN]... -u OUTDIR [-C DIR] [--] MEMBER... + {f} [options] [--exclude=PATTERN]... --resume OUTDIR [-C DIR] [--] MEMBER... {f} [options] [--exclude=PATTERN]... -x OUTDIR [--] INDIR [MEMBER...] {f} [options] -t DIR {f} [options] --test From fdb7c6ad75c7263c065bc67e6cd7d046680d83f1 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Mon, 16 Sep 2024 19:42:32 +0900 Subject: [PATCH 096/143] gfptar: remove --use-dd (not useful) --- gftool/gfptar/gfptar | 99 +------------------------------------------- 1 file changed, 2 insertions(+), 97 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 59e6ede83..6a931f430 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -907,29 +907,18 @@ class GfURLEntry(DBObj): class GfURL(metaclass=abc.ABCMeta): MAXNAMLEN = 255 # SEE ALSO: dirent.h, gfarm/gfs.h (GFS_MAXNAMLEN) - USE_DD = False @classmethod def shutup_stderr(cls): shutup_stderr() - @classmethod - def use_dd(cls, enable): - cls.USE_DD = enable - @classmethod def init(cls, url, use_gfarm_command=False, local=False, dry_run=False): if local: - if cls.USE_DD: - return GfURLLocalDD(url, dry_run) - else: - return GfURLLocal(url, dry_run) + return GfURLLocal(url, dry_run) if GfURLGfarm.is_my_URL(url): return GfURLGfarm(url, dry_run) - if cls.USE_DD: - gfurl1 = GfURLLocalDD(url, dry_run) - else: - gfurl1 = GfURLLocal(url, dry_run) + gfurl1 = GfURLLocal(url, dry_run) if not use_gfarm_command: return gfurl1 if dry_run: # cannot resolve the Gfarm URL from gfarm2fs @@ -2000,86 +1989,6 @@ class GfURLLocal(GfURL): self.chmod(mode, mtime=mtime, user=user, group=group) -class GfURLLocalDD(GfURLLocal): - def dd_read(self, textmode=False): - # dd if=file bs=64K - args = ['dd', 'if=' + self.url_str, 'bs=64K'] - logger.debug('%s', args) - if textmode: - encoding = get_encoding() - else: - encoding = None - return subprocess.Popen( - args, shell=False, encoding=encoding, close_fds=True, - stdin=subprocess.DEVNULL, stdout=subprocess.PIPE, - stderr=subprocess.DEVNULL) # shut up stderr - - def dd_write(self, textmode=False, use_fsync=True, - mode=None, mtime=None, - user=None, group=None, hostname=None): - class DDWrite(): - def __init__(self, url, proc, stdin): - self.url = url - self.proc = proc - self.stdin = stdin - - def close(self): - self.stdin.close() - - def post(self): - # must be called after wait() - if mode is not None: - self.url.chmod(mode, mtime=mtime, user=user, group=group) - else: - if mtime is not None: - self.url.utime(mtime, mtime) - self.url.chown(user, group) - - # dd of=file bs=64K - args = ['dd', 'of=' + self.url_str, 'bs=64K'] - if use_fsync: - args.append('conv=fdatasync') - logger.debug('%s', args) - if textmode: - encoding = get_encoding() - else: - encoding = None - proc = subprocess.Popen( - args, shell=False, encoding=encoding, close_fds=True, - stdin=subprocess.PIPE, stdout=subprocess.DEVNULL, - stderr=subprocess.DEVNULL) # shut up stderr - return DDWrite(self, proc, proc.stdin) - - @contextmanager - def readopen(self, textmode=False): - proc = self.dd_read(textmode=textmode) - try: - yield proc.stdout - finally: - proc.stdout.close() - ret = proc.wait() - if ret != 0: - raise GfException('{}: returncode={}'.format( - ' '.join(proc.args), ret)) - - @contextmanager - def writeopen(self, textmode=False, mode=0o600, mtime=None, - user=None, group=None, use_fsync=True, hostname=None): - dd_obj = self.dd_write(textmode=textmode, use_fsync=use_fsync, - mode=mode, mtime=mtime, - user=user, group=group, hostname=hostname) - proc = dd_obj.proc - try: - yield dd_obj.stdin - finally: - dd_obj.close() - ret = proc.wait() - if ret != 0: - raise GfException('{}: returncode={}'.format( - ' '.join(proc.args), ret)) - dd_obj.post() - - class Compress: TYPE_NO = 'no' TYPE_GZIP = 'gz' @@ -2578,8 +2487,6 @@ class GfptarProgram(Program): self.max_workers = 1 self.bufsize = self.opt['--bufsize'] self.use_fsync = not self.opt['--disable-fsync'] - self.use_dd = self.opt['--use-dd'] - GfURL.use_dd(self.use_dd) self.sync_infodb = self.opt['--generate-db'] self.workdir = self.opt['--workdir'] @@ -5669,7 +5576,6 @@ Options: --disable-gfarm-command disable the use of gfreg and gfexport for tar files on gfarm2fs --disable-fsync disable calling fsync() before close() - --use-dd use dd for local I/O instead of Python --gfsched-interval=SEC interval of updating candidate hosts to write (for Gfarm URL only) [default: 120] --progress-interval=SEC interval of updating progress [default: 1.0] @@ -5741,7 +5647,6 @@ _schema = Schema({ '--xz-program': Or(str, None), '--disable-gfarm-command': bool, '--disable-fsync': bool, - '--use-dd': bool, '--gfsched-interval': Use(int), '--generate-db': bool, '--same-owner': bool, From c58e08fa50938027f316dc26d36a9ea5c024b8df Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Mon, 16 Sep 2024 21:59:37 +0900 Subject: [PATCH 097/143] gfptar --extract: overwrite read-only files gfptar: print messsage to *.lst --- gftool/gfptar/gfptar | 53 +++++++++++++++++++++++++++++++++----------- 1 file changed, 40 insertions(+), 13 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 6a931f430..a5a795a24 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -731,6 +731,11 @@ def convert_message(error): if hasattr(error, 'filename') and hasattr(error, 'strerror'): if error.filename is not None and error.strerror is not None: return f'{error.filename}: {error.strerror}' + if error.__context__: + # "During handling of the above exception, another exception occurred" + error = error.__context__ + # if error.__cause__: + # error = error.__cause__ if isinstance(error, GfException): # custom errors return str(error) else: @@ -1190,8 +1195,8 @@ class GfURL(metaclass=abc.ABCMeta): finally: f.close() - def copy_from(self, inf, bufsize, mode=0o600, mtime=None, - user=None, group=None, use_fsync=True, hostname=None): + def copy_from0(self, inf, bufsize, mode, mtime, + user, group, use_fsync, hostname): readlen = 0 with self.writeopen(mode=mode, mtime=mtime, user=user, group=group, @@ -1205,6 +1210,20 @@ class GfURL(metaclass=abc.ABCMeta): readlen += wlen return readlen + def copy_from(self, inf, bufsize, overwrite=False, mode=0o600, mtime=None, + user=None, group=None, use_fsync=True, hostname=None): + try: + return self.copy_from0(inf, bufsize, mode, mtime, + user, group, use_fsync, hostname) + except Exception: + if not overwrite: + raise + if self.is_writable(): + raise + self.chmod(mode | 0o200) + return self.copy_from0(inf, bufsize, mode, mtime, + user, group, use_fsync, hostname) + def sha256(self, bufsize=1048576): h = hashlib.sha256() with self.readopen() as f: @@ -3116,7 +3135,7 @@ class GfptarProgram(Program): # for cmd_create tree = [ (self.D, 'dir1', 0o715, 10, None, True), - (self.F, 'dir1/file1', 0o664, 10, None, True), + (self.F, 'dir1/file1', 0o644, 10, None, True), (self.F, 'dir1/file2', 0o664, 10, None, True), (self.S, 'dir1/symlink1', 0o777, 10, 'file1', True), (self.F, 'dir1/exclude1.ext1', 0o660, 10, None, True), @@ -3127,7 +3146,7 @@ class GfptarProgram(Program): # for cmd_update tree = [ (self.D, 'dir1', 0o750, 11, None, True), - (self.F, 'dir1/file1', 0o660, 11, None, True), + (self.F, 'dir1/file1', 0o444, 11, None, True), (self.F, 'dir1/file2', 0o664, 9, None, True), # not updated (self.S, 'dir1/symlink1', 0o770, 11, 'file1', True), (self.F, 'dir1/exclude2.ext2', 0o660, 10, None, True), @@ -3150,7 +3169,7 @@ class GfptarProgram(Program): # expected tree tree = [ (self.D, 'dir1', 0o750, 11, None, True), - (self.F, 'dir1/file1', 0o660, 11, None, True), + (self.F, 'dir1/file1', 0o444, 11, None, True), (self.F, 'dir1/file2', 0o664, 10, None, True), (self.S, 'dir1/symlink1', 0o770, 11, 'file1', True), (self.D, 'dir2', 0o700, 10, None, True), @@ -4768,13 +4787,16 @@ class GfptarProgram(Program): inf = tar.extractfile(tarinfo) # io.BufferedReader logger.debug('extract,file(before): %s, %s', outurl_str, outfile) - size = outurl.copy_from(inf, self.bufsize, - mode=tarinfo.mode, - mtime=tarinfo.mtime, - user=user, group=group, - use_fsync=self.use_fsync, - hostname=target_host) - inf.close() + try: + size = outurl.copy_from(inf, self.bufsize, + overwrite=True, + mode=tarinfo.mode, + mtime=tarinfo.mtime, + user=user, group=group, + use_fsync=self.use_fsync, + hostname=target_host) + finally: + inf.close() logger.debug('extract,file: %s, %d', outfile, size) with self.lock(): self.info('extracted(F): {}', outfile) @@ -5060,7 +5082,12 @@ class GfptarProgram(Program): # int("0001") -> 1 dbgz_list.append((int(serial), int(gen), path)) continue - if ent.path.endswith(self.LIST_SUFFIX): # ignore *.lst, *.tar.lst + if ent.path.endswith(self.LIST_SUFFIX): # ignore *.lst + if self.progress_enabled: + logger.warning('') + logger.warning(f"{ent.path}:" + " This file (*.lst) is no longer required." + " Please remove it.") continue if not tar_pattern.match(base): # ignore not *.tar.* continue From 7036d35cb29f3daa2e021f4c8cf59e4d2c439064 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Mon, 16 Sep 2024 22:02:38 +0900 Subject: [PATCH 098/143] gfptar: add comment --- gftool/gfptar/gfptar | 1 + 1 file changed, 1 insertion(+) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index a5a795a24..2065074df 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -1111,6 +1111,7 @@ class GfURL(metaclass=abc.ABCMeta): def dryrun_remove(self): pass + # For test purposes only @abc.abstractmethod def remove_tree(self, remove_readonly=False): raise NotImplementedError From b6fa6d489d12169126dedc7122efdc9c30c971cf Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Mon, 16 Sep 2024 22:05:30 +0900 Subject: [PATCH 099/143] gfptar: simplify --- gftool/gfptar/gfptar | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 2065074df..1294ab47f 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -941,7 +941,7 @@ class GfURL(metaclass=abc.ABCMeta): scheme, host, path = self.parse(url) self._scheme = scheme self._host = host - self._path = path + self.path = os.path.normpath(path) self.dry_run = dry_run if dry_run: self.chmod = self.dryrun_chmod @@ -1016,10 +1016,6 @@ class GfURL(metaclass=abc.ABCMeta): def url_str(self): return self._url_str - @property - def path(self): - return os.path.normpath(self._path) - @property def root_url_str(self): # ex. gfarm://example.com/a/b/c -> gfarm://example.com From 52f9704e71ed1186ca8cc7290fdca8c62ebb1314 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Mon, 16 Sep 2024 22:38:38 +0900 Subject: [PATCH 100/143] gfptar: new option: --debug-sleep (delete --dummy-sleep) --- gftool/gfptar/gfptar | 42 ++++++++++++++++++++++++------------------ 1 file changed, 24 insertions(+), 18 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 1294ab47f..51977e1eb 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -912,6 +912,7 @@ class GfURLEntry(DBObj): class GfURL(metaclass=abc.ABCMeta): MAXNAMLEN = 255 # SEE ALSO: dirent.h, gfarm/gfs.h (GFS_MAXNAMLEN) + DEBUG_SLEEP = 0 @classmethod def shutup_stderr(cls): @@ -1060,6 +1061,15 @@ class GfURL(metaclass=abc.ABCMeta): def is_my_URL(cls, url): raise NotImplementedError + @classmethod + def set_debug_sleep(cls, sec): + cls.DEBUG_SLEEP = sec + + def debug_sleep(self): + if self.DEBUG_SLEEP > 0: + # time.sleep(0): long time since Python 3.11 + time.sleep(self.DEBUG_SLEEP) + @abc.abstractmethod def chmod(self, mode, mtime=None, user=None, group=None, follow_symlinks=True): @@ -1186,6 +1196,7 @@ class GfURL(metaclass=abc.ABCMeta): @contextmanager def dryrun_writeopen(self, textmode=False, mode=0o600, mtime=None, user=None, group=None, use_fsync=True, hostname=None): + self.debug_sleep() f = open(os.devnull, 'wb') try: yield f @@ -1688,6 +1699,7 @@ class GfURLGfarm(GfURL): @contextmanager def readopen(self, textmode=False): + self.debug_sleep() proc = self.gfexport(textmode=textmode) try: yield proc.stdout @@ -1701,6 +1713,7 @@ class GfURLGfarm(GfURL): @contextmanager def writeopen(self, textmode=False, mode=0o600, mtime=None, user=None, group=None, use_fsync=True, hostname=None): + self.debug_sleep() # TODO XXX add gfreg option for use_fsync gfreg_obj = self.gfreg(textmode=textmode, mode=mode, mtime=mtime, user=user, group=group, hostname=hostname) @@ -1974,6 +1987,7 @@ class GfURLLocal(GfURL): @contextmanager def readopen(self, textmode=False): + self.debug_sleep() if textmode: f = open(self.url_str, 'rt', encoding=get_encoding()) else: @@ -1986,6 +2000,7 @@ class GfURLLocal(GfURL): @contextmanager def writeopen(self, textmode=False, mode=0o600, mtime=None, user=None, group=None, use_fsync=True, hostname=None): + self.debug_sleep() tmpmode = mode | 0o200 # necessary (Permission denied at ex.0o400) fd = os.open(path=self.url_str, flags=(os.O_WRONLY | os.O_CREAT | os.O_TRUNC), @@ -2071,7 +2086,6 @@ class Compress: class GfTarFile(tarfile.TarFile): ATTR_PROC_LIST = '_gfptar_proc_list' # [(proc, fileobj, fileobj), ...] ATTR_USE_FSYNC = 'use_fsync' - ATTR_DUMMY_SLEEP = 'dummy_sleep' METHOD_add_entry = 'add_entry' @classmethod @@ -2126,7 +2140,7 @@ class GfTarFile(tarfile.TarFile): @classmethod def create_open(cls, gfurl, compress_type, copybufsize, compress_prog=None, use_fsync=True, target_host=None, - dummy_input=False, dummy_sleep=0): + dummy_input=False): if compress_prog is None: compress_prog = Compress.compress_prog # use Stream (not seekable) @@ -2181,7 +2195,6 @@ class GfTarFile(tarfile.TarFile): setattr(tar, cls.ATTR_USE_FSYNC, use_fsync) if dummy_input: setattr(tar, cls.METHOD_add_entry, tar._add_entry_dummy) - setattr(tar, cls.ATTR_DUMMY_SLEEP, dummy_sleep) else: setattr(tar, cls.METHOD_add_entry, tar._add_entry) return tar @@ -2216,8 +2229,6 @@ class GfTarFile(tarfile.TarFile): if tarinfo is None: # warning, skip return if entry.is_file(): - if self.dummy_sleep > 0: - time.sleep(self.dummy_sleep) with RandomStream(entry.size) as f: self.addfile(tarinfo, fileobj=f) else: @@ -2551,6 +2562,9 @@ class GfptarProgram(Program): self.dry_run = self.opt['--dry-run'] + self.debug_sleep = self.opt['--debug-sleep'] + GfURL.set_debug_sleep(self.debug_sleep) + self.exclude_re_list = [] self.exclude = self.opt['--exclude'] if isinstance(self.exclude, str): @@ -3445,11 +3459,9 @@ class GfptarProgram(Program): tb = traceback.TracebackException.from_exception(exc) logger.info(''.join(tb.format())) - def list_dummy_files(self, base_dir, num, size_min, size_max, dummy_sleep): + def list_dummy_files(self, base_dir, num, size_min, size_max): # defaults files_per_dir = 1000 - # dummy_sleep_per_entry = dummy_sleep / 512 # for each readdir() - dummy_sleep_per_entry = 0 dir_min_depth = 5 dir_max_depth = 5 dir_min_length = 30 @@ -3503,9 +3515,6 @@ class GfptarProgram(Program): size, mtime, linkname) def rand_file(dir_path, idx): - if dummy_sleep_per_entry > 0: - # time.sleep(0): long time since Python 3.11 - time.sleep(dummy_sleep_per_entry) # f = generate_random_filename() f = f'{idx}.txt' path = os.path.join(dir_path, f) @@ -3568,7 +3577,6 @@ class GfptarProgram(Program): self.dummy_input = False self.dummy_size_min = self.opt['--dummy-size-min'] self.dummy_size_max = self.opt['--dummy-size-max'] - self.dummy_sleep = self.opt['--dummy-sleep'] if self.compress_type == Compress.TYPE_NO: self.split_size = self.assumed_size @@ -3678,8 +3686,7 @@ class GfptarProgram(Program): return self.list_dummy_files(gfurl.url_str, self.dummy_num, self.dummy_size_min, - self.dummy_size_max, - self.dummy_sleep) + self.dummy_size_max) else: return gfurl.listdir(recursive=True, first=True, hardlink_warn=self.hardlink_warn) @@ -4276,8 +4283,7 @@ class GfptarProgram(Program): self.bufsize, use_fsync=self.use_fsync, target_host=target_host, - dummy_input=self.dummy_input, - dummy_sleep=self.dummy_sleep) + dummy_input=self.dummy_input) # to reduce memory usage # SEE ALSO: InfoDB.generate_db_and_dbgz() db_name = InfoDB.dbgz_filename(gen, serial) @@ -5628,7 +5634,7 @@ Options: (default: disabled) --dummy-size-min=BYTES minimum size of dummy files [default: 0] --dummy-size-max=BYTES maximum size of dummy files [default: 0] - --dummy-sleep=SEC sleep time per dummy file [default: 0.0] + --debug-sleep=SEC sleep time per file for debug [default: 0.0] --dry-run not create output files -q, --quiet quiet messages -v, --verbose verbose output @@ -5686,7 +5692,7 @@ _schema = Schema({ '--dummy-num': Or(Use(unhumanize_number), None), '--dummy-size-min': Use(unhumanize_number), '--dummy-size-max': Use(unhumanize_number), - '--dummy-sleep': Use(float), + '--debug-sleep': Use(float), '--dry-run': bool, '--quiet': bool, '--verbose': bool, From bb9c434f7160dca60c1553dd0b059d836dad5d62 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Wed, 18 Sep 2024 06:50:05 +0900 Subject: [PATCH 101/143] gfptar: fix race condition --- gftool/gfptar/gfptar | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 51977e1eb..ba5465104 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -731,14 +731,14 @@ def convert_message(error): if hasattr(error, 'filename') and hasattr(error, 'strerror'): if error.filename is not None and error.strerror is not None: return f'{error.filename}: {error.strerror}' - if error.__context__: - # "During handling of the above exception, another exception occurred" - error = error.__context__ - # if error.__cause__: - # error = error.__cause__ if isinstance(error, GfException): # custom errors return str(error) else: + # "During handling of the above exception, another exception occurred" + if error.__context__: + error = error.__context__ + # if error.__cause__: + # error = error.__cause__ return f'{error.__class__.__name__}: {str(error)}' @@ -1226,6 +1226,8 @@ class GfURL(metaclass=abc.ABCMeta): except Exception: if not overwrite: raise + if not self.exists(): + raise if self.is_writable(): raise self.chmod(mode | 0o200) @@ -1833,7 +1835,12 @@ class GfURLLocal(GfURL): for p in reversed(list(self.parent_iter)): if p.exists(): continue - os.mkdir(p.url_str, mode | 0o700) + try: + os.mkdir(p.url_str, mode | 0o700) + except FileExistsError: # race condition + pass # ignored + except Exception: + raise os.mkdir(self.url_str, mode) else: os.mkdir(self.url_str, mode) @@ -4631,19 +4638,19 @@ class GfptarProgram(Program): if parent_url.path in created_set: # parent exists dir_url.mkdir() created_set.add(dir_url.path) - self.info('created(D): {}', dir_url.path) + self.info('created(D): {}', dir_url.url_str) else: # no parent dir_url.makedirs() created_set.add(dir_url.path) - self.info('created(D): {}', dir_url.path) + self.info('created(D): {}', dir_url.url_str) created_set.add(parent_url.path) - self.info('created(D): {}', parent_url.path) + self.info('created(D): {}', parent_url.url_str) for p in parent_url.parent_iter: path = p.path if path == '.' or path == '/': continue created_set.add(path) - self.info('created(D): {}', path) + self.info('created(D): {}', p.url_str) def update_stat_for_directories(self, directory_set): logger.debug('update_stat_for_directories') @@ -4765,7 +4772,7 @@ class GfptarProgram(Program): # create directories if not exist if tarinfo.isdir(): dir_url = outurl - dir_path = outfile + dir_path = outurl.path else: dir_url = outurl.parent dir_path = dir_url.path @@ -4775,7 +4782,7 @@ class GfptarProgram(Program): if not dir_url.exists(): with ignore_exception(True): # may be race condition dir_url.makedirs() # default 0700 - self.info('created(D): {}', dir_path) + self.info('created(D): {}', dir_url.url_str) with self.lock(): self.created_directory_set.add(dir_path) From 25a01a106ea4b3e684077c90b572aa3de9365df8 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Wed, 18 Sep 2024 07:22:51 +0900 Subject: [PATCH 102/143] gfptar: update help message --- gftool/gfptar/gfptar | 1 + 1 file changed, 1 insertion(+) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index ba5465104..ae57fadab 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -5580,6 +5580,7 @@ Options: -c, --create=OUTDIR create tar files in OUTDIR from MEMBERs -r, --append=OUTDIR append files (create new tar files) -u, --update=OUTDIR append files newer than same entries in tar files + (Large amounts of space in --workdir may be used) --resume=OUTDIR same as --update -x, --extract=OUTDIR extract all members or specified MEMBERs from INDIR to OUTDIR From ccd4b7a0ab292fa63d7eefab181a28f9c8d8d7fd Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Wed, 18 Sep 2024 08:15:59 +0900 Subject: [PATCH 103/143] gfptar: update comment --- gftool/gfptar/gfptar | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index ae57fadab..3b318977f 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -927,7 +927,10 @@ class GfURL(metaclass=abc.ABCMeta): gfurl1 = GfURLLocal(url, dry_run) if not use_gfarm_command: return gfurl1 - if dry_run: # cannot resolve the Gfarm URL from gfarm2fs + if dry_run: + # cannot resolve the Gfarm URL from gfarm2fs + # when creating files. + # Because the parent directory is not created. return gfurl1 # use_gfarm_command=True: use gf* commands on gfarm2fs gfurl2 = gfurl1.get_gfarm_url_by_gfarm2fs() From ae4918c5d5d52dba50b34ba7811c20f1e499c435 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Wed, 18 Sep 2024 09:46:33 +0900 Subject: [PATCH 104/143] gfptar: f' -> f" --- gftool/gfptar/gfptar | 289 +++++++++++++++++++++---------------------- 1 file changed, 142 insertions(+), 147 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 3b318977f..08d8e717a 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -91,7 +91,7 @@ def humanize_number(num, binary_prefix=False): d = n.quantize(Decimal('0.0'), rounding=ROUND_DOWN) else: d = n.quantize(Decimal('0'), rounding=ROUND_DOWN) - return f'{d}{units[scale]}' + return f"{d}{units[scale]}" def unhumanize_number(numstr, binary_prefix=False): @@ -271,7 +271,7 @@ class DBCollection: raise NotImplementedError def clear(self): - self.con.execute(f'DROP TABLE IF EXISTS {self.table_name}') + self.con.execute(f"DROP TABLE IF EXISTS {self.table_name}") self.create_table() def commit(self): @@ -287,26 +287,26 @@ class DBCollection: self.db.unlink() def __len__(self): - res = self.con.execute(f'SELECT COUNT(*) FROM {self.table_name}') + res = self.con.execute(f"SELECT COUNT(*) FROM {self.table_name}") return res.fetchone()[0] class DBDict(DBCollection): def create_table(self): - self.con.execute(f''' + self.con.execute(f""" CREATE TABLE IF NOT EXISTS {self.table_name} (key TEXT PRIMARY KEY, value TEXT) - ''') + """) self.for_dict = True def __setitem__(self, key, value): - self.con.execute(f''' + self.con.execute(f""" INSERT OR REPLACE INTO {self.table_name} (key, value) VALUES (?, ?) - ''', (key, self.obj2txt(value))) + """, (key, self.obj2txt(value))) def __getitem__(self, key): res = self.con.execute( - f'SELECT value FROM {self.table_name} WHERE key = ?', (key,)) + f"SELECT value FROM {self.table_name} WHERE key = ?", (key,)) row = res.fetchone() if row: return self.txt2obj(key, row[0]) @@ -316,13 +316,13 @@ class DBDict(DBCollection): def __delitem__(self, key): if key in self: # __contains__() self.con.execute( - f'DELETE FROM {self.table_name} WHERE key = ?', (key,)) + f"DELETE FROM {self.table_name} WHERE key = ?", (key,)) else: raise KeyError(key) def __contains__(self, key): res = self.con.execute( - f'SELECT 1 FROM {self.table_name} WHERE key = ?', (key,)) + f"SELECT 1 FROM {self.table_name} WHERE key = ?", (key,)) row = res.fetchone() return row is not None @@ -332,17 +332,17 @@ class DBDict(DBCollection): return default def keys(self): - res = self.con.execute(f'SELECT key FROM {self.table_name}') + res = self.con.execute(f"SELECT key FROM {self.table_name}") for row in res: yield row[0] def values(self): - res = self.con.execute(f'SELECT key,value FROM {self.table_name}') + res = self.con.execute(f"SELECT key,value FROM {self.table_name}") for row in res: yield self.txt2obj(row[0], row[1]) def items(self): - res = self.con.execute(f'SELECT key,value FROM {self.table_name}') + res = self.con.execute(f"SELECT key,value FROM {self.table_name}") for row in res: yield row[0], self.txt2obj(row[0], row[1]) @@ -358,7 +358,7 @@ class DBDict(DBCollection): # sort: None, 'ASC', 'DESC' def iterator(self, like=None, sort=None, offset=0, limit=-1): - sql = f'SELECT key,value FROM {self.table_name}' + sql = f"SELECT key,value FROM {self.table_name}" if like is not None: sql += " WHERE key LIKE ? ESCAPE '\\'" if sort is not None: @@ -367,8 +367,8 @@ class DBDict(DBCollection): elif sort.upper() == 'DESC': sql += ' ORDER BY key DESC' else: - raise AssertionError(f'unknown sort type: {str(sort)}') - sql += f' LIMIT {limit} OFFSET {offset}' + raise AssertionError(f"unknown sort type: {str(sort)}") + sql += f" LIMIT {limit} OFFSET {offset}" if like is not None: res = self.con.execute(sql, (like,)) else: @@ -387,10 +387,10 @@ class DBDict(DBCollection): class DBSet(DBCollection): def create_table(self): - self.con.execute(f''' + self.con.execute(f""" CREATE TABLE IF NOT EXISTS {self.table_name} (key TEXT PRIMARY KEY) - ''') + """) self.for_dict = False def update(self, others): @@ -398,45 +398,45 @@ class DBSet(DBCollection): self.add(key) def add(self, key): - self.con.execute(f''' + self.con.execute(f""" INSERT OR IGNORE INTO {self.table_name} (key) VALUES (?) - ''', (self.obj2txt(key),)) + """, (self.obj2txt(key),)) def remove(self, key): if key in self: - self.con.execute(f'DELETE FROM {self.table_name} WHERE key = ?', + self.con.execute(f"DELETE FROM {self.table_name} WHERE key = ?", (self.obj2txt(key),)) else: raise KeyError(key) def discard(self, key): - self.con.execute(f'DELETE FROM {self.table_name} WHERE key = ?', + self.con.execute(f"DELETE FROM {self.table_name} WHERE key = ?", (self.obj2txt(key),)) def __delitem__(self, key): self.remove(key) def __contains__(self, key): - res = self.con.execute(f''' + res = self.con.execute(f""" SELECT 1 FROM {self.table_name} WHERE key = ? - ''', (self.obj2txt(key),)) + """, (self.obj2txt(key),)) row = res.fetchone() return row is not None def __iter__(self): - res = self.con.execute(f'SELECT key FROM {self.table_name}') + res = self.con.execute(f"SELECT key FROM {self.table_name}") for row in res: yield self.txt2obj(row[0], row[0]) # sort: None, 'ASC', 'DESC' def iterator(self, sort=None, offset=0, limit=-1): - sql = f'SELECT key FROM {self.table_name}' + sql = f"SELECT key FROM {self.table_name}" if sort is not None: if sort.upper() == 'ASC': sql += ' ORDER BY key ASC' elif sort.upper() == 'DESC': sql += ' ORDER BY key DESC' - sql += f' LIMIT {limit} OFFSET {offset}' + sql += f" LIMIT {limit} OFFSET {offset}" res = self.con.execute(sql) for row in res: yield self.txt2obj(row[0], row[0]) @@ -446,10 +446,10 @@ class DBList(DBCollection): ERRMSG_INDEX = 'list index out of range' def create_table(self): - self.con.execute(f''' + self.con.execute(f""" CREATE TABLE IF NOT EXISTS {self.table_name} (id INTEGER PRIMARY KEY AUTOINCREMENT, value TEXT) - ''') + """) self.for_dict = False def __repr__(self): @@ -459,18 +459,18 @@ class DBList(DBCollection): return str(list(self)) def append(self, obj): - self.con.execute(f''' + self.con.execute(f""" INSERT INTO {self.table_name} (value) VALUES (?) - ''', (self.obj2txt(obj),)) + """, (self.obj2txt(obj),)) def extend(self, lst): for obj in lst: self.append(obj) def __getitem__(self, index): - res = self.con.execute(f''' + res = self.con.execute(f""" SELECT id,value FROM {self.table_name} ORDER BY id LIMIT 1 OFFSET ? - ''', (index,)) + """, (index,)) row = res.fetchone() if row is None: raise IndexError(self.ERRMSG_INDEX) @@ -478,51 +478,51 @@ class DBList(DBCollection): def __setitem__(self, index, value): with self.con: - res = self.con.execute(f''' + res = self.con.execute(f""" SELECT id FROM {self.table_name} ORDER BY id LIMIT 1 OFFSET ? - ''', (index,)) + """, (index,)) row = res.fetchone() if row is None: raise IndexError(self.ERRMSG_INDEX) - self.con.execute(f''' + self.con.execute(f""" UPDATE {self.table_name} SET value = ? WHERE id = ? - ''', (self.obj2txt(value), row[0])) + """, (self.obj2txt(value), row[0])) def __delitem__(self, index): with self.con: - res = self.con.execute(f''' + res = self.con.execute(f""" SELECT id FROM {self.table_name} ORDER BY id LIMIT 1 OFFSET ? - ''', (index,)) + """, (index,)) row = res.fetchone() if row is None: raise IndexError(self.ERRMSG_INDEX) - self.con.execute(f''' + self.con.execute(f""" DELETE FROM {self.table_name} WHERE id = ? - ''', (row[0],)) + """, (row[0],)) def __iter__(self): - res = self.con.execute(f''' + res = self.con.execute(f""" SELECT id,value FROM {self.table_name} ORDER BY id ASC - ''') + """) for row in res: yield self.txt2obj(row[0], row[1]) def __reversed__(self): - res = self.con.execute(f''' + res = self.con.execute(f""" SELECT id,value FROM {self.table_name} ORDER BY id DESC - ''') + """) for row in res: yield self.txt2obj(row[0], row[1]) # sort: None, 'ASC', 'DESC' def iterator(self, sort=None, offset=0, limit=-1): - sql = f'SELECT id,value FROM {self.table_name}' + sql = f"SELECT id,value FROM {self.table_name}" if sort is not None: if sort.upper() == 'ASC': sql += ' ORDER BY id ASC' elif sort.upper() == 'DESC': sql += ' ORDER BY id DESC' - sql += f' LIMIT {limit} OFFSET {offset}' + sql += f" LIMIT {limit} OFFSET {offset}" res = self.con.execute(sql) for row in res: yield self.txt2obj(row[0], row[1]) @@ -668,7 +668,7 @@ def execcmd(args, stdin=subprocess.DEVNULL, stderr=subprocess.PIPE, except Exception: logger.debug('cannot decode: err={}', err) pass - msg = f'{str(args)} (exit={ret}): {err}' + msg = f"{str(args)} (exit={ret}): {err}" if err and use_stderr: logger.debug(msg) raise GfException(msg) @@ -688,12 +688,12 @@ def execcmd_readline(args, stdin=subprocess.DEVNULL): def handle_stderr(process): prefix = '' if logger.isEnabledFor(logging.INFO): - prefix = f'({args[0]}) ' + prefix = f"({args[0]}) " while True: eline = process.stderr.readline() if eline: eline = eline.rstrip('\r\n') - logger.warning(f'{prefix}{eline}') + logger.warning(f"{prefix}{eline}") elif process.poll() is not None: break @@ -730,7 +730,7 @@ def am_I_gfarmroot(): def convert_message(error): if hasattr(error, 'filename') and hasattr(error, 'strerror'): if error.filename is not None and error.strerror is not None: - return f'{error.filename}: {error.strerror}' + return f"{error.filename}: {error.strerror}" if isinstance(error, GfException): # custom errors return str(error) else: @@ -739,7 +739,7 @@ def convert_message(error): error = error.__context__ # if error.__cause__: # error = error.__cause__ - return f'{error.__class__.__name__}: {str(error)}' + return f"{error.__class__.__name__}: {str(error)}" class Program(metaclass=abc.ABCMeta): @@ -802,11 +802,11 @@ class GfURLEntry(DBObj): self.size = 0 def __str__(self): - return f'{self.path}, {self.mode:o}, {self.file_type}' + return f"{self.path}, {self.mode:o}, {self.file_type}" def __repr__(self): - return (f'Entry(path={self.path},mode={oct(self.mode)},' - f'user={self.uname},group={self.gname})') + return (f"Entry(path={self.path},mode={oct(self.mode)}," + f"user={self.uname},group={self.gname})") # only path must be specified for key when using DBDict @classmethod @@ -887,7 +887,7 @@ class GfURLEntry(DBObj): b = oct(b & 0o7777) if a != b: raise GfException( - f'{ent1.path} vs {ent2.path}: prop={name}: {a} != {b}') + f"{ent1.path} vs {ent2.path}: prop={name}: {a} != {b}") logger.debug('GfURLEntry.compare:prop=%s: PASS', name) def cmpprop(ent1, ent2, properties): @@ -905,7 +905,7 @@ class GfURLEntry(DBObj): url2 = GfURL.init(ent2.path) if not url1.compare_data(url2, bufsize=bufsize): raise GfException( - f'{ent1.path} vs {ent2.path}: different data') + f"{ent1.path} vs {ent2.path}: different data") logger.debug('GfURLEntry.compare:data: PASS') return True @@ -967,7 +967,7 @@ class GfURL(metaclass=abc.ABCMeta): supported_classes = [cls] for c in supported_classes: - if c.SCHEME_LEN > 0 and url.startswith(f'{c.SCHEME}:'): + if c.SCHEME_LEN > 0 and url.startswith(f"{c.SCHEME}:"): # gfarm://host/path -> //host/path hostpath = url[(c.SCHEME_LEN+1):] scheme = c.SCHEME @@ -1026,16 +1026,16 @@ class GfURL(metaclass=abc.ABCMeta): # ex. gfarm:/abc.def/gh -> gfarm: if self._scheme: if self._host: - return f'{self._scheme}:{self._host}' + return f"{self._scheme}:{self._host}" else: - return f'{self._scheme}:' + return f"{self._scheme}:" return '' def subpath(self, fullpath): base = self.url_str if not fullpath.startswith(base): logger.error('subpath: %s, %s', base, fullpath) - raise AssertionError(f'base={base}, fullpath={fullpath}') + raise AssertionError(f"base={base}, fullpath={fullpath}") # logger.debug('subpath: %s, %s', base, fullpath) return fullpath[len(base):].lstrip('/') # relative path @@ -1499,7 +1499,7 @@ class GfURLGfarm(GfURL): if ret == 0: val = int.from_bytes(out, byteorder='big') return val & mode > 0 - raise GfException(f'{self.url_str}: gfarm.effective_perm: {err}') + raise GfException(f"{self.url_str}: gfarm.effective_perm: {err}") def is_readable(self): return self.access(self.R_OK) @@ -1621,8 +1621,8 @@ class GfURLGfarm(GfURL): # ex. gfarm:/home/user1/dir: -> gfarm:/home/user1/dir dirname = line[:-1] if not dirname.startswith(self.url_str): - raise AssertionError(f'{dirname}.startswith(' - f'{self.url_str})') + raise AssertionError(f"{dirname}.startswith(" + f"{self.url_str})") first = False def gfsched(self, is_file=False, write_mode=False, number=None): @@ -2056,7 +2056,7 @@ class Compress: if compress_prog is not None: compress_prog0 = shutil.which(compress_prog) if compress_prog0 is None: - raise GfptarError(f'{compress_prog}: command not found.') + raise GfptarError(f"{compress_prog}: command not found.") compress_prog = compress_prog0 else: if compress_type == cls.TYPE_GZIP: @@ -2361,7 +2361,7 @@ class TestGfptar(unittest.TestCase): def test_DBDict(self): euid = os.geteuid() - tmpdir = tempfile.TemporaryDirectory(prefix=f'gfptar-test-{euid}-', + tmpdir = tempfile.TemporaryDirectory(prefix=f"gfptar-test-{euid}-", dir=None) testdb = DB(os.path.join(tmpdir.name, 'test.db')) d = DBDict(testdb, IntObj, 'test_dict') @@ -2403,7 +2403,7 @@ class TestGfptar(unittest.TestCase): def test_DBSet(self): euid = os.geteuid() - tmpdir = tempfile.TemporaryDirectory(prefix=f'gfptar-test-{euid}-', + tmpdir = tempfile.TemporaryDirectory(prefix=f"gfptar-test-{euid}-", dir=None) testdb = DB(os.path.join(tmpdir.name, 'test.db')) s = DBSet(testdb, IntObj, 'test_set') @@ -2434,7 +2434,7 @@ class TestGfptar(unittest.TestCase): def test_DBList(self): euid = os.geteuid() - tmpdir = tempfile.TemporaryDirectory(prefix=f'gfptar-test-{euid}', + tmpdir = tempfile.TemporaryDirectory(prefix=f"gfptar-test-{euid}", dir=None) testdb = DB(os.path.join(tmpdir.name, 'test.db')) lst = DBList(testdb, IntObj, 'test_list') @@ -2466,18 +2466,18 @@ class CannotBeArchivedError(GfptarError): def __init__(self, files=0, directories=0): msg = '' if files >= 1: - msg += f'{files} file' + msg += f"{files} file" if files >= 2: msg += 's' if directories >= 1: if msg: msg += ' and ' - msg += f'{directories} ' + msg += f"{directories} " if directories >= 2: msg += 'directories' else: msg += 'directory' - self.message = f'{msg} cannot be archived' + self.message = f"{msg} cannot be archived" super().__init__(self.message) @@ -2532,7 +2532,7 @@ class GfptarProgram(Program): # dir=None: system default if self.tmpdir: self.tmpdir.cleanup() - self.tmpdir = tempfile.TemporaryDirectory(prefix=f'gfptar-{euid}-', + self.tmpdir = tempfile.TemporaryDirectory(prefix=f"gfptar-{euid}-", dir=self.workdir) self.sig_init() # to clean tmpdir @@ -2623,8 +2623,7 @@ class GfptarProgram(Program): def sig_init(self, message=True): def sig_handler(signum, frame): if message: - # logger.warning(f'Interrupt (signal={signum})') - sys.stderr.write(f'\nInterrupt (signal={signum})\n') + sys.stderr.write(f"\nInterrupt (signal={signum})\n") self.canceled = True signal.signal(signal.SIGHUP, sig_handler) @@ -2875,7 +2874,7 @@ class GfptarProgram(Program): self.workdir_local_url.mkdir() def test_invalid(self, name, src, dst, for_gfarm): - testname = f'invalid-{name}' + testname = f"invalid-{name}" self.test_workdir_set(testname, gfarm=for_gfarm, local=not for_gfarm) if for_gfarm: workdir = self.workdir_gfarm @@ -2909,7 +2908,7 @@ class GfptarProgram(Program): print(testname + ' ... FAIL (unexpected success)') def test_simple(self, name, use_all_files=False): - testname = f'simple-{name}' + testname = f"simple-{name}" self.test_workdir_set(testname) workdir_local = self.workdir_local workdir_gfarm = self.workdir_gfarm @@ -2937,7 +2936,7 @@ class GfptarProgram(Program): count2 = self.test_prepare_srcdir(srcdir_gfarm, readonly, noread, link, longname) if count1 != count2: - logger.error(f'count1({count1}) != count2({count2})') + logger.error(f"count1({count1}) != count2({count2})") logger.error_exit(1, testname + '(count1 != count2) ' + '... FAIL (test_prepare_srcdir is broken)') test1_name = 'test-1-create' @@ -2959,7 +2958,7 @@ class GfptarProgram(Program): except GfptarError as e: if not noread: raise - logger.info(f'IGNORED: {str(e)}') + logger.info(f"IGNORED: {str(e)}") # Gfarm(tar) -> Gfarm self.cmd_extract(test2_dir_gfarm, test1_dir_gfarm, []) # Gfarm -> Local(tar) @@ -2971,20 +2970,20 @@ class GfptarProgram(Program): except GfptarError as e: if not noread: raise - logger.info(f'IGNORED: {str(e)}') + logger.info(f"IGNORED: {str(e)}") # Local(tar) -> Local self.cmd_extract(test4_dir_local, test3_dir_local, []) # --list for gfarm count = self.cmd_list_simple(test1_dir_gfarm) if count != count1: - logger.error(f'count({count}) != count1({count1})') + logger.error(f"count({count}) != count1({count1})") logger.error_exit(1, testname + '(list_simple(test1_dir_gfarm)) ... FAIL') # --verbose --list for local count = self.cmd_list_verbose(test3_dir_local) if count != count1: - logger.error(f'count({count}) != count1({count1})') + logger.error(f"count({count}) != count1({count1})") logger.error_exit(1, testname + '(list_verbose(test3_dir_local)) ... FAIL') @@ -3033,7 +3032,7 @@ class GfptarProgram(Program): except GfptarError as e: if not noread: raise - logger.info(f'IGNORED: {str(e)}') + logger.info(f"IGNORED: {str(e)}") # Gfarm(tar) -> Local self.cmd_extract(test6_dir_local, test5_dir_gfarm, []) @@ -3099,7 +3098,7 @@ class GfptarProgram(Program): logger.error_exit(1, testname + '(not found in archive files) ' + '... FAIL (unexpected success)') except GfptarError as e: - logger.info(f'Expected error, ignored: {str(e)}') + logger.info(f"Expected error, ignored: {str(e)}") # check mode=0o700 for parents without tarinfo of file#2 gd1 = GfURL.init(os.path.join(test2_dir_gfarm, 'dir1/readonly')) @@ -3107,11 +3106,11 @@ class GfptarProgram(Program): gd1_mode = gd1.mode() if gd1_mode != 0o700: logger.error_exit( - 1, f'{testname} ... FAIL: gd1.mode={gd1_mode:#o}') + 1, f"{testname} ... FAIL: gd1.mode={gd1_mode:#o}") ld1_mode = ld1.mode() if ld1_mode != 0o700: logger.error_exit( - 1, f'{testname} ... FAIL: ld1.mode={ld1_mode:#o}') + 1, f"{testname} ... FAIL: ld1.mode={ld1_mode:#o}") for f in files: g_member = GfURL.init(os.path.join(test2_dir_gfarm, f)) @@ -3363,7 +3362,7 @@ class GfptarProgram(Program): url.hardlink(srcdir_url.url_join(linkname)) url.chmod(mode, mtime=mtime) else: - raise Exception(f'Unexpected ftype={ftype}') + raise Exception(f"Unexpected ftype={ftype}") if readable: count += 1 for ent in reversed(tree): @@ -3439,7 +3438,7 @@ class GfptarProgram(Program): diff_args += ['-r', dir1, dir2] out, err, ret = execcmd_raw(diff_args) if ret != 0: - logger.error(f'diff -r {dir1} {dir2}: {out}: {err}') + logger.error(f"diff -r {dir1} {dir2}: {out}: {err}") return False logger.debug('diff -r (data check): PASS') return self.test_compare(dir1, dir2, data=data, same_owner=same_owner, @@ -3501,7 +3500,7 @@ class GfptarProgram(Program): file_min_length, file_max_length))) + '.' + suffix def rand_dir(first_dir_index): - first_dir = f'dir{first_dir_index:04d}' + first_dir = f"dir{first_dir_index:04d}" mode = 0o755 file_type = GfURLEntry.TYPE_DIR @@ -3526,7 +3525,7 @@ class GfptarProgram(Program): def rand_file(dir_path, idx): # f = generate_random_filename() - f = f'{idx}.txt' + f = f"{idx}.txt" path = os.path.join(dir_path, f) mode = 0o600 file_type = GfURLEntry.TYPE_FILE @@ -3737,7 +3736,7 @@ class GfptarProgram(Program): tmpdir = self.tmpdir tardb_prefix = os.path.join(tmpdir.name, self.cmd_name) - tardb_fmt = f'_{self.SERIAL_FORMAT}.db' + tardb_fmt = f"_{self.SERIAL_FORMAT}.db" # to reduce memory usage filelist_db = DB(tardb_prefix + tardb_fmt % serial) filelist = DBList(filelist_db, GfURLEntry, self.TABLE_tar_entry) @@ -4101,7 +4100,7 @@ class GfptarProgram(Program): inq.put((self.MSG_PARENT_START, gen, serial, arg)) response = outq.get() if response != self.MSG_CHILD_READY: - logger.error(f'Unexpected response from child process: {response}') + logger.error(f"Unexpected response from child process: {response}") return try: @@ -4122,7 +4121,7 @@ class GfptarProgram(Program): break if result is None: logger.debug('waiting for message from child process:' - f' serial={serial}') + f" serial={serial}") continue if len(result) == 0: logger.warning('unknown result (None) from child process') @@ -4156,13 +4155,13 @@ class GfptarProgram(Program): inq.put(self.MSG_PARENT_ERROR_COMPLETE) if self.verbose or self.debug: raise GfException( - f'{exc_type_name}: {exc_value_str}\n' - f'{"".join(exc_traceback_str)}') + f"{exc_type_name}: {exc_value_str}\n" + f"{''.join(exc_traceback_str)}") else: - raise GfException(f'{exc_type_name}: {exc_value_str}') + raise GfException(f"{exc_type_name}: {exc_value_str}") else: logger.error('Unexpected message from child process:' - f' {result}') + f" {result}") break finally: logger.debug('(parent) subprocess finished: serial=%d', serial) @@ -4235,7 +4234,7 @@ class GfptarProgram(Program): if len(filelist) == 0: tardb.close() tardb.unlink() - logger.warning(f'empty filelist: {dbfile}') + logger.warning(f"empty filelist: {dbfile}") return 0, 0, '', '' first = None last = None @@ -4257,10 +4256,10 @@ class GfptarProgram(Program): lastpath = last.subpath(self.basedir_url) outname = '%s..%s%s' % (firstpath, lastpath, self.suffix) - serial_str = f'{self.SERIAL_FORMAT}_' % serial + serial_str = f"{self.SERIAL_FORMAT}_" % serial if gen >= 2: # SEE ALSO: InfoDB.dbgz_filename() - prefix_str = f'g{gen}_{serial_str}' + prefix_str = f"g{gen}_{serial_str}" else: prefix_str = serial_str outname_max = self.outdir_url.MAXNAMLEN \ @@ -4632,7 +4631,7 @@ class GfptarProgram(Program): url_str = self.outdir_url.url_join(d) dir_url = GfURL.init(url_str, dry_run=self.dry_run) logger.debug('extract_directories_fast: ' - f'out_url={self.outdir_url.url_str}, d={d}') + f"out_url={self.outdir_url.url_str}, d={d}") # url.path is normalized if dir_url.path in created_set: logger.debug('skip (already created): %s', url_str) @@ -4751,8 +4750,8 @@ class GfptarProgram(Program): raise except Exception as e: logger.debug('tar.next()', exc_info=e) - logger.warning(f'{target}: SKIPPED: invalid or empty tar: ' - f' {str(e)}') + logger.warning(f"{target}: SKIPPED: invalid or empty tar: " + f" {str(e)}") tarinfo = None if tarinfo is None: break @@ -4875,17 +4874,17 @@ class GfptarProgram(Program): if self.selected_num > 0: selected_num_str = self._humanize(self.selected_num) selected_size_str = self._humanize(self.selected_size) - sys.stdout.write(f'\rschedule: ' - f'{selected_size_str}/{total_size_str}B ' - f'{selected_num_str}/{total_num_str}Ent ' - f'{sec_str} ' - f'{ent_per_sec_str}Ent/s') + sys.stdout.write(f"\rschedule: " + f"{selected_size_str}/{total_size_str}B " + f"{selected_num_str}/{total_num_str}Ent " + f"{sec_str} " + f"{ent_per_sec_str}Ent/s") else: - sys.stdout.write(f'\rschedule: ' - f'{total_size_str}B ' - f'{total_num_str}Ent ' - f'{sec_str} ' - f'{ent_per_sec_str}Ent/s') + sys.stdout.write(f"\rschedule: " + f"{total_size_str}B " + f"{total_num_str}Ent " + f"{sec_str} " + f"{ent_per_sec_str}Ent/s") def progress_for_sync_infodb(self, now): sec = now - self.start_time @@ -4909,12 +4908,12 @@ class GfptarProgram(Program): name = 'sync-db' else: name = 'load-db' - sys.stdout.write(f'\r{name}: ' - f'{current_tar_num_str}/{total_tar_num_str}Tar ' - f'{current_size_str}B ' - f'{current_ent_num_str}Ent ' - f'{sec_str} ' - f'{ent_per_sec_str}Ent/s') + sys.stdout.write(f"\r{name}: " + f"{current_tar_num_str}/{total_tar_num_str}Tar " + f"{current_size_str}B " + f"{current_ent_num_str}Ent " + f"{sec_str} " + f"{ent_per_sec_str}Ent/s") # lock required def progress_for_create(self, now): @@ -4932,7 +4931,7 @@ class GfptarProgram(Program): percent = (percent1 + percent2) / 2 else: percent = percent1 - percent_str = f'{percent:.0f}' + percent_str = f"{percent:.0f}" if sec > 0: bytes_per_sec = self.stored_size / sec ent_per_sec = self.stored_num / sec @@ -4945,12 +4944,12 @@ class GfptarProgram(Program): total_size_str = self._humanize(self.total_size) bytes_per_sec_str = self._humanize(bytes_per_sec) ent_per_sec_str = self._humanize(ent_per_sec) - sys.stdout.write(f'\r{self.cmd_name}: {percent_str}% ' - f'{stored_size_str}/{total_size_str}B ' - f'{stored_num_str}/{total_num_str}Ent ' - f'{sec_str} ' - f'{bytes_per_sec_str}B/s ' - f'{ent_per_sec_str}Ent/s') + sys.stdout.write(f"\r{self.cmd_name}: {percent_str}% " + f"{stored_size_str}/{total_size_str}B " + f"{stored_num_str}/{total_num_str}Ent " + f"{sec_str} " + f"{bytes_per_sec_str}B/s " + f"{ent_per_sec_str}Ent/s") # lock required def progress_for_extract(self, now): @@ -4965,7 +4964,7 @@ class GfptarProgram(Program): percent = (percent1 + percent2) / 2 else: percent = percent1 - percent_str = f'{percent:.0f}' + percent_str = f"{percent:.0f}" if sec > 0: bytes_per_sec = self.extracted_size / sec ent_per_sec = self.extracted_num / sec @@ -4978,12 +4977,12 @@ class GfptarProgram(Program): extracted_size_str = self._humanize(self.extracted_size) bytes_per_sec_str = self._humanize(bytes_per_sec) ent_per_sec_str = self._humanize(ent_per_sec) - sys.stdout.write(f'\rextract: {percent_str}% ' - f'{extracted_size_str}/{total_size_str}B ' - f'{extracted_num_str}/{total_num_str}Ent ' - f'{sec_str} ' - f'{bytes_per_sec_str}B/s ' - f'{ent_per_sec_str}Ent/s') + sys.stdout.write(f"\rextract: {percent_str}% " + f"{extracted_size_str}/{total_size_str}B " + f"{extracted_num_str}/{total_num_str}Ent " + f"{sec_str} " + f"{bytes_per_sec_str}B/s " + f"{ent_per_sec_str}Ent/s") def cmd_list_simple(self, indir): return self.cmd_list(indir, verbose=False) @@ -5018,18 +5017,15 @@ class GfptarProgram(Program): name = name + ' -> ' + fattr.linkname dt_object = datetime.datetime.fromtimestamp(fattr.mtime) mtime_str = dt_object.strftime('%Y-%m-%d %H:%M') - # print(f'g{gen}_{serial:04} {fattr.ftype} {fattr.mode:04o}' - # f' {fattr.user:>8}/{fattr.group:<8}' - # f' {fattr.size:9d} {mtime_str} {name}', file=outf) outf.write('g%d_%04d %s %04o %8s/%-8s %9d %s %s\n' % ( gen, serial, fattr.ftype, fattr.mode, fattr.user, fattr.group, fattr.size, mtime_str, name)) def print_simple(serial, gen, path, fattr): if fattr.ftype == InfoDB.TYPE_DIR: - print(f'{fattr.ftype} {path}/', file=outf) + print(f"{fattr.ftype} {path}/", file=outf) else: - print(f'{fattr.ftype} {path}', file=outf) + print(f"{fattr.ftype} {path}", file=outf) if quiet: print_func = print_quiet @@ -5144,15 +5140,15 @@ class GfptarProgram(Program): dbgz_dict[serial] = (gen, dbgz_path) gen_tar = tar_dict.get(serial, None) if gen_tar is None: - logger.error(f'lost *.tar.* tied to {dbgz_path}') + logger.error(f"lost *.tar.* tied to {dbgz_path}") tar_dict[serial] = (gen, None) for serial, gen, tar_path in tar_list: gen_dbgz = dbgz_dict.get(serial, None) if gen_dbgz is None: dbgz_fname = InfoDB.dbgz_filename(gen, serial) - logger.info(f'lost {dbgz_fname} (auto re-creation)' - f' tied to {tar_path}') + logger.info(f"lost {dbgz_fname} (auto re-creation)" + f" tied to {tar_path}") dbgz_dict[serial] = (gen, None) # for progress @@ -5195,8 +5191,7 @@ class GfptarProgram(Program): def sig_handler(signum, frame): if share_cancel.value == 0: - # logger.warning(f'Interrupt (signal={signum}') - sys.stderr.write(f'\nInterrupt (signal={signum})\n') + sys.stderr.write(f"\nInterrupt (signal={signum})\n") share_cancel.value = 1 self.cancel() @@ -5223,7 +5218,7 @@ class GfptarProgram(Program): self.progress_for_sync_infodb(now) yield serial, gen, tar_path, dbgz_path, db_path except Exception as e: - # logger.error(f'{e}') + # logger.error(f"{e}") self.print_trace(e) share_cancel.value = 1 if self.save_e is None: @@ -5306,7 +5301,7 @@ class InfoDB: # gen (>= 1): the generation number # serial (>= 1): the serial number # ex. g2_0099_gfptar.db.gz - return f'g{gen}_{int(serial):04}_gfptar.db' + return f"g{gen}_{int(serial):04}_gfptar.db" @classmethod def dbgz_filename(cls, gen, serial): @@ -5402,7 +5397,7 @@ class InfoDB: def sig_handler(signum, frame): if not cls.signal_canceled: pid = os.getpid() - logger.info(f'Interrupt (signal={signum}) (PID={pid})') + logger.info(f"Interrupt (signal={signum}) (PID={pid})") cls.signal_canceled = True signal.signal(signal.SIGHUP, sig_handler) @@ -5467,8 +5462,8 @@ class InfoDB: except MemoryError: raise except Exception as e: - logger.warning(f'{tar_path}: SKIPPED:' - f' invalid or empty tar: {str(e)}') + logger.warning(f"{tar_path}: SKIPPED:" + f" invalid or empty tar: {str(e)}") t = None if t is None: break From d5ccb7555b9befbe3d41bc6eb2b8729265c4c180 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Wed, 18 Sep 2024 22:17:19 +0900 Subject: [PATCH 105/143] gfptar: change the format of progress --- gftool/gfptar/gfptar | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 08d8e717a..5f8f70b65 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -4871,16 +4871,17 @@ class GfptarProgram(Program): else: ent_per_sec = 0 ent_per_sec_str = self._humanize(ent_per_sec) + name = "schedule" if self.selected_num > 0: selected_num_str = self._humanize(self.selected_num) selected_size_str = self._humanize(self.selected_size) - sys.stdout.write(f"\rschedule: " + sys.stdout.write(f"\r{name}: " f"{selected_size_str}/{total_size_str}B " f"{selected_num_str}/{total_num_str}Ent " f"{sec_str} " f"{ent_per_sec_str}Ent/s") else: - sys.stdout.write(f"\rschedule: " + sys.stdout.write(f"\r{name}: " f"{total_size_str}B " f"{total_num_str}Ent " f"{sec_str} " @@ -4905,15 +4906,19 @@ class GfptarProgram(Program): ent_per_sec = 0 ent_per_sec_str = self._humanize(ent_per_sec) if self.sync_infodb: - name = 'sync-db' + name = 'generate-db' + sys.stdout.write(f"\r{name}: " + f"{current_tar_num_str}/{total_tar_num_str}(DB) " + f"{current_size_str}B " + f"{current_ent_num_str}Ent " + f"{sec_str} " + f"{ent_per_sec_str}Ent/s") else: name = 'load-db' - sys.stdout.write(f"\r{name}: " - f"{current_tar_num_str}/{total_tar_num_str}Tar " - f"{current_size_str}B " - f"{current_ent_num_str}Ent " - f"{sec_str} " - f"{ent_per_sec_str}Ent/s") + sys.stdout.write(f"\r{name}: " + f"{current_tar_num_str}/{total_tar_num_str}(DB) " + f"{sec_str} " + f"{ent_per_sec_str}Ent/s") # lock required def progress_for_create(self, now): @@ -4977,7 +4982,8 @@ class GfptarProgram(Program): extracted_size_str = self._humanize(self.extracted_size) bytes_per_sec_str = self._humanize(bytes_per_sec) ent_per_sec_str = self._humanize(ent_per_sec) - sys.stdout.write(f"\rextract: {percent_str}% " + name = "extract" + sys.stdout.write(f"\r{name}: {percent_str}% " f"{extracted_size_str}/{total_size_str}B " f"{extracted_num_str}/{total_num_str}Ent " f"{sec_str} " From 769fc6e396b08677419a29f4e5ffd83a640af36b Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Wed, 18 Sep 2024 23:10:30 +0900 Subject: [PATCH 106/143] gfptar: update "Usage" --- gftool/gfptar/gfptar | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 5f8f70b65..86faf6575 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -5584,17 +5584,22 @@ Options: -c, --create=OUTDIR create tar files in OUTDIR from MEMBERs -r, --append=OUTDIR append files (create new tar files) -u, --update=OUTDIR append files newer than same entries in tar files - (Large amounts of space in --workdir may be used) + (--workdir with sufficient space is required) + (Approximately 300MiB is used per 1000000 entries) --resume=OUTDIR same as --update -x, --extract=OUTDIR extract all members or specified MEMBERs from INDIR to OUTDIR -t, --list=DIR list the members of DIR (use with --verbose to see more details) -C, --basedir=DIR change to directory for MEMBERs [default: .] - --exclude= Exclude files matching wildcard patterns - (https://docs.python.org/ja/3/library/fnmatch.html) + --exclude=PATTERN Exclude files matching wildcard patterns + (https://docs.python.org/ja/3/library/fnmatch.html) -X, --exclude-from=FILE Exclude files matching wildcard patterns listed in FILE + --workdir=DIR local directory for temporary files + (default: system temporary directory) + --max-entries-per-tar=NUM the number of entries per tar file + to limit memory usage [default: 100k] -j, --jobs=NUM the number of jobs to copy per tar file in parallel [default: 4] -s, --size=BYTES assumed bytes per output file [default: 200Mi] @@ -5625,10 +5630,6 @@ Options: (https://docs.python.org/3/library/codecs.html#standard-encodings) [default: utf-8] --bufsize=BYTES buffer size to copy [default: 1Mi] - --workdir=DIR local directory for temporary files - (default: system temporary directory) - --max-entries-per-tar=NUM the number of entries per tar file - to limit memory usage [default: 100k] --progress-unit=TYPE unit for progress - si: SI prefix - bin: Binary prefix From 6fdd92a8df7ad670dd7e05ef341296f27824f6e7 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Wed, 18 Sep 2024 23:36:15 +0900 Subject: [PATCH 107/143] gfptar -v --update: report "Not added" --- gftool/gfptar/gfptar | 85 +++++++++++++++++++++++--------------------- 1 file changed, 45 insertions(+), 40 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 86faf6575..f86cfae05 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -3763,9 +3763,11 @@ class GfptarProgram(Program): logger.debug('Canceled (listdir 2): serial=%d', serial) break logger.debug('listdir: entry.path=%s', entry.path) - if not is_update_target(entry): - continue # skip if self.exclude_match(entry.path): + logger.info(f"Not added (excluded): {entry.path}") + continue # skip + if not is_update_target(entry): + logger.info(f"Not added (not new): {entry.path}") continue # skip # include length of path @@ -5581,78 +5583,81 @@ Limitations: (For the upper limit, see the output of `ulimit -n -H`) Options: - -c, --create=OUTDIR create tar files in OUTDIR from MEMBERs - -r, --append=OUTDIR append files (create new tar files) - -u, --update=OUTDIR append files newer than same entries in tar files + -c, --create=OUTDIR Create tar files in OUTDIR from MEMBERs + -r, --append=OUTDIR Append files (create new tar files) + -u, --update=OUTDIR Append files newer than same entries in tar files (--workdir with sufficient space is required) (Approximately 300MiB is used per 1000000 entries) - --resume=OUTDIR same as --update - -x, --extract=OUTDIR extract all members or specified MEMBERs + --resume=OUTDIR Same as --update + -x, --extract=OUTDIR Extract all members or specified MEMBERs from INDIR to OUTDIR - -t, --list=DIR list the members of DIR - (use with --verbose to see more details) - -C, --basedir=DIR change to directory for MEMBERs [default: .] + -t, --list=DIR List the members of DIR + (--verbose to see more details) + -C, --basedir=DIR Change to directory for MEMBERs [default: .] --exclude=PATTERN Exclude files matching wildcard patterns (https://docs.python.org/ja/3/library/fnmatch.html) - -X, --exclude-from=FILE Exclude files matching wildcard patterns - listed in FILE - --workdir=DIR local directory for temporary files + (Ex. --exclude=*.txt) + (Ex. --exclude=*/dirname/*) + (Ex. --exclude=*/??_abcde.pdf) + -X, --exclude-from=FILE Exclude files matching wildcard patterns listed + in FILE + --workdir=DIR Local directory for temporary files (default: system temporary directory) - --max-entries-per-tar=NUM the number of entries per tar file + --max-entries-per-tar=NUM The number of entries per tar file to limit memory usage [default: 100k] - -j, --jobs=NUM the number of jobs to copy per tar file in parallel + -j, --jobs=NUM The number of jobs to copy per tar file in parallel [default: 4] - -s, --size=BYTES assumed bytes per output file [default: 200Mi] - --ratio=RATIO assumed compression ratio (%) [default: 50] - -T, --type=TYPE compression type and tar archive suffix + -s, --size=BYTES Assumed bytes per output file [default: 200Mi] + --ratio=RATIO Assumed compression ratio (%) [default: 50] + -T, --type=TYPE Compression type and tar archive suffix - gz : use gzip (*.tar.gz) - bz2: use bzip2 (*.tar.bz2) - xz : use xz (*.tar.xz) - no : no compression (*.tar) [default: gz] -I, --use-compress-program=COMMAND - filter data through COMMAND, + Filter data through COMMAND, the command must accept -d option for decompression (ex. lz4, lzip, lzop) --gzip-program=COMMAND gzip command (ex. pigz) [default: gzip] --bzip2-program=COMMAND bzip2 command (ex. pbzip2) [default: bzip2] --xz-program=COMMAND xz command [default: xz] - --generate-db regenerate gfptar*_info.db.gz - --same-owner extract files with the same ownership + --generate-db Regenerate gfptar*_info.db.gz + --same-owner Extract files with the same ownership (for euid=0 on local, or gfarmroot on Gfarm) - --disable-gfarm-command disable the use of gfreg and gfexport + --disable-gfarm-command Disable the use of gfreg and gfexport for tar files on gfarm2fs --disable-fsync disable calling fsync() before close() - --gfsched-interval=SEC interval of updating candidate hosts to write + --gfsched-interval=SEC Interval of updating candidate hosts to write (for Gfarm URL only) [default: 120] - --progress-interval=SEC interval of updating progress [default: 1.0] - --encoding=CODEC codec for filename encoding + --progress-interval=SEC Interval of updating progress [default: 1.0] + --encoding=CODEC Codec for filename encoding (https://docs.python.org/3/library/codecs.html#standard-encodings) [default: utf-8] - --bufsize=BYTES buffer size to copy [default: 1Mi] - --progress-unit=TYPE unit for progress + --bufsize=BYTES Buffer size to copy [default: 1Mi] + --progress-unit=TYPE Unit for progress - si: SI prefix - bin: Binary prefix - raw: no conversion [default: si] - --memory=BYTES upper limit of memory size (bytes) + --memory=BYTES Upper limit of memory size (bytes) (default: no limit) (ex. 2Gi) - --test run short tests (-q option is recommended) - --test-long run long tests (-q option is recommended) - --test-workdir-local=DIR local directory for test [default: /tmp] + --test Run short tests (-q option is recommended) + --test-long Run long tests (-q option is recommended) + --test-workdir-local=DIR Local directory for test [default: /tmp] --test-workdir-gfarm=DIR Gfarm directory for test [default: gfarm:/tmp] - --dummy-num=NUM the number of dummy input entries for --create + --dummy-num=NUM The number of dummy input entries for --create (ignore MEMBER arguments) (create 1000 files and directories per 1 unit) (default: disabled) - --dummy-size-min=BYTES minimum size of dummy files [default: 0] - --dummy-size-max=BYTES maximum size of dummy files [default: 0] - --debug-sleep=SEC sleep time per file for debug [default: 0.0] - --dry-run not create output files - -q, --quiet quiet messages - -v, --verbose verbose output - -d, --debug debug mode - -?, -h, --help show this help and exit + --dummy-size-min=BYTES Minimum size of dummy files [default: 0] + --dummy-size-max=BYTES Maximum size of dummy files [default: 0] + --debug-sleep=SEC Sleep time per file for debug [default: 0.0] + --dry-run Not create output files + -q, --quiet Quiet messages + -v, --verbose Verbose output + -d, --debug Debug mode + -?, -h, --help Show this help and exit Usage: {f} [options] [--exclude=PATTERN]... -c OUTDIR [-C DIR] [--] MEMBER... From b4f5b7bfbc5217ba61f6d8476ca0d8204fd3943e Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Thu, 19 Sep 2024 13:21:08 +0900 Subject: [PATCH 108/143] gfptar --update: change the format of progress --- gftool/gfptar/gfptar | 49 ++++++++++++++++++++++++++------------------ 1 file changed, 29 insertions(+), 20 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index f86cfae05..028d994f3 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -3597,6 +3597,8 @@ class GfptarProgram(Program): self.archived_size = 0 self.stored_size = 0 self.stored_num = 0 + self.selected_size = 0 + self.selected_num = 0 self.total_size = 0 self.total_num = 0 self.start_time = time.time() @@ -3762,7 +3764,21 @@ class GfptarProgram(Program): if self.is_canceled(): logger.debug('Canceled (listdir 2): serial=%d', serial) break + + if self.progress_enabled: + now = time.time() + with self.lock(): + if now >= self.next_time: + self.next_time = now + self.progress_interval + self.progress_for_create(now) + logger.debug('listdir: entry.path=%s', entry.path) + # include length of path + this_size = entry.size_all() + with self.lock(): # for progress + self.total_size += this_size + self.total_num += 1 + if self.exclude_match(entry.path): logger.info(f"Not added (excluded): {entry.path}") continue # skip @@ -3770,11 +3786,9 @@ class GfptarProgram(Program): logger.info(f"Not added (not new): {entry.path}") continue # skip - # include length of path - this_size = entry.size_all() with self.lock(): # for progress - self.total_size += this_size - self.total_num += 1 + self.selected_size += this_size + self.selected_num += 1 if filelist_num > 0 \ and (filelist_size + this_size > self.split_size @@ -3800,13 +3814,6 @@ class GfptarProgram(Program): filelist_num += 1 filelist_size += this_size - # progress for listing before starting threads - if serial == 1 and self.progress_enabled: - now = time.time() - if now >= self.next_time: - self.next_time = now + self.progress_interval - self.progress_for_create(now) - if has_error is not None: break # from listdir_switch() except MemoryError as e2: @@ -3858,12 +3865,12 @@ class GfptarProgram(Program): print('compression ratio: %.2f%% (%d/%d)' % (100 * self.archived_size / self.stored_size, self.archived_size, self.stored_size)) + if self.selected_num == 0: + print('No files were updated.') if has_error is not None: raise has_error if self.is_canceled(): raise self.error_canceled() - if self.total_num == 0: - print('No files were updated.') def create_job_init(self): if self.MT_enabled(): @@ -4929,12 +4936,12 @@ class GfptarProgram(Program): if self.listing: percent_str = '?' else: - if self.total_num > 0: - percent1 = self.stored_num * 100 / self.total_num + if self.selected_num > 0: + percent1 = self.stored_num * 100 / self.selected_num else: percent1 = 0 - if self.total_size > 0: - percent2 = self.stored_size * 100 / self.total_size + if self.selected_size > 0: + percent2 = self.stored_size * 100 / self.selected_size percent = (percent1 + percent2) / 2 else: percent = percent1 @@ -4946,14 +4953,16 @@ class GfptarProgram(Program): bytes_per_sec = 0 ent_per_sec = 0 stored_num_str = self._humanize(self.stored_num) + sel_num_str = self._humanize(self.selected_num) total_num_str = self._humanize(self.total_num) stored_size_str = self._humanize(self.stored_size) - total_size_str = self._humanize(self.total_size) + sel_size_str = self._humanize(self.selected_size) + # total_size_str = self._humanize(self.total_size) bytes_per_sec_str = self._humanize(bytes_per_sec) ent_per_sec_str = self._humanize(ent_per_sec) sys.stdout.write(f"\r{self.cmd_name}: {percent_str}% " - f"{stored_size_str}/{total_size_str}B " - f"{stored_num_str}/{total_num_str}Ent " + f"{stored_size_str}/{sel_size_str}B " + f"{stored_num_str}/{sel_num_str}/{total_num_str}Ent " f"{sec_str} " f"{bytes_per_sec_str}B/s " f"{ent_per_sec_str}Ent/s") From 8e6f1eb92dd73aa2b6f5fbd7f25cdfaab9344fcd Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Thu, 19 Sep 2024 19:02:18 +0900 Subject: [PATCH 109/143] gfptar: rename *.tmp on gfarm2fs when outdir is gfarm2fs --- gftool/gfptar/gfptar | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 028d994f3..2db0117db 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -4284,20 +4284,27 @@ class GfptarProgram(Program): offset += 1 # ex.: home/user1/dir -> home_user1_dir outname = prefix_str + outname.replace('/', '_') - outname_path = self.outdir_url.url_join(outname) - outname_path_tmp = outname_path + self.TMP_SUFFIX - outurl_tmp = GfURL.init(outname_path_tmp, - use_gfarm_command=self.use_gfarm_command, - dry_run=self.dry_run) - target_host = self.select_a_target_host(outurl_tmp, serial) + out_urlstr = self.outdir_url.url_join(outname) + tmp_urlstr = out_urlstr + self.TMP_SUFFIX + tmp_url = GfURL.init(tmp_urlstr, + use_gfarm_command=self.use_gfarm_command, + dry_run=self.dry_run) + if tmp_url.is_gfarm(): + # direct access when outdir is gfarm2fs + tmp_url_direct = GfURL.init(tmp_urlstr, + use_gfarm_command=False, + dry_run=self.dry_run) + else: + tmp_url_direct = tmp_url + target_host = self.select_a_target_host(tmp_url, serial) def remove(gfurl): if gfurl.exists(): gfurl.remove() - atexit.register(remove, outurl_tmp) - remove(outurl_tmp) - tar_tmp = GfTarFile.create_open(outurl_tmp, self.compress_type, + atexit.register(remove, tmp_url_direct) + remove(tmp_url_direct) + tar_tmp = GfTarFile.create_open(tmp_url, self.compress_type, self.bufsize, use_fsync=self.use_fsync, target_host=target_host, @@ -4392,18 +4399,18 @@ class GfptarProgram(Program): if self.dry_run: tar_size = 1024 ** 2 # no particular reason else: - tar_size = outurl_tmp.size() - outurl_tmp.rename(outname_path) + tar_size = tmp_url.size() + tmp_url_direct.rename(out_urlstr) atexit.unregister(remove) - out_db_path = self.outdir_url.url_join(db_name) - InfoDB.compress_copy(infodb.db.filename, out_db_path, + dbgz_urlstr = self.outdir_url.url_join(db_name) + InfoDB.compress_copy(infodb.db.filename, dbgz_urlstr, self.bufsize, self.use_fsync, self.dry_run) # for DEBUG # raise Exception('unexpected raise') - return tar_size, cannot_be_archived, outname_path, out_db_path + return tar_size, cannot_be_archived, out_urlstr, dbgz_urlstr def error_canceled(self): return GfptarError('Canceled') @@ -5655,6 +5662,7 @@ Options: --test-long Run long tests (-q option is recommended) --test-workdir-local=DIR Local directory for test [default: /tmp] --test-workdir-gfarm=DIR Gfarm directory for test [default: gfarm:/tmp] + (A path in gfarm2fs can be specified) --dummy-num=NUM The number of dummy input entries for --create (ignore MEMBER arguments) (create 1000 files and directories per 1 unit) From 2530cd0dac04cf61d601c208bd3c139914e229bb Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Fri, 20 Sep 2024 14:52:19 +0900 Subject: [PATCH 110/143] gfptar: fix a bug of not reporting cannot_be_archived --- gftool/gfptar/gfptar | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 2db0117db..0036336b8 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -2477,6 +2477,7 @@ class CannotBeArchivedError(GfptarError): msg += 'directories' else: msg += 'directory' + msg += '(readdir)' self.message = f"{msg} cannot be archived" super().__init__(self.message) @@ -3608,7 +3609,7 @@ class GfptarProgram(Program): self.gfsched_init() self.cannot_be_archived = 0 - self.create_job_init() # before creating threads + self.create_job_init() # before starting threads self.fattr_dict_list = [] def cmd_create(self, outdir, basedir, infiles): @@ -3890,6 +3891,7 @@ class GfptarProgram(Program): class Started(): pass + # create Process()es before starting threads self.worker_ident_dict = {} for i in range(self.max_workers): inq = multiprocessing.Queue() @@ -4138,7 +4140,7 @@ class GfptarProgram(Program): msg = result[0] if msg == self.MSG_CHILD_PROGRESS: logger.debug('MSG_CHILD_PROGRESS') - status, num, size, path_list = result + _msg, num, size, path_list = result with self.lock(): for entry_path in path_list: self.info('archived: {}', entry_path) @@ -4151,15 +4153,16 @@ class GfptarProgram(Program): + self.progress_interval self.progress_for_create(now) elif msg == self.MSG_CHILD_DONE: - (status, tar_size, cannot_be_archived, + (_msg, tar_size, cannot_be_archived, out_tar_path, out_dbgz_path) = result with self.lock(): self.archived_size += tar_size + self.cannot_be_archived += cannot_be_archived self.info('created({}): {}', self.suffix, out_tar_path) self.info('created(.db.gz): {}', out_dbgz_path) break elif msg == self.MSG_CHILD_ERROR: - (status, exc_type_name, exc_value_str, + (_msg, exc_type_name, exc_value_str, exc_traceback_str) = result inq.put(self.MSG_PARENT_ERROR_COMPLETE) if self.verbose or self.debug: From ebc6701ac7ebc64ed3af4c4ad81e9603f5f00629 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Wed, 25 Sep 2024 21:41:38 +0900 Subject: [PATCH 111/143] gfptar --extract,--append,--update: select --type automatically --- gftool/gfptar/gfptar | 149 ++++++++++++++++++++++++++++--------------- 1 file changed, 96 insertions(+), 53 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 0036336b8..525a7b784 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -2035,10 +2035,13 @@ class Compress: TYPE_GZIP = 'gz' TYPE_BZIP2 = 'bz2' TYPE_XZ = 'xz' + compress_type = None + prog_for_create = None + prog_for_extract = None @classmethod - def set_default(cls, compress_type, compress_prog, - gzip_prog, bzip2_prog, xz_prog): + def init(cls, compress_type, compress_prog, + gzip_prog, bzip2_prog, xz_prog): if gzip_prog is not None: cls.gzip_prog = shutil.which(gzip_prog) else: @@ -2058,22 +2061,39 @@ class Compress: if compress_prog0 is None: raise GfptarError(f"{compress_prog}: command not found.") compress_prog = compress_prog0 + cls.prog_for_create = compress_prog + cls.prog_for_extract = compress_prog else: + # auto selection if compress_type == cls.TYPE_GZIP: compress_prog = cls.gzip_prog elif compress_type == cls.TYPE_BZIP2: compress_prog = cls.bzip2_prog elif compress_type == cls.TYPE_XZ: compress_prog = cls.xz_prog - # else: - # compress_prog = None - cls.compress_prog = compress_prog # may be None + else: + compress_prog = None + cls.prog_for_create = compress_prog + cls.prog_for_extract = None # use get_compress_prog() + logger.debug('compress_type=%s', cls.compress_type) - logger.debug('compress_prog=%s', cls.compress_prog) + logger.debug('prog_for_create=%s', cls.prog_for_create) + logger.debug('prog_for_extract=%s', cls.prog_for_extract) logger.debug('gzip_prog=%s', cls.gzip_prog) logger.debug('bzip2_prog=%s', cls.bzip2_prog) logger.debug('xz_prog=%s', cls.xz_prog) + @classmethod + def get_compress_prog(cls, name): + if name.endswith('.gz'): + return cls.gzip_prog + elif name.endswith('.bz2'): + return cls.bzip2_prog + elif name.endswith('.xz'): + return cls.xz_prog + else: + return None + @classmethod def compress(cls, compress_prog, outf): args = [compress_prog] @@ -2101,12 +2121,16 @@ class GfTarFile(tarfile.TarFile): @classmethod def extract_open(cls, gfurl, copybufsize, compress_prog=None): if compress_prog is None: - compress_prog = Compress.compress_prog + compress_prog = Compress.prog_for_extract + if compress_prog is None: + compress_prog = Compress.get_compress_prog(gfurl.url_str) + logger.debug('extract_open: compress_prog=%s', compress_prog) # use Stream (not seekable) - if compress_prog is not None: - openmode = 'r|' - else: + if compress_prog is None \ + and Compress.compress_type != Compress.TYPE_NO: openmode = 'r|*' # any type (gz, bz2, xz) + else: + openmode = 'r|' if not gfurl.exists(): raise FileNotFoundError(gfurl.url_str) # list of tuple(proc, closeable obj, synchronizable obj) @@ -2148,15 +2172,15 @@ class GfTarFile(tarfile.TarFile): return tar @classmethod - def create_open(cls, gfurl, compress_type, copybufsize, compress_prog=None, + def create_open(cls, gfurl, copybufsize, compress_prog=None, use_fsync=True, target_host=None, dummy_input=False): if compress_prog is None: - compress_prog = Compress.compress_prog + compress_prog = Compress.prog_for_create # use Stream (not seekable) if compress_prog is None \ - and compress_type != Compress.TYPE_NO: - openmode = 'w|' + compress_type + and Compress.compress_type != Compress.TYPE_NO: + openmode = 'w|' + Compress.compress_type else: openmode = 'w|' if gfurl.exists(): @@ -2566,8 +2590,8 @@ class GfptarProgram(Program): self.gzip_prog = self.opt['--gzip-program'] self.xz_prog = self.opt['--xz-program'] self.bzip2_prog = self.opt['--bzip2-program'] - Compress.set_default(self.compress_type, self.compress_prog, - self.gzip_prog, self.bzip2_prog, self.xz_prog) + Compress.init(self.compress_type, self.compress_prog, + self.gzip_prog, self.bzip2_prog, self.xz_prog) if Compress.gzip_prog is None: raise GfException('gzip: command not found. Please install gzip.') @@ -2777,6 +2801,8 @@ class GfptarProgram(Program): save_opt_gendb = self.opt['--generate-db'] save_opt_verbose = self.opt['--verbose'] + self.opt['--use-compress-program'] = None + # create tar per one entry self.opt['--size'] = 0 @@ -2823,10 +2849,10 @@ class GfptarProgram(Program): 'pigz': 'gz', 'bzip2': 'bz2', 'xz': 'xz', - # 'pbzip2': 'bz2', - # 'lzip': 'lz', - # 'lzop': 'lzo', - # 'lz4': 'lz4', + 'pbzip2': 'bz2', + 'lzip': 'lz', + 'lzop': 'lzo', + 'lz4': 'lz4', } else: pattern_compress_prog = { @@ -3096,7 +3122,7 @@ class GfptarProgram(Program): # error try: self.cmd_extract(test6_dir_local, test3_dir_local, ['abcde']) - logger.error_exit(1, testname + '(not found in archive files) ' + + logger.error_exit(1, testname + '(not found in tar files) ' + '... FAIL (unexpected success)') except GfptarError as e: logger.info(f"Expected error, ignored: {str(e)}") @@ -3588,13 +3614,6 @@ class GfptarProgram(Program): self.dummy_size_min = self.opt['--dummy-size-min'] self.dummy_size_max = self.opt['--dummy-size-max'] - if self.compress_type == Compress.TYPE_NO: - self.split_size = self.assumed_size - self.suffix = '.tar' - else: - self.split_size = self.assumed_size * 100 / self.ratio - self.suffix = '.tar.' + self.compress_type - self.archived_size = 0 self.stored_size = 0 self.stored_num = 0 @@ -3620,7 +3639,24 @@ class GfptarProgram(Program): self.cmd_create_common(serial, gen, infiles) def error_type_mismatch(self, tar_path): - return GfptarError(f"Tar compression type mismatch: {tar_path}") + return GfptarError(f"Compression type mismatch: {tar_path}") + + def suffix_check(self, suffix, tar_path): + if suffix is None: + suffix = os.path.splitext(tar_path)[1] + dottype = suffix.split('.') + if len(dottype) == 2: + self.compress_type = dottype[1] + if self.compress_type == 'tar': + self.compress_type = Compress.TYPE_NO + if self.progress_enabled: + logger.info('') + logger.info(f"recognized type is {self.compress_type}") + Compress.init(self.compress_type, self.compress_prog, + self.gzip_prog, self.bzip2_prog, self.xz_prog) + elif not tar_path.endswith(suffix): + raise self.error_type_mismatch(tar_path) + return suffix def cmd_append(self, outdir, basedir, infiles): self.cmd_create_init('append', outdir, basedir) @@ -3628,15 +3664,14 @@ class GfptarProgram(Program): raise FileNotFoundError(outdir) max_serial = 0 max_gen = 0 - suffix = '.' + self.compress_type + suffix = None tar_list, dbgz_list = self.list_tar_list_dbgz_from_dir(self.outdir_url) for serial, gen, tar_path in tar_list: - if not tar_path.endswith(suffix): - raise self.error_type_mismatch(tar_path) if serial > max_serial: max_serial = serial if gen > max_gen: max_gen = gen + suffix = self.suffix_check(suffix, tar_path) self.cmd_create_common(max_serial + 1, max_gen + 1, infiles) def cmd_update(self, outdir, basedir, infiles): @@ -3646,14 +3681,12 @@ class GfptarProgram(Program): broken_count = 0 max_serial = 0 max_gen = 0 - suffix = '.' + self.compress_type + suffix = None for serial, gen, tar_path, dbgz_path, db_path in \ self.list_tar_dbgz_db(self.outdir_url, sort=True, progress=self.progress_enabled): if self.is_canceled(): raise self.error_canceled() - if not tar_path.endswith(suffix): - raise self.error_type_mismatch(tar_path) if serial > max_serial: max_serial = serial if gen > max_gen: @@ -3661,6 +3694,7 @@ class GfptarProgram(Program): if dbgz_path is None or tar_path is None: broken_count += 1 continue + suffix = self.suffix_check(suffix, tar_path) db = DB(db_path) fattr_dict = DBDict(db, FileAttr2, InfoDB.TABLE_ENTRY) self.fattr_dict_list.append(fattr_dict) @@ -3669,9 +3703,16 @@ class GfptarProgram(Program): self.cmd_create_common(max_serial + 1, max_gen + 1, infiles) if broken_count > 0: raise GfptarError("Some .tar.* or .db.gz are broken:" - f" error={broken_count}") + f" num={broken_count}") def cmd_create_common(self, serial, gen, infiles): + if self.compress_type == Compress.TYPE_NO: + self.split_size = self.assumed_size + self.suffix = '.tar' + else: + self.split_size = self.assumed_size * 100 / self.ratio + self.suffix = '.tar.' + self.compress_type + infiles_checked = set() for infile in infiles: # infile_url = GfURL.init(infile) @@ -4307,8 +4348,7 @@ class GfptarProgram(Program): atexit.register(remove, tmp_url_direct) remove(tmp_url_direct) - tar_tmp = GfTarFile.create_open(tmp_url, self.compress_type, - self.bufsize, + tar_tmp = GfTarFile.create_open(tmp_url, self.bufsize, use_fsync=self.use_fsync, target_host=target_host, dummy_input=self.dummy_input) @@ -4583,7 +4623,7 @@ class GfptarProgram(Program): logger.debug('check member_check_dict: %s, %s', member, found) if not found: raise GfptarError('The specified file is not found' - ' in archive files: ' + member) + ' in tar files: ' + member) member_check_dict.clear() archive_dict.clear() return gen_tarlist_dict, directory_set, member_set, error_num @@ -4621,7 +4661,7 @@ class GfptarProgram(Program): self.db.unlink() if error_num > 0: raise GfptarError("Some .tar.* or .db.gz are broken:" - f" error={error_num}") + f" num={error_num}") def extract_directories(self, directory_set): logger.debug('extract_directories') @@ -5089,7 +5129,7 @@ class GfptarProgram(Program): os.remove(db_path) if error_num > 0: raise GfptarError("Some .tar.* or .db.gz are broken:" - f" error={error_num}") + f" num={error_num}") def sort_tar_or_db_list(self, tar_or_db_list): def id_key(value): @@ -5121,7 +5161,7 @@ class GfptarProgram(Program): if ent.path.endswith(self.LIST_SUFFIX): # ignore *.lst if self.progress_enabled: logger.warning('') - logger.warning(f"{ent.path}:" + logger.warning(f"NOTICE: {ent.path}:" " This file (*.lst) is no longer required." " Please remove it.") continue @@ -5606,7 +5646,7 @@ Options: -r, --append=OUTDIR Append files (create new tar files) -u, --update=OUTDIR Append files newer than same entries in tar files (--workdir with sufficient space is required) - (Approximately 300MiB is used per 1000000 entries) + (About 300MiB is used per 1000000 entries) --resume=OUTDIR Same as --update -x, --extract=OUTDIR Extract all members or specified MEMBERs from INDIR to OUTDIR @@ -5615,30 +5655,33 @@ Options: -C, --basedir=DIR Change to directory for MEMBERs [default: .] --exclude=PATTERN Exclude files matching wildcard patterns (https://docs.python.org/ja/3/library/fnmatch.html) - (Ex. --exclude=*.txt) - (Ex. --exclude=*/dirname/*) - (Ex. --exclude=*/??_abcde.pdf) + (ex. --exclude=*.txt) + (ex. --exclude=*/dirname/*) + (ex. --exclude=*/??_abcde.pdf) -X, --exclude-from=FILE Exclude files matching wildcard patterns listed in FILE --workdir=DIR Local directory for temporary files (default: system temporary directory) - --max-entries-per-tar=NUM The number of entries per tar file - to limit memory usage [default: 100k] -j, --jobs=NUM The number of jobs to copy per tar file in parallel [default: 4] - -s, --size=BYTES Assumed bytes per output file [default: 200Mi] + -s, --size=BYTES Maximum assumed size per output file + [default: 200Mi] --ratio=RATIO Assumed compression ratio (%) [default: 50] - -T, --type=TYPE Compression type and tar archive suffix + --max-entries-per-tar=NUM The maximum number of entries per tar file + to limit memory usage [default: 100k] + -T, --type=TYPE Compression type and tar file suffix - gz : use gzip (*.tar.gz) - bz2: use bzip2 (*.tar.bz2) - xz : use xz (*.tar.xz) - no : no compression (*.tar) + - (other): --use-compress-program is required [default: gz] -I, --use-compress-program=COMMAND - Filter data through COMMAND, - the command must accept -d option for decompression + Command to compress or decompress tar files + when using --type other than gz, bz2 or xz + (The command must accept -d option to decompress) (ex. lz4, lzip, lzop) - --gzip-program=COMMAND gzip command (ex. pigz) [default: gzip] + --gzip-program=COMMAND gzip command (ex. pigz) [default: gzip] --bzip2-program=COMMAND bzip2 command (ex. pbzip2) [default: bzip2] --xz-program=COMMAND xz command [default: xz] --generate-db Regenerate gfptar*_info.db.gz From 04bf4456560c7f884df9adb345bd894e627881a1 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Wed, 25 Sep 2024 23:25:09 +0900 Subject: [PATCH 112/143] gfptar: about XZ_OPT --- gftool/gfptar/gfptar | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 525a7b784..e842065fd 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -5683,7 +5683,8 @@ Options: (ex. lz4, lzip, lzop) --gzip-program=COMMAND gzip command (ex. pigz) [default: gzip] --bzip2-program=COMMAND bzip2 command (ex. pbzip2) [default: bzip2] - --xz-program=COMMAND xz command [default: xz] + --xz-program=COMMAND xz command (ex. environment XZ_OPT="-T4" for speed) + [default: xz] --generate-db Regenerate gfptar*_info.db.gz --same-owner Extract files with the same ownership (for euid=0 on local, or gfarmroot on Gfarm) From 4fdd8f192c1108150b8d7c77dfd00fd0c2fde26f Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Wed, 25 Sep 2024 23:28:37 +0900 Subject: [PATCH 113/143] gfptar: show tempfile.gettempdir() in help --- gftool/gfptar/gfptar | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index e842065fd..1db4c934a 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -5661,7 +5661,7 @@ Options: -X, --exclude-from=FILE Exclude files matching wildcard patterns listed in FILE --workdir=DIR Local directory for temporary files - (default: system temporary directory) + (default: {tempdir}) -j, --jobs=NUM The number of jobs to copy per tar file in parallel [default: 4] -s, --size=BYTES Maximum assumed size per output file @@ -5734,7 +5734,7 @@ Usage: {f} [options] --test -C DIR MEMBER... {f} [options] --test-long {f} -h | --help -""".format(f=progname) +""".format(f=progname, tempdir=tempfile.gettempdir()) _schema = Schema({ From 4c3c612d8f0b8028e697955fe2d2e1a791e619a0 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Wed, 25 Sep 2024 23:55:10 +0900 Subject: [PATCH 114/143] gfptar: not use atexit in child processes --- gftool/gfptar/gfptar | 45 ++++++++++++++++++++++++++++---------------- 1 file changed, 29 insertions(+), 16 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 1db4c934a..4e07536d4 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -4340,14 +4340,26 @@ class GfptarProgram(Program): dry_run=self.dry_run) else: tmp_url_direct = tmp_url - target_host = self.select_a_target_host(tmp_url, serial) def remove(gfurl): if gfurl.exists(): gfurl.remove() - atexit.register(remove, tmp_url_direct) remove(tmp_url_direct) + try: + return self.create_a_tar_process2(gen, serial, tardb, + input_queue, output_queue, + tmp_url, tmp_url_direct, + filelist, out_urlstr) + except Exception: + remove(tmp_url_direct) + raise + + def create_a_tar_process2(self, gen, serial, tardb, + input_queue, output_queue, + tmp_url, tmp_url_direct, + filelist, out_urlstr): + target_host = self.select_a_target_host(tmp_url, serial) tar_tmp = GfTarFile.create_open(tmp_url, self.bufsize, use_fsync=self.use_fsync, target_host=target_host, @@ -4444,7 +4456,6 @@ class GfptarProgram(Program): else: tar_size = tmp_url.size() tmp_url_direct.rename(out_urlstr) - atexit.unregister(remove) dbgz_urlstr = self.outdir_url.url_join(db_name) InfoDB.compress_copy(infodb.db.filename, dbgz_urlstr, @@ -5419,20 +5430,22 @@ class InfoDB: gfurl.remove() remove(dbgz_url_tmp) - atexit.register(remove, dbgz_url_tmp) - with dbgz_url_tmp.writeopen(use_fsync=use_fsync) as outf: - proc = Compress.compress(Compress.gzip_prog, outf) - with db_url.readopen() as inf: - shutil.copyfileobj(inf, proc.stdin, bufsize) - proc.stdin.close() - ret = proc.wait() - if ret != 0: - raise GfException('{}: returncode={}'.format( - ' '.join(proc.args), ret)) + try: + with dbgz_url_tmp.writeopen(use_fsync=use_fsync) as outf: + proc = Compress.compress(Compress.gzip_prog, outf) + with db_url.readopen() as inf: + shutil.copyfileobj(inf, proc.stdin, bufsize) + proc.stdin.close() + ret = proc.wait() + if ret != 0: + raise GfException('{}: returncode={}'.format( + ' '.join(proc.args), ret)) + # atomic operation to avoid leaving junk files + dbgz_url_tmp.rename(out_dbgz_path) + except Exception: + remove(dbgz_url_tmp) + raise - # atomic operation to avoid leaving junk files - dbgz_url_tmp.rename(out_dbgz_path) - atexit.unregister(remove) logger.debug('created(.db.gz): %s', out_dbgz_path) if move: db_url.remove() From 021c24476fae8123a465ee45a5859d77d735dc5c Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Thu, 26 Sep 2024 01:13:01 +0900 Subject: [PATCH 115/143] gfptar: wording --- gftool/gfptar/gfptar | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 4e07536d4..50fc1947e 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -5173,7 +5173,7 @@ class GfptarProgram(Program): if self.progress_enabled: logger.warning('') logger.warning(f"NOTICE: {ent.path}:" - " This file (*.lst) is no longer required." + " This file (*.lst) is no longer needed." " Please remove it.") continue if not tar_pattern.match(base): # ignore not *.tar.* @@ -5616,13 +5616,13 @@ Example of --create (Gfarm to Gfarm): ... gfarm:/home/user1/out/0010_dir_test9000.data..dir_test9999.data.tar.gz gfarm:/home/user1/out/g1_0010_gfptar.db.gz - Contents of gMM_NN_gfptar.db.gz file (sqlite3 and gzip): + Contents of gMM_NN_gfptar.db.gz file (SQLite3 and gzip): MM: the generation number for each append operation NN: the serial number table 'path_entry': map of path name to JSON string - json.dumps([ int(file_mode), int(mtime), - int(user_unique_id), int(group_unique_id), - int(size), symlink_path, file_type(D,F,S) ] + json.dumps([ file_mode (int), mtime (int), + user_unique_id (int), group_unique_id (int), + size (int), symlink_path, file_type (D,F,S) ] file_type 'D': directory file_type 'F': file file_type 'S': symbolic link @@ -5680,8 +5680,11 @@ Options: -s, --size=BYTES Maximum assumed size per output file [default: 200Mi] --ratio=RATIO Assumed compression ratio (%) [default: 50] - --max-entries-per-tar=NUM The maximum number of entries per tar file - to limit memory usage [default: 100k] + --max-entries-per-tar=NUM + The maximum number of entries per tar file + (Too many entries in a tar may use a lot of memory, + and may work slowly) + [default: 100k] -T, --type=TYPE Compression type and tar file suffix - gz : use gzip (*.tar.gz) - bz2: use bzip2 (*.tar.bz2) From e3aaf416694132086cff63e1a826107a025712b9 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Thu, 26 Sep 2024 01:48:16 +0900 Subject: [PATCH 116/143] gfptar: clarify --- gftool/gfptar/gfptar | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 50fc1947e..3a1a8d334 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -2549,7 +2549,7 @@ class GfptarProgram(Program): self.max_workers = 1 self.bufsize = self.opt['--bufsize'] self.use_fsync = not self.opt['--disable-fsync'] - self.sync_infodb = self.opt['--generate-db'] + self.gen_db = self.opt['--generate-db'] self.workdir = self.opt['--workdir'] euid = os.geteuid() @@ -4957,7 +4957,7 @@ class GfptarProgram(Program): f"{sec_str} " f"{ent_per_sec_str}Ent/s") - def progress_for_sync_infodb(self, now): + def progress_for_infodb(self, now): sec = now - self.start_time sec_str = format_seconds(sec, minhour=True) current_tar_num_str = self._humanize(self.current_tar_num) @@ -4975,7 +4975,7 @@ class GfptarProgram(Program): else: ent_per_sec = 0 ent_per_sec_str = self._humanize(ent_per_sec) - if self.sync_infodb: + if self.gen_db: name = 'generate-db' sys.stdout.write(f"\r{name}: " f"{current_tar_num_str}/{total_tar_num_str}(DB) " @@ -4987,8 +4987,7 @@ class GfptarProgram(Program): name = 'load-db' sys.stdout.write(f"\r{name}: " f"{current_tar_num_str}/{total_tar_num_str}(DB) " - f"{sec_str} " - f"{ent_per_sec_str}Ent/s") + f"{sec_str} ") # lock required def progress_for_create(self, now): @@ -5282,7 +5281,7 @@ class GfptarProgram(Program): self.current_size = 0 if progress: now = time.time() - self.progress_for_sync_infodb(now) + self.progress_for_infodb(now) def result_one(one): future, gen, serial, tar_path, dbgz_path = one @@ -5293,7 +5292,7 @@ class GfptarProgram(Program): self.current_tar_num += 1 if progress: now = time.time() - self.progress_for_sync_infodb(now) + self.progress_for_infodb(now) yield serial, gen, tar_path, dbgz_path, db_path except Exception as e: # logger.error(f"{e}") @@ -5327,11 +5326,11 @@ class GfptarProgram(Program): yield serial, gen, tar_path, dbgz_path, db_path continue if dbgz_path is None: - update = True + gen_db = True else: - update = self.sync_infodb + gen_db = self.gen_db arg = (lock, share_cancel, self.tmpdir.name, - update, gen, serial, tar_path, + gen_db, gen, serial, tar_path, self.bufsize, self.use_fsync, self.dry_run) # InfoDB.generate_db_and_dbgz is classmethod, # because ProcessPoolExecutor cannot serialize @@ -5346,7 +5345,7 @@ class GfptarProgram(Program): if progress: now = time.time() - self.progress_for_sync_infodb(now) + self.progress_for_infodb(now) sys.stdout.write('\n') if self.save_e: raise self.save_e @@ -5464,8 +5463,8 @@ class InfoDB: proc.stdout.close() ret = proc.wait() if ret != 0: - raise GfException('{}: returncode={}'.format( - ' '.join(proc.args), ret)) + raise GfException('{} (from {} to {}): returncode={}'.format( + ' '.join(proc.args), in_dbgz_path, out_db_path, ret)) return db_url signal_initialized = False @@ -5490,7 +5489,7 @@ class InfoDB: # return db_path, dbgz_path, ent_num, size @classmethod def generate_db_and_dbgz(cls, args): - (lock, share_cancel, db_dir, update, gen, serial, + (lock, share_cancel, db_dir, gen_db, gen, serial, tar_path, bufsize, use_fsync, dry_run) = args if share_cancel.value != 0: # logger.debug('Canceled (2)') @@ -5508,9 +5507,9 @@ class InfoDB: num = 0 size = 0 - if not update and dbgz_url.exists(): - logger.debug('not update, decompress dbgz: %s', dbgz_path) - use_fsync2 = False # workdir + if not gen_db and dbgz_url.exists(): + logger.debug('not generate, decompress dbgz: %s', dbgz_path) + use_fsync2 = False # disable fsync for workdir InfoDB.decompress_copy(dbgz_path, db_path, bufsize, use_fsync2) # NOTE: slow # infodb = InfoDB(db_path) From bea848166c32980795d804149489554fd3f4e7a0 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Thu, 26 Sep 2024 02:05:40 +0900 Subject: [PATCH 117/143] gfptar: change the progress format --- gftool/gfptar/gfptar | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 3a1a8d334..7bc2d52d9 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -4947,15 +4947,15 @@ class GfptarProgram(Program): selected_size_str = self._humanize(self.selected_size) sys.stdout.write(f"\r{name}: " f"{selected_size_str}/{total_size_str}B " - f"{selected_num_str}/{total_num_str}Ent " + f"{selected_num_str}/{total_num_str}E " f"{sec_str} " - f"{ent_per_sec_str}Ent/s") + f"{ent_per_sec_str}E/s") else: sys.stdout.write(f"\r{name}: " f"{total_size_str}B " - f"{total_num_str}Ent " + f"{total_num_str}E " f"{sec_str} " - f"{ent_per_sec_str}Ent/s") + f"{ent_per_sec_str}E/s") def progress_for_infodb(self, now): sec = now - self.start_time @@ -4980,9 +4980,9 @@ class GfptarProgram(Program): sys.stdout.write(f"\r{name}: " f"{current_tar_num_str}/{total_tar_num_str}(DB) " f"{current_size_str}B " - f"{current_ent_num_str}Ent " + f"{current_ent_num_str}E " f"{sec_str} " - f"{ent_per_sec_str}Ent/s") + f"{ent_per_sec_str}E/s") else: name = 'load-db' sys.stdout.write(f"\r{name}: " @@ -5022,10 +5022,10 @@ class GfptarProgram(Program): ent_per_sec_str = self._humanize(ent_per_sec) sys.stdout.write(f"\r{self.cmd_name}: {percent_str}% " f"{stored_size_str}/{sel_size_str}B " - f"{stored_num_str}/{sel_num_str}/{total_num_str}Ent " + f"{stored_num_str}/{sel_num_str}/{total_num_str}E " f"{sec_str} " f"{bytes_per_sec_str}B/s " - f"{ent_per_sec_str}Ent/s") + f"{ent_per_sec_str}E/s") # lock required def progress_for_extract(self, now): @@ -5056,10 +5056,10 @@ class GfptarProgram(Program): name = "extract" sys.stdout.write(f"\r{name}: {percent_str}% " f"{extracted_size_str}/{total_size_str}B " - f"{extracted_num_str}/{total_num_str}Ent " + f"{extracted_num_str}/{total_num_str}E " f"{sec_str} " f"{bytes_per_sec_str}B/s " - f"{ent_per_sec_str}Ent/s") + f"{ent_per_sec_str}E/s") def cmd_list_simple(self, indir): return self.cmd_list(indir, verbose=False) From fc8b3a33193272b7e61206c2f3b35c510ec715fb Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Thu, 26 Sep 2024 02:48:33 +0900 Subject: [PATCH 118/143] gfptar: change the progress format --- gftool/gfptar/gfptar | 52 ++++++++++++++++++++++++-------------------- 1 file changed, 29 insertions(+), 23 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 7bc2d52d9..be728faf7 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -75,8 +75,8 @@ def humanize_number(num, binary_prefix=False): else: # SI prefix units = ['', 'k', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y'] base = Decimal(1000) - if num < base: - return str(int(num)) + if num < base and isinstance(num, int): + return str(num) n = Decimal(num) ulen = len(units) - 1 scale = 0 @@ -84,11 +84,12 @@ def humanize_number(num, binary_prefix=False): n /= base scale += 1 if n < 10: - d = n.quantize(Decimal('0.000'), rounding=ROUND_DOWN) - elif n < 100: d = n.quantize(Decimal('0.00'), rounding=ROUND_DOWN) - elif n < 1000: + elif n < 100: d = n.quantize(Decimal('0.0'), rounding=ROUND_DOWN) + elif n < 1000: + d = n.quantize(Decimal('0'), rounding=ROUND_DOWN) + d = f" {d}" else: d = n.quantize(Decimal('0'), rounding=ROUND_DOWN) return f"{d}{units[scale]}" @@ -2326,16 +2327,20 @@ class TestGfptar(unittest.TestCase): return suite def test_humanize(self): - self.assertEqual(humanize_number(0.1), '0') - self.assertEqual(humanize_number(1.9), '1') + self.assertEqual(humanize_number(0.001), '0.00') + self.assertEqual(humanize_number(0.1), '0.10') + self.assertEqual(humanize_number(10.11), '10.1') + self.assertEqual(humanize_number(100.11), ' 100') self.assertEqual(humanize_number(1023, binary_prefix=True), '1023') - self.assertEqual(humanize_number(1024, binary_prefix=True), '1.000Ki') + self.assertEqual(humanize_number(1024, binary_prefix=True), '1.00Ki') + self.assertEqual(humanize_number(1), '1') self.assertEqual(humanize_number(999), '999') - self.assertEqual(humanize_number(1000), '1.000k') - self.assertEqual(humanize_number(99999), '99.99k') - self.assertEqual(humanize_number(100000), '100.0k') - self.assertEqual(humanize_number(1900000), '1.900M') - self.assertEqual(humanize_number(2000000), '2.000M') + self.assertEqual(humanize_number(1000), '1.00k') + self.assertEqual(humanize_number(99999), '99.9k') + self.assertEqual(humanize_number(100000), ' 100k') + self.assertEqual(humanize_number(999999), ' 999k') + self.assertEqual(humanize_number(1000000), '1.00M') + self.assertEqual(humanize_number(6100000000000000000), '6.10E') def test_unhumanize(self): self.assertEqual(unhumanize_number('999'), 999) @@ -2764,6 +2769,7 @@ class GfptarProgram(Program): def test_main_short(self): self.test_init() + # self.test_unit() self.test_opt_pattern(full=False) self.test_member() self.test_update_append_exclude() @@ -4947,15 +4953,15 @@ class GfptarProgram(Program): selected_size_str = self._humanize(self.selected_size) sys.stdout.write(f"\r{name}: " f"{selected_size_str}/{total_size_str}B " - f"{selected_num_str}/{total_num_str}E " + f"{selected_num_str}/{total_num_str}Ent " f"{sec_str} " - f"{ent_per_sec_str}E/s") + f"{ent_per_sec_str}Ent/s") else: sys.stdout.write(f"\r{name}: " f"{total_size_str}B " - f"{total_num_str}E " + f"{total_num_str}Ent " f"{sec_str} " - f"{ent_per_sec_str}E/s") + f"{ent_per_sec_str}Ent/s") def progress_for_infodb(self, now): sec = now - self.start_time @@ -4980,9 +4986,9 @@ class GfptarProgram(Program): sys.stdout.write(f"\r{name}: " f"{current_tar_num_str}/{total_tar_num_str}(DB) " f"{current_size_str}B " - f"{current_ent_num_str}E " + f"{current_ent_num_str}Ent " f"{sec_str} " - f"{ent_per_sec_str}E/s") + f"{ent_per_sec_str}Ent/s") else: name = 'load-db' sys.stdout.write(f"\r{name}: " @@ -5022,10 +5028,10 @@ class GfptarProgram(Program): ent_per_sec_str = self._humanize(ent_per_sec) sys.stdout.write(f"\r{self.cmd_name}: {percent_str}% " f"{stored_size_str}/{sel_size_str}B " - f"{stored_num_str}/{sel_num_str}/{total_num_str}E " + f"{stored_num_str}/{sel_num_str}/{total_num_str}Ent " f"{sec_str} " f"{bytes_per_sec_str}B/s " - f"{ent_per_sec_str}E/s") + f"{ent_per_sec_str}Ent/s") # lock required def progress_for_extract(self, now): @@ -5056,10 +5062,10 @@ class GfptarProgram(Program): name = "extract" sys.stdout.write(f"\r{name}: {percent_str}% " f"{extracted_size_str}/{total_size_str}B " - f"{extracted_num_str}/{total_num_str}E " + f"{extracted_num_str}/{total_num_str}Ent " f"{sec_str} " f"{bytes_per_sec_str}B/s " - f"{ent_per_sec_str}E/s") + f"{ent_per_sec_str}Ent/s") def cmd_list_simple(self, indir): return self.cmd_list(indir, verbose=False) From 515d6cf55d9e98ea88a49a3a1dbf9d4114a61948 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Thu, 26 Sep 2024 04:15:16 +0900 Subject: [PATCH 119/143] gfptar --update: create temporary DB for performance --- gftool/gfptar/gfptar | 46 +++++++++++++++++++++++++++++++++++++------- 1 file changed, 39 insertions(+), 7 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index be728faf7..f77934da7 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -248,6 +248,21 @@ class FileAttr2(DBObj): array[4], array[5], array[6]) +class FileAttr3(DBObj): + def __init__(self, mtime): + self.mtime = mtime + + @classmethod + def dumps(cls, obj, for_dict): + array = [obj.mtime] + return json.dumps(array, separators=(',', ':')) + + @classmethod + def loads(cls, key, txt, for_dict): + array = json.loads(txt) + return cls(array[0]) + + # Abstract class DBCollection: def __init__(self, db, obj_cls, table_name, clear=False): @@ -3688,6 +3703,10 @@ class GfptarProgram(Program): max_serial = 0 max_gen = 0 suffix = None + mtime_db_idx = 0 + fattr_dict = None + num_entries = 0 + MAX_ENTRIES = 10000000 # limit for performance of SQLite3 for serial, gen, tar_path, dbgz_path, db_path in \ self.list_tar_dbgz_db(self.outdir_url, sort=True, progress=self.progress_enabled): @@ -3701,9 +3720,22 @@ class GfptarProgram(Program): broken_count += 1 continue suffix = self.suffix_check(suffix, tar_path) - db = DB(db_path) - fattr_dict = DBDict(db, FileAttr2, InfoDB.TABLE_ENTRY) - self.fattr_dict_list.append(fattr_dict) + if fattr_dict is None or num_entries >= MAX_ENTRIES: + tmpdb_path = os.path.join(self.tmpdir.name, + f"mtime-{mtime_db_idx}.db") + db = DB(tmpdb_path) + fattr_dict = DBDict(db, FileAttr3, 'mtime') + self.fattr_dict_list.append(fattr_dict) + mtime_db_idx += 1 + num_entries = 0 + for path, fattr in InfoDB.list_entries_from_db( + db_path, resolve_ugmap=False): + if self.is_canceled(): + raise self.error_canceled() + fattr_dict[path] = FileAttr3(fattr.mtime) + num_entries += 1 + logger.debug('os.remove: %s', db_path) + os.remove(db_path) self.start_time = time.time() self.next_time = self.start_time + self.progress_interval self.cmd_create_common(max_serial + 1, max_gen + 1, infiles) @@ -3763,20 +3795,20 @@ class GfptarProgram(Program): if fattr is None: continue elif int(entry.mtime) > int(fattr.mtime): # sec. - logger.debug(f"is_update_target2: path={path}:" + logger.debug(f"is_update_target0: path={path}:" f" entry.mtime({entry.mtime}) >" f" fattr.mtime({fattr.mtime})") return True # NOTE: compare only mtime # elif fattr.size != entry.size: - # logger.debug(f"is_update_target2: path={path}:" + # logger.debug(f"is_update_target0: path={path}:" # f" fattr.size({fattr.size})" # f" != entry.size({entry.size})") # return True else: return False # not found - logger.debug(f"is_update_target2: path={path}: not found (True)") + logger.debug(f"is_update_target0: path={path}: not found (True)") return True if len(self.fattr_dict_list) > 0: @@ -4307,7 +4339,7 @@ class GfptarProgram(Program): if entry.is_file(): last = entry break - if last is None or first == last: + if last is None or first.path == last.path: firstpath = first.subpath(self.basedir_url) outname = '%s%s' % (firstpath, self.suffix) else: From c1a04d1d3ba5a779b9a6f2d6516a8263bd05e7e1 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Thu, 26 Sep 2024 17:00:02 +0900 Subject: [PATCH 120/143] gfptar: new option: --merge-db-for-update --- gftool/gfptar/gfptar | 141 +++++++++++++++++++++++++------------------ 1 file changed, 81 insertions(+), 60 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index f77934da7..8f4dff8e1 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -248,19 +248,17 @@ class FileAttr2(DBObj): array[4], array[5], array[6]) -class FileAttr3(DBObj): +class Mtime(DBObj): def __init__(self, mtime): self.mtime = mtime @classmethod def dumps(cls, obj, for_dict): - array = [obj.mtime] - return json.dumps(array, separators=(',', ':')) + return str(obj.mtime) @classmethod def loads(cls, key, txt, for_dict): - array = json.loads(txt) - return cls(array[0]) + return cls(int(txt)) # Abstract @@ -2570,6 +2568,7 @@ class GfptarProgram(Program): self.bufsize = self.opt['--bufsize'] self.use_fsync = not self.opt['--disable-fsync'] self.gen_db = self.opt['--generate-db'] + self.merge_db_for_update = self.opt['--merge-db-for-update'] self.workdir = self.opt['--workdir'] euid = os.geteuid() @@ -3703,10 +3702,12 @@ class GfptarProgram(Program): max_serial = 0 max_gen = 0 suffix = None + + # self.merge_db_for_update is True mtime_db_idx = 0 fattr_dict = None num_entries = 0 - MAX_ENTRIES = 10000000 # limit for performance of SQLite3 + MAX_ENTRIES = self.merge_db_for_update for serial, gen, tar_path, dbgz_path, db_path in \ self.list_tar_dbgz_db(self.outdir_url, sort=True, progress=self.progress_enabled): @@ -3720,22 +3721,29 @@ class GfptarProgram(Program): broken_count += 1 continue suffix = self.suffix_check(suffix, tar_path) - if fattr_dict is None or num_entries >= MAX_ENTRIES: - tmpdb_path = os.path.join(self.tmpdir.name, - f"mtime-{mtime_db_idx}.db") - db = DB(tmpdb_path) - fattr_dict = DBDict(db, FileAttr3, 'mtime') + if MAX_ENTRIES > 0: + # limit for performance of SQLite3 + if fattr_dict is None or num_entries >= MAX_ENTRIES: + tmpdb_path = os.path.join(self.tmpdir.name, + f"mtime-{mtime_db_idx}.db") + db = DB(tmpdb_path) + fattr_dict = DBDict(db, Mtime, 'mtime') + self.fattr_dict_list.append(fattr_dict) + mtime_db_idx += 1 + num_entries = 0 + for path, fattr in InfoDB.list_entries_from_db( + db_path, resolve_ugmap=False): + if self.is_canceled(): + raise self.error_canceled() + fattr_dict[path] = Mtime(fattr.mtime) + num_entries += 1 + logger.debug('os.remove: %s', db_path) + os.remove(db_path) + else: + db = DB(db_path) + fattr_dict = DBDict(db, FileAttr2, InfoDB.TABLE_ENTRY) self.fattr_dict_list.append(fattr_dict) - mtime_db_idx += 1 - num_entries = 0 - for path, fattr in InfoDB.list_entries_from_db( - db_path, resolve_ugmap=False): - if self.is_canceled(): - raise self.error_canceled() - fattr_dict[path] = FileAttr3(fattr.mtime) - num_entries += 1 - logger.debug('os.remove: %s', db_path) - os.remove(db_path) + self.start_time = time.time() self.next_time = self.start_time + self.progress_interval self.cmd_create_common(max_serial + 1, max_gen + 1, infiles) @@ -5692,46 +5700,47 @@ Limitations: (For the upper limit, see the output of `ulimit -n -H`) Options: - -c, --create=OUTDIR Create tar files in OUTDIR from MEMBERs - -r, --append=OUTDIR Append files (create new tar files) - -u, --update=OUTDIR Append files newer than same entries in tar files - (--workdir with sufficient space is required) + -c, --create=OUTDIR Create tar files in OUTDIR from MEMBERs. + -r, --append=OUTDIR Append files. (create new tar files) + -u, --update=OUTDIR Append files newer than same entries in tar files. + (Sufficient space for --workdir is required) (About 300MiB is used per 1000000 entries) --resume=OUTDIR Same as --update -x, --extract=OUTDIR Extract all members or specified MEMBERs - from INDIR to OUTDIR - -t, --list=DIR List the members of DIR + from INDIR to OUTDIR. + -t, --list=DIR List the members of DIR. (--verbose to see more details) -C, --basedir=DIR Change to directory for MEMBERs [default: .] - --exclude=PATTERN Exclude files matching wildcard patterns - (https://docs.python.org/ja/3/library/fnmatch.html) + --exclude=PATTERN Exclude files matching wildcard patterns. (ex. --exclude=*.txt) (ex. --exclude=*/dirname/*) (ex. --exclude=*/??_abcde.pdf) + (https://docs.python.org/ja/3/library/fnmatch.html) -X, --exclude-from=FILE Exclude files matching wildcard patterns listed - in FILE - --workdir=DIR Local directory for temporary files + in FILE. + --workdir=DIR Local directory for temporary files. (default: {tempdir}) - -j, --jobs=NUM The number of jobs to copy per tar file in parallel + -j, --jobs=NUM The number of jobs to copy per tar file + in parallel. [default: 4] - -s, --size=BYTES Maximum assumed size per output file + -s, --size=BYTES Maximum assumed size per output file. [default: 200Mi] - --ratio=RATIO Assumed compression ratio (%) [default: 50] + --ratio=RATIO Assumed compression ratio (%). [default: 50] --max-entries-per-tar=NUM - The maximum number of entries per tar file - (Too many entries in a tar may use a lot of memory, - and may work slowly) + The maximum number of entries per tar file. + Too many entries in a tar may use a lot of memory, + and may work slowly. [default: 100k] - -T, --type=TYPE Compression type and tar file suffix - - gz : use gzip (*.tar.gz) - - bz2: use bzip2 (*.tar.bz2) - - xz : use xz (*.tar.xz) - - no : no compression (*.tar) - - (other): --use-compress-program is required + -T, --type=TYPE Compression type and tar file suffix. + gz : use gzip (*.tar.gz) + bz2: use bzip2 (*.tar.bz2) + xz : use xz (*.tar.xz) + no : no compression (*.tar) + (other): --use-compress-program is required [default: gz] -I, --use-compress-program=COMMAND Command to compress or decompress tar files - when using --type other than gz, bz2 or xz + when using --type other than gz, bz2 or xz. (The command must accept -d option to decompress) (ex. lz4, lzip, lzop) --gzip-program=COMMAND gzip command (ex. pigz) [default: gzip] @@ -5739,18 +5748,29 @@ Options: --xz-program=COMMAND xz command (ex. environment XZ_OPT="-T4" for speed) [default: xz] --generate-db Regenerate gfptar*_info.db.gz - --same-owner Extract files with the same ownership + --same-owner Extract files with the same ownership. (for euid=0 on local, or gfarmroot on Gfarm) --disable-gfarm-command Disable the use of gfreg and gfexport - for tar files on gfarm2fs - --disable-fsync disable calling fsync() before close() - --gfsched-interval=SEC Interval of updating candidate hosts to write + for tar files on gfarm2fs. + --disable-fsync disable calling fsync() before close(). + --gfsched-interval=SEC Interval of updating candidate hosts to write. (for Gfarm URL only) [default: 120] - --progress-interval=SEC Interval of updating progress [default: 1.0] - --encoding=CODEC Codec for filename encoding + --progress-interval=SEC Interval of updating progress. [default: 1.0] + --encoding=CODEC Codec for filename encoding. (https://docs.python.org/3/library/codecs.html#standard-encodings) [default: utf-8] - --bufsize=BYTES Buffer size to copy [default: 1Mi] + --bufsize=BYTES Buffer size to copy. [default: 1Mi] + --merge-db-for-update=NUM + The maximum number of entries in temporary DB + for --update. If the specified number is large, + *.db.gz file are merged to fewer files + on --workdir. If there are a lot of *.db.gz + files, using this option will improve comparison + performance for --update. + However, if there are a lot of entries + in tar files, the merge cost may be higher than + file comparison cost. + (0 = not merged) [default: 1M] --progress-unit=TYPE Unit for progress - si: SI prefix - bin: Binary prefix @@ -5763,18 +5783,18 @@ Options: --test-workdir-local=DIR Local directory for test [default: /tmp] --test-workdir-gfarm=DIR Gfarm directory for test [default: gfarm:/tmp] (A path in gfarm2fs can be specified) - --dummy-num=NUM The number of dummy input entries for --create - (ignore MEMBER arguments) - (create 1000 files and directories per 1 unit) - (default: disabled) + --dummy-num=NUM The number of dummy input entries for --create. + (MEMBER arguments are ignored.) + (1000 files are created per directory) + (default: disabled) (ex. 1M) --dummy-size-min=BYTES Minimum size of dummy files [default: 0] --dummy-size-max=BYTES Maximum size of dummy files [default: 0] --debug-sleep=SEC Sleep time per file for debug [default: 0.0] - --dry-run Not create output files - -q, --quiet Quiet messages - -v, --verbose Verbose output - -d, --debug Debug mode - -?, -h, --help Show this help and exit + --dry-run Not create output files. + -q, --quiet Quiet messages. + -v, --verbose Verbose output. + -d, --debug Debug mode. + -h, --help Show this help and exit. Usage: {f} [options] [--exclude=PATTERN]... -c OUTDIR [-C DIR] [--] MEMBER... @@ -5813,6 +5833,7 @@ _schema = Schema({ '--disable-gfarm-command': bool, '--disable-fsync': bool, '--gfsched-interval': Use(int), + '--merge-db-for-update': Use(unhumanize_number), '--generate-db': bool, '--same-owner': bool, '--workdir': Or(str, None), From 7d6385327d06e4adc4f75a009bcc8b1f00042ce0 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Thu, 26 Sep 2024 21:15:57 +0900 Subject: [PATCH 121/143] gfptar: remove -h option --- gftool/gfptar/gfptar | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 8f4dff8e1..dcd6356ec 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -5794,7 +5794,7 @@ Options: -q, --quiet Quiet messages. -v, --verbose Verbose output. -d, --debug Debug mode. - -h, --help Show this help and exit. + -?, --help Show this help and exit. Usage: {f} [options] [--exclude=PATTERN]... -c OUTDIR [-C DIR] [--] MEMBER... @@ -5806,7 +5806,7 @@ Usage: {f} [options] --test {f} [options] --test -C DIR MEMBER... {f} [options] --test-long - {f} -h | --help + {f} -? | --help """.format(f=progname, tempdir=tempfile.gettempdir()) From e7b45543323bacaa231c579b7fca38331f54e563 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Fri, 27 Sep 2024 15:48:08 +0900 Subject: [PATCH 122/143] gfptar: update --help --- gftool/gfptar/gfptar | 58 ++++++++++++++++++++++++-------------------- 1 file changed, 32 insertions(+), 26 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index dcd6356ec..f6337690d 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -5648,45 +5648,50 @@ progname = os.path.basename(__file__) __doc__ = """ gfptar - archive files in parallel -Example of --create (Gfarm to Gfarm): +Example of --create (Local to Gfarm): Command line: - gfptar -c gfarm:/home/user1/out -C gfarm:/home/user1 ./dir - Input files (any files): - gfarm:/home/user1/dir/test0000.data + gfptar -c gfarm:/home/gfuser1/out1 -C /home/user1 ./dir1 + + Input files (in /home/user1): + dir1/test0000.data ... - gfarm:/home/user1/dir/test9999.data + dir1/test9999.data + Output files: - gfarm:/home/user1/out/0001_dir_test0000.data..dir_test0999.data.tar.gz - gfarm:/home/user1/out/g1_0001_gfptar.db.gz + gfarm:/home/gfuser1/out1/0001_dir_test0000.data..dir_test0999.data.tar.gz + gfarm:/home/gfuser1/out1/g1_0001_gfptar.db.gz ... - gfarm:/home/user1/out/0010_dir_test9000.data..dir_test9999.data.tar.gz - gfarm:/home/user1/out/g1_0010_gfptar.db.gz - Contents of gMM_NN_gfptar.db.gz file (SQLite3 and gzip): - MM: the generation number for each append operation - NN: the serial number - table 'path_entry': map of path name to JSON string - json.dumps([ file_mode (int), mtime (int), - user_unique_id (int), group_unique_id (int), - size (int), symlink_path, file_type (D,F,S) ] - file_type 'D': directory - file_type 'F': file - file_type 'S': symbolic link - table 'user_map' : map of unique id (not uid) to user name - table 'group_map': map of unique id (not gid) to group name + gfarm:/home/gfuser1/out1/0010_dir_test9000.data..dir_test9999.data.tar.gz + gfarm:/home/gfuser1/out1/g1_0010_gfptar.db.gz Example of --extract (Gfarm to Local): Command line: - gfptar -x /home/user1/out2 gfarm:/home/user1/gfptar-dir + gfptar -x /home/user1/dir2 gfarm:/home/gfuser1/out1 + Output files: - /home/user1/out2/dir/test0000.data + /home/user1/dir2/dir1/test0000.data ... - /home/user1/out2/dir/test9999.data + /home/user1/dir2/dir1/test9999.data + +Contents of gMM_NN_gfptar.db.gz file (SQLite3 and gzip): + MM: the generation number for each append operation + NN: the serial number + table 'path_entry': map of path name to JSON string + json.dumps([ file_mode (int), mtime (int), + user_unique_id (int), group_unique_id (int), + size (int), symlink_path, file_type (D,F,S) ] + file_type 'D': directory + file_type 'F': file + file_type 'S': symbolic link + table 'user_map' : map of unique id (not uid) to user name + table 'group_map': map of unique id (not gid) to group name SI prefix or Binary prefix: SI prefix: - 1k = 10^3 = 1000^1 (kilo) (not K) - 1M = 10^6 = 1000^2 (mega) - ... + Binary prefix: - 1Ki = 2^10 = 1024^1 (kibi) - 1Mi = 2^20 = 1024^2 (mebi) @@ -5696,8 +5701,9 @@ Limitations: - Hard links are not preserved. - File names cannot include newline characters. - Subsecond (less than a second) for mtime is not preserved. - - --update: If many *.db.gz files exists, input files may not be appended. - (For the upper limit, see the output of `ulimit -n -H`) + - --merge-db-for-update=0 --update: If many *.db.gz files exists, + input files may not be appended. (For the upper limit, see the + output of `ulimit -n -H`) Options: -c, --create=OUTDIR Create tar files in OUTDIR from MEMBERs. From 69e2fcf2ff07e9e75ced52307b6a8c55ac494ea3 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Fri, 27 Sep 2024 16:25:05 +0900 Subject: [PATCH 123/143] gfptar: update --help --- gftool/gfptar/gfptar | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index f6337690d..962eb637a 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -5777,10 +5777,10 @@ Options: in tar files, the merge cost may be higher than file comparison cost. (0 = not merged) [default: 1M] - --progress-unit=TYPE Unit for progress - - si: SI prefix - - bin: Binary prefix - - raw: no conversion + --progress-unit=TYPE Unit type for progress + si : SI prefix + bin: Binary prefix + raw: no conversion [default: si] --memory=BYTES Upper limit of memory size (bytes) (default: no limit) (ex. 2Gi) From fc2576f85ef6b63d073ae02803fc9eaa25efc981 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Fri, 27 Sep 2024 17:27:11 +0900 Subject: [PATCH 124/143] gfptar: --merge-db-for-update=100M --- gftool/gfptar/gfptar | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 962eb637a..881fc3386 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -5773,10 +5773,10 @@ Options: on --workdir. If there are a lot of *.db.gz files, using this option will improve comparison performance for --update. - However, if there are a lot of entries - in tar files, the merge cost may be higher than - file comparison cost. - (0 = not merged) [default: 1M] + However, when there are few input files and + many existing files in tar files, the merge cost + may be higher than file comparison cost. + (0 = not merged) [default: 100M] --progress-unit=TYPE Unit type for progress si : SI prefix bin: Binary prefix From 3f7d3a9e0a742e0535374b33966b0dea519d983f Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Fri, 27 Sep 2024 17:28:09 +0900 Subject: [PATCH 125/143] gfptar: update manpages --- doc/docbook/en/ref/man1/gfptar.1.docbook | 70 ++++++++++++++++++---- doc/docbook/ja/ref/man1/gfptar.1.docbook | 75 +++++++++++++++++++----- 2 files changed, 120 insertions(+), 25 deletions(-) diff --git a/doc/docbook/en/ref/man1/gfptar.1.docbook b/doc/docbook/en/ref/man1/gfptar.1.docbook index 585d98f8a..c425d2422 100644 --- a/doc/docbook/en/ref/man1/gfptar.1.docbook +++ b/doc/docbook/en/ref/man1/gfptar.1.docbook @@ -6,7 +6,7 @@ -29 Jan 2023 +27 Sep 2024 gfptar @@ -43,6 +43,22 @@ -t indir + + gfptar + OPTIONS + -r outdir + -C basedir + member + + + + gfptar + OPTIONS + -u outdir + -C basedir + member + + @@ -51,9 +67,9 @@ gfptar - creates archive files to Gfarm, or extracts files from archive files on Gfarm. - The archive files consist of multiple tar files and list files of archived members in a directory. - Creation or extraction is performed in parallel for each tar file in a directory for archive. + creates archive files to Gfarm or local file system, or extracts the archive files. + The archive files consist of multiple tar files and database files in a directory. + Creation or extraction is performed in parallel for each tar file. @@ -63,7 +79,8 @@ outdir - basedir member... + [ basedir] + member... @@ -99,6 +116,31 @@ + + outdir + [ basedir] + member... + + + + Append archive files in outdir directory. + + + + + + outdir + [ basedir] + member... + + + + Append archive files with newer files + in existing outdir directory. + + + + @@ -136,6 +178,12 @@ OPTIONS + + + The primary options are listed below. + For additional options, please refer to gfptar --help. + + @@ -161,13 +209,13 @@ for each group of files divided by the assumed size calculated considering the assumed compression ratio (--ratio option). - The default value is 200M (200 * 1024^2 = 209715200) (200 MiB). + The default value is 200Mi (200 * 1024^2 = 209715200) (200 mebibytes). -RATIO +RATIO Specifies the assumed compression ratio in percentage for . @@ -188,9 +236,6 @@ "xz" means xz type (suffix is .tar.xz). "no" means no compression. - - This option is unnecessary for --extract (determine type automatically). - The default value is gz. @@ -236,7 +281,7 @@ - + Displays a list of command options. @@ -292,6 +337,9 @@ SEE ALSO + + gfptar --help + gfreg1 diff --git a/doc/docbook/ja/ref/man1/gfptar.1.docbook b/doc/docbook/ja/ref/man1/gfptar.1.docbook index b3ba0b1b5..904f782e1 100644 --- a/doc/docbook/ja/ref/man1/gfptar.1.docbook +++ b/doc/docbook/ja/ref/man1/gfptar.1.docbook @@ -6,7 +6,7 @@ -29 Jan 2023 +27 Sep 2024 gfptar @@ -43,6 +43,22 @@ -t indir + + gfptar + OPTIONS + -r outdir + -C basedir + member + + + + gfptar + OPTIONS + -u outdir + -C basedir + member + + @@ -51,9 +67,9 @@ gfptar -は、アーカイブファイルをGfarm上に作成します。また、そのアーカイブファイルからファイルを抽出します。 -作成されるアーカイブファイルは、ディレクトリの中に複数のtarファイルと複数のファイル一覧のファイルで構成されます。 -アーカイブの作成または抽出処理を、アーカイブ用ディレクトリ内のtarファイルごとに並列でおこないます。 +は、アーカイブファイルをGfarmまたはローカルファイルシステムに作成します。または、そのアーカイブファイルからファイルを抽出します。 +作成されるアーカイブファイルは、一つのディレクトリの中に、複数のtarファイルと複数のデータベースファイルで構成されます。 +アーカイブの作成または抽出処理を、tarファイルごとに並列でおこないます。 @@ -63,7 +79,8 @@ outdir - basedir member... + [ basedir] + member... @@ -100,6 +117,30 @@ + + outdir + [ basedir] + member... + + + + outdir に、既存アーカイブファイル群を含むディレクトリを指定し、アーカイブファイルを追加します。 + + + + + + outdir + [ basedir] + member... + + + + outdir に、アーカイブファイル群を含むディレクトリを指定し、既存ファイルよりも新しいファイルのみを格納したアーカイブファイル追加します。 + + + + @@ -137,6 +178,12 @@ OPTIONS + + + 主なオプションは以下の通りです。 + その他オプションは gfptar --help を参照してください。 + + @@ -160,13 +207,13 @@ 仮定圧縮率(別オプション --ratio)を考慮して計算された想定サイズで分割されたファイル群ごとにアーカイブファイルを作成します。 - デフォルト値は 200M (200 * 1024^2 = 209715200) (200 メビバイト) です。 + デフォルト値は 200Mi (200 * 1024^2 = 209715200) (200 メビバイト) です。 -RATIO +RATIO アーカイブを作成時に、分割されるアーカイブファイルの仮定圧縮率(%)を指定します。 @@ -181,14 +228,11 @@ TYPE - アーカイブを作成時に、圧縮する形式を指定します。 + アーカイブ新規作成時(--create)の圧縮形式を指定します。 gz は gzip、bz2 は bzip2、xz は xz 形式で圧縮されます。 - 拡張子もそれぞれ tar.gz, tar.bz2, tar,xz になります。 + 拡張子はそれぞれ tar.gz, tar.bz2, tar.xz になります。 no を指定すると圧縮しません。 - - 抽出時にこのオプションは使用されず、形式が自動で認識されます。 - デフォルト値は gz です。 @@ -201,7 +245,7 @@ アーカイブを作成時または抽出時に、圧縮・伸張するための外部コマンドを指定します。 作成されるアーカイブファイルの拡張子を オプションで指定します。 - 抽出時にはそのコマンドに -d オプションが指定されます。 + 抽出時にはその外部コマンドに -d オプションが指定されます。 @@ -234,7 +278,7 @@ - + コマンドオプションを出力します。 @@ -292,6 +336,9 @@ SEE ALSO + + gfptar --help + gfreg1 From 75872ea8325be5722c250aec9485176af5276cec Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Tue, 1 Oct 2024 17:10:46 +0900 Subject: [PATCH 126/143] gfptar: prevent the progress message from being shortened --- gftool/gfptar/gfptar | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 881fc3386..bf5514797 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -65,7 +65,9 @@ def format_seconds(seconds, minhour=False): return f"{seconds:.0f}s({minutes:.1f}m)" else: hours = seconds / 3600 - return f"{seconds:.0f}s({hours:.1f}h)" + # Prevent the string from being shortened + # ex. '1.1h' -> ' 1.1h' + return f"{seconds:.0f}s({hours:4.1f}h)" def humanize_number(num, binary_prefix=False): @@ -84,13 +86,18 @@ def humanize_number(num, binary_prefix=False): n /= base scale += 1 if n < 10: + # ex. '1.23' d = n.quantize(Decimal('0.00'), rounding=ROUND_DOWN) elif n < 100: + # ex. '12.3' d = n.quantize(Decimal('0.0'), rounding=ROUND_DOWN) elif n < 1000: d = n.quantize(Decimal('0'), rounding=ROUND_DOWN) + # Prevent the string from being shortened + # ex. ' 123' d = f" {d}" else: + # ex. '1234' d = n.quantize(Decimal('0'), rounding=ROUND_DOWN) return f"{d}{units[scale]}" From 37d3831520158202b3b04588228e9594fb696176 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Fri, 4 Oct 2024 22:12:55 +0900 Subject: [PATCH 127/143] - gfptar: new option: --disable-tempdb-aggregation - gfptar: new option: --max-entries-for-tempdb-aggregation - gfptar: delete --merge-db-for-update --- gftool/gfptar/gfptar | 87 +++++++++++++++++++++++++++----------------- 1 file changed, 54 insertions(+), 33 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index bf5514797..1fcd03e8d 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -634,6 +634,21 @@ def logger_init_without_lock(name, loglevel=logging.WARNING, return logger +def get_max_openfiles(): + _, hard_lim = resource.getrlimit(resource.RLIMIT_NOFILE) + return hard_lim + + +def update_max_openfiles(): + soft_lim, hard_lim = resource.getrlimit(resource.RLIMIT_NOFILE) + logger.info(f"current soft RLIMIT_NOFILE: {soft_lim}") + logger.info(f"current hard RLIMIT_NOFILE: {hard_lim}") + new_soft_lim = hard_lim + resource.setrlimit(resource.RLIMIT_NOFILE, (new_soft_lim, hard_lim)) + soft_lim, hard_lim = resource.getrlimit(resource.RLIMIT_NOFILE) + logger.info(f"new soft RLIMIT_NOFILE: {soft_lim}") + + _encoding = sys.getfilesystemencoding() @@ -2575,8 +2590,10 @@ class GfptarProgram(Program): self.bufsize = self.opt['--bufsize'] self.use_fsync = not self.opt['--disable-fsync'] self.gen_db = self.opt['--generate-db'] - self.merge_db_for_update = self.opt['--merge-db-for-update'] - + self.enable_tempdb_aggregation \ + = not self.opt['--disable-tempdb-aggregation'] + self.max_tempdb_aggregation \ + = self.opt['--max-entries-for-tempdb-aggregation'] self.workdir = self.opt['--workdir'] euid = os.geteuid() # Temporary files are removed when the process exits. @@ -2603,14 +2620,6 @@ class GfptarProgram(Program): if self.memory_limit is not None: self.set_memory_limit(self.memory_limit) - soft_lim, hard_lim = resource.getrlimit(resource.RLIMIT_NOFILE) - logger.info(f"current soft RLIMIT_NOFILE: {soft_lim}") - logger.info(f"current hard RLIMIT_NOFILE: {hard_lim}") - new_soft_lim = hard_lim - resource.setrlimit(resource.RLIMIT_NOFILE, (new_soft_lim, hard_lim)) - soft_lim, hard_lim = resource.getrlimit(resource.RLIMIT_NOFILE) - logger.info(f"new soft RLIMIT_NOFILE: {soft_lim}") - self.compress_type = self.opt['--type'] self.compress_prog = self.opt['--use-compress-program'] self.gzip_prog = self.opt['--gzip-program'] @@ -3656,7 +3665,7 @@ class GfptarProgram(Program): self.cannot_be_archived = 0 self.create_job_init() # before starting threads - self.fattr_dict_list = [] + self.fattr_dict_list = [] # to compare mtime for --update def cmd_create(self, outdir, basedir, infiles): self.cmd_create_init('create', outdir, basedir) @@ -3705,16 +3714,23 @@ class GfptarProgram(Program): self.cmd_create_init('update', outdir, basedir) if not self.outdir_url.exists(): raise FileNotFoundError(outdir) + + # To open all DB files at the same time + update_max_openfiles() + broken_count = 0 max_serial = 0 max_gen = 0 suffix = None + fattr_dict = None - # self.merge_db_for_update is True mtime_db_idx = 0 - fattr_dict = None num_entries = 0 - MAX_ENTRIES = self.merge_db_for_update + if self.enable_tempdb_aggregation: + MAX_ENTRIES = self.max_tempdb_aggregation + else: + MAX_ENTRIES = 0 + for serial, gen, tar_path, dbgz_path, db_path in \ self.list_tar_dbgz_db(self.outdir_url, sort=True, progress=self.progress_enabled): @@ -3729,7 +3745,6 @@ class GfptarProgram(Program): continue suffix = self.suffix_check(suffix, tar_path) if MAX_ENTRIES > 0: - # limit for performance of SQLite3 if fattr_dict is None or num_entries >= MAX_ENTRIES: tmpdb_path = os.path.join(self.tmpdir.name, f"mtime-{mtime_db_idx}.db") @@ -5664,7 +5679,7 @@ Example of --create (Local to Gfarm): ... dir1/test9999.data - Output files: + Output files in archive directory: gfarm:/home/gfuser1/out1/0001_dir_test0000.data..dir_test0999.data.tar.gz gfarm:/home/gfuser1/out1/g1_0001_gfptar.db.gz ... @@ -5675,6 +5690,9 @@ Example of --extract (Gfarm to Local): Command line: gfptar -x /home/user1/dir2 gfarm:/home/gfuser1/out1 + Input archive directory: + gfarm:/home/gfuser1/out1 + Output files: /home/user1/dir2/dir1/test0000.data ... @@ -5708,9 +5726,6 @@ Limitations: - Hard links are not preserved. - File names cannot include newline characters. - Subsecond (less than a second) for mtime is not preserved. - - --merge-db-for-update=0 --update: If many *.db.gz files exists, - input files may not be appended. (For the upper limit, see the - output of `ulimit -n -H`) Options: -c, --create=OUTDIR Create tar files in OUTDIR from MEMBERs. @@ -5765,7 +5780,7 @@ Options: (for euid=0 on local, or gfarmroot on Gfarm) --disable-gfarm-command Disable the use of gfreg and gfexport for tar files on gfarm2fs. - --disable-fsync disable calling fsync() before close(). + --disable-fsync Disable calling fsync() before close(). --gfsched-interval=SEC Interval of updating candidate hosts to write. (for Gfarm URL only) [default: 120] --progress-interval=SEC Interval of updating progress. [default: 1.0] @@ -5773,17 +5788,20 @@ Options: (https://docs.python.org/3/library/codecs.html#standard-encodings) [default: utf-8] --bufsize=BYTES Buffer size to copy. [default: 1Mi] - --merge-db-for-update=NUM - The maximum number of entries in temporary DB - for --update. If the specified number is large, - *.db.gz file are merged to fewer files - on --workdir. If there are a lot of *.db.gz - files, using this option will improve comparison - performance for --update. - However, when there are few input files and - many existing files in tar files, the merge cost - may be higher than file comparison cost. - (0 = not merged) [default: 100M] + --disable-tempdb-aggregation + This option only effective with --update option. + When there are few files in MEMBERs, this option + may improve the execution speed. + However, if more than about {max_openfiles} + tar files in archive directory exist, + the execution with this option will fail. + --max-entries-for-tempdb-aggregation=NUM + This option only effective with --update option. + A larger value is generally recommended + for this option. However, if the value is + excessively large and there are too many entries + in archive directory, performance may degrade. + [default: 100M] --progress-unit=TYPE Unit type for progress si : SI prefix bin: Binary prefix @@ -5820,7 +5838,9 @@ Usage: {f} [options] --test -C DIR MEMBER... {f} [options] --test-long {f} -? | --help -""".format(f=progname, tempdir=tempfile.gettempdir()) +""".format(f=progname, + tempdir=tempfile.gettempdir(), + max_openfiles=get_max_openfiles()) _schema = Schema({ @@ -5846,7 +5866,8 @@ _schema = Schema({ '--disable-gfarm-command': bool, '--disable-fsync': bool, '--gfsched-interval': Use(int), - '--merge-db-for-update': Use(unhumanize_number), + '--disable-tempdb-aggregation': bool, + '--max-entries-for-tempdb-aggregation': Use(unhumanize_number), '--generate-db': bool, '--same-owner': bool, '--workdir': Or(str, None), From 3fd530207f50d05f3d2e04ba9b8b5ee7ea47cf30 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Fri, 4 Oct 2024 22:54:27 +0900 Subject: [PATCH 128/143] gfptar --update,--append: "Compression type mismatch" when different --type is specified (There is no need to specify --type in such case) --- gftool/gfptar/gfptar | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 1fcd03e8d..85fcc4678 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -2620,7 +2620,12 @@ class GfptarProgram(Program): if self.memory_limit is not None: self.set_memory_limit(self.memory_limit) - self.compress_type = self.opt['--type'] + self.compress_type_specified = self.opt['--type'] + if self.compress_type_specified is None: + self.compress_type = Compress.TYPE_GZIP + else: + self.compress_type = self.compress_type_specified + self.compress_prog = self.opt['--use-compress-program'] self.gzip_prog = self.opt['--gzip-program'] self.xz_prog = self.opt['--xz-program'] @@ -3674,8 +3679,8 @@ class GfptarProgram(Program): gen = 1 self.cmd_create_common(serial, gen, infiles) - def error_type_mismatch(self, tar_path): - return GfptarError(f"Compression type mismatch: {tar_path}") + def error_type_mismatch(self, suffix, tar_path): + return GfptarError(f"Compression type mismatch ({suffix}): {tar_path}") def suffix_check(self, suffix, tar_path): if suffix is None: @@ -3691,7 +3696,7 @@ class GfptarProgram(Program): Compress.init(self.compress_type, self.compress_prog, self.gzip_prog, self.bzip2_prog, self.xz_prog) elif not tar_path.endswith(suffix): - raise self.error_type_mismatch(tar_path) + raise self.error_type_mismatch(suffix, tar_path) return suffix def cmd_append(self, outdir, basedir, infiles): @@ -3700,7 +3705,7 @@ class GfptarProgram(Program): raise FileNotFoundError(outdir) max_serial = 0 max_gen = 0 - suffix = None + suffix = self.compress_type_specified # may be None tar_list, dbgz_list = self.list_tar_list_dbgz_from_dir(self.outdir_url) for serial, gen, tar_path in tar_list: if serial > max_serial: @@ -3721,7 +3726,7 @@ class GfptarProgram(Program): broken_count = 0 max_serial = 0 max_gen = 0 - suffix = None + suffix = self.compress_type_specified # may be None fattr_dict = None mtime_db_idx = 0 @@ -5765,7 +5770,7 @@ Options: xz : use xz (*.tar.xz) no : no compression (*.tar) (other): --use-compress-program is required - [default: gz] + (default: gz) -I, --use-compress-program=COMMAND Command to compress or decompress tar files when using --type other than gz, bz2 or xz. @@ -5856,7 +5861,7 @@ _schema = Schema({ '--encoding': str, '--size': Use(unhumanize_number), '--bufsize': Use(unhumanize_number), - '--type': str, + '--type': Or(str, None), '--ratio': Use(int), '--jobs': Use(int), '--use-compress-program': Or(str, None), From 929225217e47eaf4f3f777e47f30a6501d970c55 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Fri, 4 Oct 2024 23:08:45 +0900 Subject: [PATCH 129/143] gfptar: use list_entries_from_db(sort=None) --- gftool/gfptar/gfptar | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 85fcc4678..60d05b3fd 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -3759,7 +3759,7 @@ class GfptarProgram(Program): mtime_db_idx += 1 num_entries = 0 for path, fattr in InfoDB.list_entries_from_db( - db_path, resolve_ugmap=False): + db_path, resolve_ugmap=False, sort=None): if self.is_canceled(): raise self.error_canceled() fattr_dict[path] = Mtime(fattr.mtime) @@ -4624,7 +4624,7 @@ class GfptarProgram(Program): error_num += 1 continue for path, fattr in InfoDB.list_entries_from_db( - db_path, resolve_ugmap=False): + db_path, resolve_ugmap=False, sort=None): if self.is_canceled(): raise self.error_canceled() size = fattr.size @@ -5640,7 +5640,7 @@ class InfoDB: return db_path, dbgz_path, num, size @staticmethod - def list_entries_from_db(db_path, resolve_ugmap=False): + def list_entries_from_db(db_path, resolve_ugmap=False, sort='ASC'): db = DB(db_path) has_error = False try: @@ -5655,7 +5655,7 @@ class InfoDB: user_dict_mem[k] = v for k, v in group_dict.items(): group_dict_mem[k] = v - for path, fattr in fattr_dict.iterator(sort='ASC'): + for path, fattr in fattr_dict.iterator(sort=sort): if resolve_ugmap: # unique id -> name fattr.user = user_dict_mem.get(fattr.user, '???') From 8b71c6570584b8e63f100f6720fb094bc5da5d92 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Fri, 4 Oct 2024 23:15:30 +0900 Subject: [PATCH 130/143] gfptar --list without --verbose: use resolve_ugmap=False --- gftool/gfptar/gfptar | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 60d05b3fd..5fa4857f9 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -5179,14 +5179,17 @@ class GfptarProgram(Program): if quiet: print_func = print_quiet + resolve_ugmap = False elif verbose: print_func = print_verbose + resolve_ugmap = True else: print_func = print_simple + resolve_ugmap = False count = 0 for serial, gen, ent_path, fattr in self.list_entries( - indir_url, resolve_ugmap=True): + indir_url, resolve_ugmap=resolve_ugmap): if self.is_canceled(): break print_func(serial, gen, ent_path, fattr) From e2fc6b1e3fb7dda168310c3ccab75f236b96828e Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Fri, 4 Oct 2024 23:36:36 +0900 Subject: [PATCH 131/143] gfptar: delete --resume --- gftool/gfptar/gfptar | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 5fa4857f9..80fe97c42 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -2733,8 +2733,6 @@ class GfptarProgram(Program): self.cmd_append(outdir, basedir, infiles) outdir = self.opt['--update'] - if not outdir: - outdir = self.opt['--resume'] if outdir: basedir = self.opt['--basedir'] infiles = self.opt['MEMBER'] @@ -5737,12 +5735,14 @@ Limitations: Options: -c, --create=OUTDIR Create tar files in OUTDIR from MEMBERs. - -r, --append=OUTDIR Append files. (create new tar files) - -u, --update=OUTDIR Append files newer than same entries in tar files. + -r, --append=OUTDIR Append files to new tar files. + -u, --update=OUTDIR Append only files (to new tar files) newer than + same entries in existing tar files. + This option can also be used to resume operation + after interruption (including errors). (Sufficient space for --workdir is required) - (About 300MiB is used per 1000000 entries) - --resume=OUTDIR Same as --update - -x, --extract=OUTDIR Extract all members or specified MEMBERs + (About 300MiB is used per existing 1000000 entries) + -x, --extract=OUTDIR Extract all entries or specified MEMBERs from INDIR to OUTDIR. -t, --list=DIR List the members of DIR. (--verbose to see more details) @@ -5839,7 +5839,6 @@ Usage: {f} [options] [--exclude=PATTERN]... -c OUTDIR [-C DIR] [--] MEMBER... {f} [options] [--exclude=PATTERN]... -r OUTDIR [-C DIR] [--] MEMBER... {f} [options] [--exclude=PATTERN]... -u OUTDIR [-C DIR] [--] MEMBER... - {f} [options] [--exclude=PATTERN]... --resume OUTDIR [-C DIR] [--] MEMBER... {f} [options] [--exclude=PATTERN]... -x OUTDIR [--] INDIR [MEMBER...] {f} [options] -t DIR {f} [options] --test @@ -5857,7 +5856,6 @@ _schema = Schema({ '--create': Or(str, None), '--append': Or(str, None), '--update': Or(str, None), - '--resume': Or(str, None), '--basedir': Or(str, None), '--exclude': Or([str], None), '--exclude-from': Or(str, None), From 05aa29acf3214e8cd140b8ffc9f9e435c8a81084 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Fri, 4 Oct 2024 23:38:39 +0900 Subject: [PATCH 132/143] gfptar: clarify --- gftool/gfptar/gfptar | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 80fe97c42..53be21ae0 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -3774,7 +3774,7 @@ class GfptarProgram(Program): self.cmd_create_common(max_serial + 1, max_gen + 1, infiles) if broken_count > 0: raise GfptarError("Some .tar.* or .db.gz are broken:" - f" num={broken_count}") + f" count={broken_count}") def cmd_create_common(self, serial, gen, infiles): if self.compress_type == Compress.TYPE_NO: @@ -4743,7 +4743,7 @@ class GfptarProgram(Program): self.db.unlink() if error_num > 0: raise GfptarError("Some .tar.* or .db.gz are broken:" - f" num={error_num}") + f" count={error_num}") def extract_directories(self, directory_set): logger.debug('extract_directories') @@ -5213,7 +5213,7 @@ class GfptarProgram(Program): os.remove(db_path) if error_num > 0: raise GfptarError("Some .tar.* or .db.gz are broken:" - f" num={error_num}") + f" count={error_num}") def sort_tar_or_db_list(self, tar_or_db_list): def id_key(value): From 504bd11f73f047b77fc86bc98a0aeac452906837 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Sat, 5 Oct 2024 00:12:57 +0900 Subject: [PATCH 133/143] gfptar --create: check the existence of input MEMBERs first --- gftool/gfptar/gfptar | 70 ++++++++++++++++++++++++-------------------- 1 file changed, 39 insertions(+), 31 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 53be21ae0..0975744de 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -2571,6 +2571,7 @@ class GfptarProgram(Program): self.tmpdir = None self.test_mode = False self.worker_list = [] # (started, process, inq, outq) + self.progress_enabled = True def options_init(self): self.log_init() @@ -3626,7 +3627,7 @@ class GfptarProgram(Program): TABLE_tar_entry = 'tar_entry' - def cmd_create_init(self, cmd_name, outdir, basedir): + def cmd_create_init(self, cmd_name, outdir, basedir, infiles): self.cmd_name = cmd_name self.options_init() logger.debug('create start: outdir=%s, basedir=%s', outdir, basedir) @@ -3670,12 +3671,38 @@ class GfptarProgram(Program): self.create_job_init() # before starting threads self.fattr_dict_list = [] # to compare mtime for --update + self.infiles_checked = set() + for infile in infiles: + # infile_url = GfURL.init(infile) + # if not infile_url.is_local(): + # raise GfException('specifying a relative path is required ' + # 'instead of a URL: ' + infile) + # NOTE: allow members named gfarm:* (ex. gfarm:/tmp/gfarm:abc) + infile_url = GfURL.init(infile, local=True) + infile = infile_url.path # normalize and ignore scheme + # normalized: ex. .///abc -> ./abc + infile = infile.lstrip('/') # relative path only + if infile.startswith('./'): + infile = infile[2:] + if infile == '' or infile == '.': + raise GfException('specifying current directory (.) ' + + 'as members is not allowed: ' + infile) + if infile == '..' or infile.startswith('../'): + raise GfException('specifying parent directory (..) ' + + 'as members is not allowed: ' + infile) + + url_str = os.path.join(self.basedir_url.url_str, infile) + url = GfURL.init(url_str) + if not url.exists(): + raise FileNotFoundError(url_str) + self.infiles_checked.add(infile) + def cmd_create(self, outdir, basedir, infiles): - self.cmd_create_init('create', outdir, basedir) + self.cmd_create_init('create', outdir, basedir, infiles) self.outdir_url.create_new_dir() serial = 1 gen = 1 - self.cmd_create_common(serial, gen, infiles) + self.cmd_create_common(serial, gen) def error_type_mismatch(self, suffix, tar_path): return GfptarError(f"Compression type mismatch ({suffix}): {tar_path}") @@ -3698,7 +3725,7 @@ class GfptarProgram(Program): return suffix def cmd_append(self, outdir, basedir, infiles): - self.cmd_create_init('append', outdir, basedir) + self.cmd_create_init('append', outdir, basedir, infiles) if not self.outdir_url.exists(): raise FileNotFoundError(outdir) max_serial = 0 @@ -3711,10 +3738,10 @@ class GfptarProgram(Program): if gen > max_gen: max_gen = gen suffix = self.suffix_check(suffix, tar_path) - self.cmd_create_common(max_serial + 1, max_gen + 1, infiles) + self.cmd_create_common(max_serial + 1, max_gen + 1) def cmd_update(self, outdir, basedir, infiles): - self.cmd_create_init('update', outdir, basedir) + self.cmd_create_init('update', outdir, basedir, infiles) if not self.outdir_url.exists(): raise FileNotFoundError(outdir) @@ -3771,12 +3798,12 @@ class GfptarProgram(Program): self.start_time = time.time() self.next_time = self.start_time + self.progress_interval - self.cmd_create_common(max_serial + 1, max_gen + 1, infiles) + self.cmd_create_common(max_serial + 1, max_gen + 1) if broken_count > 0: raise GfptarError("Some .tar.* or .db.gz are broken:" f" count={broken_count}") - def cmd_create_common(self, serial, gen, infiles): + def cmd_create_common(self, serial, gen): if self.compress_type == Compress.TYPE_NO: self.split_size = self.assumed_size self.suffix = '.tar' @@ -3784,27 +3811,6 @@ class GfptarProgram(Program): self.split_size = self.assumed_size * 100 / self.ratio self.suffix = '.tar.' + self.compress_type - infiles_checked = set() - for infile in infiles: - # infile_url = GfURL.init(infile) - # if not infile_url.is_local(): - # raise GfException('specifying a relative path is required ' - # 'instead of a URL: ' + infile) - # NOTE: allow members named gfarm:* (ex. gfarm:/tmp/gfarm:abc) - infile_url = GfURL.init(infile, local=True) - infile = infile_url.path # normalize and ignore scheme - # normalized: ex. .///abc -> ./abc - infile = infile.lstrip('/') # relative path only - if infile.startswith('./'): - infile = infile[2:] - if infile == '' or infile == '.': - raise GfException('specifying current directory (.) ' - + 'as members is not allowed: ' + infile) - if infile == '..' or infile.startswith('../'): - raise GfException('specifying parent directory (..) ' - + 'as members is not allowed: ' + infile) - infiles_checked.add(infile) - def listdir_switch(gfurl): if self.dummy_num is not None: return self.list_dummy_files(gfurl.url_str, @@ -3864,7 +3870,7 @@ class GfptarProgram(Program): if self.progress_enabled: self.progress_for_create(self.start_time) - for infile in infiles_checked: + for infile in self.infiles_checked: if self.is_canceled(): logger.debug('Canceled (listdir 1): serial=%d', serial) break @@ -3938,6 +3944,8 @@ class GfptarProgram(Program): break # from loop of infiles_checked except Exception as e2: cannot_read_dir += 1 + if self.progress_enabled: + logger.warning('') logger.warning('%s: error while reading directory (%s)', gfurl.url_str, convert_message(e2)) self.print_trace(e2) @@ -3978,7 +3986,7 @@ class GfptarProgram(Program): print('compression ratio: %.2f%% (%d/%d)' % (100 * self.archived_size / self.stored_size, self.archived_size, self.stored_size)) - if self.selected_num == 0: + if self.selected_num == 0 and self.cmd_name == 'update': print('No files were updated.') if has_error is not None: raise has_error From c2edbe230c92ac7a0bbe57e450bb3151293a3598 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Sat, 5 Oct 2024 12:56:01 +0900 Subject: [PATCH 134/143] gfptar --extract: call sync() instead of applying fsync() to each local file --- gftool/gfptar/gfptar | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 0975744de..1d7108d9c 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -3418,7 +3418,7 @@ class GfptarProgram(Program): with url.writeopen(textmode=True, mode=mode, mtime=mtime, - use_fsync=self.use_fsync) as f: + use_fsync=False) as f: f.write(path + str(mtime)) elif ftype == self.D: url.mkdir() @@ -4577,6 +4577,15 @@ class GfptarProgram(Program): db_file = os.path.join(tmpdir.name, 'extract.db') logger.debug('db_file=%s', db_file) + # If outdir is gfarm2fs, the path will be converted into Gfarm URL. + outdir_url_gf = GfURL.init(outdir, use_gfarm_command=True) + if outdir_url_gf.is_local(): + self.use_fsync_for_x = False # to improve performance + self.outdir_is_local = True + else: + self.use_fsync_for_x = self.use_fsync + self.outdir_is_local = False + # to reduce memory usage self.db = DB(db_file, check_same_thread=False) @@ -4749,6 +4758,13 @@ class GfptarProgram(Program): self.update_stat_for_directories(directory_set) self.db.close() self.db.unlink() + + # use sync() instead of applying fsync() to each local file + if self.outdir_is_local: + logger.debug("sync() START: %f", time.time()) + os.sync() + logger.debug("sync() END : %f", time.time()) + if error_num > 0: raise GfptarError("Some .tar.* or .db.gz are broken:" f" count={error_num}") @@ -4954,7 +4970,7 @@ class GfptarProgram(Program): mode=tarinfo.mode, mtime=tarinfo.mtime, user=user, group=group, - use_fsync=self.use_fsync, + use_fsync=self.use_fsync_for_x, hostname=target_host) finally: inf.close() From b7f959fb34376dc1c8a6e2f9a58bdc0b5978bd8e Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Sat, 5 Oct 2024 15:51:40 +0900 Subject: [PATCH 135/143] gfptar --extract: fix freezing when a write error occurs --- gftool/gfptar/gfptar | 38 ++++++++++++++++++++++++++------------ 1 file changed, 26 insertions(+), 12 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 1d7108d9c..c6a194e3c 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -767,15 +767,22 @@ def convert_message(error): if hasattr(error, 'filename') and hasattr(error, 'strerror'): if error.filename is not None and error.strerror is not None: return f"{error.filename}: {error.strerror}" - if isinstance(error, GfException): # custom errors - return str(error) - else: + + def to_str(error): + if isinstance(error, GfException): # custom errors + return f"gfptar error: {str(error)}" + else: + return f"{error.__class__.__name__}: {str(error)}" + + message = to_str(error) + + if error.__context__: # "During handling of the above exception, another exception occurred" - if error.__context__: - error = error.__context__ - # if error.__cause__: - # error = error.__cause__ - return f"{error.__class__.__name__}: {str(error)}" + message += " (Possible cause: " + to_str(error.__context__) + ")" + if error.__cause__: + # "The above exception was the direct cause of the following exception" + message += " (Direct cause: " + to_str(error.__cause__) + ")" + return message class Program(metaclass=abc.ABCMeta): @@ -2280,8 +2287,12 @@ class GfTarFile(tarfile.TarFile): if use_fsync and sync_obj: sync_obj.flush() os.fsync(sync_obj.fileno()) + for proc_tuple in proc_list: + proc, close_obj, sync_obj = proc_tuple if close_obj: close_obj.close() + for proc_tuple in proc_list: + proc, close_obj, sync_obj = proc_tuple if proc is not None: logger.debug('close external process for tar: %s', str(proc.args)) @@ -2289,6 +2300,8 @@ class GfTarFile(tarfile.TarFile): if ret != 0: raise GfException('{}: returncode={}'.format( ' '.join(proc.args), ret)) + for proc_tuple in proc_list: + proc, close_obj, sync_obj = proc_tuple if close_obj: post_func = getattr(close_obj, 'post', None) if post_func: @@ -4192,9 +4205,9 @@ class GfptarProgram(Program): def create_a_tar_thread(self, gen, serial, dbfile): try: self.create_a_tar_thread0(gen, serial, dbfile) - except Exception: + except Exception as e: if self.is_canceled(): - raise self.error_canceled() + raise self.error_canceled() from e else: raise @@ -4881,9 +4894,10 @@ class GfptarProgram(Program): def extract_from_a_tar(self, index, target, member_set): try: self.extract_from_a_tar0(index, target, member_set) - except Exception: + except Exception as e: if self.is_canceled(): - raise self.error_canceled() + self.cancel() + raise self.error_canceled() from e else: raise From dd86b13c0a78a452efff05f9427617d9a6b0f195 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Sat, 5 Oct 2024 16:33:15 +0900 Subject: [PATCH 136/143] gfptar: show Progress even if --debug or --verbose are used --- gftool/gfptar/gfptar | 106 ++++++++++++++++++++++++++----------------- 1 file changed, 64 insertions(+), 42 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index c6a194e3c..e0d558744 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -2729,7 +2729,7 @@ class GfptarProgram(Program): return _schema def _progress_enabled(self): - return not self.debug and not self.verbose and not self.quiet + return not self.quiet def run(self): try: @@ -3728,8 +3728,8 @@ class GfptarProgram(Program): self.compress_type = dottype[1] if self.compress_type == 'tar': self.compress_type = Compress.TYPE_NO - if self.progress_enabled: - logger.info('') + if self.verbose: + self.progress_newline() logger.info(f"recognized type is {self.compress_type}") Compress.init(self.compress_type, self.compress_prog, self.gzip_prog, self.bzip2_prog, self.xz_prog) @@ -3957,8 +3957,7 @@ class GfptarProgram(Program): break # from loop of infiles_checked except Exception as e2: cannot_read_dir += 1 - if self.progress_enabled: - logger.warning('') + self.progress_newline() logger.warning('%s: error while reading directory (%s)', gfurl.url_str, convert_message(e2)) self.print_trace(e2) @@ -3989,7 +3988,7 @@ class GfptarProgram(Program): if self.progress_enabled: self.progress_for_create(time.time()) - sys.stdout.write('\n') + self.progress_newline() if self.cannot_be_archived > 0 or cannot_read_dir > 0: e = CannotBeArchivedError(self.cannot_be_archived, cannot_read_dir) logger.warning(str(e)) @@ -4725,7 +4724,7 @@ class GfptarProgram(Program): self.progress_for_schedule(now) if self.progress_enabled: self.progress_for_schedule(time.time()) - sys.stdout.write('\n') + self.progress_newline() if self.search_target: # update for progress self.total_num = self.selected_num @@ -4766,7 +4765,7 @@ class GfptarProgram(Program): self.extract_from_archives(tarlist, member_set) if self.progress_enabled: self.progress_for_extract(time.time()) - sys.stdout.write('\n') + self.progress_newline() self.update_stat_for_directories(directory_set) self.db.close() @@ -5040,6 +5039,26 @@ class GfptarProgram(Program): def _humanize_raw(self, n): return int(n) + def progress_print(self, msg): + if self.debug: + logger.debug('Progress: ' + msg) + elif self.verbose: + self.info('Progress: ' + msg) + else: + sys.stdout.write("\r") + sys.stdout.write(msg) + sys.stdout.write(" ") + + def progress_newline(self): + if self.debug: + pass + elif self.verbose: + pass + elif self.quiet: + pass + else: + sys.stdout.write("\n") + def progress_for_schedule(self, now): sec = now - self.start_time sec_str = format_seconds(sec, minhour=True) @@ -5054,17 +5073,18 @@ class GfptarProgram(Program): if self.selected_num > 0: selected_num_str = self._humanize(self.selected_num) selected_size_str = self._humanize(self.selected_size) - sys.stdout.write(f"\r{name}: " - f"{selected_size_str}/{total_size_str}B " - f"{selected_num_str}/{total_num_str}Ent " - f"{sec_str} " - f"{ent_per_sec_str}Ent/s") + msg = (f"{name}: " + f"{selected_size_str}/{total_size_str}B " + f"{selected_num_str}/{total_num_str}Ent " + f"{sec_str} " + f"{ent_per_sec_str}Ent/s") else: - sys.stdout.write(f"\r{name}: " - f"{total_size_str}B " - f"{total_num_str}Ent " - f"{sec_str} " - f"{ent_per_sec_str}Ent/s") + msg = (f"{name}: " + f"{total_size_str}B " + f"{total_num_str}Ent " + f"{sec_str} " + f"{ent_per_sec_str}Ent/s") + self.progress_print(msg) def progress_for_infodb(self, now): sec = now - self.start_time @@ -5086,17 +5106,18 @@ class GfptarProgram(Program): ent_per_sec_str = self._humanize(ent_per_sec) if self.gen_db: name = 'generate-db' - sys.stdout.write(f"\r{name}: " - f"{current_tar_num_str}/{total_tar_num_str}(DB) " - f"{current_size_str}B " - f"{current_ent_num_str}Ent " - f"{sec_str} " - f"{ent_per_sec_str}Ent/s") + msg = (f"{name}: " + f"{current_tar_num_str}/{total_tar_num_str}(DB) " + f"{current_size_str}B " + f"{current_ent_num_str}Ent " + f"{sec_str} " + f"{ent_per_sec_str}Ent/s") else: name = 'load-db' - sys.stdout.write(f"\r{name}: " - f"{current_tar_num_str}/{total_tar_num_str}(DB) " - f"{sec_str} ") + msg = (f"{name}: " + f"{current_tar_num_str}/{total_tar_num_str}(DB) " + f"{sec_str} ") + self.progress_print(msg) # lock required def progress_for_create(self, now): @@ -5129,12 +5150,13 @@ class GfptarProgram(Program): # total_size_str = self._humanize(self.total_size) bytes_per_sec_str = self._humanize(bytes_per_sec) ent_per_sec_str = self._humanize(ent_per_sec) - sys.stdout.write(f"\r{self.cmd_name}: {percent_str}% " - f"{stored_size_str}/{sel_size_str}B " - f"{stored_num_str}/{sel_num_str}/{total_num_str}Ent " - f"{sec_str} " - f"{bytes_per_sec_str}B/s " - f"{ent_per_sec_str}Ent/s") + msg = (f"{self.cmd_name}: {percent_str}% " + f"{stored_size_str}/{sel_size_str}B " + f"{stored_num_str}/{sel_num_str}/{total_num_str}Ent " + f"{sec_str} " + f"{bytes_per_sec_str}B/s " + f"{ent_per_sec_str}Ent/s") + self.progress_print(msg) # lock required def progress_for_extract(self, now): @@ -5163,12 +5185,13 @@ class GfptarProgram(Program): bytes_per_sec_str = self._humanize(bytes_per_sec) ent_per_sec_str = self._humanize(ent_per_sec) name = "extract" - sys.stdout.write(f"\r{name}: {percent_str}% " - f"{extracted_size_str}/{total_size_str}B " - f"{extracted_num_str}/{total_num_str}Ent " - f"{sec_str} " - f"{bytes_per_sec_str}B/s " - f"{ent_per_sec_str}Ent/s") + msg = (f"{name}: {percent_str}% " + f"{extracted_size_str}/{total_size_str}B " + f"{extracted_num_str}/{total_num_str}Ent " + f"{sec_str} " + f"{bytes_per_sec_str}B/s " + f"{ent_per_sec_str}Ent/s") + self.progress_print(msg) def cmd_list_simple(self, indir): return self.cmd_list(indir, verbose=False) @@ -5281,8 +5304,7 @@ class GfptarProgram(Program): dbgz_list.append((int(serial), int(gen), path)) continue if ent.path.endswith(self.LIST_SUFFIX): # ignore *.lst - if self.progress_enabled: - logger.warning('') + self.progress_newline() logger.warning(f"NOTICE: {ent.path}:" " This file (*.lst) is no longer needed." " Please remove it.") @@ -5458,7 +5480,7 @@ class GfptarProgram(Program): if progress: now = time.time() self.progress_for_infodb(now) - sys.stdout.write('\n') + self.progress_newline() if self.save_e: raise self.save_e if cancel: From 355bc5552b1deeb18d67c6beec0527be5e96e2c0 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Tue, 8 Oct 2024 07:35:20 +0900 Subject: [PATCH 137/143] gfptar --extract: progress for chmod --- gftool/gfptar/gfptar | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index e0d558744..69b7c0634 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -4834,6 +4834,11 @@ class GfptarProgram(Program): def update_stat_for_directories(self, directory_set): logger.debug('update_stat_for_directories') self.clear_canceled() + + self.start_time = time.time() + self.next_time = self.start_time + self.progress_interval + total = len(directory_set) + i = 0 # DESC: process from leaves for d in directory_set.iterator(sort='DESC'): if self.is_canceled(): @@ -4851,6 +4856,15 @@ class GfptarProgram(Program): else: dir_url.chmod(tarinfo.mode, mtime=tarinfo.mtime) self.info('update_stat: {}, mode={}', d, oct(tarinfo.mode)) + i += 1 + if self.progress_enabled: + now = time.time() + if now >= self.next_time: + self.next_time = now + self.progress_interval + self.progress_for_chmod(now, i, total) + if self.progress_enabled: + self.progress_for_chmod(now, i, total) + self.progress_newline() def extract_from_archives(self, tarlist, member_set): self.lock_init(False) @@ -5193,6 +5207,27 @@ class GfptarProgram(Program): f"{ent_per_sec_str}Ent/s") self.progress_print(msg) + def progress_for_chmod(self, now, current, total): + sec = now - self.start_time + sec_str = format_seconds(sec, minhour=True) + if total > 0: + percent = current * 100 / total + else: + percent = 0 + percent_str = f"{percent:.0f}" + if sec > 0: + ent_per_sec = current / sec + else: + ent_per_sec = 0 + current_str = self._humanize(current) + total_str = self._humanize(total) + ent_per_sec_str = self._humanize(ent_per_sec) + msg = (f"chmod: {percent_str}% " + f"{current_str}/{total_str}Ent " + f"{sec_str} " + f"{ent_per_sec_str}Ent/s") + self.progress_print(msg) + def cmd_list_simple(self, indir): return self.cmd_list(indir, verbose=False) From 62a105c43b0ea43112d1d50625aa3a79f4cafa36 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Tue, 8 Oct 2024 08:59:30 +0900 Subject: [PATCH 138/143] gfptar: improve performance of makedirs() on local fs --- gftool/gfptar/gfptar | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 69b7c0634..bda888881 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -1873,6 +1873,11 @@ class GfURLLocal(GfURL): def mkdir(self, mode=0o700, parents=False): if parents: + try: + os.mkdir(self.url_str, mode) + return # OK + except FileExistsError: + return # OK # Reference: https://docs.python.org/3/library/os.html#os.makedirs # Changed in version 3.7: The mode argument no longer # affects the file permission bits of newly created From daac11a4cb40872c81c6bbb575813278d2e61441 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Tue, 8 Oct 2024 09:01:38 +0900 Subject: [PATCH 139/143] gfptar: sequel to "improve performance of makedirs() on local fs" --- gftool/gfptar/gfptar | 2 ++ 1 file changed, 2 insertions(+) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index bda888881..df0928f36 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -1878,6 +1878,8 @@ class GfURLLocal(GfURL): return # OK except FileExistsError: return # OK + except Exception: + pass # Reference: https://docs.python.org/3/library/os.html#os.makedirs # Changed in version 3.7: The mode argument no longer # affects the file permission bits of newly created From c176dd14f2470b4692587a8172772fad5067b2ad Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Tue, 8 Oct 2024 09:59:23 +0900 Subject: [PATCH 140/143] gfptar --extract: improve performance --- gftool/gfptar/gfptar | 38 ++++++++++++++++++++++++-------------- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index df0928f36..967e7c0f3 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -4939,6 +4939,7 @@ class GfptarProgram(Program): def extract_from_a_tar1(self, index, target, member_set, tar): with self.lock(): members_num = len(member_set) + is_local = self.outdir_url.is_local() while True: if self.is_canceled(): logger.debug('Canceled (extract 2): name=%s', target) @@ -4970,22 +4971,31 @@ class GfptarProgram(Program): outurl_str = self.outdir_url.url_join(outfile) outurl = GfURL.init(outurl_str, dry_run=self.dry_run) - # create directories if not exist - if tarinfo.isdir(): - dir_url = outurl - dir_path = outurl.path - else: - dir_url = outurl.parent - dir_path = dir_url.path - with self.lock(): - dir_exists = dir_path in self.created_directory_set - if not dir_exists: - if not dir_url.exists(): - with ignore_exception(True): # may be race condition + if is_local: + if tarinfo.isdir(): + dir_url = outurl + else: + dir_url = outurl.parent + with ignore_exception(True): # may be race condition + if not dir_url.exists(): dir_url.makedirs() # default 0700 - self.info('created(D): {}', dir_url.url_str) + else: # speed up directory creation for Gfarm using SQLite3 + # create directories if not exist + if tarinfo.isdir(): + dir_url = outurl + dir_path = outurl.path + else: + dir_url = outurl.parent + dir_path = dir_url.path with self.lock(): - self.created_directory_set.add(dir_path) + dir_exists = dir_path in self.created_directory_set + if not dir_exists: + if not dir_url.exists(): + with ignore_exception(True): # may be race condition + dir_url.makedirs() # default 0700 + self.info('created(D): {}', dir_url.url_str) + with self.lock(): + self.created_directory_set.add(dir_path) if tarinfo.isfile(): target_host = self.select_a_target_host(outurl, index) From efc60e0b2158e0cfff21ef621ea887685c72288a Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Tue, 8 Oct 2024 11:51:36 +0900 Subject: [PATCH 141/143] gfptar --update: change --max-entries-for-tempdb-aggregation (0 means unlimited) --- gftool/gfptar/gfptar | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 967e7c0f3..4a18be459 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -3779,7 +3779,7 @@ class GfptarProgram(Program): if self.enable_tempdb_aggregation: MAX_ENTRIES = self.max_tempdb_aggregation else: - MAX_ENTRIES = 0 + MAX_ENTRIES = -1 for serial, gen, tar_path, dbgz_path, db_path in \ self.list_tar_dbgz_db(self.outdir_url, sort=True, @@ -3794,8 +3794,9 @@ class GfptarProgram(Program): broken_count += 1 continue suffix = self.suffix_check(suffix, tar_path) - if MAX_ENTRIES > 0: - if fattr_dict is None or num_entries >= MAX_ENTRIES: + if MAX_ENTRIES >= 0: + if fattr_dict is None or \ + (MAX_ENTRIES != 0 and num_entries >= MAX_ENTRIES): tmpdb_path = os.path.join(self.tmpdir.name, f"mtime-{mtime_db_idx}.db") db = DB(tmpdb_path) @@ -5916,12 +5917,13 @@ Options: tar files in archive directory exist, the execution with this option will fail. --max-entries-for-tempdb-aggregation=NUM + The maximum number of entries per working DB file. This option only effective with --update option. - A larger value is generally recommended - for this option. However, if the value is - excessively large and there are too many entries - in archive directory, performance may degrade. - [default: 100M] + Normally, you don't need to change this value. + If a smaller value is specified, performance + may degrade. + (0: unlimited (using one file as working DB)) + [default: 0] --progress-unit=TYPE Unit type for progress si : SI prefix bin: Binary prefix From e1d30147a99fa8f21140a1cef131d248e97f607b Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Tue, 8 Oct 2024 18:21:58 +0900 Subject: [PATCH 142/143] gfptar: new environment variables: GFPTAR_ORJSON, GFPTAR_UJSON --- gftool/gfptar/gfptar | 51 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 41 insertions(+), 10 deletions(-) diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 4a18be459..10fc1f729 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -12,6 +12,10 @@ # (required: python3-pip) # pip3 install --user docopt schema +# Envrionment variables: +# GFPTAR_ORJSON=True : Use orjson instead of json +# GFPTAR_UJSON=True : Use ujson instead of json + # Coding style check: # flake8 ./gfptar @@ -42,7 +46,6 @@ import string import random from decimal import Decimal, ROUND_DOWN import sqlite3 -import json import tempfile import multiprocessing import queue @@ -56,7 +59,39 @@ from docopt import docopt from schema import Schema, Use, Or -# library +def str2bool(s): + return s.upper() in ['TRUE', '1', 'ON', 'ENABLE', 'ENABLED'] + + +USE_ORJSON = str2bool(os.getenv('GFPTAR_ORJSON', 'False')) +USE_UJSON = str2bool(os.getenv('GFPTAR_UJSON', 'False')) + +try: + if USE_ORJSON: + import orjson as json + + def _dumps(obj): # compact + return json.dumps(obj).decode() + + json_dumps = _dumps + else: + raise ImportError() +except ImportError: + try: + if USE_UJSON: + import ujson as json + json_dumps = json.dumps # compact + else: + raise ImportError() + except ImportError: + import json + + def _dumps_compact(obj): + return json.dumps(obj, separators=(',', ':')) + + json_dumps = _dumps_compact + + def format_seconds(seconds, minhour=False): if minhour is False: return f"{seconds:.0f}s" @@ -176,7 +211,7 @@ class DBObj: class JsonObj(DBObj): @classmethod def dumps(cls, obj, for_dict): - return json.dumps(obj, separators=(',', ':')) + return json_dumps(obj) @classmethod def loads(cls, key, txt, for_dict): @@ -224,7 +259,7 @@ class FileAttr1(DBObj): @classmethod def dumps(cls, obj, for_dict): array = [obj.mode, obj.mtime, obj.user, obj.group] - return json.dumps(array, separators=(',', ':')) + return json_dumps(array) @classmethod def loads(cls, key, txt, for_dict): @@ -246,7 +281,7 @@ class FileAttr2(DBObj): def dumps(cls, obj, for_dict): array = [obj.mode, obj.mtime, obj.user, obj.group, obj.size, obj.linkname, obj.ftype] - return json.dumps(array, separators=(',', ':')) + return json_dumps(array) @classmethod def loads(cls, key, txt, for_dict): @@ -861,7 +896,7 @@ class GfURLEntry(DBObj): # save path to key when using dict, so don't save path to value if not for_dict: array.append(obj.path) # [7] - return json.dumps(array, separators=(',', ':')) + return json_dumps(array) @classmethod def loads(cls, key, txt, for_dict): @@ -1392,10 +1427,6 @@ class GfURL(metaclass=abc.ABCMeta): return True -def str2bool(s): - return s.upper() in ['TRUE', '1', 'ON', 'ENABLE', 'ENABLED'] - - USE_GFMKDIR_PLUS = str2bool(os.getenv('GFMKDIR_PLUS', 'True')) USE_GFCHMOD_PLUS = str2bool(os.getenv('GFCHMOD_PLUS', 'True')) USE_GFREG_PLUS = str2bool(os.getenv('GFREG_PLUS', 'True')) From a818f677b7509d56f7f0c77027870895cae45451 Mon Sep 17 00:00:00 2001 From: Takuya Ishibashi Date: Thu, 31 Oct 2024 22:23:20 +0900 Subject: [PATCH 143/143] gfptar: use orjson if it is installed --- INSTALL.en | 4 +++ INSTALL.ja | 4 +++ gftool/gfptar/gfptar | 70 ++++++++++++++++++++++++-------------------- 3 files changed, 47 insertions(+), 31 deletions(-) diff --git a/INSTALL.en b/INSTALL.en index 44d9420d2..a9ae08a56 100644 --- a/INSTALL.en +++ b/INSTALL.en @@ -134,4 +134,8 @@ postgresql, postgresql-client (required to execute config-gfarm) libfuse-dev (required to build gfarm2fs) python3, python3-docopt, python3-schema (required to execute gfptar) +* pip (Python Package Installer) Package + +orjson (If it is installed, use it to accelerate gfptar) + $Id$ diff --git a/INSTALL.ja b/INSTALL.ja index 27fd95c29..75896b66c 100644 --- a/INSTALL.ja +++ b/INSTALL.ja @@ -141,4 +141,8 @@ postgresql, postgresql-client (config-gfarm の実行に必要) libfuse-dev (gfarm2fs の構築に必要) python3, python3-docopt, python3-schema (gfptar の実行に必要) +* pip (Python Package Installer) パッケージ + +orjson (インストールされていれば gfptar を高速化) + $Id$ diff --git a/gftool/gfptar/gfptar b/gftool/gfptar/gfptar index 10fc1f729..f41f84fbb 100755 --- a/gftool/gfptar/gfptar +++ b/gftool/gfptar/gfptar @@ -11,10 +11,9 @@ # --- Or, install to ~/.local just for user's environment --- # (required: python3-pip) # pip3 install --user docopt schema - -# Envrionment variables: -# GFPTAR_ORJSON=True : Use orjson instead of json -# GFPTAR_UJSON=True : Use ujson instead of json +# +# --- To use orjson --- +# pip3 install --user orjson # Coding style check: # flake8 ./gfptar @@ -63,33 +62,31 @@ def str2bool(s): return s.upper() in ['TRUE', '1', 'ON', 'ENABLE', 'ENABLED'] -USE_ORJSON = str2bool(os.getenv('GFPTAR_ORJSON', 'False')) -USE_UJSON = str2bool(os.getenv('GFPTAR_UJSON', 'False')) +USE_ORJSON = str2bool(os.getenv('GFPTAR_USE_ORJSON', 'True')) + +json_dumps = None +json_loads = None +json_type = None try: if USE_ORJSON: - import orjson as json + import orjson - def _dumps(obj): # compact - return json.dumps(obj).decode() - - json_dumps = _dumps - else: - raise ImportError() + json_dumps = orjson.dumps # byte array + json_loads = orjson.loads + json_type = "orjson" except ImportError: - try: - if USE_UJSON: - import ujson as json - json_dumps = json.dumps # compact - else: - raise ImportError() - except ImportError: - import json + pass + +if json_type is None: + import json - def _dumps_compact(obj): - return json.dumps(obj, separators=(',', ':')) + def _dumps_compact(obj): + return json.dumps(obj, separators=(',', ':')).encode() - json_dumps = _dumps_compact + json_dumps = _dumps_compact + json_loads = json.loads + json_type = "json" def format_seconds(seconds, minhour=False): @@ -215,7 +212,7 @@ class JsonObj(DBObj): @classmethod def loads(cls, key, txt, for_dict): - return json.loads(txt) + return json_loads(txt) # Example of DBObj @@ -263,7 +260,7 @@ class FileAttr1(DBObj): @classmethod def loads(cls, key, txt, for_dict): - array = json.loads(txt) + array = json_loads(txt) return cls(array[0], array[1], array[2], array[3]) @@ -285,7 +282,7 @@ class FileAttr2(DBObj): @classmethod def loads(cls, key, txt, for_dict): - array = json.loads(txt) + array = json_loads(txt) return cls(array[0], array[1], array[2], array[3], array[4], array[5], array[6]) @@ -316,6 +313,7 @@ class DBCollection: self.create_table() def obj2txt(self, obj): + # byte array return self.obj_cls.dumps(obj, self.for_dict) def txt2obj(self, key, txt): @@ -351,7 +349,7 @@ class DBDict(DBCollection): def create_table(self): self.con.execute(f""" CREATE TABLE IF NOT EXISTS {self.table_name} - (key TEXT PRIMARY KEY, value TEXT) + (key TEXT PRIMARY KEY, value BLOB) """) self.for_dict = True @@ -504,7 +502,7 @@ class DBList(DBCollection): def create_table(self): self.con.execute(f""" CREATE TABLE IF NOT EXISTS {self.table_name} - (id INTEGER PRIMARY KEY AUTOINCREMENT, value TEXT) + (id INTEGER PRIMARY KEY AUTOINCREMENT, value BLOB) """) self.for_dict = False @@ -900,7 +898,7 @@ class GfURLEntry(DBObj): @classmethod def loads(cls, key, txt, for_dict): - o = json.loads(txt) + o = json_loads(txt) if for_dict: path = key else: @@ -2632,6 +2630,8 @@ class GfptarProgram(Program): logger.debug('USE_GFCHMOD_PLUS: %s', USE_GFCHMOD_PLUS) logger.debug('USE_GFREG_PLUS: %s', USE_GFREG_PLUS) + logger.debug(f'JSON library: {json_type}') + set_encoding(self.opt['--encoding']) self.jobs = self.opt['--jobs'] # SEE ALSO: MT_enabled() @@ -4902,6 +4902,7 @@ class GfptarProgram(Program): self.next_time = now + self.progress_interval self.progress_for_chmod(now, i, total) if self.progress_enabled: + now = time.time() self.progress_for_chmod(now, i, total) self.progress_newline() @@ -5851,7 +5852,7 @@ Example of --extract (Gfarm to Local): Contents of gMM_NN_gfptar.db.gz file (SQLite3 and gzip): MM: the generation number for each append operation NN: the serial number - table 'path_entry': map of path name to JSON string + table 'path_entry': map of path name to JSON string (BLOB) json.dumps([ file_mode (int), mtime (int), user_unique_id (int), group_unique_id (int), size (int), symlink_path, file_type (D,F,S) ] @@ -5877,6 +5878,13 @@ Limitations: - File names cannot include newline characters. - Subsecond (less than a second) for mtime is not preserved. +Envrionment variables: + - GFPTAR_USE_ORJSON: Use orjson instead of json. + (1 ... enable, 0 ... disable) (default: 1) + - GFMKDIR_PLUS + - GFCHMOD_PLUS + - GFREG_PLUS + Options: -c, --create=OUTDIR Create tar files in OUTDIR from MEMBERs. -r, --append=OUTDIR Append files to new tar files.