From 257c93169709039ef4ca54e8359aace31c62b0c8 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Tue, 3 Aug 2021 10:38:26 -0500 Subject: [PATCH] Remove public API of indexing module. (#580) * Remove public API for indexing module. * Remove index command from signac.__main__. * Remove tests for methods and classes that have been removed. * Remove index from CLI help message. * Make indexing features adhere to pydocstyle. * Fix typo. * Remove index shell test. --- doc/api.rst | 2 +- doc/cli-help.txt | 4 +- setup.cfg | 4 +- signac/__init__.py | 23 - signac/__main__.py | 26 +- signac/contrib/__init__.py | 32 -- signac/contrib/indexing.py | 929 ++++++------------------------------- signac/contrib/project.py | 4 +- tests/test_indexing.py | 386 ++------------- tests/test_project.py | 4 +- tests/test_shell.py | 24 - 11 files changed, 196 insertions(+), 1242 deletions(-) diff --git a/doc/api.rst b/doc/api.rst index 6bfbfe40b..3a8679ca2 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -162,7 +162,7 @@ Top-level functions .. automodule:: signac :members: :show-inheritance: - :exclude-members: Project,Collection,RegexFileCrawler,MainCrawler,MasterCrawler,SignacProjectCrawler,JSONDict,H5Store,H5StoreManager + :exclude-members: Project,Collection,JSONDict,H5Store,H5StoreManager Submodules diff --git a/doc/cli-help.txt b/doc/cli-help.txt index fb2f44f08..032d3f801 100644 --- a/doc/cli-help.txt +++ b/doc/cli-help.txt @@ -1,12 +1,12 @@ usage: signac [-h] [--debug] [--version] [-v] [-y] - {init,project,job,statepoint,diff,document,rm,move,clone,index,find,view,schema,shell,sync,import,export,update-cache,config} + {init,project,job,statepoint,diff,document,rm,move,clone,find,view,schema,shell,sync,import,export,update-cache,config} ... signac aids in the management, access and analysis of large-scale computational investigations. positional arguments: - {init,project,job,statepoint,diff,document,rm,move,clone,index,find,view,schema,shell,sync,import,export,update-cache,config} + {init,project,job,statepoint,diff,document,rm,move,clone,find,view,schema,shell,sync,import,export,update-cache,config} optional arguments: -h, --help show this help message and exit diff --git a/setup.cfg b/setup.cfg index 346fb2592..4d9b59907 100644 --- a/setup.cfg +++ b/setup.cfg @@ -17,7 +17,7 @@ select = E,F,W ignore = E123,E126,E203,E226,E241,E704,W503,W504 [pydocstyle] -match = ^((?!\.sync-zenodo-metadata|setup|benchmark|indexing).)*\.py$ +match = ^((?!\.sync-zenodo-metadata|setup|benchmark).)*\.py$ match-dir = ^((?!\.|tests|configobj).)*$ ignore-decorators = "deprecated" add-ignore = D105, D107, D203, D204, D213 @@ -35,7 +35,7 @@ omit = [tool:pytest] filterwarnings = - ignore: .*[The indexing module | get_statepoint | Use of.+as key] is deprecated.*: DeprecationWarning + ignore: .*[get_statepoint | Use of.+as key] is deprecated.*: DeprecationWarning [bumpversion:file:setup.py] diff --git a/signac/__init__.py b/signac/__init__.py index 76c2ea489..5d475ec49 100644 --- a/signac/__init__.py +++ b/signac/__init__.py @@ -12,21 +12,10 @@ from . import cite, contrib, errors, sync, testing, warnings from .contrib import ( Collection, - MainCrawler, - MasterCrawler, Project, - RegexFileCrawler, - SignacProjectCrawler, TemporaryProject, - export, - export_one, - export_pymongo, - export_to_mirror, - fetch, get_job, get_project, - index, - index_files, init_project, ) from .core.h5store import H5Store, H5StoreManager @@ -56,19 +45,7 @@ "init_project", "get_job", "diff_jobs", - "fetch", - "export_one", - "export", - "export_to_mirror", "Collection", - "export_pymongo", - "fs", - "index_files", - "index", - "RegexFileCrawler", - "MainCrawler", - "MasterCrawler", - "SignacProjectCrawler", "buffered", "is_buffered", "get_buffer_size", diff --git a/signac/__main__.py b/signac/__main__.py index 8a293d6c8..de80685b0 100644 --- a/signac/__main__.py +++ b/signac/__main__.py @@ -29,7 +29,7 @@ else: READLINE = True -from . import Project, get_project, index, init_project +from . import Project, get_project, init_project from .common import config from .common.configobj import Section, flatten_errors from .contrib.filterparse import parse_filter_arg @@ -277,16 +277,6 @@ def main_clone(args): _print_err(f"Cloned '{job}' to '{dst_project}'.") -def main_index(args): - """Handle index subcommand.""" - _print_err(f"Compiling main index for path '{os.path.realpath(args.root)}'...") - if args.tags: - args.tags = set(args.tags) - _print_err("Provided tags: {}".format(", ".join(sorted(args.tags)))) - for doc in index(root=args.root, tags=args.tags, raise_on_error=args.debug): - print(json.dumps(doc)) - - def main_find(args): """Handle find subcommand.""" project = get_project() @@ -1197,18 +1187,6 @@ def main(): ) parser_clone.set_defaults(func=main_clone) - parser_index = subparsers.add_parser("index") - parser_index.add_argument( - "root", - nargs="?", - default=".", - help="Specify the root path from where the main index is to be compiled.", - ) - parser_index.add_argument( - "-t", "--tags", nargs="+", help="Specify tags for this main index compilation." - ) - parser_index.set_defaults(func=main_index) - parser_find = subparsers.add_parser( "find", description="""All filter arguments may be provided either directly in JSON @@ -1486,7 +1464,7 @@ def main(): "--ignore-times", action="store_true", dest="deep", - help="Never rely on file meta data such as the size or the modification time " + help="Never rely on file metadata such as the size or the modification time " "when determining file differences.", ) sync_group.add_argument( diff --git a/signac/contrib/__init__.py b/signac/contrib/__init__.py index 51fa69295..487b7e32e 100644 --- a/signac/contrib/__init__.py +++ b/signac/contrib/__init__.py @@ -5,49 +5,17 @@ import logging -from . import indexing from .collection import Collection -from .indexing import ( - BaseCrawler, - JSONCrawler, - MainCrawler, - MasterCrawler, - RegexFileCrawler, - SignacProjectCrawler, - export, - export_one, - export_pymongo, - export_to_mirror, - fetch, - fetched, - index, - index_files, -) from .project import Project, TemporaryProject, get_job, get_project, init_project logger = logging.getLogger(__name__) __all__ = [ - "indexing", "Project", "TemporaryProject", "get_project", "init_project", "get_job", - "BaseCrawler", - "RegexFileCrawler", - "JSONCrawler", - "SignacProjectCrawler", - "MainCrawler", - "MasterCrawler", - "fetch", - "fetched", - "export_one", - "export", - "export_to_mirror", - "export_pymongo", - "index_files", - "index", "Collection", ] diff --git a/signac/contrib/indexing.py b/signac/contrib/indexing.py index 2c0fced6d..ee93aa9b7 100644 --- a/signac/contrib/indexing.py +++ b/signac/contrib/indexing.py @@ -1,22 +1,18 @@ # Copyright (c) 2017 The Regents of the University of Michigan # All rights reserved. # This software is licensed under the BSD 3-Clause License. +"""Indexing features.""" + import errno import hashlib -import importlib.machinery import json import logging import math import os import re import warnings -from collections import defaultdict -from time import sleep - -from deprecation import deprecated from ..common import errors -from ..version import __version__ from .hashing import calc_id from .utility import walkdepth @@ -27,70 +23,56 @@ KEY_PATH = "root" KEY_PAYLOAD = "format" -""" -THIS MODULE IS DEPRECATED! -""" - -@deprecated( - deprecated_in="1.3", - removed_in="2.0", - current_version=__version__, - details="The indexing module is deprecated.", -) -def md5(file): - "Calculate and return the md5 hash value for the file data." +def _compute_file_md5(file): + """Calculate and return the md5 hash value for the file data.""" m = hashlib.md5() for chunk in iter(lambda: file.read(4096), b""): m.update(chunk) return m.hexdigest() -def _is_blank_module(module): - with open(module.__file__) as file: - return not bool(file.read().strip()) - - -# this class is deprecated -class BaseCrawler: - """Crawl through `root` and index all files. +class _BaseCrawler: + """Crawl through ``root`` and index all files. - The crawler creates an index on data, which can be exported - to a database for easier access.""" + The crawler creates an index on data, which can be exported to a database + for easier access. + """ tags = None - @deprecated( - deprecated_in="1.3", - removed_in="2.0", - current_version=__version__, - details="The indexing module is deprecated.", - ) def __init__(self, root): - """Initialize a BaseCrawler instance. + """Initialize a _BaseCrawler instance. + + Parameters + ---------- + root : str + The path to the root directory to crawl through. - :param root: The path to the root directory to crawl through. - :type root: str""" + """ self.root = os.path.expanduser(root) self.tags = set() if self.tags is None else set(self.tags) def docs_from_file(self, dirpath, fn): """Implement this method to generate documents from files. - :param dirpath: The path of the file, relative to `root`. - :type dirpath: str - :param fn: The filename. - :type fn: str - :yields: Index documents. + Parameters + ---------- + dirpath : str + The path of the file, relative to ``root``. + fn : str + The filename. + + Yields + ------ + dict + Index documents. + """ raise NotImplementedError() - yield def fetch(self, doc, mode="r"): - """Implement this generator method to associate data with a document. - - :returns: object associated with doc - """ + """Implement this generator method to associate data with a document.""" raise errors.FetchError(f"Unable to fetch object for '{doc}'.") @classmethod @@ -103,16 +85,23 @@ def _calculate_hash(cls, doc, dirpath, fn): return m.hexdigest() def crawl(self, depth=0): - """Crawl through the `root` directory. + """Crawl through the ``root`` directory. The crawler will inspect every file and directory up - until the specified `depth` and call the + until the specified ``depth`` and call the :meth:`docs_from_file` method. - :param depth: Crawl through the directory for the specified depth. - A value of 0 specifies no limit. - :type dept: int - :yields: (id, doc)-tuples""" + Parameters + ---------- + depth : int + Maximum directory depth to crawl. A value of 0 specifies no limit. + + Yields + ------ + dict + Document. + + """ logger.info(f"Crawling '{self.root}' (depth={depth})...") for dirpath, dirnames, filenames in walkdepth(self.root, depth): for fn in filenames: @@ -124,30 +113,36 @@ def crawl(self, depth=0): logger.info(f"Crawl of '{self.root}' done.") def process(self, doc, dirpath, fn): - """Implement this method for additional processing of generated docs. + """Implement this method for processing generated documents. + + The default implementation will return the unmodified ``doc``. - The default implementation will return the unmodified `doc`. + Parameters + ---------- + dirpath : str + The path of the file, relative to `root`. + fn : str + The filename. - :param dirpath: The path of the file, relative to `root`. - :type dirpath: str - :param fn: The filename. - :type fn: str - :returns: A document, that means an instance of mapping. - :rtype: mapping""" + Returns + ------- + dict + A document. + + """ return doc -# this class is deprecated -class RegexFileCrawler(BaseCrawler): +class _RegexFileCrawler(_BaseCrawler): r"""Generate documents from filenames and associate each file with a data type. - The `RegexFileCrawler` uses regular expressions to generate - data from files. This is a particular easy method to retrieve meta data + The :py:class:`_RegexFileCrawler` uses regular expressions to generate + data from files. This is a particular easy method to retrieve metadata associated with files. Inherit from this class to configure a crawler - for your data structre. + for your data structure. Let's assume we want to index text files, with a naming pattern, that - specifies a parameter `a` through the filename, e.g.: + specifies a parameter ``a`` through the filename, e.g.: .. code-block:: python @@ -160,29 +155,27 @@ class RegexFileCrawler(BaseCrawler): .. code-block:: python - MyCrawler(RegexFileCrawler): + MyCrawler(_RegexFileCrawler): pass MyCrawler.define('.*\/a_(?P\d+)\.txt', 'TextFile') + """ + "Mapping of compiled regex objects and associated formats." definitions = {} # type: ignore @classmethod - @deprecated( - deprecated_in="1.3", - removed_in="2.0", - current_version=__version__, - details="The indexing module is deprecated.", - ) def define(cls, regex, format_=None): """Define a format for a particular regular expression. - :param regex: All files of the specified format - must match this regular expression. - :type regex: :class:`str` - :param format_: The format associated with all matching files. - :type format_: :class:`object` + Parameters + ---------- + regex : str + A regular expression used to match files of the specified format. + format_ : object + The format associated with all matching files. + """ if isinstance(regex, str): regex = re.compile(regex) @@ -191,43 +184,52 @@ def define(cls, regex, format_=None): cls.definitions = definitions @classmethod - @deprecated( - deprecated_in="1.3", - removed_in="2.0", - current_version=__version__, - details="The indexing module is deprecated.", - ) def compute_file_id(cls, doc, file): """Compute the file id for a given doc and the associated file. - :param doc: The index document - :param file: The associated file - :returns: The file id. + The resulting id is assigned to ``doc["md5"]``. + + Parameters + ---------- + doc : dict + The index document. + file : file-like object + The associated file + + Returns + ------- + str + The file id. + """ - file_id = doc["md5"] = md5(file) + file_id = doc["md5"] = _compute_file_md5(file) return file_id - @deprecated( - deprecated_in="1.3", - removed_in="2.0", - current_version=__version__, - details="The indexing module is deprecated.", - ) def docs_from_file(self, dirpath, fn): """Generate documents from filenames. This method implements the abstract - :py:meth:~.BaseCrawler.docs_from_file` and yields index + :py:meth:`~._BaseCrawler.docs_from_file` and yields index documents associated with files. - .. note:: - It is not recommended to reimplement this method to modify - documents generated from filenames. - See :meth:`~RegexFileCrawler.process` instead. + Notes + ----- + It is not recommended to reimplement this method to modify + documents generated from filenames. + See :py:meth:`~_RegexFileCrawler.process` instead. + + Parameters + ---------- + dirpath : str + The path of the file relative to root. + fn : str + The filename of the file. + + Yields + ------ + dict + Index document. - :param dirpath: The path of the file relative to root. - :param fn: The filename of the file. - :yields: Index documents. """ for regex, format_ in self.definitions.items(): m = regex.match(os.path.join(dirpath, fn)) @@ -242,19 +244,21 @@ def docs_from_file(self, dirpath, fn): doc["file_id"] = self.compute_file_id(doc, file) yield doc - @deprecated( - deprecated_in="1.3", - removed_in="2.0", - current_version=__version__, - details="The indexing module is deprecated.", - ) def fetch(self, doc, mode="r"): - """Fetch the data associated with `doc`. + """Fetch the data associated with ``doc``. + + Parameters + ---------- + doc : dict + An index document. + mode : str + Mode used to open file object. + + Returns + ------- + file-like object + The file associated with the index document. - :param doc: A index document. - :type doc: :class:`dict` - :returns: The file associated with the index document. - :rtype: A file-like object """ fn = doc.get(KEY_FILENAME) if fn: @@ -275,32 +279,32 @@ def fetch(self, doc, mode="r"): f"Unable to match file path of doc '{doc}' to format definition." ) else: - raise errors.FetchError(f"Insufficient meta data in doc '{doc}'.") - - @deprecated( - deprecated_in="1.3", - removed_in="2.0", - current_version=__version__, - details="The indexing module is deprecated.", - ) + raise errors.FetchError(f"Insufficient metadata in doc '{doc}'.") + def process(self, doc, dirpath, fn): """Post-process documents generated from filenames. - Example: - + Examples + -------- .. code-block:: python - MyCrawler(signac.indexing.RegexFileCrawler): + MyCrawler(signac.indexing._RegexFileCrawler): def process(self, doc, dirpath, fn): doc['long_name_for_a'] = doc['a'] return super(MyCrawler, self).process(doc, dirpath, fn) - :param dirpath: The path of the file, relative to `root`. - :type dirpath: str - :param fn: The filename. - :type fn: str - :returns: An index document, that means an instance of mapping. - :rtype: mapping + Parameters + ---------- + dirpath : str + The path of the file, relative to ``root``. + fn : str + The filename. + + Returns + ------- + dict + An index document. + """ result = {} for key, value in doc.items(): @@ -319,12 +323,6 @@ def process(self, doc, dirpath, fn): result[key] = float(value) return super().process(result, dirpath, fn) - @deprecated( - deprecated_in="1.3", - removed_in="2.0", - current_version=__version__, - details="The indexing module is deprecated.", - ) def crawl(self, depth=0): if self.definitions: yield from super().crawl(depth=depth) @@ -332,26 +330,6 @@ def crawl(self, depth=0): return -@deprecated( - deprecated_in="1.3", - removed_in="2.0", - current_version=__version__, - details="The indexing module is deprecated.", -) -class JSONCrawler(BaseCrawler): - encoding = "utf-8" - fn_regex = r".*\.json" - - def docs_from_json(self, doc): - yield doc - - def docs_from_file(self, dirpath, fn): - if re.match(self.fn_regex, os.path.join(dirpath, fn)): - with open(os.path.join(dirpath, fn), "rb") as file: - doc = json.loads(file.read().decode(self.encoding)) - yield from self.docs_from_json(doc) - - def _index_signac_project_workspace( root, include_job_document=True, @@ -362,7 +340,7 @@ def _index_signac_project_workspace( encoding="utf-8", statepoint_dict=None, ): - "Yields standard index documents for a signac project workspace." + """Yield standard index documents for a signac project workspace.""" logger.debug(f"Indexing workspace '{root}'...") m = re.compile(r"[a-f0-9]{32}") try: @@ -400,18 +378,22 @@ def _index_signac_project_workspace( logger.debug(f"Indexed workspace '{root}', {i + 1} entries.") -# this class is deprecated -class SignacProjectCrawler(RegexFileCrawler): +class _SignacProjectCrawler(_RegexFileCrawler): """Index a signac project workspace. - Without any file format definitions, this crawler - yields index documents for each job, including - the statepoint and the job document. + Without any file format definitions, this crawler yields index documents + for each job, including the state point and the job document. - See also: :py:class:`~.RegexFileCrawler` + See Also + -------- + :py:class:`~._RegexFileCrawler` - :param root: The path to the project's root directory. - :type root: str""" + Parameters + ---------- + root : str + The path to the project's root directory. + + """ encoding = "utf-8" statepoint_index = "statepoint" @@ -419,12 +401,6 @@ class SignacProjectCrawler(RegexFileCrawler): fn_job_document = "signac_job_document.json" signac_id_alias = "_id" - @deprecated( - deprecated_in="1.3", - removed_in="2.0", - current_version=__version__, - details="The indexing module is deprecated.", - ) def __init__(self, root): from .project import get_project @@ -473,612 +449,3 @@ def crawl(self, depth=0): yield self.process(doc, None, None) for doc in super().crawl(depth=depth): yield doc - - -# this class is deprecated -class MainCrawler(BaseCrawler): - r"""Compiles a main index from indexes defined in access modules. - - An instance of this crawler will search the data space for access - modules, which by default are named ``signac_access.py``. Once such - a file is found, the crawler will import the module and try to execute - two special functions given that they are defined within the module's - global namespace: ``get_indexes()`` and ``get_crawlers()``. - - The ``get_indexes()`` is assumed to yield one or multiple index generator - functions, while the ``get_crawlers()`` function is assumed to yield - one or more crawler instances. - - This is an example for such an access module: - - .. code-block:: python - - import signac - - def get_indexes(root): - yield signac.index_files(root, r'.*\.txt') - - def get_crawlers(root): - yield MyCrawler(root) - - In case that the main crawler has tags, the ``get_indexes()`` function - will always be ignored while crawlers yielded from the ``get_crawlers()`` - function will only be executed in case that they match at least one - of the tags. - - In case that the access module is completely empty, it will be executed - as if it had the following directives: - - .. code-block:: python - - import signac - - def get_indexes(root): - yield signac.get_project(root).index() - - Tags for indexes yielded from the `get_indexes()` function can be specified - by assigning them directly to the function: - - .. code-block:: python - - def get_indexes(root): - yield signac.get_project(root).index() - - get_indexes.tags = {'foo'} - - - :param root: The path to the root directory to crawl through. - :type root: str - :param raise_on_error: Raise all exceptions encountered during - during crawling instead of ignoring them. - :type raise_on_error: bool - """ - - FN_ACCESS_MODULE = "signac_access.py" - "The filename of modules containing crawler definitions." - - @deprecated( - deprecated_in="1.3", - removed_in="2.0", - current_version=__version__, - details="The indexing module is deprecated.", - ) - def __init__(self, root, raise_on_error=False): - self.raise_on_error = raise_on_error - super().__init__(root=root) - - def _docs_from_module(self, dirpath, fn): - name = os.path.join(dirpath, fn) - module = importlib.machinery.SourceFileLoader(name, name).load_module() - - logger.info(f"Crawling from module '{module.__file__}'.") - - has_tags = self.tags is not None and len(set(self.tags)) - - def _check_tags(tags): - if tags is None or not len(set(tags)): - if has_tags: - logger.info("Skipping, index has no defined tags.") - return False - else: - return True - else: - if not has_tags: - logger.info("Skipping, index requires tags.") - return False - elif set(self.tags).intersection(set(tags)): - return True # at least one tag matches! - else: - logger.info("Skipping, tag mismatch.") - return False - - if not has_tags and _is_blank_module(module): - from .project import get_project - - for doc in get_project(root=dirpath).index(): - yield doc - - if hasattr(module, "get_indexes"): - if _check_tags(getattr(module.get_indexes, "tags", None)): - for index in module.get_indexes(dirpath): - for doc in index: - yield doc - - if hasattr(module, "get_crawlers"): - for crawler in module.get_crawlers(dirpath): - logger.info(f"Executing subcrawler:\n {crawler}") - if _check_tags(getattr(crawler, "tags", None)): - for doc in crawler.crawl(): - doc.setdefault(KEY_PROJECT, os.path.relpath(dirpath, self.root)) - yield doc - - def docs_from_file(self, dirpath, fn): - """Compile main index from file in case it is an access module. - - :param dirpath: The path of the file relative to root. - :param fn: The filename of the file. - :yields: Index documents. - """ - if fn == self.FN_ACCESS_MODULE: - try: - yield from self._docs_from_module(dirpath, fn) - except Exception: - logger.error( - "Error while indexing from module '{}'.".format( - os.path.join(dirpath, fn) - ) - ) - if self.raise_on_error: - raise - else: - logger.debug(f"Completed indexing from '{os.path.join(dirpath, fn)}'.") - - -# Deprecated API -class MasterCrawler(MainCrawler): - def __init__(self, *args, **kwargs): - warnings.warn( - "The MasterCrawler class has been replaced by the MainCrawler class. " - "Both classes are deprecated and will be removed in a future release.", - DeprecationWarning, - ) - super().__init__(*args, **kwargs) - - -@deprecated( - deprecated_in="1.3", - removed_in="2.0", - current_version=__version__, - details="The indexing module is deprecated.", -) -def fetch( - doc_or_id, mode="r", mirrors=None, num_tries=3, timeout=60, ignore_local=False -): - """Fetch the file associated with this document or file id. - - This function retrieves a file associated with the provided - index document or file id and behaves like the built-in - :py:func:`open` function, e.g.: - - .. code-block:: python - - for doc in index: - with signac.fetch(doc) as file: - do_something_with(file) - - :param doc_or_id: A file_id or a document with a file_id value. - :param mode: Mode to use for opening files. - :param mirrors: An optional set of mirrors to fetch the file from. - :param num_tries: The number of automatic retry attempts in case of - mirror connection errors. - :type num_tries: int - :param timeout: The time in seconds to wait before an - automatic retry attempt. - :type timeout: int - :returns: The file associated with the document or file id. - :rtype: A file-like object - """ - if doc_or_id is None: - raise ValueError("Argument 'doc_or_id' must not be None!") - file_id = doc_or_id if isinstance(doc_or_id, str) else doc_or_id.get("file_id") - if not ignore_local: - try: - fn = os.path.join(doc_or_id["root"], doc_or_id["filename"]) - return open(fn, mode=mode) - except KeyError: - raise errors.FetchError("Insufficient file meta data for fetch.", doc_or_id) - except OSError as error: - if error.errno == errno.ENOENT: - if file_id is None: - raise errors.FetchError(f"Failed to fetch '{doc_or_id}'.") - if mirrors is None: - raise errors.FetchError("No mirrors provided!") - else: - for i in range(num_tries): - for mirror in mirrors: - try: - return mirror.get(file_id, mode=mode) - except mirror.AutoRetry as error: - logger.warning(error) - sleep(timeout) - except mirror.FileNotFoundError as error: - logger.debug(error) - else: - raise errors.FetchError(f"Unable to fetch object for '{file_id}'.") - - -@deprecated( - deprecated_in="1.3", - removed_in="2.0", - current_version=__version__, - details="The indexing module is deprecated.", -) -def fetched(docs): - """Iterate over documents and yield associated files.""" - for doc in docs: - if "file_id" in doc: - yield doc, fetch(doc) - - -def _export_to_mirror(file, file_id, mirror): - "Export a file-like object with file_id to mirror." - with mirror.new_file(_id=file_id) as dst: - dst.write(file.read()) - - -@deprecated( - deprecated_in="1.3", - removed_in="2.0", - current_version=__version__, - details="The indexing module is deprecated.", -) -def export_to_mirror(doc, mirror, num_tries=3, timeout=60): - """Export a file associated with doc to mirror. - - :param doc: A document with a file_id entry. - :param mirror: A file-system object to export the file to. - :param num_tries: The number of automatic retry attempts in case of - mirror connection errors. - :type num_tries: int - :param timeout: The time in seconds to wait before an - automatic retry attempt. - :type timeout: int - :returns: The file id after successful export. - """ - if "file_id" not in doc: - raise errors.ExportError(f"Doc '{doc}' does not have a file_id entry.") - for i in range(num_tries): - try: - with fetch(doc, mode="rb") as file: - _export_to_mirror(file, doc["file_id"], mirror) - except mirror.FileExistsError: - logger.debug( - "File with id '{}' already exported, skipping.".format(doc["file_id"]) - ) - break - except mirror.AutoRetry as error: - logger.warning(f"Error during export: '{error}', retrying...") - sleep(timeout) - else: - logger.debug( - "Stored file with id '{}' in mirror '{}'.".format( - doc["file_id"], mirror - ) - ) - return doc["file_id"] - else: - raise errors.ExportError(doc) - - -@deprecated( - deprecated_in="1.3", - removed_in="2.0", - current_version=__version__, - details="The indexing module is deprecated.", -) -def export_one(doc, index, mirrors=None, num_tries=3, timeout=60): - """Export one document to index and an optionally associated file to mirrors. - - :param doc: A document with a file_id entry. - :param docs: The index collection to export to. - :param mirrors: An optional set of mirrors to export files to. - :param num_tries: The number of automatic retry attempts in case of - mirror connection errors. - :type num_tries: int - :param timeout: The time in seconds to wait before an - automatic retry attempt. - :type timeout: int - :returns: The id and file id after successful export. - """ - index.replace_one({"_id": doc["_id"]}, doc, upsert=True) - if mirrors and "file_id" in doc: - for mirror in mirrors: - export_to_mirror(doc, mirror, num_tries, timeout) - return doc["_id"], doc["file_id"] - else: - return doc["_id"], None - - -@deprecated( - deprecated_in="1.3", - removed_in="2.0", - current_version=__version__, - details="The indexing module is deprecated.", -) -def export(docs, index, mirrors=None, update=False, num_tries=3, timeout=60, **kwargs): - """Export docs to index and optionally associated files to mirrors. - - The behavior of this function is equivalent to: - - .. code-block:: python - - for doc in docs: - export_one(doc, index, mirrors, num_tries) - - If the `update` argument is set to True, the export algorithm will - automatically identify stale index documents, that means documents - that refer to files or state points that have been removed and are - no longer part of the data space. Any document which shares the - `root`, but not the `_id` field with any of the updated documents - is considered stale and removed. Using `update` in combination with - an empty docs sequence will raise `ExportError`, since it is not - possible to identify stale documents in that case. - - .. note:: - - This function will automatically delegate to specialized - implementations for special index types. For example, if - the index argument is a MongoDB document collection, the - index documents will be exported via :py:func:`~.export_pymongo`. - - :param docs: The index documents to export. - :param index: The collection to export the index to. - :param mirrors: An optional set of mirrors to export files to. - :param update: If True, remove stale index documents, that means - documents that refer to files or state points that no longer exist. - :type update: bool - :param num_tries: The number of automatic retry attempts in case of - mirror connection errors. - :type num_tries: int - :param timeout: The time in seconds to wait before an - automatic retry attempt. - :type timeout: int - :param kwargs: Optional keyword arguments to pass to - delegate implementations. - :raises ExportError: When using the update argument in combination with - an empty docs sequence. - """ - try: - import pymongo - except ImportError: - pass - else: - if isinstance(index, pymongo.collection.Collection): - logger.info("Using optimized export function export_pymongo().") - return export_pymongo( - docs=docs, - index=index, - mirrors=mirrors, - update=update, - num_tries=num_tries, - timeout=timeout, - **kwargs, - ) - ids = defaultdict(list) - for doc in docs: - _id, _ = export_one(doc, index, mirrors, num_tries, timeout, **kwargs) - if update: - root = doc.get("root") - if root is not None: - ids[root].append(_id) - if update: - if ids: - stale = set() - for root in ids: - docs_ = index.find({"root": root}) - all_ = {doc["_id"] for doc in docs_} - stale.update(all_.difference(ids[root])) - logger.info(f"Removing {len(stale)} stale documents.") - for _id in set(stale): - index.delete_one(dict(_id=_id)) - else: - raise errors.ExportError( - "The exported docs sequence is empty! Unable to update!" - ) - - -def _export_pymongo(docs, operations, index, mirrors, num_tries, timeout): - """Export docs via operations to index and files to mirrors.""" - import pymongo - - if mirrors is not None: - for mirror in mirrors: - for doc in docs: - if "file_id" in doc: - export_to_mirror(doc, mirror, num_tries, timeout) - for i in range(num_tries): - try: - index.bulk_write(operations) - break - except pymongo.errors.AutoReconnect as error: - logger.warning(error) - sleep(timeout) - else: - raise errors.ExportError() - - -@deprecated( - deprecated_in="1.3", - removed_in="2.0", - current_version=__version__, - details="The indexing module is deprecated.", -) -def export_pymongo( - docs, index, mirrors=None, update=False, num_tries=3, timeout=60, chunksize=100 -): - """Optimized :py:func:`~.export` function for pymongo index collections. - - The behavior of this function is rougly equivalent to: - - .. code-block:: python - - for doc in docs: - export_one(doc, index, mirrors, num_tries) - - .. note:: - - All index documents must be JSON-serializable to - be able to be exported to a MongoDB collection. - - :param docs: The index documents to export. - :param index: The database collection to export the index to. - :type index: :class:`pymongo.collection.Collection` - :param num_tries: The number of automatic retry attempts in case of - mirror connection errors. - :type num_tries: int - :param timeout: The time in seconds to wait before an - automatic retry attempt. - :type timeout: int - :param chunksize: The buffer size for export operations. - :type chunksize: int""" - import pymongo - - logger.info(f"Exporting to pymongo database collection index '{index}'.") - chunk = [] - operations = [] - ids = defaultdict(list) - for doc in docs: - f = {"_id": doc["_id"]} - if update: - root = doc.get("root") - if root is not None: - ids[root].append(doc["_id"]) - chunk.append(doc) - operations.append(pymongo.ReplaceOne(f, doc, upsert=True)) - if len(chunk) >= chunksize: - logger.debug("Pushing chunk.") - _export_pymongo(chunk, operations, index, mirrors, num_tries, timeout) - chunk[:] = [] - operations[:] = [] - if len(operations): - logger.debug("Pushing final chunk.") - _export_pymongo(chunk, operations, index, mirrors, num_tries, timeout) - if update: - if ids: - stale = set() - for root in ids: - docs_ = index.find({"root": root}) - all_ = {doc["_id"] for doc in docs_} - stale.update(all_.difference(ids[root])) - logger.info(f"Removing {len(stale)} stale documents.") - for _id in set(stale): - index.delete_one(dict(_id=_id)) - else: - raise errors.ExportError( - "The exported docs sequence is empty! Unable to update!" - ) - - -@deprecated( - deprecated_in="1.3", - removed_in="2.0", - current_version=__version__, - details="The indexing module is deprecated.", -) -def index_files(root=".", formats=None, depth=0): - r"""Generate a file index. - - This generator function yields file index documents, - where each index document corresponds to one file. - - To index all files in the current working directory, - simply execute: - - .. code-block:: python - - for doc in signac.index_files(): - print(doc) - - A file associated with a file index document can be - fetched via the :py:func:`fetch` function: - - .. code-block:: python - - for doc in signac.index_files(): - with signac.fetch(doc) as file: - print(file.read()) - - This is especially useful if the file index is part of - a collection (:py:class:`.Collection`) which can be searched - for specific entries. - - To limit the file index to files with a specific filename - formats, provide a regular expression as the formats argument. - To index all files that have file ending `.txt`, execute: - - .. code-block:: python - - for doc in signac.index_files(formats='.*\.txt'): - print(doc) - - We can specify specific formats by providing a dictionary as - ``formats`` argument, where the key is the filename pattern and - the value is an arbitrary formats string, e.g.: - - .. code-block:: python - - for doc in signac.index_files(formats= - {r'.*\.txt': 'TextFile', r'.*\.zip': 'ZipFile'}): - print(doc) - - :param root: The directory to index, defaults to the - current working directory. - :type root: str - :param formats: Limit the index to files that match the - given regular expression and optionally associate formats - with given patterns. - :param depth: Limit the search to the specified directory depth. - :type depth: int - :yields: The file index documents as dicts. - """ - if formats is None: - formats = {".*": "File"} - if isinstance(formats, str): - formats = {formats: "File"} - - class Crawler(RegexFileCrawler): - pass - - for regex, fmt in formats.items(): - Crawler.define(regex, fmt) - - yield from Crawler(root).crawl(depth=depth) - - -@deprecated( - deprecated_in="1.3", - removed_in="2.0", - current_version=__version__, - details="The indexing module is deprecated.", -) -def index(root=".", tags=None, depth=0, **kwargs): - r"""Generate a main index. - - A main index is compiled from other indexes by searching - for modules named ``signac_access.py`` and compiling all - indexes which are yielded from a function ``get_indexes(root)`` - defined within that module as well as the indexes generated by - crawlers yielded from a function ``get_crawlers(root)`` defined - within that module. - - This is a minimal example for a ``signac_access.py`` file: - - .. code-block:: python - - import signac - - def get_indexes(root): - yield signac.index_files(root, r'.*\.txt') - - Internally, this function constructs an instance of - :py:class:`.MainCrawler` and all extra key-word arguments - will be forwarded to the constructor of said main crawler. - - :param root: Look for access modules under this directory path. - :type root: str - :param tags: If tags are provided, do not execute subcrawlers - that don't match the same tags. - :param depth: Limit the search to the specified directory depth. - :param kwargs: These keyword-arguments are forwarded to the - internal MainCrawler instance. - :type depth: int - :yields: The main index documents as instances of dict. - """ - - class Crawler(MainCrawler): - pass - - if tags is not None: - Crawler.tags = tags - - yield from Crawler(root, **kwargs).crawl(depth=depth) diff --git a/signac/contrib/project.py b/signac/contrib/project.py index 801319f1f..a5dec3d7f 100644 --- a/signac/contrib/project.py +++ b/signac/contrib/project.py @@ -38,7 +38,7 @@ ) from .filterparse import _add_prefix, _root_keys, parse_filter from .hashing import calc_id -from .indexing import SignacProjectCrawler +from .indexing import _SignacProjectCrawler from .job import Job from .schema import ProjectSchema from .utility import _mkdir_p, _nested_dicts_to_dotted_keys, split_and_print_progress @@ -1910,7 +1910,7 @@ def _full_doc(doc): if isinstance(formats, str): formats = {formats: "File"} - class Crawler(SignacProjectCrawler): + class Crawler(_SignacProjectCrawler): pass for pattern, fmt in formats.items(): diff --git a/tests/test_indexing.py b/tests/test_indexing.py index 26e214bad..c8019c5f0 100644 --- a/tests/test_indexing.py +++ b/tests/test_indexing.py @@ -1,54 +1,17 @@ # Copyright (c) 2017 The Regents of the University of Michigan # All rights reserved. # This software is licensed under the BSD 3-Clause License. -import io import json import os import re from tempfile import TemporaryDirectory -from typing import Dict from unittest.mock import Mock import pytest -import signac from signac import Collection -from signac.common import errors from signac.contrib import indexing - -SIGNAC_ACCESS_MODULE_LEGACY = r"""import os -import re - -from signac.contrib import RegexFileCrawler - -RE_TXT = r".*a_(?P\d)\.txt" - -class Crawler(RegexFileCrawler): - tags = {'test1', 'test2'} - -Crawler.define(RE_TXT, 'TextFile') - -def get_crawlers(root): - yield Crawler(root) -""" - -SIGNAC_ACCESS_MODULE = r"""import signac - -def get_indexes(root): - yield signac.index_files(root, r'.*a_(?P\d)\.txt') - -get_indexes.tags = {'test1', 'test2'} -""" - -SIGNAC_ACCESS_MODULE_GET_CRAWLERS = r"""import signac - -class Crawler(signac.RegexFileCrawler): - tags = {'test1', 'test2'} -Crawler.define(r'.*_(?P\d)\.txt') - -def get_crawlers(root): - yield Crawler(root) -""" +from signac.errors import FetchError class TestFormat: @@ -59,59 +22,7 @@ def close(self): assert 0 -class _TestFS: - name = "inmemorytestgrid" - files: Dict[str, Dict] = {} - - class FileExistsError(IOError): - pass - - FileNotFoundError = KeyError - - class _Writer(io.BytesIO): - def __init__(self, cache, file_id): - self.cache = cache - self.file_id = file_id - - def close(self): - self.cache[self.file_id] = self.getvalue() - super(_TestFS._Writer, self).close() - - def __init__(self, _id="_testFS"): - self._id = _id - - def config(self): - return {"id": self._id} - - @classmethod - def from_config(cls, config): - return _TestFS(_id=config["id"]) - - def new_file(self, mode="xb", **kwargs): - _id = kwargs["_id"] - cache = self.files.setdefault(self._id, dict()) - if _id in cache: - raise self.FileExistsError(_id) - if mode == "xb": - return self._Writer(cache, _id) - else: - raise ValueError(mode) - - def get(self, file_id, mode="r"): - cache = self.files[self._id] - buf = cache[file_id] - if mode == "r": - return io.StringIO(buf.decode()) - elif mode == "rb": - return io.BytesIO(buf) - else: - raise ValueError(mode) - - class TestIndexingBase: - - access_module = SIGNAC_ACCESS_MODULE - @pytest.fixture(autouse=True) def setUp(self, request): self._tmp_dir = TemporaryDirectory(prefix="signac_") @@ -129,18 +40,16 @@ def fn(name): json.dump(dict(a=0), file) with open(fn("a_1.json"), "w") as file: json.dump(dict(a=1), file) - with open(fn("signac_access.py"), "w") as module: - module.write(self.access_module) def get_index_collection(self): c = Collection() return Mock(spec=c, wraps=c) def test_base_crawler(self): - crawler = indexing.BaseCrawler(root=self._tmp_dir.name) + crawler = indexing._BaseCrawler(root=self._tmp_dir.name) assert len(list(crawler.crawl())) == 0 doc = dict(a=0) - with pytest.raises(errors.FetchError): + with pytest.raises(FetchError): assert crawler.fetch(doc) is None assert doc == crawler.process(doc, None, None) with pytest.raises(NotImplementedError): @@ -150,73 +59,67 @@ def test_base_crawler(self): def test_regex_file_crawler_pre_compiled(self): self.setup_project() - class Crawler(indexing.RegexFileCrawler): + class Crawler(indexing._RegexFileCrawler): pass regex = re.compile(r".*a_(?P\d)\.txt") - with pytest.deprecated_call(): - Crawler.define(regex, TestFormat) + Crawler.define(regex, TestFormat) crawler = Crawler(root=self._tmp_dir.name) no_find = True - with pytest.deprecated_call(): - for doc in crawler.crawl(): - no_find = False - ffn = os.path.join(doc["root"], doc["filename"]) - m = regex.match(ffn) - assert m is not None - assert os.path.isfile(ffn) - with open(ffn) as file: - doc2 = json.load(file) - assert doc2["a"] == doc["a"] + for doc in crawler.crawl(): + no_find = False + ffn = os.path.join(doc["root"], doc["filename"]) + m = regex.match(ffn) + assert m is not None + assert os.path.isfile(ffn) + with open(ffn) as file: + doc2 = json.load(file) + assert doc2["a"] == doc["a"] assert not no_find def test_regex_file_crawler(self): self.setup_project() - class Crawler(indexing.RegexFileCrawler): + class Crawler(indexing._RegexFileCrawler): pass # First test without pattern crawler = Crawler(root=self._tmp_dir.name) - with pytest.deprecated_call(): - assert len(list(crawler.crawl())) == 0 + assert len(list(crawler.crawl())) == 0 # Now with pattern(s) pattern = r".*a_(?P\d)\.txt" regex = re.compile(pattern) - with pytest.deprecated_call(): - Crawler.define(pattern, TestFormat) - Crawler.define("negativematch", "negativeformat") + Crawler.define(pattern, TestFormat) + Crawler.define("negativematch", "negativeformat") crawler = Crawler(root=self._tmp_dir.name) no_find = True - with pytest.deprecated_call(): - for doc in crawler.crawl(): - no_find = False - ffn = os.path.join(doc["root"], doc["filename"]) - m = regex.match(ffn) - assert m is not None - assert os.path.isfile(ffn) - with open(ffn) as file: - doc2 = json.load(file) - assert doc2["a"] == doc["a"] - assert not no_find - with pytest.raises(errors.FetchError): - crawler.fetch(dict()) - with pytest.raises(errors.FetchError): - crawler.fetch({"filename": "shouldnotmatch"}) + for doc in crawler.crawl(): + no_find = False + ffn = os.path.join(doc["root"], doc["filename"]) + m = regex.match(ffn) + assert m is not None + assert os.path.isfile(ffn) + with open(ffn) as file: + doc2 = json.load(file) + assert doc2["a"] == doc["a"] + assert not no_find + with pytest.raises(FetchError): + crawler.fetch(dict()) + with pytest.raises(FetchError): + crawler.fetch({"filename": "shouldnotmatch"}) def test_regex_file_crawler_inheritance(self): self.setup_project() - class CrawlerA(indexing.RegexFileCrawler): + class CrawlerA(indexing._RegexFileCrawler): pass - class CrawlerB(indexing.RegexFileCrawler): + class CrawlerB(indexing._RegexFileCrawler): pass - with pytest.deprecated_call(): - CrawlerA.define("a", TestFormat) - CrawlerB.define("b", TestFormat) + CrawlerA.define("a", TestFormat) + CrawlerB.define("b", TestFormat) assert len(CrawlerA.definitions) == 1 assert len(CrawlerB.definitions) == 1 @@ -226,222 +129,7 @@ class CrawlerC(CrawlerA): assert len(CrawlerA.definitions) == 1 assert len(CrawlerC.definitions) == 1 assert len(CrawlerB.definitions) == 1 - with pytest.deprecated_call(): - CrawlerC.define("c", TestFormat) + CrawlerC.define("c", TestFormat) assert len(CrawlerA.definitions) == 1 assert len(CrawlerB.definitions) == 1 assert len(CrawlerC.definitions) == 2 - - def test_index_files(self): - self.setup_project() - - # First test without pattern - root = self._tmp_dir.name - with pytest.deprecated_call(): - assert len(list(signac.index_files(root))) == 5 - - # Now with pattern(s) - pattern_positive = r".*a_(?P\d)\.txt" - pattern_negative = "nomatch" - - with pytest.deprecated_call(): - assert len(list(signac.index_files(root, pattern_positive))) == 2 - assert len(list(signac.index_files(root, pattern_negative))) == 0 - - no_find = True - for doc in signac.index_files(root, pattern_positive): - no_find = False - ffn = os.path.join(doc["root"], doc["filename"]) - assert re.match(r".*a_(?P\d)\.txt", ffn) is not None - assert os.path.isfile(ffn) - with open(ffn) as file: - doc2 = json.load(file) - assert doc2["a"] == doc["a"] - assert not no_find - - def test_json_crawler(self): - self.setup_project() - with pytest.deprecated_call(): - crawler = indexing.JSONCrawler(root=self._tmp_dir.name) - # with pytest.deprecated_call(): - docs = list(sorted(crawler.crawl(), key=lambda d: d["a"])) - assert len(docs) == 2 - for i, doc in enumerate(docs): - assert doc["a"] == i - assert doc["format"] is None - ids = {doc["_id"] for doc in docs} - assert len(ids) == len(docs) - - def test_main_crawler(self): - self.setup_project() - crawler = indexing.MainCrawler(root=self._tmp_dir.name) - crawler.tags = {"test1"} - no_find = True - with pytest.deprecated_call(): - for doc in crawler.crawl(): - no_find = False - ffn = os.path.join(doc["root"], doc["filename"]) - assert os.path.isfile(ffn) - with open(ffn) as file: - doc2 = json.load(file) - assert doc2["a"] == doc["a"] - with signac.fetch(doc) as file: - pass - assert not no_find - - def test_index(self): - self.setup_project() - root = self._tmp_dir.name - with pytest.deprecated_call(): - assert len(list(signac.index(root=root))) == 0 - index = signac.index(root=self._tmp_dir.name, tags={"test1"}) - no_find = True - for doc in index: - no_find = False - ffn = os.path.join(doc["root"], doc["filename"]) - assert os.path.isfile(ffn) - with open(ffn) as file: - doc2 = json.load(file) - assert doc2["a"] == doc["a"] - with pytest.deprecated_call(): - with signac.fetch(doc) as file: - pass - assert not no_find - - def test_fetch(self): - with pytest.deprecated_call(): - with pytest.raises(ValueError): - signac.fetch(None) - with pytest.raises(errors.FetchError): - signac.fetch(dict()) - self.setup_project() - crawler = indexing.MainCrawler(root=self._tmp_dir.name) - crawler.tags = {"test1"} - with pytest.deprecated_call(): - docs = list(crawler.crawl()) - assert len(docs) == 2 - for doc in docs: - with pytest.deprecated_call(): - with signac.fetch(doc) as file: - pass - with pytest.deprecated_call(): - for doc, file in indexing.fetched(docs): - doc2 = json.load(file) - assert doc["a"] == doc2["a"] - file.close() - - def test_export_one(self): - self.setup_project() - crawler = indexing.MainCrawler(root=self._tmp_dir.name) - crawler.tags = {"test1"} - index = self.get_index_collection() - with pytest.deprecated_call(): - for doc in crawler.crawl(): - signac.export_one(doc, index) - assert index.replace_one.called - for doc in crawler.crawl(): - assert index.find_one({"_id": doc["_id"]}) is not None - - def test_export(self): - self.setup_project() - crawler = indexing.MainCrawler(root=self._tmp_dir.name) - crawler.tags = {"test1"} - index = self.get_index_collection() - with pytest.deprecated_call(): - signac.export(crawler.crawl(), index) - assert index.replace_one.called or index.bulk_write.called - for doc in crawler.crawl(): - assert index.find_one({"_id": doc["_id"]}) is not None - - def test_export_with_update(self): - self.setup_project() - with pytest.deprecated_call(): - index = list(signac.index(root=self._tmp_dir.name, tags={"test1"})) - collection = self.get_index_collection() - with pytest.deprecated_call(): - signac.export(index, collection, update=True) - assert collection.replace_one.called or collection.bulk_write.called - for doc in index: - assert collection.find_one({"_id": doc["_id"]}) is not None - collection.reset_mock() - assert len(index) == collection.find().count() - assert collection.find.called - with pytest.deprecated_call(): - signac.export(index, collection, update=True) - assert collection.replace_one.called or collection.bulk_write.called - for doc in index: - assert collection.find_one({"_id": doc["_id"]}) is not None - assert len(index) == collection.find().count() - collection.reset_mock() - for fn in ("a_0.txt", "a_1.txt"): - os.remove(os.path.join(self._tmp_dir.name, fn)) - N = len(index) - with pytest.deprecated_call(): - index = list(signac.index(root=self._tmp_dir.name, tags={"test1"})) - assert len(index) == (N - 1) - collection.reset_mock() - if index: - with pytest.deprecated_call(): - signac.export(index, collection, update=True) - assert collection.replace_one.called or collection.bulk_write.called - assert len(index) == collection.find().count() - else: - with pytest.raises(errors.ExportError): - with pytest.deprecated_call(): - signac.export(index, collection, update=True) - - def test_export_to_mirror(self): - self.setup_project() - crawler = indexing.MainCrawler(root=self._tmp_dir.name) - crawler.tags = {"test1"} - index = self.get_index_collection() - mirror = _TestFS() - with pytest.deprecated_call(): - for doc in crawler.crawl(): - assert "file_id" in doc - doc.pop("file_id") - with pytest.raises(errors.ExportError): - signac.export_to_mirror(doc, mirror) - break - for doc in crawler.crawl(): - assert "file_id" in doc - signac.export_one(doc, index) - - signac.export_to_mirror(doc, mirror) - assert index.replace_one.called - for doc in crawler.crawl(): - assert index.find_one({"_id": doc["_id"]}) is not None - with mirror.get(doc["file_id"]): - pass - - def test_main_crawler_tags(self): - self.setup_project() - crawler = indexing.MainCrawler(root=self._tmp_dir.name) - with pytest.deprecated_call(): - assert 0 == len(list(crawler.crawl())) - crawler.tags = None - assert 0 == len(list(crawler.crawl())) - crawler.tags = {} - assert 0 == len(list(crawler.crawl())) - crawler.tags = {"nomatch"} - assert 0 == len(list(crawler.crawl())) - crawler.tags = {"test1"} - assert 2 == len(list(crawler.crawl())) - crawler.tags = {"test2"} - assert 2 == len(list(crawler.crawl())) - crawler.tags = {"test1", "test2"} - assert 2 == len(list(crawler.crawl())) - crawler.tags = {"test1", "non-existent-key"} - assert 2 == len(list(crawler.crawl())) - crawler.tags = {"test2", "non-existent-key"} - assert 2 == len(list(crawler.crawl())) - crawler.tags = {"test1", "test2", "non-existent-key"} - assert 2 == len(list(crawler.crawl())) - - -class TestIndexingBaseGetCrawlers(TestIndexingBase): - access_module = SIGNAC_ACCESS_MODULE_GET_CRAWLERS - - -class TestIndexingBaseLegacy(TestIndexingBase): - access_module = SIGNAC_ACCESS_MODULE_LEGACY diff --git a/tests/test_project.py b/tests/test_project.py index 7dc9eb391..d93619705 100644 --- a/tests/test_project.py +++ b/tests/test_project.py @@ -666,7 +666,7 @@ def test_signac_project_crawler(self): index[doc["_id"]] = doc assert len(index) == len(job_ids) assert set(index.keys()) == set(job_ids) - crawler = signac.contrib.SignacProjectCrawler(self.project.root_directory()) + crawler = signac.contrib._SignacProjectCrawler(self.project.root_directory()) index2 = {} for doc in crawler.crawl(): index2[doc["_id"]] = doc @@ -683,7 +683,7 @@ def test_signac_project_crawler(self): index[doc["_id"]] = doc assert len(index) == 2 * len(job_ids) - class Crawler(signac.contrib.SignacProjectCrawler): + class Crawler(signac.contrib._SignacProjectCrawler): called = False def process(self_, doc, dirpath, fn): diff --git a/tests/test_shell.py b/tests/test_shell.py index 1f54cf30a..df60903c4 100644 --- a/tests/test_shell.py +++ b/tests/test_shell.py @@ -152,30 +152,6 @@ def test_statepoint(self): assert "{'a': 0}" in sp assert len(project) == 1 - # Index schema is changed - @pytest.mark.xfail() - def test_index(self): - self.call("python -m signac init my_project".split()) - self.call("python -m signac project --access".split()) - project = signac.Project() - project.open_job({"a": 0}).init() - assert len(project) == 1 - with pytest.deprecated_call(): - assert len(list(project.index())) == 1 - assert len(list(signac.index())) == 1 - doc = json.loads(self.call("python -m signac index".split())) - assert "statepoint" in doc - assert doc["statepoint"] == {"a": 0} - doc = json.loads(self.call("python -m signac project --index".split())) - assert "statepoint" in doc - assert doc["statepoint"] == {"a": 0} - project.open_job({"a": 0}).document["b"] = 0 - doc = json.loads(self.call("python -m signac index".split())) - assert "statepoint" in doc - assert doc["statepoint"] == {"a": 0} - assert "b" in doc - assert doc["b"] == 0 - def test_document(self): self.call("python -m signac init my_project".split()) project = signac.Project()