glotzerlab · bdice · Nov 26, 2021 · Oct 10, 2021 · Oct 10, 2021 · Oct 10, 2021
diff --git a/.gitignore b/.gitignore
@@ -23,6 +23,7 @@ __pycache__
 pip-log.txt
 
 # Unit test / coverage reports
+.asv
 .noseids
 .coverage
 .tox

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -14,6 +14,7 @@ repos:
       - id: check-builtin-literals
       - id: check-executables-have-shebangs
       - id: check-json
+        exclude: 'asv.conf.json'
       - id: check-yaml
       - id: debug-statements
       - id: requirements-txt-fixer

diff --git a/asv.conf.json b/asv.conf.json
@@ -0,0 +1,160 @@
+{
+    // The version of the config file format.  Do not change, unless
+    // you know what you are doing.
+    "version": 1,
+
+    // The name of the project being benchmarked
+    "project": "signac",
+
+    // The project's homepage
+    "project_url": "https://signac.io/",
+
+    // The URL or local path of the source code repository for the
+    // project being benchmarked
+    "repo": ".",
+
+    // The Python project's subdirectory in your repo.  If missing or
+    // the empty string, the project is assumed to be located at the root
+    // of the repository.
+    // "repo_subdir": "",
+
+    // Customizable commands for building, installing, and
+    // uninstalling the project. See asv.conf.json documentation.
+    //
+    // "install_command": ["in-dir={env_dir} python -mpip install {wheel_file}"],
+    // "uninstall_command": ["return-code=any python -mpip uninstall -y {project}"],
+    // "build_command": [
+    //     "python setup.py build",
+    //     "PIP_NO_BUILD_ISOLATION=false python -mpip wheel --no-deps --no-index -w {build_cache_dir} {build_dir}"
+    // ],
+
+    // List of branches to benchmark. If not provided, defaults to "master"
+    // (for git) or "default" (for mercurial).
+    // "branches": ["master"], // for git
+    // "branches": ["default"],    // for mercurial
+
+    // The DVCS being used.  If not set, it will be automatically
+    // determined from "repo" by looking at the protocol in the URL
+    // (if remote), or by looking for special directories, such as
+    // ".git" (if local).
+    "dvcs": "git",
+
+    // The tool to use to create environments.  May be "conda",
+    // "virtualenv" or other value depending on the plugins in use.
+    // If missing or the empty string, the tool will be automatically
+    // determined by looking for tools on the PATH environment
+    // variable.
+    "environment_type": "virtualenv",
+
+    // timeout in seconds for installing any dependencies in environment
+    // defaults to 10 min
+    //"install_timeout": 600,
+
+    // the base URL to show a commit for the project.
+    "show_commit_url": "https://github.com/glotzerlab/signac/commit/",
+
+    // The Pythons you'd like to test against.  If not provided, defaults
+    // to the current version of Python used to run `asv`.
+    // "pythons": ["3.9"],
+
+    // The list of conda channel names to be searched for benchmark
+    // dependency packages in the specified order
+    // "conda_channels": ["conda-forge"],
+
+    // The matrix of dependencies to test.  Each key is the name of a
+    // package (in PyPI) and the values are version numbers.  An empty
+    // list or empty string indicates to just test against the default
+    // (latest) version. null indicates that the package is to not be
+    // installed. If the package to be tested is only available from
+    // PyPi, and the 'environment_type' is conda, then you can preface
+    // the package name by 'pip+', and the package will be installed via
+    // pip (with all the conda available packages installed first,
+    // followed by the pip installed packages).
+    //
+    // "matrix": {
+    //     "numpy": ["1.6", "1.7"],
+    //     "six": ["", null],        // test with and without six installed
+    //     "pip+emcee": [""],   // emcee is only available for install with pip.
+    // },
+
+    // Combinations of libraries/python versions can be excluded/included
+    // from the set to test. Each entry is a dictionary containing additional
+    // key-value pairs to include/exclude.
+    //
+    // An exclude entry excludes entries where all values match. The
+    // values are regexps that should match the whole string.
+    //
+    // An include entry adds an environment. Only the packages listed
+    // are installed. The 'python' key is required. The exclude rules
+    // do not apply to includes.
+    //
+    // In addition to package names, the following keys are available:
+    //
+    // - python
+    //     Python version, as in the *pythons* variable above.
+    // - environment_type
+    //     Environment type, as above.
+    // - sys_platform
+    //     Platform, as in sys.platform. Possible values for the common
+    //     cases: 'linux2', 'win32', 'cygwin', 'darwin'.
+    //
+    // "exclude": [
+    //     {"python": "3.2", "sys_platform": "win32"}, // skip py3.2 on windows
+    //     {"environment_type": "conda", "six": null}, // don't run without six on conda
+    // ],
+    //
+    // "include": [
+    //     // additional env for python2.7
+    //     {"python": "2.7", "numpy": "1.8"},
+    //     // additional env if run on windows+conda
+    //     {"platform": "win32", "environment_type": "conda", "python": "2.7", "libpython": ""},
+    // ],
+
+    // The directory (relative to the current directory) that benchmarks are
+    // stored in.  If not provided, defaults to "benchmarks"
+    "benchmark_dir": "benchmarks",
+
+    // The directory (relative to the current directory) to cache the Python
+    // environments in.  If not provided, defaults to "env"
+    "env_dir": ".asv/env",
+
+    // The directory (relative to the current directory) that raw benchmark
+    // results are stored in.  If not provided, defaults to "results".
+    "results_dir": ".asv/results",
+
+    // The directory (relative to the current directory) that the html tree
+    // should be written to.  If not provided, defaults to "html".
+    "html_dir": ".asv/html",
+
+    // The number of characters to retain in the commit hashes.
+    // "hash_length": 8,
+
+    // `asv` will cache results of the recent builds in each
+    // environment, making them faster to install next time.  This is
+    // the number of builds to keep, per environment.
+    // "build_cache_size": 2,
+
+    // The commits after which the regression search in `asv publish`
+    // should start looking for regressions. Dictionary whose keys are
+    // regexps matching to benchmark names, and values corresponding to
+    // the commit (exclusive) after which to start looking for
+    // regressions.  The default is to start from the first commit
+    // with results. If the commit is `null`, regression detection is
+    // skipped for the matching benchmark.
+    //
+    // "regressions_first_commits": {
+    //    "some_benchmark": "352cdf",  // Consider regressions only after this commit
+    //    "another_benchmark": null,   // Skip regression detection altogether
+    // },
+
+    // The thresholds for relative change in results, after which `asv
+    // publish` starts reporting regressions. Dictionary of the same
+    // form as in ``regressions_first_commits``, with values
+    // indicating the thresholds.  If multiple entries match, the
+    // maximum is taken. If no entry matches, the default is 5%.
+    //
+    // "regressions_thresholds": {
+    //    "some_benchmark": 0.01,     // Threshold of 1%
+    //    "another_benchmark": 0.5,   // Threshold of 50%
+    // },
+}
diff --git a/benchmark.py b/benchmark.py
@@ -18,6 +18,16 @@
 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
+"""Benchmarks for use in CI testing.
+
+This script defines benchmarks of common signac operations, used to assess the
+performance of the framework over time. Most developers will want to make use of
+the asv (airspeed velocity) tools for benchmarking, located in
+``benchmarks/benchmarks.py``. This script is used by CI tests to identify any
+significant performance regressions introduced by new features.
+"""
+
+
 import argparse
 import base64
 import json

diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py
diff --git a/benchmarks/benchmarks.py b/benchmarks/benchmarks.py
@@ -0,0 +1,147 @@
+# Copyright 2021 The Regents of the University of Michigan
+# All rights reserved.
+# This software is licensed under the BSD 3-Clause License.
+"""Benchmarks for use with asv (airspeed velocity).
+
+This script defines benchmarks of common signac operations, used to assess the
+performance of the framework over time. The asv tools allow for profiling,
+comparison, and visualization of benchmark results. This complements the file
+``benchmark.py`` in the root directory of the repository, which is primarily
+intended for CI tests.
+"""
+
+import random
+import string
+from itertools import islice
+from multiprocessing import Pool
+from tempfile import TemporaryDirectory
+
+from tqdm import tqdm
+
+import signac
+
+
+def _random_str(size):
+    return "".join(random.choice(string.ascii_lowercase) for _ in range(size))
+
+
+def _make_json_data(i, num_keys=1, data_size=0):
+    assert num_keys >= 1
+    assert data_size >= 0
+
+    data = {f"b_{j}": _random_str(data_size) for j in range(num_keys - 1)}
+    data["a"] = f"{i}{_random_str(max(0, data_size - len(str(i))))}"
+    return data
+
+
+def _make_job(project, num_keys, num_doc_keys, data_size, data_std, i):
+    size = max(0, int(random.gauss(data_size, data_std)))
+    job = project.open_job(_make_json_data(i, num_keys, size))
+    if num_doc_keys > 0:
+        size = max(0, int(random.gauss(data_size, data_std)))
+        job.document.update(_make_json_data(i, num_doc_keys, size))
+    else:
+        job.init()
+
+
+def generate_random_data(
+    project,
+    N,
+    num_keys=1,
+    num_doc_keys=0,
+    data_size_mean=0,
+    data_size_std=0,
+    parallel=True,
+):
+    assert len(project) == 0
+
+    if parallel:
+        with Pool() as pool:
+            p = [
+                (project, num_keys, num_doc_keys, data_size_mean, data_size_std, i)
+                for i in range(N)
+            ]
+            list(pool.starmap(_make_job, tqdm(p, desc="init random project data")))
+    else:
+        from functools import partial
+
+        make = partial(
+            _make_job, project, num_keys, num_doc_keys, data_size_mean, data_size_std
+        )
+        list(map(make, tqdm(range(N), desc="init random project data")))
+
+
+def setup_random_project(
+    N, num_keys=1, num_doc_keys=0, data_size_mean=0, data_size_std=0, seed=0, root=None
+):
+    random.seed(seed)
+    if not isinstance(N, int):
+        raise TypeError("N must be an integer!")
+
+    temp_dir = TemporaryDirectory()
+    project = signac.init_project(f"benchmark-N={N}", root=temp_dir.name)
 def TemporaryProject(name=None, cls=None, **kwargs): 
 def init_jobs(project, nested=False, listed=False, heterogeneous=False): 
 def TemporaryProject(name=None, cls=None, **kwargs): 
 def init_jobs(project, nested=False, listed=False, heterogeneous=False): 
+    generate_random_data(
+        project, N, num_keys, num_doc_keys, data_size_mean, data_size_std
+    )
+    return project, temp_dir
+
+
+PARAMETERS = {
+    "N": [100, 1_000],
+    "num_statepoint_keys": [10],
+    "num_document_keys": [0],
+    "data_size_mean": [100],
+    "data_size_std": [0],
+}
+
+
+class _ProjectBenchBase:
+    param_names = PARAMETERS.keys()
+    params = PARAMETERS.values()
+
+    def setup(self, *params):
+        N, num_keys, num_doc_keys, data_size_mean, data_size_std = params
+        self.project, self.temp_dir = setup_random_project(
+            N,
+            num_keys=num_keys,
+            num_doc_keys=num_doc_keys,
+            data_size_mean=data_size_mean,
+            data_size_std=data_size_std,
+        )
+
+    def teardown(self, *params):
+        self.temp_dir.cleanup()
+
+
+class ProjectBench(_ProjectBenchBase):
+    def time_determine_len(self, *params):
+        len(self.project)
+
+    def time_iterate_single_pass(self, *params):
+        list(self.project)
+
+    def time_iterate(self, *params):
+        for _ in range(10):
+            list(self.project)
+
+    def time_iterate_load_sp(self, *params):
+        for _ in range(10):
+            [job.sp() for job in self.project]
+
+
+class ProjectRandomJobBench(_ProjectBenchBase):
+    def setup(self, *params):
+        super().setup(*params)
+        self.random_job = random.choice(list(self.project))
+        self.random_job_sp = self.random_job.statepoint()
+        self.random_job_id = self.random_job.id
+        self.lean_filter = {k: v for k, v in islice(self.random_job_sp.items(), 1)}
+
+    def time_select_by_id(self, *params):
+        self.project.open_job(id=self.random_job_id)
+
+    def time_search_lean_filter(self, *params):
+        len(self.project.find_jobs(self.lean_filter))
+
+    def time_search_rich_filter(self, *params):
+        len(self.project.find_jobs(self.random_job_sp))
diff --git a/changelog.txt b/changelog.txt
@@ -10,6 +10,11 @@ Version 1
 [1.8.0] -- 2021-xx-xx
 ---------------------
 
+Added
++++++
+
+ - Benchmarks can be run using the ``asv`` (airspeed velocity) tool (#629).
+
 Deprecated
 ++++++++++
 

diff --git a/doc/support.rst b/doc/support.rst
@@ -94,6 +94,26 @@ To run tests, execute:
 
     (signac-dev) signac $ python -m pytest tests/
 
+Benchmarking
+------------
+
+Benchmarks can be run using the `asv (airspeed velocity) <https://asv.readthedocs.io/>`__ tool.
+To install the tool, execute:
+
+.. code-block:: bash
+
+   (signac-dev) signac $ pip install asv
+
+The ``asv`` tool will install signac into an isolated virtual environment that is used for benchmarking.
+Below is a quick reference with some helpful commands:
+
+  * ``$ asv run master..mybranch`` benchmarks every commit from ``master`` to ``mybranch``.
+  * ``$ asv publish`` generates a static HTML site showing benchmark results.
+  * ``$ asv preview`` hosts a local preview of the generated HTML site.
+  * ``$ asv dev`` runs benchmarks that are in development.
+  * ``$ asv profile 'benchmarks.ProjectBench.time_iterate_load_sp(.*)' --gui=snakeviz`` will profile a specific test and visualize results with `snakeviz <https://jiffyclub.github.io/snakeviz/>`__.
+
+For more information on how to use asv, refer to `Using airspeed velocity <https://asv.readthedocs.io/en/stable/using.html>`__.
 
 Building documentation
 ----------------------
-Original file line number
+Diff line change
@@ Expand Up / @@ -10,6 +10,11 @@ Version 1 @@
     [1.8.0] -- 2021-xx-xx
     ---------------------
+    Added
+    +++++
+     - Benchmarks can be run using the ``asv`` (airspeed velocity) tool (#629).
     Deprecated
     ++++++++++
@@ Expand Down @@