elastic · drawlerr · Jan 17, 2020 · Jan 17, 2020 · Jan 17, 2020 · Jan 17, 2020
diff --git a/.gitignore b/.gitignore
@@ -115,3 +115,6 @@ recipes/ccr/ccr-target-hosts.json
 *~
 /.project
 /.pydevproject
+
+# Tracker tracks
+tracks/
diff --git a/Makefile b/Makefile
@@ -88,7 +88,7 @@ tox-env-clean:
 	rm -rf .tox
 
 lint: check-venv
-	@find esrally benchmarks scripts tests -name "*.py" -exec $(VEPYLINT) -j0 -rn --load-plugins pylint_quotes --rcfile=$(CURDIR)/.pylintrc \{\} +
+	@find esrally benchmarks scripts tests tracker -name "*.py" -exec $(VEPYLINT) -j0 -rn --load-plugins pylint_quotes --rcfile=$(CURDIR)/.pylintrc \{\} +
 
 docs: check-venv
 	@. $(VENV_ACTIVATE_FILE); cd docs && $(MAKE) html

diff --git a/setup.py b/setup.py
@@ -118,7 +118,8 @@ def str_from_file(name):
       entry_points={
           "console_scripts": [
               "esrally=esrally.rally:main",
-              "esrallyd=esrally.rallyd:main"
+              "esrallyd=esrally.rallyd:main",
+              "tracker=tracker.tracker:main"
           ],
       },
       classifiers=[

diff --git a/tracker/__init__.py b/tracker/__init__.py
diff --git a/tracker/corpus.py b/tracker/corpus.py
@@ -0,0 +1,81 @@
+# Licensed to Elasticsearch B.V. under one or more contributor
+# license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright
+# ownership. Elasticsearch B.V. licenses this file to you under
+# the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#	http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import bz2
+import json
+import logging
+import os
+import pathlib
+
+from elasticsearch import helpers
+
+
+def template_vars(index_name, out_path, comp_outpath, doc_count):
+    corpus_path = pathlib.Path(out_path)
+    compressed_corpus_path = pathlib.Path(comp_outpath)
+    return {
+        "index_name": index_name,
+        "base_url": corpus_path.parent.as_uri(),
+        "filename": corpus_path.name,
+        "path": corpus_path,
+        "doc_count": doc_count,
+        "uncompressed_bytes": os.stat(corpus_path.as_posix()).st_size,
+        "compressed_bytes": os.stat(compressed_corpus_path.as_posix()).st_size
+    }
+
+
+def extract(client, outdir, index):
+    """
+    Scroll an index with a match-all query, dumping document source to
+    outdir/documents.json
+    :param client: Elasitcsearch client to scroll
+    :param outdir: Destination directory for corpus dump
+    :param index: Name of index to dump
+    :return: dict of properties describing the corpus for templates
+    """
+    outpath = os.path.join(outdir, "{}-documents.json".format(index))
+
+    total_docs = client.count(index=index)["count"]
+    logging.info("%d total docs in index %s", total_docs, index)
+    freq = total_docs // 1000
+
+    compressor = bz2.BZ2Compressor()
+    comp_outpath = outpath + ".bz2"
+
+    with open(outpath, "wb") as outfile:
+        with open(comp_outpath, "wb") as comp_outfile:
+            logging.info("Now dumping corpus to %s...", outpath)
+
+            query = {"query": {"match_all": {}}}
+            for n, doc in enumerate(helpers.scan(client, query=query, index=index)):
+                docsrc = doc["_source"]
+                data = (json.dumps(docsrc, separators=(',', ':')) + "\n").encode("utf-8")
+
+                outfile.write(data)
+                comp_outfile.write(compressor.compress(data))
+
+                render_progress(n+1, total_docs, freq)
+
+            print()  # progress prints didn't have a newline
+            comp_outfile.write(compressor.flush())
+    return template_vars(index, outpath, comp_outpath, total_docs)
+
+
+def render_progress(cur, total, freq):
+    if cur % freq == 0 or total - cur < freq:
+        percent = (cur * 100) / total
+        print("\r{n}/{total_docs} ({percent:.1f}%)".format(n=cur, total_docs=total, percent=percent), end="")
diff --git a/tracker/index.py b/tracker/index.py
@@ -0,0 +1,72 @@
+# Licensed to Elasticsearch B.V. under one or more contributor
+# license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright
+# ownership. Elasticsearch B.V. licenses this file to you under
+# the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#	http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import json
+import os
+
+
+INDEX_SETTINGS_EPHEMERAL_KEYS = ["uuid",
+                                 "creation_date",
+                                 "version",
+                                 "provided_name"]
+
+
+def filter_ephemeral_index_settings(settings):
+    """
+    Some of the 'settings' reported by Elasticsearch for an index are
+    ephemeral values, not useful for re-creating the index.
+    :param settings: Index settings reported by index.get()
+    :return: settings with ephemeral keys removed
+    """
+    return {k: v for k, v in settings.items() if k not in INDEX_SETTINGS_EPHEMERAL_KEYS}
+
+
+def extract_index_mapping_and_settings(client, index):
+    """
+    Calls index GET to retrieve mapping + settings, filtering settings
+    so they can be used to re-create this index
+    :param client: Elasticsearch client
+    :param index: name of index
+    :return: index creation dictionary
+    """
+    response = client.indices.get(index)
+    details = response[index]
+
+    mappings = details["mappings"]
+    index_settings = filter_ephemeral_index_settings(details["settings"]["index"])
+    return {"mappings": mappings, "settings": {"index": index_settings}}
+
+
+def extract(client, outdir, index):
+    """
+    Request index information to format in "index.json" for Rally
+    :param client: Elasticsearch client
+    :param outdir: destination directory
+    :param index: name of index
+    :return: None
+    """
+    filename = index + ".json"
+    index_obj = extract_index_mapping_and_settings(client, index)
+    outpath = os.path.join(outdir, filename)
+    with open(outpath, "w") as outfile:
+        json.dump(index_obj, outfile, indent=4, sort_keys=True)
+        outfile.write('\n')
+    return {
+        "name": index,
+        "path": outpath,
+        "filename": filename,
+    }
diff --git a/tracker/index_test.py b/tracker/index_test.py
@@ -0,0 +1,101 @@
+# Licensed to Elasticsearch B.V. under one or more contributor
+# license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright
+# ownership. Elasticsearch B.V. licenses this file to you under
+# the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#	http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from unittest import mock
+from tracker.index import filter_ephemeral_index_settings, extract_index_mapping_and_settings
+
+
+def test_index_setting_filter():
+    unfiltered_index_settings = {
+        "number_of_shards": "5",
+        "provided_name": "queries",
+        "creation_date": "1579230289084",
+        "requests": {
+            "cache": {
+                "enable": "false"
+            }
+        },
+        "number_of_replicas": "0",
+        "queries": {
+            "cache": {
+                "enabled": "false"
+            }
+        },
+        "uuid": "jdzVt-dDS1aRlqdZWK4pdA",
+        "version": {
+            "created": "7050099"
+        }
+    }
+    settings = filter_ephemeral_index_settings(unfiltered_index_settings)
+    assert settings.keys() == {"number_of_shards", "number_of_replicas", "requests", "queries"}
+
+
+@mock.patch("elasticsearch.Elasticsearch")
+def test_extract_index_create(client):
+    client.indices.get.return_value = {
+        "osmgeopoints": {
+            "aliases": {},
+            "mappings": {
+                "dynamic": "strict",
+                "properties": {
+                    "location": {
+                        "type": "geo_point"
+                    }
+                }
+            },
+            "settings": {
+                "index": {
+                    "number_of_shards": "5",
+                    "provided_name": "osmgeopoints",
+                    "creation_date": "1579210032233",
+                    "requests": {
+                        "cache": {
+                            "enable": "false"
+                        }
+                    },
+                    "number_of_replicas": "0",
+                    "uuid": "vOOsPNfxTJyQekkIo9TjPA",
+                    "version": {
+                        "created": "7050099"
+                    }
+                }
+            }
+        }
+    }
+    expected = {
+        "mappings": {
+            "dynamic": "strict",
+            "properties": {
+                "location": {
+                    "type": "geo_point"
+                }
+            }
+        },
+        "settings": {
+            "index": {
+                "number_of_replicas": "0",
+                "number_of_shards": "5",
+                "requests": {
+                    "cache": {
+                        "enable": "false"
+                    }
+                }
+            }
+        }
+    }
+    res = extract_index_mapping_and_settings(client, "osmgeopoints")
+    assert res == expected
diff --git a/tracker/resources/logging.json b/tracker/resources/logging.json
@@ -0,0 +1,38 @@
+{
+  "version": 1,
+  "formatters": {
+    "normal": {
+      "format": "%(asctime)s,%(msecs)d PID:%(process)d %(name)s %(levelname)s %(message)s",
+      "datefmt": "%Y-%m-%d %H:%M:%S",
+      "()": "esrally.log.configure_utc_formatter"
+    }
+  },
+  "handlers": {
+    "default_log_handler": {
+      "class": "logging.handlers.WatchedFileHandler",
+      "filename": "tracker.log",
+      "encoding": "UTF-8",
+      "formatter": "normal"
+    },
+    "console_handler": {
+      "class": "logging.StreamHandler",
+      "level": "INFO"
+    }
+  },
+  "root": {
+    "handlers": [
+      "default_log_handler",
+      "console_handler"
+    ],
+    "level": "INFO"
+  },
+  "loggers": {
+    "elasticsearch": {
+      "handlers": [
+        "default_log_handler"
+      ],
+      "level": "WARNING",
+      "propagate": false
+    }
+  }
+}
diff --git a/tracker/templates/challenges.json.j2 b/tracker/templates/challenges.json.j2
@@ -0,0 +1,49 @@
+    {
+      "name": "append-no-conflicts",
+      "description": "Indexes the whole document corpus using Elasticsearch default settings. We only adjust the number of replicas as we benchmark a single node cluster and Rally will only start the benchmark if the cluster turns green. Document ids are unique so all index operations are append only. After that a couple of queries are run.",
+      "default": true,
+      "schedule": [
+        {
+          "operation": "delete-index"
+        },
+        {
+          "operation": {
+            "operation-type": "create-index",
+            "settings": {% raw %}{{index_settings | default({}) | tojson}}{% endraw %}
+          }
+        },
+        {
+          "operation": "index-append",
+          "warmup-time-period": 120,
+          "clients": {% raw %}{{bulk_indexing_clients | default(8)}}{% endraw %}
+        },
+        {
+          "name": "refresh-after-index",
+          "operation": "refresh",
+          "clients": 1
+        },
+        {
+          "operation": "force-merge",
+          "clients": 1
+        },
+        {
+          "name": "refresh-after-force-merge",
+          "operation": "refresh",
+          "clients": 1
+        },
+        {
+          "operation": "index-stats",
+          "clients": 1,
+          "warmup-iterations": 500,
+          "iterations": 1000,
+          "target-throughput": 90
+        },
+        {
+          "operation": "node-stats",
+          "clients": 1,
+          "warmup-iterations": 100,
+          "iterations": 1000,
+          "target-throughput": 90
+        }
+      ]
+    }
diff --git a/tracker/templates/operations.json.j2 b/tracker/templates/operations.json.j2
@@ -0,0 +1,16 @@
+{
+    "name": "index-append",
+    "operation-type": "bulk",
+    "bulk-size": {{bulk_size | default(5000)}},
+    "ingest-percentage": {{ingest_percentage | default(100)}}
+},
+{
+    "name": "index-update",
+    "operation-type": "bulk",
+    "bulk-size": {{bulk_size | default(5000)}},
+    "ingest-percentage": {{ingest_percentage | default(100)}},
+    "conflicts": "{{conflicts | default('random')}}",
+    "on-conflict": "{{on_conflict | default('index')}}",
+    "conflict-probability": {{conflict_probability | default(25)}},
+    "recency": {{recency | default(0)}}
+}