From 33f855e6389132ffc8dfa755b72a78bb2cc8eb35 Mon Sep 17 00:00:00 2001 From: Dimitrios Liappis Date: Mon, 6 Apr 2020 15:53:18 +0300 Subject: [PATCH] Update corpora compressed size after re-compression with pbzip2 (#109) Update compressed-bytes for all corpora after re-compressing them using `pbzip2 -9 -v -k -m10000`. Together with https://github.com/elastic/rally/pull/947 this allows for much faster decompression utilizing all available CPU cores. --- eventdata/track.json | 2 +- geopoint/track.json | 2 +- geopointshape/track.json | 2 +- geoshape/track.json | 6 +++--- http_logs/track.json | 28 ++++++++++++++-------------- metricbeat/track.json | 4 ++-- nested/track.json | 2 +- noaa/track.json | 2 +- nyc_taxis/track.json | 2 +- percolator/track.json | 2 +- pmc/track.json | 2 +- so/track.json | 2 +- 12 files changed, 28 insertions(+), 28 deletions(-) diff --git a/eventdata/track.json b/eventdata/track.json index 39bdc442..44a11c52 100644 --- a/eventdata/track.json +++ b/eventdata/track.json @@ -17,7 +17,7 @@ { "source-file": "eventdata.json.bz2", "document-count": 20000000, - "compressed-bytes": 791796014, + "compressed-bytes": 792768300, "uncompressed-bytes": 16437108429 } ] diff --git a/geopoint/track.json b/geopoint/track.json index ea556aad..f5c8ea5b 100644 --- a/geopoint/track.json +++ b/geopoint/track.json @@ -17,7 +17,7 @@ { "source-file": "documents.json.bz2", "document-count": 60844404, - "compressed-bytes": 505295401, + "compressed-bytes": 505542241, "uncompressed-bytes": 2448564579 } ] diff --git a/geopointshape/track.json b/geopointshape/track.json index dbbf84f8..3fe6e0e7 100644 --- a/geopointshape/track.json +++ b/geopointshape/track.json @@ -17,7 +17,7 @@ { "source-file": "documents.json.bz2", "document-count": 60844404, - "compressed-bytes": 493367095, + "compressed-bytes": 493689712, "uncompressed-bytes": 2780550484 } ] diff --git a/geoshape/track.json b/geoshape/track.json index d3b87ae6..f8c6585f 100644 --- a/geoshape/track.json +++ b/geoshape/track.json @@ -26,7 +26,7 @@ { "source-file": "linestrings.json.bz2", "document-count": 20532036, - "compressed-bytes": 3697293598, + "compressed-bytes": 3698508764, "uncompressed-bytes": 12592499821 } ] @@ -39,7 +39,7 @@ { "source-file": "multilinestrings.json.bz2", "document-count": 532036, - "compressed-bytes": 1816588880, + "compressed-bytes": 1817213095, "uncompressed-bytes": 5992834062 } ] @@ -52,7 +52,7 @@ { "source-file": "polygons.json.bz2", "document-count": 39459211, - "compressed-bytes": 8835370788, + "compressed-bytes": 8837117359, "uncompressed-bytes": 30178820325 } ] diff --git a/http_logs/track.json b/http_logs/track.json index 137e899e..d099ea8c 100644 --- a/http_logs/track.json +++ b/http_logs/track.json @@ -44,49 +44,49 @@ "target-index": "logs-181998", "source-file": "documents-181998.unparsed.json.bz2", "document-count": 2708746, - "compressed-bytes": 13064317, + "compressed-bytes": 13088137, "uncompressed-bytes": 303920342 }, { "target-index": "logs-191998", "source-file": "documents-191998.unparsed.json.bz2", "document-count": 9697882, - "compressed-bytes": 47211781, + "compressed-bytes": 47290776, "uncompressed-bytes": 1088378738 }, { "target-index": "logs-201998", "source-file": "documents-201998.unparsed.json.bz2", "document-count": 13053463, - "compressed-bytes": 63174979, + "compressed-bytes": 63278452, "uncompressed-bytes": 1456836090 }, { "target-index": "logs-211998", "source-file": "documents-211998.unparsed.json.bz2", "document-count": 17647279, - "compressed-bytes": 85607179, + "compressed-bytes": 85739523, "uncompressed-bytes": 1975990671 }, { "target-index": "logs-221998", "source-file": "documents-221998.unparsed.json.bz2", "document-count": 10716760, - "compressed-bytes": 53190976, + "compressed-bytes": 53264421, "uncompressed-bytes": 1202551382 }, { "target-index": "logs-231998", "source-file": "documents-231998.unparsed.json.bz2", "document-count": 11961342, - "compressed-bytes": 60705435, + "compressed-bytes": 60795929, "uncompressed-bytes": 1334381144 }, { "target-index": "logs-241998", "source-file": "documents-241998.unparsed.json.bz2", "document-count": 181463624, - "compressed-bytes": 897719968, + "compressed-bytes": 899190175, "uncompressed-bytes": 20563705716 } ] @@ -100,49 +100,49 @@ "target-index": "logs-181998", "source-file": "documents-181998.json.bz2", "document-count": 2708746, - "compressed-bytes": 13815456, + "compressed-bytes": 13843641, "uncompressed-bytes": 363512754 }, { "target-index": "logs-191998", "source-file": "documents-191998.json.bz2", "document-count": 9697882, - "compressed-bytes": 49439633, + "compressed-bytes": 49546887, "uncompressed-bytes": 1301732149 }, { "target-index": "logs-201998", "source-file": "documents-201998.json.bz2", "document-count": 13053463, - "compressed-bytes": 65623436, + "compressed-bytes": 65759419, "uncompressed-bytes": 1744012279 }, { "target-index": "logs-211998", "source-file": "documents-211998.json.bz2", "document-count": 17647279, - "compressed-bytes": 88258230, + "compressed-bytes": 88445049, "uncompressed-bytes": 2364230815 }, { "target-index": "logs-221998", "source-file": "documents-221998.json.bz2", "document-count": 10716760, - "compressed-bytes": 54160603, + "compressed-bytes": 54274027, "uncompressed-bytes": 1438320123 }, { "target-index": "logs-231998", "source-file": "documents-231998.json.bz2", "document-count": 11961342, - "compressed-bytes": 60927822, + "compressed-bytes": 61043842, "uncompressed-bytes": 1597530673 }, { "target-index": "logs-241998", "source-file": "documents-241998.json.bz2", "document-count": 181463624, - "compressed-bytes": 905378242, + "compressed-bytes": 907295259, "uncompressed-bytes": 24555905444 } ] diff --git a/metricbeat/track.json b/metricbeat/track.json index bef54c38..a880a4b5 100644 --- a/metricbeat/track.json +++ b/metricbeat/track.json @@ -16,8 +16,8 @@ { "source-file": "documents.json.bz2", "document-count": 1079600, - "compressed-bytes":91887122, - "uncompressed-bytes":1249705758 + "compressed-bytes": 91964149, + "uncompressed-bytes": 1249705758 } ] } diff --git a/nested/track.json b/nested/track.json index 939d1f36..b4d80579 100644 --- a/nested/track.json +++ b/nested/track.json @@ -17,7 +17,7 @@ { "source-file": "documents.json.bz2", "document-count": 11203029, - "compressed-bytes": 695293381, + "compressed-bytes": 695550727, "uncompressed-bytes": 3637747670 } ] diff --git a/noaa/track.json b/noaa/track.json index f623c163..3786f9aa 100644 --- a/noaa/track.json +++ b/noaa/track.json @@ -17,7 +17,7 @@ { "source-file": "documents.json.bz2", "document-count": 33659481, - "compressed-bytes": 993302204, + "compressed-bytes": 995480468, "uncompressed-bytes": 9684262698 } ] diff --git a/nyc_taxis/track.json b/nyc_taxis/track.json index 35e35385..a91877a0 100644 --- a/nyc_taxis/track.json +++ b/nyc_taxis/track.json @@ -18,7 +18,7 @@ "source-file": "documents.json.bz2", "#COMMENT": "ML benchmark rely on the fact that the document count stays constant.", "document-count": 165346692, - "compressed-bytes": 4812721501, + "compressed-bytes": 4820107188, "uncompressed-bytes": 79802445255 } ] diff --git a/percolator/track.json b/percolator/track.json index 404f8eaa..f8793099 100644 --- a/percolator/track.json +++ b/percolator/track.json @@ -17,7 +17,7 @@ { "source-file": "queries-2.json.bz2", "document-count": 2000000, - "compressed-bytes": 105192, + "compressed-bytes": 124009, "uncompressed-bytes": 110039748 } ] diff --git a/pmc/track.json b/pmc/track.json index a9137422..c66be8f0 100644 --- a/pmc/track.json +++ b/pmc/track.json @@ -17,7 +17,7 @@ { "source-file": "documents.json.bz2", "document-count": 574199, - "compressed-bytes": 5928712141, + "compressed-bytes": 5931724449, "uncompressed-bytes": 23256051757 } ] diff --git a/so/track.json b/so/track.json index f9e3c179..b53b81e5 100644 --- a/so/track.json +++ b/so/track.json @@ -17,7 +17,7 @@ { "source-file": "posts.json.bz2", "document-count": 36062278, - "compressed-bytes": 9599137228, + "compressed-bytes": 9600716233, "uncompressed-bytes": 35564808298 } ]