diff --git a/src/objects.py b/src/objects.py index 1947fb76..e1a6efb8 100644 --- a/src/objects.py +++ b/src/objects.py @@ -97,6 +97,13 @@ def append(self, new_element): # CPUs are cheap self.queries.sort(key=lambda q: q.query_hash) + def find_query_by_hash(self, query_hash): + for query in self.queries: + if query.query_hash == query_hash: + return query + + return None + class EPNode: def __init__(self): diff --git a/src/reports/abstract.py b/src/reports/abstract.py index a31689f0..342de757 100644 --- a/src/reports/abstract.py +++ b/src/reports/abstract.py @@ -37,6 +37,7 @@ def __init__(self): if not os.path.isdir(f"report/{self.start_date}"): os.mkdir(f"report/{self.start_date}") + os.mkdir(f"report/{self.start_date}/imgs") def get_report_name(self): return "" diff --git a/src/reports/adoc/comparison.py b/src/reports/adoc/comparison.py deleted file mode 100644 index 830b59b1..00000000 --- a/src/reports/adoc/comparison.py +++ /dev/null @@ -1,139 +0,0 @@ -from sql_formatter.core import format_sql - -from objects import CollectResult, Query -from reports.abstract import Report - - -class ComparisonReport(Report): - def __init__(self): - super().__init__() - - self.queries = {} - - @classmethod - def generate_report(cls, - loq_yb: CollectResult, - loq_pg: CollectResult): - report = ComparisonReport() - - report.define_version(loq_yb.db_version, loq_pg.db_version) - report.report_model(loq_yb.model_queries) - report.report_config(loq_yb.config, "YB") - report.report_config(loq_pg.config, "PG") - - for query in zip(loq_yb.queries, loq_pg.queries): - report.add_query(*query) - - report.build_report() - report.publish_report("cmp") - - def get_report_name(self): - return "Comparison" - - def define_version(self, first_version, second_version): - self.report += f"[VERSION]\n====\nYugabyte:\n{first_version}\n\nPostgres:\n{second_version}\n====\n\n" - - def add_query(self, first_query: Query, second_query: Query): - if first_query.tag not in self.queries: - self.queries[first_query.tag] = [[first_query, second_query], ] - else: - self.queries[first_query.tag].append([first_query, second_query]) - - def build_report(self): - # link to top - self.report += "\n[#top]\n== Summary\n" - - num_columns = 5 - self._start_table("1,1,1,1,4") - self.report += "|Yugabyte|Postgres|Ratio vs Postgres|Ratio vs Postgres x3|Query\n" - for tag, queries in self.queries.items(): - self.report += f"{num_columns}+m|{tag}.sql\n" - for query in queries: - ratio = "{:.2f}".format(query[0].execution_time_ms / query[1].execution_time_ms if query[1].execution_time_ms != 0 else 99999999) - ratio_x3 = query[0].execution_time_ms / (3 * query[1].execution_time_ms) if query[1].execution_time_ms != 0 else 99999999 - ratio_x3_str = "{:.2f}".format(query[0].execution_time_ms / (3 * query[1].execution_time_ms) if query[1].execution_time_ms != 0 else 99999999) - color = "[green]" if ratio_x3 <= 1.0 else "[red]" - self.report += f"|{query[0].execution_time_ms}\n" \ - f"|{query[1].execution_time_ms}\n" \ - f"a|*{ratio}*\n" \ - f"a|{color}#*{ratio_x3_str}*#\n" - self.report += f"a|[#{query[0].query_hash}_top]\n<<{query[0].query_hash}>>\n" - self._start_source(["sql"]) - self.report += format_sql(query[1].query.replace("|", "\|")) - self._end_source() - self.report += "\n" - self._end_table_row() - self._end_table() - - # different results links - for tag in self.queries.keys(): - self.report += f"\n<<{tag}>>\n" - - for tag, queries in self.queries.items(): - self.report += f"\n[#{tag}]\n== {tag} queries file\n\n" - for query in queries: - self.__report_query(query[0], query[1]) - - # noinspection InsecureHash - def __report_query(self, yb_query: Query, pg_query: Query): - self.reported_queries_counter += 1 - - self.report += f"\n[#{yb_query.query_hash}]\n" - self.report += f"=== Query {yb_query.query_hash}" - self.report += f"\n{yb_query.tag}\n" - self.report += "\n<>\n" - self.report += f"\n<<{yb_query.query_hash}_top,Show in summary>>\n" - self._add_double_newline() - - self._start_source(["sql"]) - self.report += format_sql(yb_query.query.replace("|", "\|")) - self._end_source() - - self._add_double_newline() - - self._start_table("3") - self.report += "|Metric|Yugabyte|Postgres\n" - self._start_table_row() - self.report += f"Cardinality|{yb_query.result_cardinality}|{pg_query.result_cardinality}" - self._end_table_row() - self._start_table_row() - self.report += f"Estimated cost|{yb_query.execution_plan.get_estimated_cost()}|{pg_query.execution_plan.get_estimated_cost()}" - self._end_table_row() - self._start_table_row() - self.report += f"Execution time|{yb_query.execution_time_ms}|{pg_query.execution_time_ms}" - self._end_table_row() - self._end_table() - - self._start_table() - self._start_table_row() - - self._start_collapsible("Yugabyte version plan") - self._start_source(["diff"]) - self.report += yb_query.execution_plan.full_str - self._end_source() - self._end_collapsible() - - self._start_collapsible("Postgres version plan") - self._start_source(["diff"]) - self.report += pg_query.execution_plan.full_str - self._end_source() - self._end_collapsible() - - self._start_source(["diff"]) - - diff = self._get_plan_diff( - pg_query.execution_plan.full_str, - yb_query.execution_plan.full_str, - ) - if not diff: - diff = yb_query.execution_plan.full_str - - self.report += diff - self._end_source() - self._end_table_row() - - self.report += "\n" - - self._end_table() - - self._add_double_newline() diff --git a/src/reports/adoc/regression.py b/src/reports/adoc/regression.py index 389a0451..8d0c95d0 100644 --- a/src/reports/adoc/regression.py +++ b/src/reports/adoc/regression.py @@ -1,6 +1,10 @@ from dataclasses import dataclass +from typing import Type +import numpy as np from sql_formatter.core import format_sql +from matplotlib import pyplot as plt +from matplotlib import rcParams from objects import CollectResult, Query from reports.abstract import Report @@ -35,8 +39,8 @@ def generate_report(cls, report.define_version_names(v1_name, v2_name) report.define_version(loq_v1.db_version, loq_v2.db_version) report.report_model(loq_v1.model_queries) - report.report_config(loq_v1.config, "YB") - report.report_config(loq_v2.config, "PG") + report.report_config(loq_v1.config, v1_name) + report.report_config(loq_v2.config, v2_name) for query in zip(loq_v1.queries, loq_v2.queries): if query[0].query_hash != query[1].query_hash: @@ -45,7 +49,9 @@ def generate_report(cls, report.add_query(*query) report.build_report() - report.publish_report("reg") + report.build_xls_report() + + report.publish_report("regression") report.publish_short_report() def get_report_name(self): @@ -61,6 +67,120 @@ def add_query(self, first_query: Query, second_query: Query): else: self.queries[first_query.tag].append([first_query, second_query]) + def create_query_plot(self, best_optimization, optimizations, query, postfix_name): + if not optimizations: + return "NO PLOT" + + rcParams['font.family'] = 'serif' + rcParams['font.size'] = 6 + plt.xlabel('Execution time [ms]') + plt.ylabel('Predicted cost') + + plt.plot([q.execution_time_ms for q in optimizations if q.execution_time_ms != 0], + [q.execution_plan.get_estimated_cost() for q in optimizations if + q.execution_time_ms != 0], 'k.', + [query.execution_time_ms], + [query.execution_plan.get_estimated_cost()], 'r^', + [best_optimization.execution_time_ms], + [best_optimization.execution_plan.get_estimated_cost()], 'go') + + file_name = f'imgs/query_{self.reported_queries_counter}_{postfix_name}.png' + plt.savefig(f"report/{self.start_date}/{file_name}", dpi=300) + plt.close() + + return file_name + + @staticmethod + def fix_last_newline_in_result(result, rows): + if result: + splitted_result = result.split("\n") + result = "\n".join(splitted_result[:-1]) + last_newline = splitted_result[-1] + rows[0] = f"{last_newline}{rows[0]}" + result += "\n" + return result + + def __report_heatmap(self, query: Type[Query]): + """ + Here is the deal. In v2 plans we can separate each plan tree node by splitting by `->` + When constructing heatmap need to add + or - to the beginning of string `\n`. + So there is 2 splitters - \n and -> and need to construct correct result. + + :param query: + :return: + """ + if not (execution_plan_heatmap := query.heatmap()): + return + + best_decision = max(row['weight'] for row in execution_plan_heatmap.values()) + last_rowid = max(execution_plan_heatmap.keys()) + result = "" + for row_id, row in execution_plan_heatmap.items(): + rows = row['str'].split("\n") + + if row['weight'] == best_decision: + result = self.fix_last_newline_in_result(result, rows) + result += "\n".join([f"+{line}" for line_id, line in enumerate(rows) if + line_id != (len(rows) - 1)]) + f"\n{rows[-1]}" + elif row['weight'] == 0: + result = self.fix_last_newline_in_result(result, rows) + result += "\n".join([f"-{line}" for line_id, line in enumerate(rows) if + line_id != (len(rows) - 1)]) + f"\n{rows[-1]}" + else: + result += f"{row['str']}" + + # skip adding extra -> to the end of list + if row_id != last_rowid: + result += "->" + + self._start_collapsible("Plan heatmap") + self._start_source(["diff"]) + self.report += result + self._end_source() + self._end_collapsible() + + @staticmethod + def generate_regression_and_standard_errors(x_data, y_data): + x = np.array(x_data) + y = np.array(y_data) + n = x.size + + a, b = np.polyfit(x, y, deg=1) + y_est = a * x + b + y_err = (y - y_est).std() * np.sqrt(1 / n + (x - x.mean()) ** 2 / np.sum((x - x.mean()) ** 2)) + + fig, ax = plt.subplots() + + plt.xlabel('Predicted cost') + plt.ylabel('Execution time [ms]') + + ax.plot(x, y_est, '-') + ax.fill_between(x, y_est - y_err, y_est + y_err, alpha=0.2) + ax.plot(x, y, 'k.') + + return fig + + def create_default_query_plots(self): + file_names = ['imgs/all_queries_defaults_yb_v1.png', + 'imgs/all_queries_defaults_yb_v2.png'] + + for i in range(2): + x_data = [] + y_data = [] + + for tag, queries in self.queries.items(): + for yb_pg_queries in queries: + query = yb_pg_queries[i] + if query.execution_time_ms: + x_data.append(query.execution_plan.get_estimated_cost()) + y_data.append(query.execution_time_ms) + + fig = self.generate_regression_and_standard_errors(x_data, y_data) + fig.savefig(f"report/{self.start_date}/{file_names[i]}", dpi=300) + plt.close() + + return file_names + def build_report(self): # link to top self.add_plan_comparison() @@ -69,37 +189,102 @@ def build_report(self): self.add_scanned_rows() self.add_peak_memory_collapsible() - self.report += "\n[#query_summary]\n== Query Summary\n" - num_columns = 4 - self._start_table("1,1,1,4") - self.report += f"|{self.v1_name}|{self.v2_name}|Ratio (Second/First)|Query\n" + self._start_table("2") + self.report += f"|{self.v1_name}|{self.v2_name}\n" + default_query_plots = self.create_default_query_plots() + self.report += f"a|image::{default_query_plots[0]}[{self.v1_name},align=\"center\"]\n" + self.report += f"a|image::{default_query_plots[1]}[{self.v2_name},align=\"center\"]\n" + self._end_table() + + self.report += "\n== QO score\n" + + yb_v1_bests = 0 + yb_v2_bests = 0 + qe_bests_geo = 1 + qo_yb_v1_bests = 1 + qa_yb_v2_bests = 1 + total = 0 + for queries in self.queries.values(): + for query in queries: + yb_v1_query = query[0] + yb_v2_query = query[1] + + yb_v1_best = yb_v1_query.get_best_optimization(self.config) + yb_v2_best = yb_v2_query.get_best_optimization(self.config) + + qe_bests_geo *= yb_v1_best.execution_time_ms / yb_v2_best.execution_time_ms + qo_yb_v1_bests *= (yb_v1_query.execution_time_ms if yb_v1_query.execution_time_ms > 0 else 1.0) / ( + yb_v1_best.execution_time_ms if yb_v1_best.execution_time_ms > 0 else 1) + qa_yb_v2_bests *= yb_v2_query.execution_time_ms / yb_v2_best.execution_time_ms if yb_v2_best.execution_time_ms != 0 else 9999999 + yb_v1_bests += 1 if yb_v1_query.compare_plans(yb_v1_best.execution_plan) else 0 + yb_v2_bests += 1 if yb_v2_query.compare_plans( + yb_v2_best.execution_plan) else 0 + + total += 1 + + self._start_table("4,1,1") + self.report += f"|Statistic|{self.v1_name}|{self.v2_name}\n" + self.report += f"|Best execution plan picked|{'{:.2f}'.format(float(yb_v1_bests) * 100 / total)}%|{'{:.2f}'.format(float(yb_v2_bests) * 100 / total)}%\n" + self.report += f"|Geomeric mean QE best\n2+m|{'{:.2f}'.format(qe_bests_geo ** (1 / total))}\n" + self.report += f"|Geomeric mean QO default vs best|{'{:.2f}'.format(qo_yb_v1_bests ** (1 / total))}|{'{:.2f}'.format(qa_yb_v2_bests ** (1 / total))}\n" + self._end_table() + + self.report += "\n[#top]\n== QE score\n" + + num_columns = 7 for tag, queries in self.queries.items(): + self._start_table("1,1,1,1,1,1,4") + self.report += f"|{self.v1_name}|{self.v1_name} Best|{self.v2_name}|{self.v2_name} Best" \ + f"|Ratio {self.v1_name} vs {self.v2_name}|Ratio Best {self.v1_name} vs {self.v2_name}|Query\n" self.report += f"{num_columns}+m|{tag}.sql\n" - for query_id, query in enumerate(queries): - same_plan = query[0].compare_plans(query[1].execution_plan) - color = "[green]" if same_plan else "[orange]" - ratio = "{:.2f}".format( - query[1].execution_time_ms / query[0].execution_time_ms - if query[0].execution_time_ms != 0 else 0) - - # insert anchor to the first query in file - self.report += "a|" - if query_id == 0: - self.report += f"[#{tag}]\n" - - # append all query stats - self.report += f"{query[0].execution_time_ms}\n" \ - f"|{query[1].execution_time_ms}\n" \ - f"a|{color}#*{ratio}*#\n" - self.report += f"a|[#{query[0].query_hash}_query]\n" \ - f"<>\n\n" \ - f"<<{query[0].query_hash}>>\n" + for query in queries: + yb_v1_query = query[0] + yb_v2_query = query[1] + + yb_v1_best = yb_v1_query.get_best_optimization(self.config) + yb_v2_best = yb_v2_query.get_best_optimization(self.config) + + yb_ = yb_v2_query.execution_time_ms != 0 + + default_v1_equality = "[green]" if yb_v1_query.compare_plans( + yb_v1_best.execution_plan) else "[red]" + default_v2_equality = "[green]" if yb_ and yb_v2_query.compare_plans( + yb_v2_best.execution_plan) else "[red]" + + best_yb_pg_equality = "(eq) " if yb_v1_best.compare_plans( + yb_v2_best.execution_plan) else "" + + ratio_x3 = yb_v1_query.execution_time_ms / ( + 3 * yb_v2_query.execution_time_ms) if yb_v2_query.execution_time_ms != 0 else 99999999 + ratio_x3_str = "{:.2f}".format( + yb_v1_query.execution_time_ms / yb_v2_query.execution_time_ms + if yb_v2_query.execution_time_ms != 0 else 99999999) + ratio_color = "[green]" if ratio_x3 <= 1.0 else "[red]" + + ratio_best = yb_v1_best.execution_time_ms / ( + 3 * yb_v2_best.execution_time_ms) \ + if yb_v1_best.execution_time_ms != 0 and yb_ else 99999999 + ratio_best_x3_str = "{:.2f}".format( + yb_v1_best.execution_time_ms / yb_v2_best.execution_time_ms + if yb_v1_best.execution_time_ms != 0 and yb_ else 99999999) + ratio_best_color = "[green]" if ratio_best <= 1.0 else "[red]" + + bitmap_flag = "[blue]" if yb_ and "bitmap" in yb_v2_query.execution_plan.full_str.lower() else "[black]" + + self.report += f"a|[black]#*{'{:.2f}'.format(yb_v1_query.execution_time_ms)}*#\n" \ + f"a|{default_v1_equality}#*{'{:.2f}'.format(yb_v1_best.execution_time_ms)}*#\n" \ + f"a|{bitmap_flag}#*{'{:.2f}'.format(yb_v2_query.execution_time_ms)}*#\n" \ + f"a|{default_v2_equality}#*{'{:.2f}'.format(yb_v2_best.execution_time_ms)}*#\n" \ + f"a|{ratio_color}#*{ratio_x3_str}*#\n" \ + f"a|{ratio_best_color}#*{best_yb_pg_equality}{ratio_best_x3_str}*#\n" + self.report += f"a|[#{yb_v1_query.query_hash}_top]\n<<{yb_v1_query.query_hash}>>\n" self._start_source(["sql"]) - self.report += format_sql(query[1].query.replace("|", "\|")) + self.report += format_sql(yb_v2_query.query.replace("|", "\|")) self._end_source() self.report += "\n" self._end_table_row() - self._end_table() + + self._end_table() for tag, queries in self.queries.items(): self.report += f"\n== {tag} queries file\n\n" @@ -186,56 +371,122 @@ def add_peak_memory_collapsible(self): self._end_collapsible() # noinspection InsecureHash - def __report_query(self, first_query: Query, second_query: Query): + def __report_query(self, v1_query: Type[Query], v2_query: Type[Query]): + v1_best = v1_query.get_best_optimization(self.config) + v2_best = v2_query.get_best_optimization(self.config) + self.reported_queries_counter += 1 - self.report += f"\n[#{first_query.query_hash}]\n" - self.report += f"=== Query {first_query.query_hash}" - self.report += f"\nTags: `{first_query.tag}`\n" - self.report += "\n<>\n" - self.report += "\n<>\n" - self.report += f"\n<<{first_query.query_hash}_query,Show in query summary>>\n" + self.report += f"\n[#{v1_query.query_hash}]\n" + self.report += f"=== Query {v1_query.query_hash}" + self.report += f"\n{v1_query.tag}\n" + self.report += "\n<>\n" + self.report += f"\n<<{v1_query.query_hash}_top,Show in summary>>\n" self._add_double_newline() self._start_source(["sql"]) - self.report += format_sql(first_query.query.replace("|", "\|")) + self.report += format_sql(v1_query.query.replace("|", "\|")) self._end_source() + self._start_table("2") + self.report += f"|{self.v1_name}|{self.v2_name}\n" + v1_query_plot = self.create_query_plot(v1_best, v1_query.optimizations, v1_query, "v1") + v2_query_plot = self.create_query_plot(v2_best, v2_query.optimizations, v2_query, "v2") + self.report += f"a|image::{v1_query_plot}[{self.v1_name},align=\"center\"]\n" + self.report += f"a|image::{v2_query_plot}[{self.v2_name},align=\"center\"]\n" + self._end_table() + + self._add_double_newline() + self._add_double_newline() + default_v1_equality = "(eq) " if v1_query.compare_plans( + v1_best.execution_plan) else "" + + self._start_table("5") + self.report += f"|Metric|{self.v1_name}|{self.v1_name} Best|{self.v2_name}|{self.v2_name} Best\n" + + default_v2_equality = "(eq) " if v2_query.compare_plans( + v2_best.execution_plan) else "" + best_yb_pg_equality = "(eq) " if v1_best.compare_plans( + v2_best.execution_plan) else "" + default_v1_v2_equality = "(eq) " if v1_query.compare_plans( + v2_query.execution_plan) else "" + + if 'order by' in v1_query.query: + self._start_table_row() + self.report += \ + f"!! Result hash|{v1_query.result_hash}|{v1_best.result_hash}|{v2_query.result_hash}|{v2_best.result_hash}" \ + if v2_query.result_hash != v1_query.result_hash else \ + f"Result hash|`{v1_query.result_hash}|{v1_best.result_hash}|{v2_query.result_hash}|{v2_best.result_hash}" + self._end_table_row() - self._start_table("3") - self.report += f"|Metric|{self.v1_name}|{self.v2_name}\n" self._start_table_row() - self.report += f"Cardinality|{first_query.result_cardinality}|{second_query.result_cardinality}" + self.report += f"Cardinality|{v1_query.result_cardinality}|{v1_best.result_cardinality}|{v2_query.result_cardinality}|{v2_best.result_cardinality}" self._end_table_row() self._start_table_row() - self.report += f"Optimizer cost|{first_query.execution_plan.get_estimated_cost()}|{second_query.execution_plan.get_estimated_cost()}" + self.report += f"Estimated cost|{v1_query.execution_plan.get_estimated_cost()}|{default_v1_equality}{v1_best.execution_plan.get_estimated_cost()}|{v2_query.execution_plan.get_estimated_cost()}|{default_v2_equality}{v2_best.execution_plan.get_estimated_cost()}" self._end_table_row() self._start_table_row() - self.report += f"Execution time|{first_query.execution_time_ms}|{second_query.execution_time_ms}" + self.report += f"Execution time|{'{:.2f}'.format(v1_query.execution_time_ms)}|{default_v1_equality}{'{:.2f}'.format(v1_best.execution_time_ms)}|{'{:.2f}'.format(v2_query.execution_time_ms)}|{default_v2_equality}{'{:.2f}'.format(v2_best.execution_time_ms)}" self._end_table_row() + self._end_table() self._start_table() self._start_table_row() - self._start_collapsible(f"{self.v1_name} version plan") + self._start_collapsible(f"{self.v1_name} default plan") + self._start_source(["diff"]) + self.report += v1_query.execution_plan.full_str + self._end_source() + self._end_collapsible() + + self._start_collapsible(f"{self.v1_name} best plan") + self._start_source(["diff"]) + self.report += v1_best.execution_plan.full_str + self._end_source() + self._end_collapsible() + + self._start_collapsible(f"{self.v2_name} default plan") + self._start_source(["diff"]) + self.report += v2_query.execution_plan.full_str + self._end_source() + self._end_collapsible() + + v2_best = v2_query.get_best_optimization(self.config) + self._start_collapsible(f"{default_v2_equality}{self.v2_name} best plan") self._start_source(["diff"]) - self.report += first_query.execution_plan.full_str + self.report += v2_best.execution_plan.full_str self._end_source() self._end_collapsible() - self._start_collapsible(f"{self.v2_name} version plan") + self._start_collapsible(f"{default_v1_v2_equality}{self.v1_name} default vs {self.v2_name} default") self._start_source(["diff"]) - self.report += second_query.execution_plan.full_str + + self.report += self._get_plan_diff( + v1_query.execution_plan.full_str, + v2_query.execution_plan.full_str, + ) self._end_source() self._end_collapsible() + self._start_collapsible(f"{best_yb_pg_equality}{self.v2_name} best vs {self.v1_name} best") self._start_source(["diff"]) + self.report += self._get_plan_diff( + v2_best.execution_plan.full_str, + v1_best.execution_plan.full_str, + ) + self._end_source() + self._end_collapsible() - diff = self._get_plan_diff(first_query.execution_plan.full_str, second_query.execution_plan.full_str) + self.report += f"{default_v1_equality}{self.v1_name} vs {self.v2_name}\n" + self._start_source(["diff"]) + diff = self._get_plan_diff( + v1_query.execution_plan.full_str, + v2_query.execution_plan.full_str + ) if not diff: - diff = first_query.execution_plan.full_str + diff = v1_query.execution_plan.full_str self.report += diff self._end_source() @@ -247,6 +498,51 @@ def __report_query(self, first_query: Query, second_query: Query): self._add_double_newline() + def build_xls_report(self): + import xlsxwriter + + workbook = xlsxwriter.Workbook(f'report/{self.start_date}/report_regression.xls') + worksheet = workbook.add_worksheet() + + head_format = workbook.add_format() + head_format.set_bold() + head_format.set_bg_color('#999999') + + eq_format = workbook.add_format() + eq_format.set_bold() + eq_format.set_bg_color('#d9ead3') + + eq_bad_format = workbook.add_format() + eq_bad_format.set_bold() + eq_bad_format.set_bg_color('#fff2cc') + + worksheet.write(0, 0, "First", head_format) + worksheet.write(0, 1, "Second", head_format) + worksheet.write(0, 2, "Ratio", head_format) + worksheet.write(0, 3, "Query", head_format) + worksheet.write(0, 4, "Query Hash", head_format) + + row = 1 + # Iterate over the data and write it out row by row. + for tag, queries in self.queries.items(): + for query in queries: + first_query: Query = query[0] + second_query: Query = query[1] + + ratio = second_query.execution_time_ms / ( + first_query.execution_time_ms) if first_query.execution_time_ms != 0 else 99999999 + ratio_color = eq_bad_format if ratio > 1.0 else eq_format + + worksheet.write(row, 0, '{:.2f}'.format(first_query.execution_time_ms)) + worksheet.write(row, 1, + f"{'{:.2f}'.format(second_query.execution_time_ms)}") + worksheet.write(row, 2, f'{ratio}', ratio_color) + worksheet.write(row, 3, f'{format_sql(first_query.query)}') + worksheet.write(row, 4, f'{first_query.query_hash}') + row += 1 + + workbook.close() + def define_version_names(self, v1_name, v2_name): self.v1_name = v1_name self.v2_name = v2_name @@ -258,4 +554,3 @@ def publish_short_report(self): short_summary.write(f"Changed RPC calls: {self.short_summary.diff_rpc_calls}\n") short_summary.write(f"Changed RPC wait times: {self.short_summary.diff_wait_times}\n") short_summary.write(f"Changed peak memory: {self.short_summary.diff_peak_memory}\n") - diff --git a/src/reports/adoc/score.py b/src/reports/adoc/score.py index a9e06a9a..c58d690f 100644 --- a/src/reports/adoc/score.py +++ b/src/reports/adoc/score.py @@ -1,5 +1,3 @@ -import os - import numpy as np from typing import Type @@ -7,6 +5,7 @@ from matplotlib import rcParams from sql_formatter.core import format_sql +from db.postgres import PostgresQuery from objects import CollectResult, Query from reports.abstract import Report from utils import allowed_diff @@ -16,8 +15,6 @@ class ScoreReport(Report): def __init__(self): super().__init__() - os.mkdir(f"report/{self.start_date}/imgs") - self.queries = {} self.overall_plots = { 'color': 'k.', @@ -35,9 +32,11 @@ def generate_report(cls, loq: CollectResult, pg_loq: CollectResult = None): report.report_config(pg_loq.config, "PG") for qid, query in enumerate(loq.queries): - report.add_query(query, pg_loq.queries[qid] if pg_loq else None) + report.add_query(query, pg_loq.find_query_by_hash(query.query_hash) if pg_loq else None) report.build_report() + report.build_xls_report() + report.publish_report("score") def get_report_name(self): @@ -65,13 +64,14 @@ def create_default_query_plots(self): for tag, queries in self.queries.items(): for yb_pg_queries in queries: query = yb_pg_queries[i] - if query.execution_time_ms: + if query and query.execution_time_ms: x_data.append(query.execution_plan.get_estimated_cost()) y_data.append(query.execution_time_ms) - fig = self.generate_regression_and_standard_errors(x_data, y_data) - fig.savefig(f"report/{self.start_date}/{file_names[i]}", dpi=300) - plt.close() + if x_data and y_data: + fig = self.generate_regression_and_standard_errors(x_data, y_data) + fig.savefig(f"report/{self.start_date}/{file_names[i]}", dpi=300) + plt.close() return file_names @@ -156,9 +156,8 @@ def build_report(self): pg_success = pg_query.execution_time_ms != 0 qe_bests_geo *= yb_best.execution_time_ms / pg_best.execution_time_ms if pg_success else 1 - qo_yb_bests_geo *= ( - yb_query.execution_time_ms if yb_query.execution_time_ms > 0 else 1.0) / ( - yb_best.execution_time_ms if yb_best.execution_time_ms > 0 else 1) + qo_yb_bests_geo *= (yb_query.execution_time_ms if yb_query.execution_time_ms > 0 else 1.0) / \ + (yb_best.execution_time_ms if yb_best.execution_time_ms > 0 else 1) qo_pg_bests_geo *= pg_query.execution_time_ms / pg_best.execution_time_ms if pg_best.execution_time_ms != 0 else 9999999 yb_bests += 1 if yb_query.compare_plans(yb_best.execution_plan) else 0 pg_bests += 1 if pg_success and pg_query.compare_plans( @@ -249,6 +248,126 @@ def __report_near_queries(self, query: Type[Query]): self.report += add_to_report self._end_collapsible() + def build_xls_report(self): + import xlsxwriter + + workbook = xlsxwriter.Workbook(f'report/{self.start_date}/report_score.xls') + worksheet = workbook.add_worksheet() + + head_format = workbook.add_format() + head_format.set_bold() + head_format.set_bg_color('#999999') + + eq_format = workbook.add_format() + eq_format.set_bold() + eq_format.set_bg_color('#d9ead3') + + eq_bad_format = workbook.add_format() + eq_bad_format.set_bold() + eq_bad_format.set_bg_color('#fff2cc') + + eq_good_format = workbook.add_format() + eq_good_format.set_bold() + eq_good_format.set_bg_color('#d9ead3') + + bm_format = workbook.add_format() + bm_format.set_bold() + bm_format.set_bg_color('#cfe2f3') + + pg_comparison_format = workbook.add_format() + pg_comparison_format.set_bold() + pg_comparison_format.set_bg_color('#fce5cd') + + # Start from the first cell. Rows and columns are zero indexed. + yb_bests = 0 + pg_bests = 0 + total = 0 + for queries in self.queries.values(): + for query in queries: + yb_query = query[0] + pg_query = query[1] + + yb_best = yb_query.get_best_optimization(self.config, ) + pg_best = pg_query.get_best_optimization(self.config, ) + + yb_bests += 1 if yb_query.compare_plans(yb_best.execution_plan) else 0 + pg_bests += 1 if pg_query.compare_plans(pg_best.execution_plan) else 0 + + total += 1 + + worksheet.write(0, 0, "YB", head_format) + worksheet.write(0, 1, "YB Best", head_format) + worksheet.write(0, 2, "PG", head_format) + worksheet.write(0, 3, "PG Best", head_format) + worksheet.write(0, 4, "Ratio YB vs PG", head_format) + worksheet.write(0, 5, "Best YB vs PG", head_format) + worksheet.write(0, 6, "Query", head_format) + worksheet.write(0, 7, "Query Hash", head_format) + + row = 1 + # Iterate over the data and write it out row by row. + for tag, queries in self.queries.items(): + for query in queries: + yb_query: PostgresQuery = query[0] + pg_query: PostgresQuery = query[1] + + yb_best = yb_query.get_best_optimization(self.config, ) + pg_best = pg_query.get_best_optimization(self.config, ) + + default_yb_equality = yb_query.compare_plans(yb_best.execution_plan) + default_pg_equality = pg_query.compare_plans(pg_best.execution_plan) + + default_yb_pg_equality = yb_query.compare_plans(pg_query.execution_plan) + best_yb_pg_equality = yb_best.compare_plans(pg_best.execution_plan) + + ratio_x3 = yb_query.execution_time_ms / ( + 3 * pg_query.execution_time_ms) if pg_query.execution_time_ms != 0 else 99999999 + ratio_x3_str = "{:.2f}".format( + yb_query.execution_time_ms / pg_query.execution_time_ms if pg_query.execution_time_ms != 0 else 99999999) + ratio_color = ratio_x3 > 1.0 + + ratio_best = yb_best.execution_time_ms / ( + 3 * pg_best.execution_time_ms) if yb_best.execution_time_ms != 0 else 99999999 + ratio_best_x3_str = "{:.2f}".format( + yb_best.execution_time_ms / pg_best.execution_time_ms if yb_best.execution_time_ms != 0 else 99999999) + ratio_best_color = ratio_best > 1.0 + + bitmap_flag = "bitmap" in pg_query.execution_plan.full_str.lower() + + best_pg_format = None + if ratio_best_color and best_yb_pg_equality: + best_pg_format = eq_bad_format + elif best_yb_pg_equality: + best_pg_format = eq_good_format + elif ratio_best_color: + best_pg_format = pg_comparison_format + + df_pf_format = None + if ratio_color and default_yb_pg_equality: + df_pf_format = eq_bad_format + elif default_yb_pg_equality: + df_pf_format = eq_good_format + elif ratio_color: + df_pf_format = pg_comparison_format + + worksheet.write(row, 0, '{:.2f}'.format(yb_query.execution_time_ms)) + worksheet.write(row, 1, + f"{'{:.2f}'.format(yb_best.execution_time_ms)}", + eq_format if default_yb_equality else None) + worksheet.write(row, 2, + f"{'{:.2f}'.format(pg_query.execution_time_ms)}", + bm_format if bitmap_flag else None) + worksheet.write(row, 3, + f"{'{:.2f}'.format(pg_best.execution_time_ms)}", + eq_format if default_pg_equality else None) + worksheet.write(row, 4, f"{ratio_x3_str}", df_pf_format) + worksheet.write(row, 5, f"{ratio_best_x3_str}", best_pg_format) + worksheet.write(row, 6, f'{format_sql(pg_query.query)}') + worksheet.write(row, 7, f'{pg_query.query_hash}') + row += 1 + + workbook.close() + def __report_heatmap(self, query: Type[Query]): """ Here is the deal. In PG plans we can separate each plan tree node by splitting by `->` diff --git a/src/reports/xls/regression.py b/src/reports/xls/regression.py deleted file mode 100644 index 314edd33..00000000 --- a/src/reports/xls/regression.py +++ /dev/null @@ -1,81 +0,0 @@ -from typing import Type - -from sql_formatter.core import format_sql - -from objects import CollectResult, Query -from reports.abstract import Report - - -class RegressionXlsReport(Report): - def __init__(self): - super().__init__() - - self.logger.info(f"Created report folder for this run at 'report/{self.start_date}'") - - self.queries = {} - - @classmethod - def generate_report(cls, first_loq: CollectResult, second_loq: CollectResult): - report = RegressionXlsReport() - - for qid, query in enumerate(first_loq.queries): - report.add_query(query, second_loq.queries[qid]) - - report.build_report() - - def get_report_name(self): - return "regression" - - def define_version(self, version): - pass - - def add_query(self, query: Type[Query], pg: Type[Query] | None): - if query.tag not in self.queries: - self.queries[query.tag] = [[query, pg], ] - else: - self.queries[query.tag].append([query, pg]) - - def build_report(self): - import xlsxwriter - - workbook = xlsxwriter.Workbook(f'report/{self.start_date}/report_regression.xls') - worksheet = workbook.add_worksheet() - - head_format = workbook.add_format() - head_format.set_bold() - head_format.set_bg_color('#999999') - - eq_format = workbook.add_format() - eq_format.set_bold() - eq_format.set_bg_color('#d9ead3') - - eq_bad_format = workbook.add_format() - eq_bad_format.set_bold() - eq_bad_format.set_bg_color('#fff2cc') - - worksheet.write(0, 0, "First", head_format) - worksheet.write(0, 1, "Second", head_format) - worksheet.write(0, 2, "Ratio", head_format) - worksheet.write(0, 3, "Query", head_format) - worksheet.write(0, 4, "Query Hash", head_format) - - row = 1 - # Iterate over the data and write it out row by row. - for tag, queries in self.queries.items(): - for query in queries: - first_query: Query = query[0] - second_query: Query = query[1] - - ratio = second_query.execution_time_ms / ( - first_query.execution_time_ms) if first_query.execution_time_ms != 0 else 99999999 - ratio_color = eq_bad_format if ratio > 1.0 else eq_format - - worksheet.write(row, 0, '{:.2f}'.format(first_query.execution_time_ms)) - worksheet.write(row, 1, - f"{'{:.2f}'.format(second_query.execution_time_ms)}") - worksheet.write(row, 2, f'{ratio}', ratio_color) - worksheet.write(row, 3, f'{format_sql(first_query.query)}') - worksheet.write(row, 4, f'{first_query.query_hash}') - row += 1 - - workbook.close() diff --git a/src/reports/xls/score.py b/src/reports/xls/score.py deleted file mode 100644 index cb53da74..00000000 --- a/src/reports/xls/score.py +++ /dev/null @@ -1,183 +0,0 @@ -from typing import Type - -from matplotlib import pyplot as plt -from sql_formatter.core import format_sql - -from objects import CollectResult, Query -from db.postgres import PostgresQuery -from reports.abstract import Report - - -class ScoreXlsReport(Report): - def __init__(self): - super().__init__() - - self.logger.info(f"Created report folder for this run at 'report/{self.start_date}'") - - self.queries = {} - - @classmethod - def generate_report(cls, loq: CollectResult, pg_loq: CollectResult = None): - report = ScoreXlsReport() - - for qid, query in enumerate(loq.queries): - report.add_query(query, pg_loq.queries[qid] if pg_loq else None) - - report.build_report() - - def get_report_name(self): - return "score" - - def define_version(self, version): - self.report += f"[VERSION]\n====\n{version}\n====\n\n" - - def calculate_score(self, query): - if query.execution_time_ms == 0: - return -1 - else: - return "{:.2f}".format( - query.get_best_optimization( - self.config).execution_time_ms / query.execution_time_ms) - - def create_plot(self, best_optimization, optimizations, query): - plt.xlabel('Execution time') - plt.ylabel('Optimizer cost') - - plt.plot([q.execution_time_ms for q in optimizations if q.execution_time_ms != 0], - [q.execution_plan.get_estimated_cost() for q in optimizations if q.execution_time_ms != 0], 'k.', - [query.execution_time_ms], - [query.execution_plan.get_estimated_cost()], 'r^', - [best_optimization.execution_time_ms], - [best_optimization.execution_plan.get_estimated_cost()], 'go') - - file_name = f'imgs/query_{self.reported_queries_counter}.png' - plt.savefig(f"report/{self.start_date}/{file_name}") - plt.close() - - return file_name - - def add_query(self, query: Type[Query], pg: Query | None): - if query.tag not in self.queries: - self.queries[query.tag] = [[query, pg], ] - else: - self.queries[query.tag].append([query, pg]) - - def build_report(self): - import xlsxwriter - - workbook = xlsxwriter.Workbook(f'report/{self.start_date}/report_score.xls') - worksheet = workbook.add_worksheet() - - head_format = workbook.add_format() - head_format.set_bold() - head_format.set_bg_color('#999999') - - eq_format = workbook.add_format() - eq_format.set_bold() - eq_format.set_bg_color('#d9ead3') - - eq_bad_format = workbook.add_format() - eq_bad_format.set_bold() - eq_bad_format.set_bg_color('#fff2cc') - - eq_good_format = workbook.add_format() - eq_good_format.set_bold() - eq_good_format.set_bg_color('#d9ead3') - - bm_format = workbook.add_format() - bm_format.set_bold() - bm_format.set_bg_color('#cfe2f3') - - pg_comparison_format = workbook.add_format() - pg_comparison_format.set_bold() - pg_comparison_format.set_bg_color('#fce5cd') - - # Start from the first cell. Rows and columns are zero indexed. - yb_bests = 0 - pg_bests = 0 - total = 0 - for queries in self.queries.values(): - for query in queries: - yb_query = query[0] - pg_query = query[1] - - yb_best = yb_query.get_best_optimization(self.config, ) - pg_best = pg_query.get_best_optimization(self.config, ) - - yb_bests += 1 if yb_query.compare_plans(yb_best.execution_plan) else 0 - pg_bests += 1 if pg_query.compare_plans(pg_best.execution_plan) else 0 - - total += 1 - - worksheet.write(0, 0, "YB", head_format) - worksheet.write(0, 1, "YB Best", head_format) - worksheet.write(0, 2, "PG", head_format) - worksheet.write(0, 3, "PG Best", head_format) - worksheet.write(0, 4, "Ratio YB vs PG", head_format) - worksheet.write(0, 5, "Best YB vs PG", head_format) - worksheet.write(0, 6, "Query", head_format) - worksheet.write(0, 7, "Query Hash", head_format) - - row = 1 - # Iterate over the data and write it out row by row. - for tag, queries in self.queries.items(): - for query in queries: - yb_query: PostgresQuery = query[0] - pg_query: PostgresQuery = query[1] - - yb_best = yb_query.get_best_optimization(self.config, ) - pg_best = pg_query.get_best_optimization(self.config, ) - - default_yb_equality = yb_query.compare_plans(yb_best.execution_plan) - default_pg_equality = pg_query.compare_plans(pg_best.execution_plan) - - default_yb_pg_equality = yb_query.compare_plans(pg_query.execution_plan) - best_yb_pg_equality = yb_best.compare_plans(pg_best.execution_plan) - - ratio_x3 = yb_query.execution_time_ms / ( - 3 * pg_query.execution_time_ms) if pg_query.execution_time_ms != 0 else 99999999 - ratio_x3_str = "{:.2f}".format( - yb_query.execution_time_ms / pg_query.execution_time_ms if pg_query.execution_time_ms != 0 else 99999999) - ratio_color = ratio_x3 > 1.0 - - ratio_best = yb_best.execution_time_ms / ( - 3 * pg_best.execution_time_ms) if yb_best.execution_time_ms != 0 else 99999999 - ratio_best_x3_str = "{:.2f}".format( - yb_best.execution_time_ms / pg_best.execution_time_ms if yb_best.execution_time_ms != 0 else 99999999) - ratio_best_color = ratio_best > 1.0 - - bitmap_flag = "bitmap" in pg_query.execution_plan.full_str.lower() - - best_pg_format = None - if ratio_best_color and best_yb_pg_equality: - best_pg_format = eq_bad_format - elif best_yb_pg_equality: - best_pg_format = eq_good_format - elif ratio_best_color: - best_pg_format = pg_comparison_format - - df_pf_format = None - if ratio_color and default_yb_pg_equality: - df_pf_format = eq_bad_format - elif default_yb_pg_equality: - df_pf_format = eq_good_format - elif ratio_color: - df_pf_format = pg_comparison_format - - worksheet.write(row, 0, '{:.2f}'.format(yb_query.execution_time_ms)) - worksheet.write(row, 1, - f"{'{:.2f}'.format(yb_best.execution_time_ms)}", - eq_format if default_yb_equality else None) - worksheet.write(row, 2, - f"{'{:.2f}'.format(pg_query.execution_time_ms)}", - bm_format if bitmap_flag else None) - worksheet.write(row, 3, - f"{'{:.2f}'.format(pg_best.execution_time_ms)}", - eq_format if default_pg_equality else None) - worksheet.write(row, 4, f"{ratio_x3_str}", df_pf_format) - worksheet.write(row, 5, f"{ratio_best_x3_str}", best_pg_format) - worksheet.write(row, 6, f'{format_sql(pg_query.query)}') - worksheet.write(row, 7, f'{pg_query.query_hash}') - row += 1 - - workbook.close() diff --git a/src/runner.py b/src/runner.py index 0ee91d4c..109aae5f 100644 --- a/src/runner.py +++ b/src/runner.py @@ -5,11 +5,8 @@ from config import Config, init_logger, ConnectionConfig, DDLStep from db.factory import create_database from db.postgres import DEFAULT_USERNAME, DEFAULT_PASSWORD, PostgresResultsLoader -from reports.adoc.comparison import ComparisonReport from reports.adoc.regression import RegressionReport from reports.adoc.score import ScoreReport -from reports.xls.score import ScoreXlsReport -from reports.xls.regression import RegressionXlsReport from reports.adoc.selectivity import SelectivityReport from reports.adoc.taqo import TaqoReport @@ -236,7 +233,8 @@ def parse_ddls(ddl_ops): enable_statistics=args.enable_statistics or get_bool_from_str( configuration.get("enable-statistics", False)), explain_clause=args.explain_clause or configuration.get("explain-clause", "EXPLAIN"), - session_props=configuration.get("session-props") + (args.session_props.split(",") if args.session_props else []), + session_props=configuration.get("session-props") + + (args.session_props.split(",") if args.session_props else []), basic_multiplier=int(args.basic_multiplier), skip_percentage_delta=configuration.get("skip-percentage-delta", 0.05), @@ -301,37 +299,12 @@ def parse_ddls(ddl_ops): args.pg_results) if args.pg_results else None ScoreReport.generate_report(yb_queries, pg_queries) - elif args.type == "score_xls": - yb_queries = loader.get_queries_from_previous_result(args.results) - pg_queries = loader.get_queries_from_previous_result( - args.pg_results) if args.pg_results else None - - ScoreXlsReport.generate_report(yb_queries, pg_queries) elif args.type == "regression": - report = RegressionReport() - v1_queries = loader.get_queries_from_previous_result(args.v1_results) v2_queries = loader.get_queries_from_previous_result(args.v2_results) - report.generate_report(args.v1_name, args.v2_name, v1_queries, v2_queries) - elif args.type == "regression_xls": - report = RegressionXlsReport() - - v1_queries = loader.get_queries_from_previous_result(args.v1_results) - v2_queries = loader.get_queries_from_previous_result(args.v2_results) - - report.generate_report(v1_queries, v2_queries) - elif args.type == "comparison": - report = ComparisonReport() - - yb_queries = loader.get_queries_from_previous_result(args.results) - pg_queries = loader.get_queries_from_previous_result( - args.pg_results) if args.pg_results else None - - report.generate_report(yb_queries, pg_queries) + RegressionReport.generate_report(args.v1_name, args.v2_name, v1_queries, v2_queries) elif args.type == "selectivity": - report = SelectivityReport() - default_queries = loader.get_queries_from_previous_result(args.default_results) default_analyze_queries = loader.get_queries_from_previous_result( args.default_analyze_results) @@ -341,7 +314,7 @@ def parse_ddls(ddl_ops): stats_analyze_queries = loader.get_queries_from_previous_result( args.stats_analyze_results) - report.generate_report(default_queries, default_analyze_queries, ta_queries, - ta_analyze_queries, stats_queries, stats_analyze_queries) + SelectivityReport.generate_report(default_queries, default_analyze_queries, ta_queries, + ta_analyze_queries, stats_queries, stats_analyze_queries) else: raise AttributeError(f"Unknown test type defined {config.test}")