From dd7af477acf112308f4b4bc5632275d05687c169 Mon Sep 17 00:00:00 2001 From: leahwicz <60146280+leahwicz@users.noreply.github.com> Date: Fri, 29 Oct 2021 17:06:09 -0400 Subject: [PATCH] Perf improvement to subgraph selection (#4155) Perf improvement to get_subset_graph Co-authored-by: Ian Knox --- CHANGELOG.md | 4 ++-- core/dbt/graph/graph.py | 15 +++++++++++++-- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ff1fd2194a7..d75733dce0b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,13 +2,13 @@ ### Features - Allow nullable `error_after` in source freshness ([#3874](https://github.com/dbt-labs/dbt-core/issues/3874), [#3955](https://github.com/dbt-labs/dbt-core/pull/3955)) - +- Increase performance of graph subset selection ([#4135](https://github.com/dbt-labs/dbt-core/issues/4135),[#4155](https://github.com/dbt-labs/dbt-core/pull/4155)) ### Fixes - Changes unit tests using `assertRaisesRegexp` to `assertRaisesRegex` Contributors: - [@kadero](https://github.com/kadero) ([3955](https://github.com/dbt-labs/dbt-core/pull/3955)) -- [@frankcash](https://github.com/frankcash) ([4136](https://github.com/dbt-labs/dbt-core/pull/4136) +- [@frankcash](https://github.com/frankcash) ([4136](https://github.com/dbt-labs/dbt-core/pull/4136)) ## dbt-core 1.0.0b2 (October 25, 2021) diff --git a/core/dbt/graph/graph.py b/core/dbt/graph/graph.py index a2feba24f3c..ba752d6c58a 100644 --- a/core/dbt/graph/graph.py +++ b/core/dbt/graph/graph.py @@ -1,6 +1,7 @@ from typing import ( Set, Iterable, Iterator, Optional, NewType ) +from itertools import product import networkx as nx # type: ignore from dbt.exceptions import InternalException @@ -77,17 +78,26 @@ def select_successors(self, selected: Set[UniqueId]) -> Set[UniqueId]: successors.update(self.graph.successors(node)) return successors - def get_subset_graph(self, selected: Iterable[UniqueId]) -> 'Graph': + def get_subset_graph(self, selected: Iterable[UniqueId]) -> "Graph": """Create and return a new graph that is a shallow copy of the graph, but with only the nodes in include_nodes. Transitive edges across removed nodes are preserved as explicit new edges. """ - new_graph = nx.algorithms.transitive_closure(self.graph) + new_graph = self.graph.copy() include_nodes = set(selected) for node in self: if node not in include_nodes: + source_nodes = [x for x, _ in new_graph.in_edges(node)] + target_nodes = [x for _, x in new_graph.out_edges(node)] + + new_edges = product(source_nodes, target_nodes) + non_cyclic_new_edges = [ + (source, target) for source, target in new_edges if source != target + ] # removes cyclic refs + + new_graph.add_edges_from(non_cyclic_new_edges) new_graph.remove_node(node) for node in include_nodes: @@ -96,6 +106,7 @@ def get_subset_graph(self, selected: Iterable[UniqueId]) -> 'Graph': "Couldn't find model '{}' -- does it exist or is " "it disabled?".format(node) ) + return Graph(new_graph) def subgraph(self, nodes: Iterable[UniqueId]) -> 'Graph':