Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Exclude dfsas from used tables #2841

Merged
merged 4 commits into from
Oct 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions src/databricks/labs/ucx/source_code/linters/from_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
UsedTable,
TableSqlCollector,
)
from databricks.labs.ucx.source_code.linters.directfs import DIRECT_FS_ACCESS_PATTERNS
from databricks.labs.ucx.source_code.sql.sql_parser import SqlExpression, SqlParser

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -68,9 +69,12 @@ def lint_expression(self, expression: Expression):

def collect_tables(self, source_code: str) -> Iterable[UsedTable]:
try:
yield from SqlParser.walk_expressions(
for info in SqlParser.walk_expressions(
source_code, lambda e: e.collect_table_infos("hive_metastore", self._session_state)
)
):
if any(pattern.matches(info.table_name) for pattern in DIRECT_FS_ACCESS_PATTERNS):
continue
yield info
except SqlParseError as _:
pass # TODO establish a strategy

Expand Down
3 changes: 2 additions & 1 deletion src/databricks/labs/ucx/source_code/linters/pyspark.py
Original file line number Diff line number Diff line change
Expand Up @@ -394,7 +394,8 @@ def collect_tables_from_tree(self, tree: Tree) -> Iterable[TableInfoNode]:
if matcher is None:
continue
assert isinstance(node, Call)
yield from matcher.collect_tables(self._from_table, self._index, self._session_state, node)
for used_table in matcher.collect_tables(self._from_table, self._index, self._session_state, node):
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fix hidden bug

yield TableInfoNode(used_table, node)


class _SparkSqlAnalyzer:
Expand Down
5 changes: 3 additions & 2 deletions tests/unit/source_code/linters/test_from_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,11 +111,12 @@ def test_raises_advice_when_parsing_unsupported_sql(migration_index):
[
("SELECT * FROM hive_metastore.old.things", [("hive_metastore", "old", "things")]),
("SELECT * FROM old.things", [("hive_metastore", "old", "things")]),
("SELECT * FROM brand.new.things", []),
("SELECT * FROM new.things", [("hive_metastore", "new", "things")]),
("SELECT * FROM brand.new.things", []),
("SELECT * FROM parquet.`dbfs://mnt/foo2/bar2`", []),
],
)
def test_collects_tables(query, expected, migration_index):
def test_linter_collects_tables(query, expected, migration_index):
session_state = CurrentSessionState(schema="old")
ftf = FromTableSqlLinter(migration_index, session_state=session_state)
tuples = list((info.catalog_name, info.schema_name, info.table_name) for info in ftf.collect_tables(query))
Expand Down
17 changes: 17 additions & 0 deletions tests/unit/source_code/linters/test_pyspark.py
Original file line number Diff line number Diff line change
Expand Up @@ -638,3 +638,20 @@ def test_apply_table_name_matcher_with_existing_constant(migration_index):
table_constant = node.value.args[0]
assert isinstance(table_constant, Const)
assert table_constant.value == 'brand.new.stuff'


@pytest.mark.parametrize(
"source_code, expected",
[
("spark.table('my_schema.my_table')", [('hive_metastore', 'my_schema', 'my_table')]),
("spark.read.parquet('dbfs://mnt/foo2/bar2')", []),
],
)
def test_spark_collect_tables_ignores_dfsas(source_code, expected, migration_index):
session_state = CurrentSessionState('old')
from_table = FromTableSqlLinter(migration_index, session_state)
linter = SparkTableNamePyLinter(from_table, migration_index, session_state)
used_tables = list(linter.collect_tables(source_code))
for used_table in used_tables:
actual = (used_table.catalog_name, used_table.schema_name, used_table.table_name)
assert actual in expected
Loading