Skip to content

Commit

Permalink
Exclude dfsas from used tables (#2841)
Browse files Browse the repository at this point in the history
## Changes
Our current implementation treats direct filesystem access patterns as
tables, which is incorrect.
This PR fixes this issue.

### Linked issues
None

### Functionality
None

### Tests
- [x] added unit tests

---------

Co-authored-by: Eric Vergnaud <[email protected]>
  • Loading branch information
ericvergnaud and ericvergnaud authored Oct 8, 2024
1 parent c76d04e commit 2531122
Show file tree
Hide file tree
Showing 4 changed files with 28 additions and 5 deletions.
8 changes: 6 additions & 2 deletions src/databricks/labs/ucx/source_code/linters/from_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
UsedTable,
TableSqlCollector,
)
from databricks.labs.ucx.source_code.linters.directfs import DIRECT_FS_ACCESS_PATTERNS
from databricks.labs.ucx.source_code.sql.sql_parser import SqlExpression, SqlParser

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -68,9 +69,12 @@ def lint_expression(self, expression: Expression):

def collect_tables(self, source_code: str) -> Iterable[UsedTable]:
try:
yield from SqlParser.walk_expressions(
for info in SqlParser.walk_expressions(
source_code, lambda e: e.collect_table_infos("hive_metastore", self._session_state)
)
):
if any(pattern.matches(info.table_name) for pattern in DIRECT_FS_ACCESS_PATTERNS):
continue
yield info
except SqlParseError as _:
pass # TODO establish a strategy

Expand Down
3 changes: 2 additions & 1 deletion src/databricks/labs/ucx/source_code/linters/pyspark.py
Original file line number Diff line number Diff line change
Expand Up @@ -394,7 +394,8 @@ def collect_tables_from_tree(self, tree: Tree) -> Iterable[TableInfoNode]:
if matcher is None:
continue
assert isinstance(node, Call)
yield from matcher.collect_tables(self._from_table, self._index, self._session_state, node)
for used_table in matcher.collect_tables(self._from_table, self._index, self._session_state, node):
yield TableInfoNode(used_table, node)


class _SparkSqlAnalyzer:
Expand Down
5 changes: 3 additions & 2 deletions tests/unit/source_code/linters/test_from_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,11 +111,12 @@ def test_raises_advice_when_parsing_unsupported_sql(migration_index):
[
("SELECT * FROM hive_metastore.old.things", [("hive_metastore", "old", "things")]),
("SELECT * FROM old.things", [("hive_metastore", "old", "things")]),
("SELECT * FROM brand.new.things", []),
("SELECT * FROM new.things", [("hive_metastore", "new", "things")]),
("SELECT * FROM brand.new.things", []),
("SELECT * FROM parquet.`dbfs://mnt/foo2/bar2`", []),
],
)
def test_collects_tables(query, expected, migration_index):
def test_linter_collects_tables(query, expected, migration_index):
session_state = CurrentSessionState(schema="old")
ftf = FromTableSqlLinter(migration_index, session_state=session_state)
tuples = list((info.catalog_name, info.schema_name, info.table_name) for info in ftf.collect_tables(query))
Expand Down
17 changes: 17 additions & 0 deletions tests/unit/source_code/linters/test_pyspark.py
Original file line number Diff line number Diff line change
Expand Up @@ -638,3 +638,20 @@ def test_apply_table_name_matcher_with_existing_constant(migration_index):
table_constant = node.value.args[0]
assert isinstance(table_constant, Const)
assert table_constant.value == 'brand.new.stuff'


@pytest.mark.parametrize(
"source_code, expected",
[
("spark.table('my_schema.my_table')", [('hive_metastore', 'my_schema', 'my_table')]),
("spark.read.parquet('dbfs://mnt/foo2/bar2')", []),
],
)
def test_spark_collect_tables_ignores_dfsas(source_code, expected, migration_index):
session_state = CurrentSessionState('old')
from_table = FromTableSqlLinter(migration_index, session_state)
linter = SparkTableNamePyLinter(from_table, migration_index, session_state)
used_tables = list(linter.collect_tables(source_code))
for used_table in used_tables:
actual = (used_table.catalog_name, used_table.schema_name, used_table.table_name)
assert actual in expected

0 comments on commit 2531122

Please sign in to comment.