databrickslabs · nfx · Oct 8, 2024 · Oct 4, 2024 · Oct 4, 2024 · Oct 4, 2024
@@ -12,6 +12,7 @@
     UsedTable,
     TableSqlCollector,
 )
+from databricks.labs.ucx.source_code.linters.directfs import DIRECT_FS_ACCESS_PATTERNS
 from databricks.labs.ucx.source_code.sql.sql_parser import SqlExpression, SqlParser
 
 logger = logging.getLogger(__name__)
@@ -68,9 +69,12 @@ def lint_expression(self, expression: Expression):
 
     def collect_tables(self, source_code: str) -> Iterable[UsedTable]:
         try:
-            yield from SqlParser.walk_expressions(
+            for info in SqlParser.walk_expressions(
                 source_code, lambda e: e.collect_table_infos("hive_metastore", self._session_state)
-            )
+            ):
+                if any(pattern.matches(info.table_name) for pattern in DIRECT_FS_ACCESS_PATTERNS):
+                    continue
+                yield info
         except SqlParseError as _:
             pass  # TODO establish a strategy
 

@@ -394,7 +394,8 @@ def collect_tables_from_tree(self, tree: Tree) -> Iterable[TableInfoNode]:
             if matcher is None:
                 continue
             assert isinstance(node, Call)
-            yield from matcher.collect_tables(self._from_table, self._index, self._session_state, node)
+            for used_table in matcher.collect_tables(self._from_table, self._index, self._session_state, node):
+                yield TableInfoNode(used_table, node)
 
 
 class _SparkSqlAnalyzer:

@@ -111,11 +111,12 @@ def test_raises_advice_when_parsing_unsupported_sql(migration_index):
     [
         ("SELECT * FROM hive_metastore.old.things", [("hive_metastore", "old", "things")]),
         ("SELECT * FROM old.things", [("hive_metastore", "old", "things")]),
-        ("SELECT * FROM brand.new.things", []),
         ("SELECT * FROM new.things", [("hive_metastore", "new", "things")]),
+        ("SELECT * FROM brand.new.things", []),
+        ("SELECT * FROM parquet.`dbfs://mnt/foo2/bar2`", []),
     ],
 )
-def test_collects_tables(query, expected, migration_index):
+def test_linter_collects_tables(query, expected, migration_index):
     session_state = CurrentSessionState(schema="old")
     ftf = FromTableSqlLinter(migration_index, session_state=session_state)
     tuples = list((info.catalog_name, info.schema_name, info.table_name) for info in ftf.collect_tables(query))

@@ -638,3 +638,20 @@ def test_apply_table_name_matcher_with_existing_constant(migration_index):
     table_constant = node.value.args[0]
     assert isinstance(table_constant, Const)
     assert table_constant.value == 'brand.new.stuff'
+
+
+@pytest.mark.parametrize(
+    "source_code, expected",
+    [
+        ("spark.table('my_schema.my_table')", [('hive_metastore', 'my_schema', 'my_table')]),
+        ("spark.read.parquet('dbfs://mnt/foo2/bar2')", []),
+    ],
+)
+def test_spark_collect_tables_ignores_dfsas(source_code, expected, migration_index):
+    session_state = CurrentSessionState('old')
+    from_table = FromTableSqlLinter(migration_index, session_state)
+    linter = SparkTableNamePyLinter(from_table, migration_index, session_state)
+    used_tables = list(linter.collect_tables(source_code))
+    for used_table in used_tables:
+        actual = (used_table.catalog_name, used_table.schema_name, used_table.table_name)
+        assert actual in expected