Merge pull request #1407 from fishtown-analytics/feature/wrap-models-…

…in-jinja-tags add a flag to wrap models/tests in jinja blocks
dbt-labs · Apr 18, 2019 · dd469ad · dd469ad
2 parents 4dd8056 + 4ffc5cb
commit dd469ad
Show file tree

Hide file tree

Showing 8 changed files with 330 additions and 32 deletions.
diff --git a/core/dbt/clients/_jinja_blocks.py b/core/dbt/clients/_jinja_blocks.py
@@ -16,11 +16,12 @@ def __init__(self, contents):
 
 
 class BlockTag(object):
-    def __init__(self, block_type_name, block_name, contents=None, **kw):
+    def __init__(self, block_type_name, block_name, contents=None,
+                 full_block=None, **kw):
         self.block_type_name = block_type_name
         self.block_name = block_name
         self.contents = contents
-        self.full_block = None
+        self.full_block = full_block
 
     def __str__(self):
         return 'BlockTag({!r}, {!r})'.format(self.block_type_name,
@@ -59,7 +60,7 @@ def end_pat(self):
     r'(?:\s+(?P<block_name>({})))?'.format(_NAME_PATTERN),
 )))
 
-TAG_CLOSE_PATTERN = regex(r'(?:\-\%\}\s*|\%\})')
+TAG_CLOSE_PATTERN = regex(r'(?:(?P<tag_close>(\-\%\}\s*|\%\})))')
 # if you do {% materialization foo, adapter="myadapter' %} and end up with
 # mismatched quotes this will still match, but jinja will fail somewhere
 # since the adapter= argument has to be an adapter name, and none have quotes
@@ -103,6 +104,13 @@ def end_pat(self):
 )
 
 
+NON_STRING_DO_BLOCK_MEMBER_PATTERN = regex(
+    # anything, followed by a quote, paren, or a tag end
+    r'''(.*?)'''
+    r'''((?P<quote>(['"]))|(?P<open>(\())|(?P<close>(\))))'''
+)
+
+
 class BlockIterator(object):
     def __init__(self, data):
         self.data = data
@@ -134,9 +142,7 @@ def _match(self, pattern):
     def expect_comment_end(self):
         """Expect a comment end and return the match object.
         """
-        match = self._match(COMMENT_END_PATTERN)
-        if match is None:
-            dbt.exceptions.raise_compiler_error('unexpected EOF, expected #}')
+        match = self._expect_match('#}', COMMENT_END_PATTERN)
         self.advance(match.end())
 
     def expect_raw_end(self):
@@ -195,7 +201,8 @@ def handle_block(self, match, block_start=None):
         while True:
             match = self._expect_match(
                 '"{}"'.format(found.end_block_type_name),
-                found.end_pat(), COMMENT_START_PATTERN, RAW_START_PATTERN
+                found.end_pat(), COMMENT_START_PATTERN, RAW_START_PATTERN,
+                regex('''(?P<quote>(['"]))''')
             )
             groups = match.groupdict()
             if groups.get('endblock') is not None:
@@ -207,6 +214,10 @@ def handle_block(self, match, block_start=None):
                 self.expect_comment_end()
             elif groups.get('raw_start') is not None:
                 self.expect_raw_end()
+            elif groups.get('quote') is not None:
+                self.rewind()
+                match = self._expect_match('any string', STRING_PATTERN)
+                self.advance(match.end())
             else:
                 raise dbt.exceptions.InternalException(
                     'unhandled regex in handle_block, no match: {}'
@@ -226,10 +237,49 @@ def handle_block(self, match, block_start=None):
     def handle_materialization(self, match):
         self._expect_match('materialization args',
                            MATERIALIZATION_ARGS_PATTERN)
-        self._expect_match('%}', TAG_CLOSE_PATTERN)
+        endtag = self._expect_match('%}', TAG_CLOSE_PATTERN)
+        self.advance(endtag.end())
         # handle the block we started with!
         self.blocks.append(self.handle_block(match))
 
+    def handle_do(self, match, expect_block):
+        if expect_block:
+            # we might be wrong to expect a block ({% do (...) %}, for example)
+            # so see if there's more data before the tag closes. if there
+            # isn't, we expect a block.
+            close_match = self._expect_match('%}', TAG_CLOSE_PATTERN)
+            unprocessed = self.data[match.end():close_match.start()].strip()
+            expect_block = not unprocessed
+
+        if expect_block:
+            # if we're here, expect_block is True and we must have set
+            # close_match
+            self.advance(close_match.end())
+            block = self.handle_block(match)
+        else:
+            # we have a do-statement like {% do thing() %}, so no {% enddo %}
+            # also, we don't want to advance to the end of the match, as it
+            # might be inside a string or something! So go back and figure out
+            self._process_rval_components()
+            block = BlockTag('do', None,
+                             full_block=self.data[match.start():self.pos])
+        self.blocks.append(block)
+
+    def handle_set(self, match):
+        equal_or_close = self._expect_match('%} or =',
+                                            TAG_CLOSE_PATTERN, regex(r'='))
+        self.advance(equal_or_close.end())
+        if equal_or_close.groupdict().get('tag_close') is None:
+            # it's an equals sign, must be like {% set x = 1 %}
+            self._process_rval_components()
+            # watch out, order matters here on python 2
+            block = BlockTag(full_block=self.data[match.start():self.pos],
+                             **match.groupdict())
+        else:
+            # it's a tag close, must be like {% set x %}...{% endset %}
+            block = self.handle_block(match)
+        self.blocks.append(block)
+
     def find_block(self):
         open_block = (
             r'(?:\s*\{\%\-|\{\%)\s*'
@@ -251,21 +301,39 @@ def find_block(self):
         # comments are easy
         if matchgroups.get('comment_start') is not None:
             start = match.start()
+            self.advance(match.end())
             self.expect_comment_end()
             self.blocks.append(BlockData(self.data[start:self.pos]))
             return True
 
-        if matchgroups.get('block_type_name') == 'raw':
+        block_type_name = matchgroups.get('block_type_name')
+
+        if block_type_name == 'raw':
             start = match.start()
             self.expect_raw_end()
             self.blocks.append(BlockData(self.data[start:self.pos]))
             return True
 
-        if matchgroups.get('block_type_name') == 'materialization':
+        if block_type_name == 'materialization':
             self.advance(match.end())
             self.handle_materialization(match)
             return True
 
+        if block_type_name == 'do':
+            # if there is a "block_name" in the match groups, we don't expect a
+            # block as the "block name" is actually part of the do-statement.
+            # we need to do this to handle the (weird and probably wrong!) case
+            # of a do-statement that is only a single identifier - techincally
+            # allowed in jinja. (for example, {% do thing %})
+            expect_block = matchgroups.get('block_name') is None
+            self.handle_do(match, expect_block=expect_block)
+            return True
+
+        if block_type_name == 'set':
+            self.advance(match.end())
+            self.handle_set(match)
+            return True
+
         # we're somewhere like this {% block_type_name block_type
         # we've either got arguments, a close of tag (%}), or bad input.
         # we've handled materializations already (they're weird!)
@@ -284,6 +352,42 @@ def find_block(self):
         self.blocks.append(self.handle_block(match))
         return True
 
+    def _process_rval_components(self):
+        """This is suspiciously similar to _process_macro_default_arg, probably
+        want to figure out how to merge the two.
+
+        Process the rval of an assignment statement or a do-block
+        """
+        while True:
+            match = self._expect_match(
+                'do block component',
+                # you could have a string, though that would be weird
+                STRING_PATTERN,
+                # a quote or an open/close parenthesis
+                NON_STRING_DO_BLOCK_MEMBER_PATTERN,
+                # a tag close
+                TAG_CLOSE_PATTERN
+            )
+            matchgroups = match.groupdict()
+            self.advance(match.end())
+            if matchgroups.get('string') is not None:
+                continue
+            elif matchgroups.get('quote') is not None:
+                self.rewind()
+                # now look for a string
+                match = self._expect_match('any string', STRING_PATTERN)
+                self.advance(match.end())
+            elif matchgroups.get('open'):
+                self._parenthesis_stack.append(True)
+            elif matchgroups.get('close'):
+                self._parenthesis_stack.pop()
+            elif matchgroups.get('tag_close'):
+                if self._parenthesis_stack:
+                    msg = ('Found "%}", expected ")"')
+                    dbt.exceptions.raise_compiler_error(msg)
+                return
+            # else whitespace
+
     def _process_macro_default_arg(self):
         """Handle the bit after an '=' in a macro default argument. This is
         probably the trickiest thing. The goal here is to accept all strings
@@ -297,7 +401,7 @@ def _process_macro_default_arg(self):
                 'macro argument',
                 # you could have a string
                 STRING_PATTERN,
-                # a quote, a comma, or a close parenthesis
+                # a quote, a comma, or a open/close parenthesis
                 NON_STRING_MACRO_ARGS_PATTERN,
                 # we want to "match", not "search"
                 method='match'
@@ -311,20 +415,17 @@ def _process_macro_default_arg(self):
                 # we got a bunch of data and then a string opening value.
                 # put the quote back on the menu
                 self.rewind()
-                # if we only got a single quote mark and didn't hit a string
-                # at all, this file has an unclosed quote. Fail accordingly.
-                if match.end() - match.start() == 1:
-                    msg = (
-                        'Unclosed quotation mark at position {}. Context:\n{}'
-                        .format(self.pos, self.data[self.pos-20:self.pos+20])
-                    )
-                    dbt.exceptions.raise_compiler_error(msg)
+                # now look for a string
+                match = self._expect_match('any string', STRING_PATTERN)
+                self.advance(match.end())
             elif matchgroups.get('comma') is not None:
                 # small hack: if we hit a comma and there is one parenthesis
                 # left, return to look for a new name. otherwise we're still
                 # looking for the parameter close.
                 if len(self._parenthesis_stack) == 1:
                     return
+            elif matchgroups.get('open'):
+                self._parenthesis_stack.append(True)
             elif matchgroups.get('close'):
                 self._parenthesis_stack.pop()
             else:

diff --git a/core/dbt/flags.py b/core/dbt/flags.py
@@ -3,13 +3,16 @@
 FULL_REFRESH = False
 USE_CACHE = True
 WARN_ERROR = False
+TEST_NEW_PARSER = False
 
 
 def reset():
-    global STRICT_MODE, NON_DESTRUCTIVE, FULL_REFRESH, USE_CACHE, WARN_ERROR
+    global STRICT_MODE, NON_DESTRUCTIVE, FULL_REFRESH, USE_CACHE, WARN_ERROR, \
+        TEST_NEW_PARSER
 
     STRICT_MODE = False
     NON_DESTRUCTIVE = False
     FULL_REFRESH = False
     USE_CACHE = True
     WARN_ERROR = False
+    TEST_NEW_PARSER = False
diff --git a/core/dbt/main.py b/core/dbt/main.py
@@ -221,6 +221,8 @@ def update_flags(parsed):
     elif arg_full_refresh:
         flags.FULL_REFRESH = True
 
+    flags.TEST_NEW_PARSER = getattr(parsed, 'test_new_parser', False)
+
 
 def _build_base_subparser():
     base_subparser = argparse.ArgumentParser(add_help=False)
@@ -637,6 +639,15 @@ def parse_args(args):
         help=argparse.SUPPRESS,
     )
 
+    # if set, extract all models and blocks with the jinja block extractor, and
+    # verify that we don't fail anywhere the actual jinja parser passes. The
+    # reverse (passing files that ends up failing jinja) is fine.
+    p.add_argument(
+        '--test-new-parser',
+        action='store_true',
+        help=argparse.SUPPRESS
+    )
+
     subs = p.add_subparsers(title="Available sub-commands")
 
     base_subparser = _build_base_subparser()

diff --git a/core/dbt/parser/base.py b/core/dbt/parser/base.py
@@ -265,3 +265,16 @@ def parse_node(self, node, node_path, package_project_config, tags=None,
         parsed_node.validate()
 
         return parsed_node
+
+    def check_block_parsing(self, name, path, contents):
+        """Check if we were able to extract toplevel blocks from the given
+        contents. Return True if extraction was successful (no exceptions),
+        False if it fails.
+        """
+        if not dbt.flags.TEST_NEW_PARSER:
+            return True
+        try:
+            dbt.clients.jinja.extract_toplevel_blocks(contents)
+        except Exception:
+            return False
+        return True
diff --git a/core/dbt/parser/base_sql.py b/core/dbt/parser/base_sql.py
@@ -9,6 +9,7 @@
 
 from dbt.contracts.graph.unparsed import UnparsedNode
 from dbt.parser.base import MacrosKnownParser
+from dbt.node_types import NodeType
 
 
 class BaseSqlParser(MacrosKnownParser):
@@ -74,7 +75,21 @@ def parse_sql_node(self, node_dict, tags=None):
                                   node.name)
 
         project = self.all_projects.get(package_name)
+
+        parse_ok = True
+        if node.resource_type == NodeType.Model:
+            parse_ok = self.check_block_parsing(
+                node.name, node.original_file_path, node.raw_sql
+            )
+
         node_parsed = self.parse_node(node, unique_id, project, tags=tags)
+        if not parse_ok:
+            # if we had a parse error in parse_node, we would not get here. So
+            # this means we rejected a good file :(
+            raise dbt.exceptions.InternalException(
+                'the block parser rejected a good node: {} was marked invalid '
+                'but is actually valid!'.format(node.original_file_path)
+            )
         return unique_id, node_parsed
 
     def parse_sql_nodes(self, nodes, tags=None):

diff --git a/core/dbt/parser/schemas.py b/core/dbt/parser/schemas.py
@@ -278,6 +278,7 @@ def build_test_node(self, test_target, package_name, test, root_dir, path,
         full_path = get_pseudo_test_path(full_name, test_path, 'schema_test')
         raw_sql = self._build_raw_sql(test_namespace, test_target, test_type,
                                       test_args)
+
         unparsed = UnparsedNode(
             name=full_name,
             resource_type=NodeType.Test,
@@ -295,13 +296,23 @@ def build_test_node(self, test_target, package_name, test, root_dir, path,
         node_path = self.get_path(NodeType.Test, unparsed.package_name,
                                   unparsed.name)
 
-        return self.parse_node(unparsed,
-                               node_path,
-                               source_package,
-                               tags=['schema'],
-                               fqn_extra=None,
-                               fqn=fqn_override,
-                               column_name=column_name)
+        result = self.parse_node(unparsed,
+                                 node_path,
+                                 source_package,
+                                 tags=['schema'],
+                                 fqn_extra=None,
+                                 fqn=fqn_override,
+                                 column_name=column_name)
+
+        parse_ok = self.check_block_parsing(full_name, test_path, raw_sql)
+        if not parse_ok:
+            # if we had a parse error in parse_node, we would not get here. So
+            # this means we rejected a good file :(
+            raise dbt.exceptions.InternalException(
+                'the block parser rejected a good node: {} was marked invalid '
+                'but is actually valid!'.format(test_path)
+            )
+        return result
 
 
 class SchemaModelParser(SchemaBaseTestParser):

diff --git a/test/integration/base.py b/test/integration/base.py
@@ -441,16 +441,23 @@ def project_config(self):
     def profile_config(self):
         return {}
 
-    def run_dbt(self, args=None, expect_pass=True, strict=True):
+    def run_dbt(self, args=None, expect_pass=True, strict=True, parser=True):
         if args is None:
             args = ["run"]
 
+        final_args = []
+
         if strict:
-            args = ["--strict"] + args
-        args.append('--log-cache-events')
-        logger.info("Invoking dbt with {}".format(args))
+            final_args.append('--strict')
+        if parser:
+            final_args.append('--test-new-parser')
+
+        final_args.extend(args)
+        final_args.append('--log-cache-events')
+
+        logger.info("Invoking dbt with {}".format(final_args))
 
-        res, success = dbt.handle_and_check(args)
+        res, success = dbt.handle_and_check(final_args)
         self.assertEqual(
             success, expect_pass,
             "dbt exit state did not match expected")