diff --git a/.github/workflows/pull_request_build.yml b/.github/workflows/pull_request_build.yml index d7d5d05..fad51e9 100644 --- a/.github/workflows/pull_request_build.yml +++ b/.github/workflows/pull_request_build.yml @@ -87,6 +87,7 @@ jobs: run: "dbt build --fail-fast --empty" - name: Generate Docs Combining Prod and branch catalog.json + if: ${{ steps.prod_manifest.outputs.manifest_found == 'true' && contains(github.event.pull_request.labels.*.name, 'full-refresh') != true }} run: "dbt-coves generate docs --merge-deferred --state logs" - name: Run governance checks diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index be781c8..ef6f3d0 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,9 +1,8 @@ files: ^transform/models/ repos: - - repo: https://github.com/dbt-checkpoint/dbt-checkpoint - rev: v2.0.1 + rev: v2.0.5 hooks: - id: check-source-table-has-description @@ -12,12 +11,13 @@ repos: - id: check-script-ref-and-source - id: check-model-has-description - id: check-model-has-properties-file - - id: check-model-has-all-columns - + # - id: check-model-has-all-columns + # - id: check-database-casing-consistency + always_run: true - repo: https://github.com/sqlfluff/sqlfluff # this is the version of sqlfluff, needs to be updated when using a new sqlfluff version (pip show sqlfluff) - rev: 2.3.2 + rev: 3.1.1 hooks: - id: sqlfluff-lint language: python @@ -25,19 +25,16 @@ repos: # sqlfluff-templater-dbt should match the version of sqlfluff above in rev (pip show sqlfluff-templater-dbt) # dbt-snowflake needs to match the version in transform tab of Datacoves (pip show dbt-snowflake) additional_dependencies: - # ["sqlfluff-templater-dbt==2.3.2", "dbt-snowflake==1.6.8", dbt-core==1.6.9] [ - "sqlfluff-templater-dbt==2.3.2", - "dbt-redshift==1.6.7", - dbt-core==1.6.9, + "sqlfluff-templater-dbt==3.1.1", + "dbt-core==1.8.7", + "dbt-snowflake==1.8.4", ] - args: [--config, "transform/.sqlfluff"] - + args: [--config, transform/.sqlfluff] - repo: https://github.com/adrienverge/yamllint.git - rev: v1.17.0 + rev: v1.35.1 hooks: - id: yamllint args: [-c=.yamllint] exclude: ^transform/.dbt_coves/templates - diff --git a/transform/.yamllint b/.yamllint similarity index 100% rename from transform/.yamllint rename to .yamllint diff --git a/automate/dbt/get_artifacts.sh b/automate/dbt/get_artifacts.sh old mode 100644 new mode 100755 index 0c537bf..7182f43 --- a/automate/dbt/get_artifacts.sh +++ b/automate/dbt/get_artifacts.sh @@ -1,4 +1,3 @@ - #! /bin/bash # Cause script to exit on error @@ -9,7 +8,13 @@ cd $DATACOVES__DBT_HOME mkdir -p logs dbt run-operation get_last_artifacts + +# Check if manifest,son exist, count lines if does or set to 0 +if [ -e "logs/manifest.json" ]; then LINES_IN_MANIFEST="$(grep -c '^' logs/manifest.json)" +else + LINES_IN_MANIFEST="0" +fi if [ $LINES_IN_MANIFEST -eq 0 ] then diff --git a/automate/dbt/profiles.yml b/automate/dbt/profiles.yml index d91e94b..b4d437c 100644 --- a/automate/dbt/profiles.yml +++ b/automate/dbt/profiles.yml @@ -5,7 +5,7 @@ default: type: snowflake threads: 16 client_session_keep_alive: true - + account: "{{ env_var('DATACOVES__MAIN__ACCOUNT') }}" database: "{{ env_var('DATACOVES__MAIN__DATABASE') }}" schema: "{{ env_var('DATACOVES__MAIN__SCHEMA') }}" @@ -13,4 +13,3 @@ default: password: "{{ env_var('DATACOVES__MAIN__PASSWORD') }}" role: "{{ env_var('DATACOVES__MAIN__ROLE') }}" warehouse: "{{ env_var('DATACOVES__MAIN__WAREHOUSE') }}" - diff --git a/automate/dbt/remove_test_databases.sh b/automate/dbt/remove_test_databases.sh old mode 100644 new mode 100755 diff --git a/transform/dbt_project.yml b/transform/dbt_project.yml index 9ce6984..a558ad5 100644 --- a/transform/dbt_project.yml +++ b/transform/dbt_project.yml @@ -37,6 +37,10 @@ models: datacoves_starter_project: L1_staging: +materialized: view + loans: + +materialized: view + country_data: + +materialized: view L2_core: +materialized: view L3_marts: diff --git a/transform/macros/cicd/get_last_artifacts.sql b/transform/macros/cicd/get_last_artifacts.sql index 80331a2..0ee6104 100644 --- a/transform/macros/cicd/get_last_artifacts.sql +++ b/transform/macros/cicd/get_last_artifacts.sql @@ -1,30 +1,48 @@ {# Macro for returning dbt manifest from a snowflake stage. #} {# dbt run-operation get_last_artifacts -#} + #} {# Once this is completed, deferral and state modifiers are available using --state logs #} -{% macro get_last_artifacts(stage = 'RAW.DBT_ARTIFACTS.ARTIFACTS') %} - {# we will put the manifest.json in the log directory and use the with the --state param in dbt #} +{% macro get_last_artifacts() %} + {# Fallback variable used to run/debug macro in vscode #} + {% set stage_name = 'RAW.DBT_ARTIFACTS.ARTIFACTS' %} + + {# We will put the manifest.json in the log directory and use it with the --state param in dbt #} {% set logs_dir = env_var('DATACOVES__DBT_HOME') ~ "/logs/" %} {# List only the .json files in the root folder (excludes archive dir) #} {% set list_stage_query %} - LIST @{{ stage }} PATTERN = '^((?!(archive/)).)*.json$'; + LIST @{{ stage_name }} PATTERN = '^((?!(archive/)).)*.json$'; {% endset %} - {{ print("\nCurrent items in stage " ~ stage) }} + {{ print("\nCurrent items in stage " ~ stage_name) }} {% set results = run_query(list_stage_query) %} {{ results.exclude('md5').print_table(max_column_width=40) }} {{ print("\n" ~ "="*85) }} - {% set artifacts_destination = "file://" + logs_dir %} + {% if results and results.rows %} - {% set get_query %} - get @{{ stage }}/manifest.json {{ artifacts_destination }}; - get @{{ stage }}/catalog.json {{ artifacts_destination }}; - {% endset %} + {% set artifacts_destination = "file://" + logs_dir %} + + {# Download and print manifest.json #} + {% set get_manifest_query %} + get @{{ stage_name }}/manifest.json {{ artifacts_destination }}; + {% endset %} + {% set download_manifest_results = run_query(get_manifest_query) %}x + {{ print("Manifest Downloaded") }} + {{ download_manifest_results.print_table(max_column_width=40) }} + + {# Download and print catalog.json #} + {% set get_catalog_query %} + get @{{ stage_name }}/catalog.json {{ artifacts_destination }}; + {% endset %} + {% set download_catalog_results = run_query(get_catalog_query) %} + {{ print("Catalog Downloaded") }} + {{ download_catalog_results.print_table(max_column_width=40) }} - {% set results = run_query(get_query) %} + {% else %} + {{ print("No artifacts found in stage " ~ stage_name ~ ". Skipping file download.") }} + {% endif %} {% endmacro %} diff --git a/transform/macros/cicd/grant_access_to_pr_database.sql b/transform/macros/cicd/grant_access_to_pr_database.sql index 2cd4223..97a08ef 100644 --- a/transform/macros/cicd/grant_access_to_pr_database.sql +++ b/transform/macros/cicd/grant_access_to_pr_database.sql @@ -5,7 +5,7 @@ #} {%- macro grant_access_to_pr_database() -%} - {% set db_role_name = 'z_db_balboa_tst' %} + {% set db_role_name = 'analyst' %} {% set db_name = target.database %} {% set apply_db_grants_sql %} @@ -13,20 +13,21 @@ {% endset %} {% do run_query(apply_db_grants_sql) %} - {% set schemas_list %} + select schema_name from {{ db_name }}.information_schema.schemata where schema_name not in ('INFORMATION_SCHEMA','PUBLIC','DBT_TEST__AUDIT') + {{print(schema_list)}} {% endset %} {% set schemas = run_query(schemas_list) %} {% for schema in schemas %} {% set apply_schema_grants_sql %} - grant usage on schema {{db_name}}.{{ schema[0] }} to z_schema_{{schema[0]}}; - grant select on all tables in schema {{db_name}}.{{ schema[0] }} to role z_tables_views_general; - grant select on all views in schema {{db_name}}.{{ schema[0] }} to role z_tables_views_general; + grant usage on schema {{db_name}}.{{ schema[0] }} to {{db_role_name}}; + grant select on all tables in schema {{db_name}}.{{ schema[0] }} to role {{db_role_name}}; + grant select on all views in schema {{db_name}}.{{ schema[0] }} to role {{db_role_name}}; {% endset %} {% do run_query(apply_schema_grants_sql) %} diff --git a/transform/macros/create_database.sql b/transform/macros/create_database.sql index 9245dd2..795211a 100644 --- a/transform/macros/create_database.sql +++ b/transform/macros/create_database.sql @@ -12,7 +12,7 @@ identifier="tables") -%} {% if not database_exists %} {% set create_db_sql %} - use role transformer_dbt; + use role analyst; create database {{ target.database }}; grant ownership on database {{ target.database }} to role {{ target.role }}; use role {{ target.role }}; diff --git a/transform/models/L1_staging/country_data/stg_country_populations.yml b/transform/models/L1_staging/country_data/stg_country_populations.yml index 63d6f1c..3bf856f 100644 --- a/transform/models/L1_staging/country_data/stg_country_populations.yml +++ b/transform/models/L1_staging/country_data/stg_country_populations.yml @@ -1,15 +1,15 @@ version: 2 models: - - name: COUNTRY_POPULATIONS + - name: stg_country_populations description: 'Raw population information from Github Datasets repository' columns: - name: year description: The year for which the population value is recorded - data_tests: - - not_null - name: country_name description: The name of the country + data_tests: + - not_null - name: value description: The population value for a particular year and country - name: country_code diff --git a/transform/models/L1_staging/loans/_loans.yml b/transform/models/L1_staging/loans/_loans.yml new file mode 100644 index 0000000..26b9fd6 --- /dev/null +++ b/transform/models/L1_staging/loans/_loans.yml @@ -0,0 +1,8 @@ +version: 2 + +sources: + - name: MAYRAPENA1324 + database: RAW + tables: + - name: PERSONAL_LOANS + description: 'A personal loans source table' diff --git a/transform/models/L1_staging/loans/stg_mayrapena1324_personal_loans.sql b/transform/models/L1_staging/loans/stg_mayrapena1324_personal_loans.sql new file mode 100644 index 0000000..3e9d375 --- /dev/null +++ b/transform/models/L1_staging/loans/stg_mayrapena1324_personal_loans.sql @@ -0,0 +1,71 @@ +with raw_source as ( + + select * + from {{ source('MAYRAPENA1324', 'PERSONAL_LOANS') }} + +), + +final as ( + + select + "_AIRBYTE_RAW_ID"::varchar as airbyte_raw_id, + "_AIRBYTE_EXTRACTED_AT"::timestamp_tz as airbyte_extracted_at, + "_AIRBYTE_META"::variant as airbyte_meta, + "TOTAL_ACC"::float as total_acc, + "ANNUAL_INC"::float as annual_inc, + "EMP_LENGTH"::varchar as emp_length, + "DESC"::varchar as desc, + "TOTAL_PYMNT"::float as total_pymnt, + "LAST_PYMNT_D"::varchar as last_pymnt_d, + "ADDR_STATE"::varchar as addr_state, + "NEXT_PYMNT_D"::varchar as next_pymnt_d, + "EMP_TITLE"::varchar as emp_title, + "COLLECTION_RECOVERY_FEE"::float as collection_recovery_fee, + "MTHS_SINCE_LAST_MAJOR_DEROG"::float as mths_since_last_major_derog, + "INQ_LAST_6MTHS"::float as inq_last_6mths, + "SUB_GRADE"::varchar as sub_grade, + "FUNDED_AMNT_INV"::float as funded_amnt_inv, + "DELINQ_2YRS"::float as delinq_2yrs, + "LOAN_ID"::varchar as loan_id, + "FUNDED_AMNT"::float as funded_amnt, + "VERIFICATION_STATUS"::varchar as verification_status, + "DTI"::float as dti, + "TOTAL_REC_PRNCP"::float as total_rec_prncp, + "GRADE"::varchar as grade, + "HOME_OWNERSHIP"::varchar as home_ownership, + "ISSUE_D"::varchar as issue_d, + "MTHS_SINCE_LAST_DELINQ"::float as mths_since_last_delinq, + "OUT_PRNCP"::float as out_prncp, + "PUB_REC"::float as pub_rec, + "INT_RATE"::float as int_rate, + "ZIP_CODE"::varchar as zip_code, + "OPEN_ACC"::float as open_acc, + "TERM"::varchar as term, + "PYMNT_PLAN"::varchar as pymnt_plan, + "URL"::varchar as url, + "REVOL_BAL"::float as revol_bal, + "RECOVERIES"::float as recoveries, + "LAST_PYMNT_AMNT"::float as last_pymnt_amnt, + "LOAN_AMNT"::float as loan_amnt, + "PURPOSE"::varchar as purpose, + "INITIAL_LIST_STATUS"::varchar as initial_list_status, + "TOTAL_REC_INT"::float as total_rec_int, + "TOTAL_PYMNT_INV"::float as total_pymnt_inv, + "MTHS_SINCE_LAST_RECORD"::float as mths_since_last_record, + "LAST_CREDIT_PULL_D"::varchar as last_credit_pull_d, + "TOTAL_REC_LATE_FEE"::float as total_rec_late_fee, + "MEMBER_ID"::float as member_id, + "POLICY_CODE"::float as policy_code, + "TITLE"::varchar as title, + "LOAN_STATUS"::varchar as loan_status, + "INSTALLMENT"::float as installment, + "EARLIEST_CR_LINE"::varchar as earliest_cr_line, + "REVOL_UTIL"::varchar as revol_util, + "OUT_PRNCP_INV"::float as out_prncp_inv, + "COLLECTIONS_12_MTHS_EX_MED"::float as collections_12_mths_ex_med + + from raw_source + +) + +select * from final diff --git a/transform/models/L1_staging/loans/stg_mayrapena1324_personal_loans.yml b/transform/models/L1_staging/loans/stg_mayrapena1324_personal_loans.yml new file mode 100644 index 0000000..3e2ffe1 --- /dev/null +++ b/transform/models/L1_staging/loans/stg_mayrapena1324_personal_loans.yml @@ -0,0 +1,61 @@ +version: 2 + +models: + - name: stg_mayrapena1324_personal_loans + description: 'A staging model for personal loans' + columns: + - name: airbyte_raw_id + - name: airbyte_extracted_at + - name: airbyte_meta + - name: total_acc + - name: annual_inc + - name: emp_length + - name: desc + - name: total_pymnt + - name: last_pymnt_d + - name: addr_state + - name: next_pymnt_d + - name: emp_title + - name: collection_recovery_fee + - name: mths_since_last_major_derog + - name: inq_last_6mths + - name: sub_grade + - name: funded_amnt_inv + - name: delinq_2yrs + - name: loan_id + - name: funded_amnt + - name: verification_status + - name: dti + - name: total_rec_prncp + - name: grade + - name: home_ownership + - name: issue_d + - name: mths_since_last_delinq + - name: out_prncp + - name: pub_rec + - name: int_rate + - name: zip_code + - name: open_acc + - name: term + - name: pymnt_plan + - name: url + - name: revol_bal + - name: recoveries + - name: last_pymnt_amnt + - name: loan_amnt + - name: purpose + - name: initial_list_status + - name: total_rec_int + - name: total_pymnt_inv + - name: mths_since_last_record + - name: last_credit_pull_d + - name: total_rec_late_fee + - name: member_id + - name: policy_code + - name: title + - name: loan_status + - name: installment + - name: earliest_cr_line + - name: revol_util + - name: out_prncp_inv + - name: collections_12_mths_ex_med diff --git a/transform/models/L2_core/mayrapena1324_avg_by_grade.sql b/transform/models/L2_core/mayrapena1324_avg_by_grade.sql new file mode 100644 index 0000000..61625c0 --- /dev/null +++ b/transform/models/L2_core/mayrapena1324_avg_by_grade.sql @@ -0,0 +1,20 @@ +with raw_source as ( + + select * from {{ ref('stg_mayrapena1324_personal_loans') }} + +), + +final as ( + + select + grade, + avg(loan_amnt) as avg_loan_amount, + count(*) as total_loans + from raw_source + where loan_status = 'Fully Paid' + group by grade + order by grade + +) + +select * from final diff --git a/transform/models/L2_core/mayrapena1324_avg_by_grade.yml b/transform/models/L2_core/mayrapena1324_avg_by_grade.yml new file mode 100644 index 0000000..dfc2b70 --- /dev/null +++ b/transform/models/L2_core/mayrapena1324_avg_by_grade.yml @@ -0,0 +1,9 @@ +version: 2 + +models: + - name: mayrapena1324_avg_by_grade + description: 'An average loan amount by grade' + columns: + - name: grade + - name: avg_loan_amount + - name: total_loans