-
Notifications
You must be signed in to change notification settings - Fork 136
overriding microbatch macro for capturing the compiled code of the mi… #1000
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 16 commits
36e1956
bbd163d
b6aac49
cecb3a6
4271d1f
1c26b1a
7ae883b
677b181
a80518a
dd629fd
70ccd14
736b2a1
9c2d74f
20b2fae
35a35ce
b4407f0
51d3872
6f358de
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,143 @@ | ||
| from contextlib import contextmanager | ||
|
|
||
| import pytest | ||
|
|
||
| from dbt_project import DbtProject | ||
|
|
||
|
|
||
| def _microbatch_source_model_sql() -> str: | ||
| return """ | ||
| {{ config(event_time='order_date') }} | ||
| {% set event_time_data_type = 'datetime2' if target.type == 'sqlserver' else 'timestamp' %} | ||
|
|
||
| select | ||
| 1 as order_id, | ||
| 1 as customer_id, | ||
| 42 as amount, | ||
| cast('2024-01-01 00:00:00' as {{ event_time_data_type }}) as order_date | ||
| union all | ||
| select | ||
| 2 as order_id, | ||
| 2 as customer_id, | ||
| 84 as amount, | ||
| cast('2025-01-01 00:00:00' as {{ event_time_data_type }}) as order_date | ||
| """ | ||
|
|
||
|
|
||
| def _microbatch_model_sql(source_model_name: str) -> str: | ||
| return """ | ||
| {% set model_config = { | ||
| "materialized": "incremental", | ||
| "incremental_strategy": "microbatch", | ||
| "event_time": "order_date", | ||
| "batch_size": "year", | ||
| "begin": "2024-01-01" | ||
| } %} | ||
| {% if target.type != "duckdb" %} | ||
| {% do model_config.update({"unique_key": "order_id"}) %} | ||
| {% endif %} | ||
| {{ config(**model_config) }} | ||
|
|
||
| select | ||
| order_id, | ||
| customer_id, | ||
| amount, | ||
| order_date | ||
| from {{ ref('__MICROBATCH_SOURCE_MODEL__') }} | ||
| """.replace("__MICROBATCH_SOURCE_MODEL__", source_model_name) | ||
|
|
||
|
|
||
| @contextmanager | ||
| def _with_microbatch_test_models(dbt_project: DbtProject, model_suffix: str): | ||
| source_model_name = f"mb_src_{model_suffix}" | ||
| target_model_name = f"mb_tgt_{model_suffix}" | ||
| source_model_path = dbt_project.tmp_models_dir_path.joinpath(f"{source_model_name}.sql") | ||
| target_model_path = dbt_project.tmp_models_dir_path.joinpath(f"{target_model_name}.sql") | ||
|
|
||
| source_model_path.write_text(_microbatch_source_model_sql()) | ||
| target_model_path.write_text(_microbatch_model_sql(source_model_name)) | ||
| relative_source_model_path = source_model_path.relative_to(dbt_project.project_dir_path) | ||
| relative_target_model_path = target_model_path.relative_to(dbt_project.project_dir_path) | ||
| try: | ||
| yield relative_source_model_path, relative_target_model_path, target_model_name | ||
| finally: | ||
| if source_model_path.exists(): | ||
| source_model_path.unlink() | ||
| if target_model_path.exists(): | ||
| target_model_path.unlink() | ||
|
|
||
|
|
||
| def _run_microbatch_model_and_get_latest_success_result( | ||
| dbt_project: DbtProject, model_suffix: str | ||
| ): | ||
| with _with_microbatch_test_models(dbt_project, model_suffix) as ( | ||
| source_model_path, | ||
| model_path, | ||
| target_model_name, | ||
| ): | ||
| dbt_project.dbt_runner.run( | ||
| select=f"{source_model_path} {model_path}" | ||
| ) | ||
|
|
||
| unique_id = f"model.elementary_tests.{target_model_name}" | ||
| run_results = dbt_project.read_table( | ||
| "dbt_run_results", | ||
| where=f"unique_id = '{unique_id}' and status = 'success'", | ||
| order_by="generated_at desc", | ||
| limit=1, | ||
| ) | ||
| return run_results | ||
|
|
||
|
|
||
| @contextmanager | ||
| def _with_microbatch_macro_file(dbt_project: DbtProject, macro_name: str): | ||
| macro_path = ( | ||
| dbt_project.project_dir_path / "macros" / "microbatch.sql" | ||
| ) | ||
| macro_sql = """ | ||
| {% macro __MACRO_NAME__(arg_dict) %} | ||
| {{ return(elementary.get_incremental_microbatch_sql(arg_dict)) }} | ||
| {% endmacro %} | ||
| """.replace("__MACRO_NAME__", macro_name) | ||
| if macro_path.exists(): | ||
| raise FileExistsError(f"Expected no macro file at {macro_path}") | ||
|
|
||
| macro_path.write_text(macro_sql) | ||
| try: | ||
| yield | ||
| finally: | ||
| if macro_path.exists(): | ||
| macro_path.unlink() | ||
|
|
||
|
|
||
| @pytest.mark.skip_targets(["spark", "vertica", "bigquery", "athena", "clickhouse", "dremio"]) | ||
| @pytest.mark.skip_for_dbt_fusion | ||
| @pytest.mark.parametrize( | ||
| "macro_name,expected_compiled_code,model_suffix", | ||
| [ | ||
| ("get_incremental_microbatch_sql", True, "with_override"), | ||
| ("get_incremental_microbatch_sql_not_used", False, "without_override"), | ||
| ], | ||
| ids=["with_override", "without_override"], | ||
| ) | ||
| def test_microbatch_run_results_compiled_code_behavior( | ||
| dbt_project: DbtProject, | ||
| macro_name: str, | ||
| expected_compiled_code: bool, | ||
| model_suffix: str, | ||
| ): | ||
| dbt_project.dbt_runner.vars["disable_run_results"] = False | ||
|
|
||
| with _with_microbatch_macro_file(dbt_project, macro_name): | ||
| run_results = _run_microbatch_model_and_get_latest_success_result( | ||
| dbt_project, model_suffix | ||
| ) | ||
| assert run_results, "Expected a successful run result row for microbatch model" | ||
| if expected_compiled_code: | ||
| assert run_results[0]["compiled_code"], ( | ||
| "Expected compiled_code to be populated when override macro is present" | ||
| ) | ||
| else: | ||
| assert not run_results[0]["compiled_code"], ( | ||
| "Expected compiled_code to stay empty when override macro is absent" | ||
| ) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,54 @@ | ||
| {#- | ||
| NOTE FOR PACKAGE CONSUMERS: | ||
| This package macro is not guaranteed to be picked up automatically by dbt's | ||
| incremental strategy resolution in all projects. | ||
| To apply this behavior, users should: | ||
| 1) Override `get_incremental_microbatch_sql` in their own project and delegate to | ||
| `elementary.get_incremental_microbatch_sql(arg_dict)`. | ||
| 2) Enable dbt behavior flag `require_batched_execution_for_custom_microbatch_strategy`. | ||
|
|
||
| This flow is currently not supported for adapters: | ||
| - spark | ||
| - bigquery | ||
| - athena | ||
| - clickhouse | ||
| - dremio | ||
| - vertica | ||
|
|
||
| This flow is currently not supported for dbt Fusion. | ||
| -#} | ||
| {% macro get_incremental_microbatch_sql(arg_dict) %} | ||
| {% if execute and model is defined %} | ||
| {% do elementary.capture_microbatch_compiled_code_for_model() %} | ||
| {% endif %} | ||
|
|
||
| {{ return(adapter.dispatch("get_incremental_microbatch_sql", "dbt")(arg_dict)) }} | ||
| {% endmacro %} | ||
|
|
||
|
|
||
| {% macro capture_microbatch_compiled_code_for_model() %} | ||
| {% set model_unique_id = ( | ||
| model.get("unique_id") if model is mapping else model.unique_id | ||
| ) | default(none, true) %} | ||
| {% set model_compiled_code = ( | ||
| model.get("compiled_code") if model is mapping else model.compiled_code | ||
| ) | default(none, true) %} | ||
| {% if model_unique_id is none %} | ||
| {{ return(none) }} | ||
| {% endif %} | ||
| {% if not model_compiled_code %} | ||
| {{ return(none) }} | ||
| {% endif %} | ||
|
|
||
| {% set compiled_code_by_unique_id = elementary.get_cache( | ||
| "microbatch_compiled_code_by_unique_id", {} | ||
| ) %} | ||
| {% if model_unique_id in compiled_code_by_unique_id %} | ||
| {{ return(none) }} | ||
| {% endif %} | ||
|
|
||
| {% do compiled_code_by_unique_id.update({model_unique_id: model_compiled_code}) %} | ||
| {% do elementary.set_cache( | ||
| "microbatch_compiled_code_by_unique_id", compiled_code_by_unique_id | ||
| ) %} | ||
| {% endmacro %} | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,5 +1,11 @@ | ||
| {% macro get_compiled_code(node, as_column_value=false) %} | ||
| {% set compiled_code = adapter.dispatch("get_compiled_code", "elementary")(node) %} | ||
| {% set compiled_code = node.get("compiled_code") or node.get("compiled_sql") %} | ||
| {% if not compiled_code and node and node.get("unique_id") %} | ||
| {% set compiled_code = elementary.get_cache( | ||
| "microbatch_compiled_code_by_unique_id", {} | ||
| ).get(node.get("unique_id")) %} | ||
| {% endif %} | ||
|
Comment on lines
+3
to
+7
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🔴 Microbatch cache fallback bypasses Redshift On Redshift, Prompt for agentsWas this helpful? React with 👍 or 👎 to provide feedback.
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fixed |
||
| {% set compiled_code = adapter.dispatch("format_compiled_code", "elementary")(compiled_code) %} | ||
|
|
||
| {% set max_column_size = elementary.get_column_size() %} | ||
| {% if as_column_value and max_column_size and compiled_code and compiled_code | length > max_column_size %} | ||
|
|
@@ -9,12 +15,11 @@ | |
| {% do return(compiled_code) %} | ||
| {% endmacro %} | ||
|
|
||
| {% macro default__get_compiled_code(node) %} | ||
| {% do return(node.get("compiled_code") or node.get("compiled_sql")) %} | ||
| {% macro default__format_compiled_code(compiled_code) %} | ||
| {% do return(compiled_code) %} | ||
| {% endmacro %} | ||
|
|
||
| {% macro redshift__get_compiled_code(node) %} | ||
| {% set compiled_code = node.get("compiled_code") or node.get("compiled_sql") %} | ||
| {% macro redshift__format_compiled_code(compiled_code) %} | ||
| {% if not compiled_code %} {% do return(none) %} | ||
| {% else %} {% do return(compiled_code.replace("%", "%%")) %} | ||
| {% endif %} | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This
set_cachecould have a race if multiple dbt models run in parallel, and all of them update the initial dict.I think instead you should create the
microbatch_compiled_code_by_unique_idin theinit_elementary_graphmacro, and then I think you don't even need to doset_cache.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
fixed