From 01e9655176608900439b7655c1722bcd0c7b4aea Mon Sep 17 00:00:00 2001 From: Bastien Boutonnet Date: Wed, 21 Apr 2021 18:34:55 +0200 Subject: [PATCH] Support kms in haversine_distance macro (#340) --- CHANGELOG.md | 1 + README.md | 29 ++++++------ .../data/geo/data_haversine_km.csv | 2 + .../data/geo/data_haversine_mi.csv | 2 + integration_tests/dbt_project.yml | 4 +- integration_tests/macros/tests.sql | 1 - integration_tests/models/geo/schema.yml | 13 ++++++ .../models/geo/test_haversine_distance_km.sql | 23 ++++++++++ .../models/geo/test_haversine_distance_mi.sql | 39 ++++++++++++++++ macros/geo/haversine_distance.sql | 44 ++++++++++++++++--- 10 files changed, 136 insertions(+), 22 deletions(-) create mode 100644 integration_tests/data/geo/data_haversine_km.csv create mode 100644 integration_tests/data/geo/data_haversine_mi.csv create mode 100644 integration_tests/models/geo/schema.yml create mode 100644 integration_tests/models/geo/test_haversine_distance_km.sql create mode 100644 integration_tests/models/geo/test_haversine_distance_mi.sql diff --git a/CHANGELOG.md b/CHANGELOG.md index a2deabbd..ec7dd910 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ * Support a new argument, `zero_length_range_allowed` in the `mutually_exclusive_ranges` test ([#307](https://github.com/fishtown-analytics/dbt-utils/pull/307) [@zemekeng](https://github.com/zemekeneng)) * Add new schema test, `sequential_values` ([#318](https://github.com/fishtown-analytics/dbt-utils/pull/318), inspired by [@hundredwatt](https://github.com/hundredwatt)) * Support `quarter` in the `postgres__last_day` macro ([#333](https://github.com/fishtown-analytics/dbt-utils/pull/333/files), [@seunghanhong](https://github.com/seunghanhong)) +* Add new argument, `unit`, to `haversine_distance` [#340](https://github.com/fishtown-analytics/dbt-utils/pull/340) [@bastienboutonnet](https://github.com/bastienboutonnet) ## Fixes diff --git a/README.md b/README.md index cd615d3e..7bbe1dd3 100644 --- a/README.md +++ b/README.md @@ -95,7 +95,7 @@ Usage: --- ### Date/Time #### date_spine ([source](macros/datetime/date_spine.sql)) -This macro returns the sql required to build a date spine. The spine will include the `start_date` (if it is aligned to the `datepart`), but it will not include the `end_date`. +This macro returns the sql required to build a date spine. The spine will include the `start_date` (if it is aligned to the `datepart`), but it will not include the `end_date`. Usage: ``` @@ -111,9 +111,12 @@ Usage: #### haversine_distance ([source](macros/geo/haversine_distance.sql)) This macro calculates the [haversine distance](http://daynebatten.com/2015/09/latitude-longitude-distance-sql/) between a pair of x/y coordinates. -Usage: +Optionally takes a `unit` string parameter ('km' or 'mi') which defaults to miles (imperial system). + +**Usage:** + ``` -{{ dbt_utils.haversine_distance(lat1=,lon1=,lat2=,lon2=) }} +{{ dbt_utils.haversine_distance(lat1=,lon1=,lat2=,lon2=, unit='mi') }} ``` --- ### Schema Tests @@ -181,13 +184,13 @@ models: ``` -This macro can also be used at the column level. When this is done, the `expression` is evaluated against the column. +This macro can also be used at the column level. When this is done, the `expression` is evaluated against the column. ```yaml version: 2 -models: +models: - name: model_name - columns: + columns: - name: col_a tests: - dbt_utils.expression_is_true: @@ -197,7 +200,7 @@ models: - dbt_utils.expression_is_true: expression: '= 1' condition: col_a = 1 - + ``` @@ -361,7 +364,7 @@ models: upper_bound_column: ended_at partition_by: customer_id gaps: required - + # test that each customer can have subscriptions that start and end on the same date - name: subscriptions tests: @@ -512,9 +515,9 @@ An optional `quote_columns` parameter (`default=false`) can also be used if a co #### accepted_range ([source](macros/schema_tests/accepted_range.sql)) -This test checks that a column's values fall inside an expected range. Any combination of `min_value` and `max_value` is allowed, and the range can be inclusive or exclusive. Provide a `where` argument to filter to specific records only. +This test checks that a column's values fall inside an expected range. Any combination of `min_value` and `max_value` is allowed, and the range can be inclusive or exclusive. Provide a `where` argument to filter to specific records only. -In addition to comparisons to a scalar value, you can also compare to another column's values. Any data type that supports the `>` or `<` operators can be compared, so you could also run tests like checking that all order dates are in the past. +In addition to comparisons to a scalar value, you can also compare to another column's values. Any data type that supports the `>` or `<` operators can be compared, so you could also run tests like checking that all order dates are in the past. Usage: ```yaml @@ -528,19 +531,19 @@ models: - dbt_utils.accepted_range: min_value: 0 inclusive: false - + - name: account_created_at tests: - dbt_utils.accepted_range: max_value: "getdate()" #inclusive is true by default - + - name: num_returned_orders tests: - dbt_utils.accepted_range: min_value: 0 max_value: "num_orders" - + - name: num_web_sessions tests: - dbt_utils.accepted_range: diff --git a/integration_tests/data/geo/data_haversine_km.csv b/integration_tests/data/geo/data_haversine_km.csv new file mode 100644 index 00000000..636442f2 --- /dev/null +++ b/integration_tests/data/geo/data_haversine_km.csv @@ -0,0 +1,2 @@ +lat_1,lon_1,lat_2,lon_2,output +48.864716,2.349014,52.379189,4.899431,430 diff --git a/integration_tests/data/geo/data_haversine_mi.csv b/integration_tests/data/geo/data_haversine_mi.csv new file mode 100644 index 00000000..cc4757f7 --- /dev/null +++ b/integration_tests/data/geo/data_haversine_mi.csv @@ -0,0 +1,2 @@ +lat_1,lon_1,lat_2,lon_2,output +48.864716,2.349014,52.379189,4.899431,267 diff --git a/integration_tests/dbt_project.yml b/integration_tests/dbt_project.yml index 59f8ad0e..4b5b10eb 100644 --- a/integration_tests/dbt_project.yml +++ b/integration_tests/dbt_project.yml @@ -53,8 +53,8 @@ seeds: sql: data_events_20180103: +schema: events - + schema_tests: data_test_sequential_timestamps: +column_types: - my_timestamp: timestamp \ No newline at end of file + my_timestamp: timestamp diff --git a/integration_tests/macros/tests.sql b/integration_tests/macros/tests.sql index 9ca536e4..caeb5d6e 100644 --- a/integration_tests/macros/tests.sql +++ b/integration_tests/macros/tests.sql @@ -1,6 +1,5 @@ {% macro test_assert_equal(model, actual, expected) %} - select count(*) from {{ model }} where {{ actual }} != {{ expected }} {% endmacro %} diff --git a/integration_tests/models/geo/schema.yml b/integration_tests/models/geo/schema.yml new file mode 100644 index 00000000..50dd0241 --- /dev/null +++ b/integration_tests/models/geo/schema.yml @@ -0,0 +1,13 @@ +version: 2 + +models: + - name: test_haversine_distance_km + tests: + - assert_equal: + actual: actual + expected: expected + - name: test_haversine_distance_mi + tests: + - assert_equal: + actual: actual + expected: expected diff --git a/integration_tests/models/geo/test_haversine_distance_km.sql b/integration_tests/models/geo/test_haversine_distance_km.sql new file mode 100644 index 00000000..e8ca4818 --- /dev/null +++ b/integration_tests/models/geo/test_haversine_distance_km.sql @@ -0,0 +1,23 @@ +with data as ( + select * from {{ ref('data_haversine_km') }} +), +final as ( + select + output as expected, + cast( + {{ + dbt_utils.haversine_distance( + lat1='lat_1', + lon1='lon_1', + lat2='lat_2', + lon2='lon_2', + unit='km' + ) + }} as {{ dbt_utils.type_numeric() }} + ) as actual + from data +) +select + expected, + round(actual,0) as actual +from final diff --git a/integration_tests/models/geo/test_haversine_distance_mi.sql b/integration_tests/models/geo/test_haversine_distance_mi.sql new file mode 100644 index 00000000..53f8172c --- /dev/null +++ b/integration_tests/models/geo/test_haversine_distance_mi.sql @@ -0,0 +1,39 @@ +with data as ( + select * from {{ ref('data_haversine_mi') }} +), +final as ( + select + output as expected, + cast( + {{ + dbt_utils.haversine_distance( + lat1='lat_1', + lon1='lon_1', + lat2='lat_2', + lon2='lon_2', + unit='mi' + ) + }} as {{ dbt_utils.type_numeric() }} + ) as actual + from data + + union all + + select + output as expected, + cast( + {{ + dbt_utils.haversine_distance( + lat1='lat_1', + lon1='lon_1', + lat2='lat_2', + lon2='lon_2', + ) + }} as {{ dbt_utils.type_numeric() }} + ) as actual + from data +) +select + expected, + round(actual,0) as actual +from final diff --git a/macros/geo/haversine_distance.sql b/macros/geo/haversine_distance.sql index ed6c8281..38fb869e 100644 --- a/macros/geo/haversine_distance.sql +++ b/macros/geo/haversine_distance.sql @@ -3,17 +3,49 @@ This calculates the distance between two sets of latitude and longitude. The formula is from the following blog post: http://daynebatten.com/2015/09/latitude-longitude-distance-sql/ -The arguments should be float type. +The arguments should be float type. #} -{% macro haversine_distance(lat1,lon1,lat2,lon2) -%} - {{ return(adapter.dispatch('haversine_distance', packages = dbt_utils._get_utils_namespaces())(lat1,lon1,lat2,lon2)) }} +{% macro degrees_to_radians(degrees) -%} + acos(-1) * {{degrees}} / 180 +{%- endmacro %} + +{% macro haversine_distance(lat1, lon1, lat2, lon2, unit='mi') -%} + {{ return(adapter.dispatch('haversine_distance', packages = dbt_utils._get_utils_namespaces())(lat1,lon1,lat2,lon2,unit)) }} {% endmacro %} -{% macro default__haversine_distance(lat1,lon1,lat2,lon2) -%} +{% macro default__haversine_distance(lat1, lon1, lat2, lon2, unit='mi') -%} +{%- if unit == 'mi' %} + {% set conversion_rate = 1 %} +{% elif unit == 'km' %} + {% set conversion_rate = 1.60934 %} +{% else %} + {{ exceptions.raise_compiler_error("unit input must be one of 'mi' or 'km'. Got " ~ unit) }} +{% endif %} - 2 * 3961 * asin(sqrt((sin(radians(({{lat2}} - {{lat1}}) / 2))) ^ 2 + + 2 * 3961 * asin(sqrt(pow((sin(radians(({{ lat2 }} - {{ lat1 }}) / 2))), 2) + cos(radians({{lat1}})) * cos(radians({{lat2}})) * - (sin(radians(({{lon2}} - {{lon1}}) / 2))) ^ 2)) + pow((sin(radians(({{ lon2 }} - {{ lon1 }}) / 2))), 2))) * {{ conversion_rate }} + +{%- endmacro %} + + + +{% macro bigquery__haversine_distance(lat1, lon1, lat2, lon2, unit='mi') -%} +{% set radians_lat1 = dbt_utils.degrees_to_radians(lat1) %} +{% set radians_lat2 = dbt_utils.degrees_to_radians(lat2) %} +{% set radians_lon1 = dbt_utils.degrees_to_radians(lon1) %} +{% set radians_lon2 = dbt_utils.degrees_to_radians(lon2) %} +{%- if unit == 'mi' %} + {% set conversion_rate = 1 %} +{% elif unit == 'km' %} + {% set conversion_rate = 1.60934 %} +{% else %} + {{ exceptions.raise_compiler_error("unit input must be one of 'mi' or 'km'. Got " ~ unit) }} +{% endif %} + 2 * 3961 * asin(sqrt(pow(sin(({{ radians_lat2 }} - {{ radians_lat1 }}) / 2), 2) + + cos({{ radians_lat1 }}) * cos({{ radians_lat2 }}) * + pow(sin(({{ radians_lon2 }} - {{ radians_lon1 }}) / 2), 2))) * {{ conversion_rate }} {%- endmacro %} +