From f4c96baac9e99c8c2df8dcdc056dde14af173686 Mon Sep 17 00:00:00 2001 From: drizk1 Date: Tue, 9 Jul 2024 21:30:33 -0400 Subject: [PATCH 1/7] adds iceberg support --- NEWS.md | 8 ++++++-- Project.toml | 2 +- src/TidierDB.jl | 49 ++++++++++++++++++++++++++++++++----------------- 3 files changed, 39 insertions(+), 20 deletions(-) diff --git a/NEWS.md b/NEWS.md index f123f11..172dedd 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,11 +1,15 @@ # TidierDB.jl updates -## v0.2.2 - 2024-07-07 +## v0.2.4 - 2024-07-10 +- Changes DuckDB compat to 1.0 +- Adds support for `iceberg` tables via DuckDB to allow iceberg paths in `db_table` when `iceberg = true` + +## v0.2.3 - 2024-07-07 - Adds direct path support for `db_table` when using DuckDB - Adds `connect` ability for AWS and Google Cloud to allow querying via S3 + DuckDB - Adds documentation for S3 + DuckDB with TidierDB -## v0.2.1 - 2024-06-27 +## v0.2.2 - 2024-06-27 - Adds support for Databricks SQL Rest API - Adds docs for Databricks use - Fixes float/int type conversion when Snowflake collects to dataframe diff --git a/Project.toml b/Project.toml index 1aa4ffc..aa130e5 100644 --- a/Project.toml +++ b/Project.toml @@ -29,7 +29,7 @@ Chain = "0.6" ClickHouse = "0.2" DataFrames = "1.5" Documenter = "0.27, 1" -DuckDB = "0.10" +DuckDB = "1.0" GoogleCloud = "0.11" HTTP = "1.1" JSON3 = "1.1" diff --git a/src/TidierDB.jl b/src/TidierDB.jl index 34206ca..5bb04a4 100644 --- a/src/TidierDB.jl +++ b/src/TidierDB.jl @@ -176,9 +176,15 @@ end # DuckDB function get_table_metadata(conn::DuckDB.Connection, table_name::String) - query = """ - DESCRIBE SELECT * FROM '$(table_name)' LIMIT 0 - """ + query = if occursin("iceberg_scan", table_name) + """ + DESCRIBE SELECT * FROM $(table_name) LIMIT 0 + """ + else + """ + DESCRIBE SELECT * FROM '$(table_name)' LIMIT 0 + """ + end result = DuckDB.execute(conn, query) |> DataFrame result[!, :current_selxn] .= 1 table_name = if occursin(r"[:/]", table_name) @@ -255,28 +261,37 @@ function get_table_metadata(conn::ClickHouse.ClickHouseSock, table_name::String) return select(result, 1 => :name, 2 => :type, :current_selxn, :table_name) end -function db_table(db, table, athena_params::Any=nothing) +function db_table(db, table, athena_params::Any=nothing; delta::Bool=false) table_name = string(table) + + if delta + DuckDB.execute(db, "INSTALL iceberg;") + DuckDB.execute(db, "LOAD iceberg;") + formatted_table_name = "iceberg_scan('$table_name', allow_moved_paths = true)" + else + formatted_table_name = if current_sql_mode[] == :snowflake + "$(db.database).$(db.schema).$table_name" + elseif db isa DatabricksConnection + "$(db.database).$(db.schema).$table_name" + elseif occursin(r"[:/]", table_name) && !delta + "'$table_name'" + else + table_name + end + end + metadata = if current_sql_mode[] == :lite - get_table_metadata(db, table_name) + get_table_metadata(db, formatted_table_name) elseif current_sql_mode[] in [:postgres, :duckdb, :mysql, :mssql, :clickhouse, :gbq, :oracle] - get_table_metadata(db, table_name) + get_table_metadata(db, formatted_table_name) elseif current_sql_mode[] == :athena - get_table_metadata_athena(db, table_name, athena_params) + get_table_metadata_athena(db, formatted_table_name, athena_params) elseif current_sql_mode[] == :snowflake - get_table_metadata(db, table_name) + get_table_metadata(db, formatted_table_name) else error("Unsupported SQL mode: $(current_sql_mode[])") end - formatted_table_name = if current_sql_mode[] == :snowflake - "$(db.database).$(db.schema).$table_name" - elseif db isa DatabricksConnection - "$(db.database).$(db.schema).$table_name" - elseif occursin(r"[:/]", table_name) - "'$table_name'" - else - table_name - end + return SQLQuery(from=formatted_table_name, metadata=metadata, db=db, athena_params=athena_params) end From 475b8d6b2f7f15333f80f70bafea9682db988ecb Mon Sep 17 00:00:00 2001 From: drizk1 Date: Wed, 10 Jul 2024 06:01:12 -0400 Subject: [PATCH 2/7] fix iceberg arg name --- src/TidierDB.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/TidierDB.jl b/src/TidierDB.jl index 5bb04a4..2192162 100644 --- a/src/TidierDB.jl +++ b/src/TidierDB.jl @@ -261,10 +261,10 @@ function get_table_metadata(conn::ClickHouse.ClickHouseSock, table_name::String) return select(result, 1 => :name, 2 => :type, :current_selxn, :table_name) end -function db_table(db, table, athena_params::Any=nothing; delta::Bool=false) +function db_table(db, table, athena_params::Any=nothing; iceberg::Bool=false) table_name = string(table) - if delta + if iceberg DuckDB.execute(db, "INSTALL iceberg;") DuckDB.execute(db, "LOAD iceberg;") formatted_table_name = "iceberg_scan('$table_name', allow_moved_paths = true)" @@ -273,7 +273,7 @@ function db_table(db, table, athena_params::Any=nothing; delta::Bool=false) "$(db.database).$(db.schema).$table_name" elseif db isa DatabricksConnection "$(db.database).$(db.schema).$table_name" - elseif occursin(r"[:/]", table_name) && !delta + elseif occursin(r"[:/]", table_name) && iceberg "'$table_name'" else table_name From ddbe8191a90f798dc455970e676539658aaf6f0a Mon Sep 17 00:00:00 2001 From: drizk1 Date: Wed, 10 Jul 2024 06:19:31 -0400 Subject: [PATCH 3/7] fixes to db_table --- src/TidierDB.jl | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/TidierDB.jl b/src/TidierDB.jl index 2192162..d9ede4f 100644 --- a/src/TidierDB.jl +++ b/src/TidierDB.jl @@ -176,15 +176,15 @@ end # DuckDB function get_table_metadata(conn::DuckDB.Connection, table_name::String) - query = if occursin("iceberg_scan", table_name) + query = #if occursin("iceberg_scan", table_name) """ DESCRIBE SELECT * FROM $(table_name) LIMIT 0 """ - else - """ - DESCRIBE SELECT * FROM '$(table_name)' LIMIT 0 - """ - end + # else + # """ + # DESCRIBE SELECT * FROM $(table_name) LIMIT 0 + # """ + # end result = DuckDB.execute(conn, query) |> DataFrame result[!, :current_selxn] .= 1 table_name = if occursin(r"[:/]", table_name) @@ -273,7 +273,7 @@ function db_table(db, table, athena_params::Any=nothing; iceberg::Bool=false) "$(db.database).$(db.schema).$table_name" elseif db isa DatabricksConnection "$(db.database).$(db.schema).$table_name" - elseif occursin(r"[:/]", table_name) && iceberg + elseif occursin(r"[:/]", table_name) && !iceberg "'$table_name'" else table_name From c8b1b2422f2d0c2896b36dd51dade19fa780c8fc Mon Sep 17 00:00:00 2001 From: drizk1 Date: Thu, 11 Jul 2024 19:17:39 -0400 Subject: [PATCH 4/7] formatting fix --- src/TidierDB.jl | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/src/TidierDB.jl b/src/TidierDB.jl index d9ede4f..b4e3acc 100644 --- a/src/TidierDB.jl +++ b/src/TidierDB.jl @@ -176,15 +176,10 @@ end # DuckDB function get_table_metadata(conn::DuckDB.Connection, table_name::String) - query = #if occursin("iceberg_scan", table_name) + query = """ DESCRIBE SELECT * FROM $(table_name) LIMIT 0 """ - # else - # """ - # DESCRIBE SELECT * FROM $(table_name) LIMIT 0 - # """ - # end result = DuckDB.execute(conn, query) |> DataFrame result[!, :current_selxn] .= 1 table_name = if occursin(r"[:/]", table_name) @@ -261,19 +256,26 @@ function get_table_metadata(conn::ClickHouse.ClickHouseSock, table_name::String) return select(result, 1 => :name, 2 => :type, :current_selxn, :table_name) end -function db_table(db, table, athena_params::Any=nothing; iceberg::Bool=false) +function db_table(db, table, athena_params::Any=nothing; iceberg::Bool=false, delta::Bool=false) table_name = string(table) if iceberg DuckDB.execute(db, "INSTALL iceberg;") DuckDB.execute(db, "LOAD iceberg;") formatted_table_name = "iceberg_scan('$table_name', allow_moved_paths = true)" + println(formatted_table_name) + + elseif delta + # DuckDB.execute(db, "INSTALL delta;") + # DuckDB.execute(db, "LOAD delta;") + formatted_table_name = "delta_scan('$table_name')" + println(formatted_table_name) else formatted_table_name = if current_sql_mode[] == :snowflake "$(db.database).$(db.schema).$table_name" elseif db isa DatabricksConnection "$(db.database).$(db.schema).$table_name" - elseif occursin(r"[:/]", table_name) && !iceberg + elseif occursin(r"[:/]", table_name) && !(iceberg || delta) "'$table_name'" else table_name From df77380fca667713e8159cee09dfdfe7819b9ce3 Mon Sep 17 00:00:00 2001 From: drizk1 Date: Thu, 11 Jul 2024 21:31:07 -0400 Subject: [PATCH 5/7] fixes db_table bug --- src/TidierDB.jl | 61 +++++++++++++++++++++++++++---------------------- 1 file changed, 34 insertions(+), 27 deletions(-) diff --git a/src/TidierDB.jl b/src/TidierDB.jl index b4e3acc..5715a6f 100644 --- a/src/TidierDB.jl +++ b/src/TidierDB.jl @@ -259,41 +259,48 @@ end function db_table(db, table, athena_params::Any=nothing; iceberg::Bool=false, delta::Bool=false) table_name = string(table) - if iceberg - DuckDB.execute(db, "INSTALL iceberg;") - DuckDB.execute(db, "LOAD iceberg;") - formatted_table_name = "iceberg_scan('$table_name', allow_moved_paths = true)" - println(formatted_table_name) - - elseif delta - # DuckDB.execute(db, "INSTALL delta;") - # DuckDB.execute(db, "LOAD delta;") - formatted_table_name = "delta_scan('$table_name')" - println(formatted_table_name) - else - formatted_table_name = if current_sql_mode[] == :snowflake - "$(db.database).$(db.schema).$table_name" - elseif db isa DatabricksConnection - "$(db.database).$(db.schema).$table_name" - elseif occursin(r"[:/]", table_name) && !(iceberg || delta) - "'$table_name'" + if current_sql_mode[] == :lite + metadata = get_table_metadata(db, table_name) + elseif current_sql_mode[] == :postgres ||current_sql_mode[] == :duckdb || current_sql_mode[] == :mysql || current_sql_mode[] == :mssql || current_sql_mode[] == :clickhouse || current_sql_mode[] == :gbq ||current_sql_mode[] == :oracle + if iceberg + DuckDB.execute(db, "INSTALL iceberg;") + DuckDB.execute(db, "LOAD iceberg;") + table_name2 = "iceberg_scan('$table_name', allow_moved_paths = true)" + metadata = get_table_metadata(db, table_name2) + elseif delta + DuckDB.execute(db, "INSTALL delta;") + DuckDB.execute(db, "LOAD delta;") + table_name2 = "delta_scan('$table_name')" + # println(table_name2) + metadata = get_table_metadata(db, table_name2) + elseif occursin(r"[:/]", table_name) + table_name2 = "'$table_name'" + metadata = get_table_metadata(db, table_name2) else - table_name + metadata = get_table_metadata(db, table_name) end - end - - metadata = if current_sql_mode[] == :lite - get_table_metadata(db, formatted_table_name) - elseif current_sql_mode[] in [:postgres, :duckdb, :mysql, :mssql, :clickhouse, :gbq, :oracle] - get_table_metadata(db, formatted_table_name) elseif current_sql_mode[] == :athena - get_table_metadata_athena(db, formatted_table_name, athena_params) + metadata = get_table_metadata_athena(db, table_name, athena_params) elseif current_sql_mode[] == :snowflake - get_table_metadata(db, formatted_table_name) + metadata = get_table_metadata(db, table_name) else error("Unsupported SQL mode: $(current_sql_mode[])") end + formatted_table_name = if current_sql_mode[] == :snowflake + "$(db.database).$(db.schema).$table_name" + elseif db isa DatabricksConnection + "$(db.database).$(db.schema).$table_name" + elseif iceberg + "iceberg_scan('$table_name', allow_moved_paths = true)" + elseif delta + "delta_scan('$table_name')" + elseif occursin(r"[:/]", table_name) && !(iceberg || delta) + "'$table_name'" + else + table_name + end + return SQLQuery(from=formatted_table_name, metadata=metadata, db=db, athena_params=athena_params) end From b14422a8f3a3ffd43f57e0c01aabb4b01217add4 Mon Sep 17 00:00:00 2001 From: drizk1 Date: Fri, 12 Jul 2024 13:06:41 -0400 Subject: [PATCH 6/7] bump version, add `db_table` docstring --- NEWS.md | 7 ++++--- Project.toml | 2 +- README.md | 2 ++ docs/src/index.md | 2 ++ src/TidierDB.jl | 3 +++ src/docstrings.jl | 42 ++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 54 insertions(+), 4 deletions(-) diff --git a/NEWS.md b/NEWS.md index 172dedd..a26d737 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,8 +1,9 @@ # TidierDB.jl updates -## v0.2.4 - 2024-07-10 -- Changes DuckDB compat to 1.0 -- Adds support for `iceberg` tables via DuckDB to allow iceberg paths in `db_table` when `iceberg = true` +## v0.2.4 - 2024-07-12 +- Switches to DuckDB to 1.0 version +- Adds support for `iceberg` tables via DuckDB to read iceberg paths in `db_table` when `iceberg = true` +- Adds support for DuckDB's beta `delta_scan` to read delta paths in `db_table` when `delta = true` ## v0.2.3 - 2024-07-07 - Adds direct path support for `db_table` when using DuckDB diff --git a/Project.toml b/Project.toml index aa130e5..aad87c1 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "TidierDB" uuid = "86993f9b-bbba-4084-97c5-ee15961ad48b" authors = ["Daniel Rizk and contributors"] -version = "0.2.3" +version = "0.2.4" [deps] AWS = "fbe9abb3-538b-5e4e-ba9e-bc94f4f92ebc" diff --git a/README.md b/README.md index b3c9831..4af5f98 100644 --- a/README.md +++ b/README.md @@ -72,6 +72,8 @@ Supported aggregate functions (as supported by the backend) with more to come - `@summarize` supports any SQL aggregate function in addition to the list above. Simply write the function as written in SQL syntax and it will work. - `copy_to` (for DuckDB, MySQL, SQLite) +With DuckDB, `db_table` supports direct paths for S3 bucket locations, iceberg tables, delta table, in addition to csv, parquet, etc. + DuckDB specifically enables copy_to to directly reading in `.parquet`, `.json`, `.csv`, and `.arrow` file, including https file paths. ```julia diff --git a/docs/src/index.md b/docs/src/index.md index 4bd5425..2aab814 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -66,6 +66,8 @@ Supported aggregate functions (as supported by the backend) with more to come - `@summarize` supports any SQL aggregate function in addition to the list above. Simply write the function as written in SQL syntax and it will work - `copy_to` (for DuckDB, MySQL, SQLite) +With DuckDB, `db_table` supports direct paths for S3 bucket locations, iceberg tables, delta table, in addition to csv, parquet, etc. + DuckDB specifically enables copy_to to directly reading in `.parquet`, `.json`, `.csv`, and `.arrow` file, including https file paths. ```julia diff --git a/src/TidierDB.jl b/src/TidierDB.jl index 5715a6f..243b0dd 100644 --- a/src/TidierDB.jl +++ b/src/TidierDB.jl @@ -256,6 +256,9 @@ function get_table_metadata(conn::ClickHouse.ClickHouseSock, table_name::String) return select(result, 1 => :name, 2 => :type, :current_selxn, :table_name) end +""" +$docstring_db_table +""" function db_table(db, table, athena_params::Any=nothing; iceberg::Bool=false, delta::Bool=false) table_name = string(table) diff --git a/src/docstrings.jl b/src/docstrings.jl index 1191594..15d522a 100644 --- a/src/docstrings.jl +++ b/src/docstrings.jl @@ -1041,4 +1041,46 @@ julia> @chain db_table(db, "df_mem") begin ─────┼─────────────────────────── 1 │ AA 1 0.1 ``` +""" + +const docstring_db_table = +""" + db_table(database, table_name, athena_params, delta = false, iceberg = false) + +`db_table` starts the underlying SQL query struct, adding the metadata and table. + +#arguments +`database`: The Database or connection object +`table_name`: tablename as a string. Table name can be a name of a table on the database or paths to the following types + -CSV + -Parquet + -Json + -Iceberg + -Delta + -S3 tables from AWS or Google Cloud +`delta`: must be true to read delta +`iceberg`: must be true to read iceberg + +# Example +```julia + +julia> df = DataFrame(id = [string('A' + i ÷ 26, 'A' + i % 26) for i in 0:9], + groups = [i % 2 == 0 ? "aa" : "bb" for i in 1:10], + value = repeat(1:5, 2), + percent = 0.1:0.1:1.0); + +julia> db = connect(:duckdb); + +julia> copy_to(db, df, "df_mem"); + +julia> db_table(db, "df_mem") +TidierDB.SQLQuery("", "df_mem", "", "", "", "", "", "", false, false, 4×4 DataFrame + Row │ name type current_selxn table_name + │ String? String? Int64 String +─────┼───────────────────────────────────────────── + 1 │ id VARCHAR 1 df_mem + 2 │ groups VARCHAR 1 df_mem + 3 │ value BIGINT 1 df_mem + 4 │ percent DOUBLE 1 df_mem, false, DuckDB.Connection(":memory:"), TidierDB.CTE[], 0, nothing) +``` """ \ No newline at end of file From 80ac9af5eac2838d2ab2ec30a04ff3cbb7bdf593 Mon Sep 17 00:00:00 2001 From: drizk1 Date: Fri, 12 Jul 2024 13:16:59 -0400 Subject: [PATCH 7/7] updates to duckdb connxn type --- src/TBD_macros.jl | 2 +- src/TidierDB.jl | 9 ++++----- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/src/TBD_macros.jl b/src/TBD_macros.jl index 1f1d790..7099da0 100644 --- a/src/TBD_macros.jl +++ b/src/TBD_macros.jl @@ -664,7 +664,7 @@ macro collect(sqlquery) # Determine the type of db and execute the query accordingly if db isa DatabricksConnection df_result = execute_databricks(db, final_query) - elseif db isa SQLite.DB || db isa LibPQ.Connection || db isa DuckDB.Connection || db isa MySQL.Connection || db isa ODBC.Connection + elseif db isa SQLite.DB || db isa LibPQ.Connection || db isa DuckDB.DB || db isa MySQL.Connection || db isa ODBC.Connection result = DBInterface.execute(db, final_query) df_result = DataFrame(result) elseif current_sql_mode[] == :clickhouse diff --git a/src/TidierDB.jl b/src/TidierDB.jl index 243b0dd..b7c9e11 100644 --- a/src/TidierDB.jl +++ b/src/TidierDB.jl @@ -175,7 +175,7 @@ end # DuckDB -function get_table_metadata(conn::DuckDB.Connection, table_name::String) +function get_table_metadata(conn::DuckDB.DB, table_name::String) query = """ DESCRIBE SELECT * FROM $(table_name) LIMIT 0 @@ -266,8 +266,8 @@ function db_table(db, table, athena_params::Any=nothing; iceberg::Bool=false, de metadata = get_table_metadata(db, table_name) elseif current_sql_mode[] == :postgres ||current_sql_mode[] == :duckdb || current_sql_mode[] == :mysql || current_sql_mode[] == :mssql || current_sql_mode[] == :clickhouse || current_sql_mode[] == :gbq ||current_sql_mode[] == :oracle if iceberg - DuckDB.execute(db, "INSTALL iceberg;") - DuckDB.execute(db, "LOAD iceberg;") + DBInterface.execute(db, "INSTALL iceberg;") + DBInterface.execute(db, "LOAD iceberg;") table_name2 = "iceberg_scan('$table_name', allow_moved_paths = true)" metadata = get_table_metadata(db, table_name2) elseif delta @@ -397,9 +397,8 @@ function connect(backend::Symbol; kwargs...) set_sql_mode(:lite) return SQLite.DB(db_path) elseif backend == :DuckDB || backend == :duckdb - mem = DuckDB.open(":memory:") set_sql_mode(:duckdb) - db = DuckDB.connect(mem) + db = DBInterface.connect(DuckDB.DB, ":memory:") DBInterface.execute(db, "SET autoinstall_known_extensions=1;") DBInterface.execute(db, "SET autoload_known_extensions=1;")