-
Notifications
You must be signed in to change notification settings - Fork 149
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Enhancement/enable serverless (#303)
* Experiment with Dataproc Serverless * add serverless as another submission method * add changelog and run tests against target core branch * fix syntax * fix schema overwrite * use 0.21 version of connector Co-authored-by: Jeremy Cohen <jeremy@dbtlabs.com>
- Loading branch information
1 parent
53f8b90
commit 84c20fc
Showing
7 changed files
with
187 additions
and
110 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
kind: Features | ||
body: Add support for Dataproc Serverless | ||
time: 2022-09-09T12:29:24.993388-07:00 | ||
custom: | ||
Author: ChenyuLInx | ||
Issue: "248" | ||
PR: "303" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,152 @@ | ||
from typing import Dict, Union | ||
|
||
from dbt.adapters.base import PythonJobHelper | ||
from dbt.adapters.bigquery import BigQueryConnectionManager, BigQueryCredentials | ||
from google.api_core import retry | ||
from google.api_core.client_options import ClientOptions | ||
|
||
try: | ||
# library only needed for python models | ||
from google.cloud import storage, dataproc_v1 # type: ignore | ||
except ImportError: | ||
_has_dataproc_lib = False | ||
else: | ||
_has_dataproc_lib = True | ||
|
||
|
||
class BaseDataProcHelper(PythonJobHelper): | ||
def __init__(self, parsed_model: Dict, credential: BigQueryCredentials) -> None: | ||
"""_summary_ | ||
Args: | ||
credential (_type_): _description_ | ||
""" | ||
if not _has_dataproc_lib: | ||
raise RuntimeError( | ||
"You need to install [dataproc] extras to run python model in dbt-bigquery" | ||
) | ||
# validate all additional stuff for python is set | ||
schema = parsed_model["schema"] | ||
identifier = parsed_model["alias"] | ||
self.parsed_model = parsed_model | ||
python_required_configs = [ | ||
"dataproc_region", | ||
"gcs_bucket", | ||
] | ||
for required_config in python_required_configs: | ||
if not getattr(credential, required_config): | ||
raise ValueError( | ||
f"Need to supply {required_config} in profile to submit python job" | ||
) | ||
self.model_file_name = f"{schema}/{identifier}.py" | ||
self.credential = credential | ||
self.GoogleCredentials = BigQueryConnectionManager.get_credentials(credential) | ||
self.storage_client = storage.Client( | ||
project=self.credential.database, credentials=self.GoogleCredentials | ||
) | ||
self.gcs_location = "gs://{}/{}".format(self.credential.gcs_bucket, self.model_file_name) | ||
|
||
# set retry policy, default to timeout after 24 hours | ||
self.timeout = self.parsed_model["config"].get( | ||
"timeout", self.credential.job_execution_timeout_seconds or 60 * 60 * 24 | ||
) | ||
self.retry = retry.Retry(maximum=10.0, deadline=self.timeout) | ||
self.client_options = ClientOptions( | ||
api_endpoint="{}-dataproc.googleapis.com:443".format(self.credential.dataproc_region) | ||
) | ||
self.job_client = self._get_job_client() | ||
|
||
def _upload_to_gcs(self, filename: str, compiled_code: str) -> None: | ||
bucket = self.storage_client.get_bucket(self.credential.gcs_bucket) | ||
blob = bucket.blob(filename) | ||
blob.upload_from_string(compiled_code) | ||
|
||
def submit(self, compiled_code: str) -> dataproc_v1.types.jobs.Job: | ||
# upload python file to GCS | ||
self._upload_to_gcs(self.model_file_name, compiled_code) | ||
# submit dataproc job | ||
return self._submit_dataproc_job() | ||
|
||
def _get_job_client( | ||
self, | ||
) -> Union[dataproc_v1.JobControllerClient, dataproc_v1.BatchControllerClient]: | ||
raise NotImplementedError("_get_job_client not implemented") | ||
|
||
def _submit_dataproc_job(self) -> dataproc_v1.types.jobs.Job: | ||
raise NotImplementedError("_submit_dataproc_job not implemented") | ||
|
||
|
||
class ClusterDataprocHelper(BaseDataProcHelper): | ||
def _get_job_client(self) -> dataproc_v1.JobControllerClient: | ||
if not self._get_cluster_name(): | ||
raise ValueError( | ||
"Need to supply dataproc_cluster_name in profile or config to submit python job with cluster submission method" | ||
) | ||
return dataproc_v1.JobControllerClient( # type: ignore | ||
client_options=self.client_options, credentials=self.GoogleCredentials | ||
) | ||
|
||
def _get_cluster_name(self) -> str: | ||
return self.parsed_model["config"].get( | ||
"dataproc_cluster_name", self.credential.dataproc_cluster_name | ||
) | ||
|
||
def _submit_dataproc_job(self) -> dataproc_v1.types.jobs.Job: | ||
job = { | ||
"placement": {"cluster_name": self._get_cluster_name()}, | ||
"pyspark_job": { | ||
"main_python_file_uri": self.gcs_location, | ||
}, | ||
} | ||
operation = self.job_client.submit_job_as_operation( # type: ignore | ||
request={ | ||
"project_id": self.credential.database, | ||
"region": self.credential.dataproc_region, | ||
"job": job, | ||
} | ||
) | ||
response = operation.result(retry=self.retry) | ||
return response | ||
|
||
|
||
class ServerlessDataProcHelper(BaseDataProcHelper): | ||
def _get_job_client(self) -> dataproc_v1.BatchControllerClient: | ||
return dataproc_v1.BatchControllerClient( | ||
client_options=self.client_options, credentials=self.GoogleCredentials | ||
) | ||
|
||
def _submit_dataproc_job(self) -> dataproc_v1.types.jobs.Job: | ||
# create the Dataproc Serverless job config | ||
batch = dataproc_v1.Batch() | ||
batch.pyspark_batch.main_python_file_uri = self.gcs_location | ||
# how to keep this up to date? | ||
# we should probably also open this up to be configurable | ||
batch.pyspark_batch.jar_file_uris = [ | ||
"gs://spark-lib/bigquery/spark-bigquery-with-dependencies_2.12-0.21.1.jar" | ||
] | ||
# should we make all of these spark/dataproc properties configurable? | ||
# https://cloud.google.com/dataproc-serverless/docs/concepts/properties | ||
# https://cloud.google.com/dataproc-serverless/docs/reference/rest/v1/projects.locations.batches#runtimeconfig | ||
batch.runtime_config.properties = { | ||
"spark.executor.instances": "2", | ||
} | ||
parent = f"projects/{self.credential.database}/locations/{self.credential.dataproc_region}" | ||
request = dataproc_v1.CreateBatchRequest( | ||
parent=parent, | ||
batch=batch, | ||
) | ||
# make the request | ||
operation = self.job_client.create_batch(request=request) # type: ignore | ||
# this takes quite a while, waiting on GCP response to resolve | ||
response = operation.result(retry=self.retry) | ||
return response | ||
# there might be useful results here that we can parse and return | ||
# Dataproc job output is saved to the Cloud Storage bucket | ||
# allocated to the job. Use regex to obtain the bucket and blob info. | ||
# matches = re.match("gs://(.*?)/(.*)", response.driver_output_resource_uri) | ||
# output = ( | ||
# self.storage_client | ||
# .get_bucket(matches.group(1)) | ||
# .blob(f"{matches.group(2)}.000000000") | ||
# .download_as_string() | ||
# ) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters