From 063384136903ada725df68345b9f3ee89bff55fb Mon Sep 17 00:00:00 2001 From: Dan Hansen Date: Fri, 21 Jul 2023 09:04:17 -0700 Subject: [PATCH] [GCP] [BigQuery] Handle `totalBytesProcessed` `NoneType` (#27474) * [GCP] [BigQuery] Handle totalBytesProcessed NoneType * Update CHANGES.md * lint / whitespace --------- Co-authored-by: Yi Hu --- CHANGES.md | 2 ++ sdks/python/apache_beam/io/gcp/bigquery.py | 26 ++++++++++++++++++---- 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index dff482b6b7465..ec1c112ff4df4 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -79,6 +79,8 @@ ## Bugfixes +* Fixed DirectRunner bug in Python SDK where GroupByKey gets empty PCollection and fails when pipeline option `direct_num_workers!=1`. ([#27373](https://github.com/apache/beam/pull/27373)) +* Fixed BigQuery I/O bug when estimating size on queries that utilize row-level security ([#27474](https://github.com/apache/beam/pull/27474)) * Fixed X (Java/Python) ([#X](https://github.com/apache/beam/issues/X)). ## Known Issues diff --git a/sdks/python/apache_beam/io/gcp/bigquery.py b/sdks/python/apache_beam/io/gcp/bigquery.py index 3fc7bfc3b0208..5c1ca4a7d6e75 100644 --- a/sdks/python/apache_beam/io/gcp/bigquery.py +++ b/sdks/python/apache_beam/io/gcp/bigquery.py @@ -751,8 +751,17 @@ def estimate_size(self): kms_key=self.kms_key, job_labels=self._get_bq_metadata().add_additional_bq_job_labels( self.bigquery_job_labels)) - size = int(job.statistics.totalBytesProcessed) - return size + + if job.statistics.totalBytesProcessed is None: + # Some queries may not have access to `totalBytesProcessed` as a + # result of row-level security. + # > BigQuery hides sensitive statistics on all queries against + # > tables with row-level security. + # See cloud.google.com/bigquery/docs/managing-row-level-security + # and cloud.google.com/bigquery/docs/best-practices-row-level-security + return None + + return int(job.statistics.totalBytesProcessed) else: # Size estimation is best effort. We return None as we have # no access to the query that we're running. @@ -1104,8 +1113,17 @@ def estimate_size(self): kms_key=self.kms_key, job_labels=self._get_bq_metadata().add_additional_bq_job_labels( self.bigquery_job_labels)) - size = int(job.statistics.totalBytesProcessed) - return size + + if job.statistics.totalBytesProcessed is None: + # Some queries may not have access to `totalBytesProcessed` as a + # result of row-level security + # > BigQuery hides sensitive statistics on all queries against + # > tables with row-level security. + # See cloud.google.com/bigquery/docs/managing-row-level-security + # and cloud.google.com/bigquery/docs/best-practices-row-level-security + return None + + return int(job.statistics.totalBytesProcessed) else: # Size estimation is best effort. We return None as we have # no access to the query that we're running.