feat(ingest): add async batch mode to the rest sink (#10733)

datahub-project · Jun 25, 2024 · 724907b · 724907b
1 parent 0dc0bc5
commit 724907b
Show file tree

Hide file tree

Showing 9 changed files with 766 additions and 378 deletions.
diff --git a/metadata-ingestion/src/datahub/ingestion/graph/client.py b/metadata-ingestion/src/datahub/ingestion/graph/client.py
@@ -214,15 +214,15 @@ def _post_generic(self, url: str, payload_dict: Dict) -> Dict:
     def _make_rest_sink_config(self) -> "DatahubRestSinkConfig":
         from datahub.ingestion.sink.datahub_rest import (
             DatahubRestSinkConfig,
-            SyncOrAsync,
+            RestSinkMode,
         )
 
         # This is a bit convoluted - this DataHubGraph class is a subclass of DatahubRestEmitter,
         # but initializing the rest sink creates another rest emitter.
         # TODO: We should refactor out the multithreading functionality of the sink
         # into a separate class that can be used by both the sink and the graph client
         # e.g. a DatahubBulkRestEmitter that both the sink and the graph client use.
-        return DatahubRestSinkConfig(**self.config.dict(), mode=SyncOrAsync.ASYNC)
+        return DatahubRestSinkConfig(**self.config.dict(), mode=RestSinkMode.ASYNC)
 
     @contextlib.contextmanager
     def make_rest_sink(
@@ -253,14 +253,10 @@ def emit_all(
     ) -> None:
         """Emit all items in the iterable using multiple threads."""
 
+        # The context manager also ensures that we raise an error if a failure occurs.
         with self.make_rest_sink(run_id=run_id) as sink:
             for item in items:
                 sink.emit_async(item)
-        if sink.report.failures:
-            raise OperationalError(
-                f"Failed to emit {len(sink.report.failures)} records",
-                info=sink.report.as_obj(),
-            )
 
     def get_aspect(
         self,

diff --git a/metadata-ingestion/src/datahub/ingestion/sink/datahub_rest.py b/metadata-ingestion/src/datahub/ingestion/sink/datahub_rest.py
@@ -7,7 +7,7 @@
 import threading
 import uuid
 from enum import auto
-from typing import Optional, Union
+from typing import List, Optional, Tuple, Union
 
 from datahub.cli.cli_utils import set_env_variables_override_config
 from datahub.configuration.common import (
@@ -16,6 +16,7 @@
     OperationalError,
 )
 from datahub.emitter.mcp import MetadataChangeProposalWrapper
+from datahub.emitter.mcp_builder import mcps_from_mce
 from datahub.emitter.rest_emitter import DataHubRestEmitter
 from datahub.ingestion.api.common import RecordEnvelope, WorkUnit
 from datahub.ingestion.api.sink import (
@@ -30,7 +31,10 @@
     MetadataChangeEvent,
     MetadataChangeProposal,
 )
-from datahub.utilities.advanced_thread_executor import PartitionExecutor
+from datahub.utilities.partition_executor import (
+    BatchPartitionExecutor,
+    PartitionExecutor,
+)
 from datahub.utilities.perf_timer import PerfTimer
 from datahub.utilities.server_config_util import set_gms_config
 
@@ -41,18 +45,26 @@
 )
 
 
-class SyncOrAsync(ConfigEnum):
+class RestSinkMode(ConfigEnum):
     SYNC = auto()
     ASYNC = auto()
 
+    # Uses the new ingestProposalBatch endpoint. Significantly more efficient than the other modes,
+    # but requires a server version that supports it.
+    # https://github.com/datahub-project/datahub/pull/10706
+    ASYNC_BATCH = auto()
+
 
 class DatahubRestSinkConfig(DatahubClientConfig):
-    mode: SyncOrAsync = SyncOrAsync.ASYNC
+    mode: RestSinkMode = RestSinkMode.ASYNC
 
-    # These only apply in async mode.
+    # These only apply in async modes.
     max_threads: int = DEFAULT_REST_SINK_MAX_THREADS
     max_pending_requests: int = 2000
 
+    # Only applies in async batch mode.
+    max_per_batch: int = 100
+
 
 @dataclasses.dataclass
 class DataHubRestSinkReport(SinkReport):
@@ -111,10 +123,20 @@ def __post_init__(self) -> None:
         set_env_variables_override_config(self.config.server, self.config.token)
         logger.debug("Setting gms config")
         set_gms_config(gms_config)
-        self.executor = PartitionExecutor(
-            max_workers=self.config.max_threads,
-            max_pending=self.config.max_pending_requests,
-        )
+
+        self.executor: Union[PartitionExecutor, BatchPartitionExecutor]
+        if self.config.mode == RestSinkMode.ASYNC_BATCH:
+            self.executor = BatchPartitionExecutor(
+                max_workers=self.config.max_threads,
+                max_pending=self.config.max_pending_requests,
+                process_batch=self._emit_batch_wrapper,
+                max_per_batch=self.config.max_per_batch,
+            )
+        else:
+            self.executor = PartitionExecutor(
+                max_workers=self.config.max_threads,
+                max_pending=self.config.max_pending_requests,
+            )
 
     @classmethod
     def _make_emitter(cls, config: DatahubRestSinkConfig) -> DataHubRestEmitter:
@@ -189,6 +211,7 @@ def _write_done_callback(
                     self.report.report_warning({"warning": e.message, "info": e.info})
                 write_callback.on_failure(record_envelope, e, e.info)
             else:
+                logger.exception(f"Failure: {e}", exc_info=e)
                 self.report.report_failure({"e": e})
                 write_callback.on_failure(record_envelope, Exception(e), {})
 
@@ -203,6 +226,30 @@ def _emit_wrapper(
         # TODO: Add timing metrics
         self.emitter.emit(record)
 
+    def _emit_batch_wrapper(
+        self,
+        records: List[
+            Tuple[
+                Union[
+                    MetadataChangeEvent,
+                    MetadataChangeProposal,
+                    MetadataChangeProposalWrapper,
+                ],
+            ]
+        ],
+    ) -> None:
+        events: List[Union[MetadataChangeProposal, MetadataChangeProposalWrapper]] = []
+        for record in records:
+            event = record[0]
+            if isinstance(event, MetadataChangeEvent):
+                # Unpack MCEs into MCPs.
+                mcps = mcps_from_mce(event)
+                events.extend(mcps)
+            else:
+                events.append(event)
+
+        self.emitter.emit_mcps(events)
+
     def write_record_async(
         self,
         record_envelope: RecordEnvelope[
@@ -218,7 +265,8 @@ def write_record_async(
         # should only have a high value if the sink is actually a bottleneck.
         with self.report.main_thread_blocking_timer:
             record = record_envelope.record
-            if self.config.mode == SyncOrAsync.ASYNC:
+            if self.config.mode == RestSinkMode.ASYNC:
+                assert isinstance(self.executor, PartitionExecutor)
                 partition_key = _get_partition_key(record_envelope)
                 self.executor.submit(
                     partition_key,
@@ -229,6 +277,17 @@ def write_record_async(
                     ),
                 )
                 self.report.pending_requests += 1
+            elif self.config.mode == RestSinkMode.ASYNC_BATCH:
+                assert isinstance(self.executor, BatchPartitionExecutor)
+                partition_key = _get_partition_key(record_envelope)
+                self.executor.submit(
+                    partition_key,
+                    record,
+                    done_callback=functools.partial(
+                        self._write_done_callback, record_envelope, write_callback
+                    ),
+                )
+                self.report.pending_requests += 1
             else:
                 # execute synchronously
                 try:
@@ -249,7 +308,8 @@ def emit_async(
         )
 
     def close(self):
-        self.executor.shutdown()
+        with self.report.main_thread_blocking_timer:
+            self.executor.shutdown()
 
     def __repr__(self) -> str:
         return self.emitter.__repr__()

diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py
@@ -102,7 +102,7 @@
     OwnershipTypeClass,
     SubTypesClass,
 )
-from datahub.utilities.advanced_thread_executor import BackpressureAwareExecutor
+from datahub.utilities.backpressure_aware_executor import BackpressureAwareExecutor
 
 logger = logging.getLogger(__name__)