Skip to content
This repository has been archived by the owner on Nov 15, 2023. It is now read-only.

Commit

Permalink
Log PVF retries (#6504)
Browse files Browse the repository at this point in the history
  • Loading branch information
mrcnski committed Jan 4, 2023
1 parent b4d8c76 commit 52e8606
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 5 deletions.
18 changes: 14 additions & 4 deletions node/core/candidate-validation/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -604,13 +604,16 @@ async fn validate_candidate_exhaustive(

#[async_trait]
trait ValidationBackend {
/// Tries executing a PVF a single time (no retries).
async fn validate_candidate(
&mut self,
pvf: Pvf,
timeout: Duration,
encoded_params: Vec<u8>,
) -> Result<WasmValidationResult, ValidationError>;

/// Tries executing a PVF. Will retry once if an error is encountered that may have been
/// transient.
async fn validate_candidate_with_retry(
&mut self,
raw_validation_code: Vec<u8>,
Expand All @@ -620,7 +623,7 @@ trait ValidationBackend {
// Construct the PVF a single time, since it is an expensive operation. Cloning it is cheap.
let pvf = Pvf::from_code(raw_validation_code);

let validation_result =
let mut validation_result =
self.validate_candidate(pvf.clone(), timeout, params.encode()).await;

// If we get an AmbiguousWorkerDeath error, retry once after a brief delay, on the
Expand All @@ -630,12 +633,19 @@ trait ValidationBackend {
{
// Wait a brief delay before retrying.
futures_timer::Delay::new(PVF_EXECUTION_RETRY_DELAY).await;

gum::debug!(
target: LOG_TARGET,
?pvf,
"Re-trying failed candidate validation due to AmbiguousWorkerDeath."
);

// Encode the params again when re-trying. We expect the retry case to be relatively
// rare, and we want to avoid unconditionally cloning data.
self.validate_candidate(pvf, timeout, params.encode()).await
} else {
validation_result
validation_result = self.validate_candidate(pvf, timeout, params.encode()).await;
}

validation_result
}

async fn precheck_pvf(&mut self, pvf: Pvf) -> Result<Duration, PrepareError>;
Expand Down
22 changes: 21 additions & 1 deletion node/core/pvf/src/host.rs
Original file line number Diff line number Diff line change
Expand Up @@ -525,6 +525,16 @@ async fn handle_execute_pvf(
},
ArtifactState::FailedToProcess { last_time_failed, num_failures, error } => {
if can_retry_prepare_after_failure(*last_time_failed, *num_failures, error) {
gum::debug!(
target: LOG_TARGET,
?pvf,
?artifact_id,
?last_time_failed,
%num_failures,
%error,
"handle_execute_pvf: Re-trying failed PVF preparation."
);

// If we are allowed to retry the failed prepare job, change the state to
// Preparing and re-queue this job.
*state = ArtifactState::Preparing {
Expand Down Expand Up @@ -585,6 +595,16 @@ async fn handle_heads_up(
},
ArtifactState::FailedToProcess { last_time_failed, num_failures, error } => {
if can_retry_prepare_after_failure(*last_time_failed, *num_failures, error) {
gum::debug!(
target: LOG_TARGET,
?active_pvf,
?artifact_id,
?last_time_failed,
%num_failures,
%error,
"handle_heads_up: Re-trying failed PVF preparation."
);

// If we are allowed to retry the failed prepare job, change the state to
// Preparing and re-queue this job.
*state = ArtifactState::Preparing {
Expand Down Expand Up @@ -1393,7 +1413,7 @@ mod tests {
}

// Test that multiple execution requests don't trigger preparation retries if the first one
// failed due to reproducible error (e.g. Prevalidation).
// failed due to a reproducible error (e.g. Prevalidation).
#[async_std::test]
async fn test_execute_prepare_no_retry() {
let mut test = Builder::default().build();
Expand Down

0 comments on commit 52e8606

Please sign in to comment.