diff --git a/bigcodebench/evaluate.py b/bigcodebench/evaluate.py
index d39bd93..1df464c 100644
--- a/bigcodebench/evaluate.py
+++ b/bigcodebench/evaluate.py
@@ -6,7 +6,7 @@
 import threading
 import time
 from collections import Counter, defaultdict
-from concurrent.futures import ProcessPoolExecutor, as_completed
+from concurrent.futures import ProcessPoolExecutor, as_completed, wait, FIRST_COMPLETED
 from datetime import datetime
 from typing import Any, Dict, List, Tuple
 from warnings import warn
@@ -204,14 +204,13 @@ def evaluate(flags):
             assert len(completion_id) == len(problems), "Missing problems in samples"
 
             def stucking_checker():
-                while remainings:
-                    last_size = len(remainings)
-                    time.sleep(240)
-                    if last_size != len(remainings) or len(remainings) == 0:
-                        continue
-                    # Potential stucking
-                    warn("No samples had finished testing in the last 240s")
-                    warn(f"{len(remainings)} samples to be tested: {remainings}")
+                not_done = futures
+                while len(not_done) > 0:
+                    done, not_done = wait(not_done, timeout=240, return_when=FIRST_COMPLETED)
+
+                    if len(done) == 0:
+                        warn("No samples have finished testing in the last 240s")
+                        warn(f"{len(remainings)} samples to be tested: {remainings}")
 
             threading.Thread(target=stucking_checker).start()