feat: update working sheet in batches of 25 specs (#170)

* feat: update working sheet in batches of 25 specs Retry in the face of errors talking to Sheets. * fix: python 3.8
canonical · Feb 1, 2024 · 6f0436f · 6f0436f
1 parent 01623e8
commit 6f0436f
Show file tree

Hide file tree

Showing 2 changed files with 77 additions and 51 deletions.
diff --git a/requirements.txt b/requirements.txt
@@ -7,4 +7,5 @@ python-dateutil==2.8.2
 lxml==4.9.2
 cachetools==5.3.0
 black==23.1.0
-flake8==6.0.0
+flake8==6.0.0
+tenacity==8.2.3
diff --git a/webapp/update.py b/webapp/update.py
@@ -1,3 +1,7 @@
+from typing import List, Dict
+
+import tenacity
+
 from webapp.google import Drive, Sheets
 from webapp.spec import Spec
 from webapp.settings import (
@@ -7,6 +11,64 @@
     TMP_SHEET_TITLE,
 )
 
+try:
+    from itertools import batched
+except ImportError:
+    from itertools import islice
+
+    def batched(iterable, n):
+        # batched('ABCDEFG', 3) --> ABC DEF G
+        if n < 1:
+            raise ValueError("n must be at least one")
+        it = iter(iterable)
+        while batch := tuple(islice(it, n)):
+            yield batch
+
+
+def _generate_spec_rows_for_folders(drive: Drive, folders: List[Dict]):
+    for folder in folders:
+        query_doc_files = (
+            f"mimeType = 'application/vnd.google-apps.document' "
+            f"and '{folder['id']}' in parents"
+        )
+        files = drive.get_files(
+            query=query_doc_files,
+            fields=(
+                "id",
+                "name",
+                "createdTime",
+                "modifiedTime",
+                "webViewLink",
+            ),
+        )
+        for file_ in files:
+            try:
+                comments = drive.get_comments(
+                    file_id=file_["id"], fields=("resolved",)
+                )
+                open_comments = [c for c in comments if not c["resolved"]]
+                parsed_doc = Spec(google_drive=drive, document_id=file_["id"])
+            except Exception as e:
+                print(f"Unable to parse document: {file_['name']}", e)
+                continue
+
+            row = [
+                folder["name"],
+                file_["name"],
+                file_["id"],
+                file_["webViewLink"],
+                parsed_doc.metadata["index"],
+                parsed_doc.metadata["title"],
+                parsed_doc.metadata["status"],
+                ", ".join(parsed_doc.metadata["authors"]),
+                parsed_doc.metadata["type"],
+                file_["createdTime"],
+                file_["modifiedTime"],
+                len(comments),
+                len(open_comments),
+            ]
+            yield row
+
 
 def update_sheet() -> None:
     """
@@ -19,10 +81,17 @@ def update_sheet() -> None:
     specs_sheet = sheets.get_sheet_by_title(SPECS_SHEET_TITLE)
     tmp_sheet = sheets.ensure_sheet_by_title(TMP_SHEET_TITLE)
 
-    sheets.clear(sheet_id=tmp_sheet["properties"]["sheetId"])
+    @tenacity.retry(
+        stop=tenacity.stop_after_attempt(3),
+        wait=tenacity.wait_incrementing(start=0.5, increment=0.8),
+    )
+    def _append_rows(rows):
+        """Helper to retry extending the TMP_SHEET."""
+        return sheets.insert_rows(rows, range=TMP_SHEET_TITLE)
 
+    sheets.clear(sheet_id=tmp_sheet["properties"]["sheetId"])
     # Add headers
-    sheets.insert_rows(
+    _append_rows(
         rows=[
             [
                 "Folder name",
@@ -39,8 +108,7 @@ def update_sheet() -> None:
                 "Number of comments",
                 "Number of open comments",
             ]
-        ],
-        range=TMP_SHEET_TITLE,
+        ]
     )
 
     query_subfolders = (
@@ -49,52 +117,9 @@ def update_sheet() -> None:
     )
     folders = drive.get_files(query=query_subfolders, fields=("id", "name"))
 
-    for folder in folders:
-        query_doc_files = (
-            f"mimeType = 'application/vnd.google-apps.document' "
-            f"and '{folder['id']}' in parents"
-        )
-        files = drive.get_files(
-            query=query_doc_files,
-            fields=(
-                "id",
-                "name",
-                "createdTime",
-                "modifiedTime",
-                "webViewLink",
-            ),
-        )
-        for file in files:
-            try:
-                comments = drive.get_comments(
-                    file_id=file["id"], fields=("resolved",)
-                )
-                open_comments = [c for c in comments if not c["resolved"]]
-
-                parsed_doc = Spec(google_drive=drive, document_id=file["id"])
-            except Exception as e:
-                print(f"Unable to parse document: {file['name']}", e)
-                continue
-
-            row = [
-                folder["name"],
-                file["name"],
-                file["id"],
-                file["webViewLink"],
-                parsed_doc.metadata.get("index"),
-                parsed_doc.metadata.get("title"),
-                parsed_doc.metadata.get("status"),
-                ", ".join(parsed_doc.metadata.get("authors")),
-                parsed_doc.metadata.get("type"),
-                file["createdTime"],
-                file["modifiedTime"],
-                len(comments),
-                len(open_comments),
-            ]
-            sheets.insert_rows(
-                rows=[row],
-                range=TMP_SHEET_TITLE,
-            )
+    # Insert rows in batches of 25, which is a magic number with no science behind it.
+    for rows in batched(_generate_spec_rows_for_folders(drive, folders), 25):
+        _append_rows(rows=rows)
 
     # Rename temporary file as the main one once it contains all the specs
     sheets.update_sheet_name(