Skip to content

Commit

Permalink
feat: update working sheet in batches of 25 specs (#170)
Browse files Browse the repository at this point in the history
* feat: update working sheet in batches of 25 specs

Retry in the face of errors talking to Sheets.

* fix: python 3.8
  • Loading branch information
sparkiegeek committed Feb 1, 2024
1 parent 01623e8 commit 6f0436f
Show file tree
Hide file tree
Showing 2 changed files with 77 additions and 51 deletions.
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,5 @@ python-dateutil==2.8.2
lxml==4.9.2
cachetools==5.3.0
black==23.1.0
flake8==6.0.0
flake8==6.0.0
tenacity==8.2.3
125 changes: 75 additions & 50 deletions webapp/update.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
from typing import List, Dict

import tenacity

from webapp.google import Drive, Sheets
from webapp.spec import Spec
from webapp.settings import (
Expand All @@ -7,6 +11,64 @@
TMP_SHEET_TITLE,
)

try:
from itertools import batched
except ImportError:
from itertools import islice

def batched(iterable, n):
# batched('ABCDEFG', 3) --> ABC DEF G
if n < 1:
raise ValueError("n must be at least one")
it = iter(iterable)
while batch := tuple(islice(it, n)):
yield batch


def _generate_spec_rows_for_folders(drive: Drive, folders: List[Dict]):
for folder in folders:
query_doc_files = (
f"mimeType = 'application/vnd.google-apps.document' "
f"and '{folder['id']}' in parents"
)
files = drive.get_files(
query=query_doc_files,
fields=(
"id",
"name",
"createdTime",
"modifiedTime",
"webViewLink",
),
)
for file_ in files:
try:
comments = drive.get_comments(
file_id=file_["id"], fields=("resolved",)
)
open_comments = [c for c in comments if not c["resolved"]]
parsed_doc = Spec(google_drive=drive, document_id=file_["id"])
except Exception as e:
print(f"Unable to parse document: {file_['name']}", e)
continue

row = [
folder["name"],
file_["name"],
file_["id"],
file_["webViewLink"],
parsed_doc.metadata["index"],
parsed_doc.metadata["title"],
parsed_doc.metadata["status"],
", ".join(parsed_doc.metadata["authors"]),
parsed_doc.metadata["type"],
file_["createdTime"],
file_["modifiedTime"],
len(comments),
len(open_comments),
]
yield row


def update_sheet() -> None:
"""
Expand All @@ -19,10 +81,17 @@ def update_sheet() -> None:
specs_sheet = sheets.get_sheet_by_title(SPECS_SHEET_TITLE)
tmp_sheet = sheets.ensure_sheet_by_title(TMP_SHEET_TITLE)

sheets.clear(sheet_id=tmp_sheet["properties"]["sheetId"])
@tenacity.retry(
stop=tenacity.stop_after_attempt(3),
wait=tenacity.wait_incrementing(start=0.5, increment=0.8),
)
def _append_rows(rows):
"""Helper to retry extending the TMP_SHEET."""
return sheets.insert_rows(rows, range=TMP_SHEET_TITLE)

sheets.clear(sheet_id=tmp_sheet["properties"]["sheetId"])
# Add headers
sheets.insert_rows(
_append_rows(
rows=[
[
"Folder name",
Expand All @@ -39,8 +108,7 @@ def update_sheet() -> None:
"Number of comments",
"Number of open comments",
]
],
range=TMP_SHEET_TITLE,
]
)

query_subfolders = (
Expand All @@ -49,52 +117,9 @@ def update_sheet() -> None:
)
folders = drive.get_files(query=query_subfolders, fields=("id", "name"))

for folder in folders:
query_doc_files = (
f"mimeType = 'application/vnd.google-apps.document' "
f"and '{folder['id']}' in parents"
)
files = drive.get_files(
query=query_doc_files,
fields=(
"id",
"name",
"createdTime",
"modifiedTime",
"webViewLink",
),
)
for file in files:
try:
comments = drive.get_comments(
file_id=file["id"], fields=("resolved",)
)
open_comments = [c for c in comments if not c["resolved"]]

parsed_doc = Spec(google_drive=drive, document_id=file["id"])
except Exception as e:
print(f"Unable to parse document: {file['name']}", e)
continue

row = [
folder["name"],
file["name"],
file["id"],
file["webViewLink"],
parsed_doc.metadata.get("index"),
parsed_doc.metadata.get("title"),
parsed_doc.metadata.get("status"),
", ".join(parsed_doc.metadata.get("authors")),
parsed_doc.metadata.get("type"),
file["createdTime"],
file["modifiedTime"],
len(comments),
len(open_comments),
]
sheets.insert_rows(
rows=[row],
range=TMP_SHEET_TITLE,
)
# Insert rows in batches of 25, which is a magic number with no science behind it.
for rows in batched(_generate_spec_rows_for_folders(drive, folders), 25):
_append_rows(rows=rows)

# Rename temporary file as the main one once it contains all the specs
sheets.update_sheet_name(
Expand Down

0 comments on commit 6f0436f

Please sign in to comment.