From 7a1ae3fa432b5cdd4eb690ea1fddca56d31c34e8 Mon Sep 17 00:00:00 2001 From: Birger Schacht Date: Mon, 26 Aug 2024 08:57:21 +0200 Subject: [PATCH] fix(serializer): make reference file detection more failure resistant --- apis_ontology/serializers.py | 79 +++++++++++++++++------------------- apis_ontology/views.py | 34 ++++------------ 2 files changed, 46 insertions(+), 67 deletions(-) diff --git a/apis_ontology/serializers.py b/apis_ontology/serializers.py index 6774582..24eb8a0 100644 --- a/apis_ontology/serializers.py +++ b/apis_ontology/serializers.py @@ -9,7 +9,6 @@ from drf_spectacular.utils import extend_schema_field from drf_spectacular.types import OpenApiTypes from functools import cache -import roman DATEPATTERN = re.compile(r"(?P\d\d\d\d)-(?P\d\d)-(?P\d\d)") FOLIOPATTERN = re.compile(r"^(?P\d{1,3}[r|v]).*$") @@ -19,51 +18,47 @@ @cache def iiif_titles(): + full_dict = {} titles = requests.get("https://iiif.acdh-dev.oeaw.ac.at/images/sicprod/", headers={"Accept": "application/json"}) - return titles.json() + for title in titles.json(): + full_dict[title] = requests.get(f"https://iiif.acdh-dev.oeaw.ac.at/images/sicprod/{title}", headers={"Accept": "application/json"}).json() + return full_dict + + +def normalize_title(title: str) -> str: + return title.replace(" ", "_").replace("(", "").replace(")", "") + + +NUMBER = re.compile(r"(?P\d+)") def get_folio(obj): - folio = obj.folio - # some references use pages + title = normalize_title(obj.bibtexjson["title"]) if page := obj.pages_start: - if page % 2 == 0: - # 7 (H) uses pages, but they files are named using recto/verso - if "7 (H)" in obj.bibtexjson["title"]: - return f"{page:03d}v-{page+1:03d}r" - else: - return f"{page:03d}-{page+1:03d}" - else: - if "7 (H)" in obj.bibtexjson["title"]: - return f"{page-1:03d}v-{page:03d}r" - else: - return f"{page-1:03d}-{page:03d}" + page = f"{page:03d}" if obj.folio: - if match := ROMANPATTERN.match(obj.folio): - romanfirst = match["romanfirst"] - try: - number = roman.fromRoman(romanfirst) - except roman.InvalidRomanNumeralError: - return f"Invalid roman numeral: {obj.folio}" - if match["rectoverso"] == "r": - number -= 1 - return f"{roman.toRoman(number)}v-{roman.toRoman(number+1)}r" - if match := FOLIOPATTERN.match(obj.folio): - cleanfolio = match["cleanfolio"] - nr = int(cleanfolio[:-1]) - if cleanfolio.endswith("r"): - folio = f"{nr-1:03d}v-{nr:03d}r" - if cleanfolio.endswith("v"): - folio = f"{nr:03d}v-{nr+1:03d}r" - return folio - if match := PAGEPATTERN.match(obj.folio): - page = int(match["page"]) - if page % 2 == 0: - folio = f"{page:03d}-{page+1:03d}" - else: - folio = f"{page-1:03d}-{page:03d}" - return folio - return folio + page = obj.folio + if "-" in obj.folio: + page = obj.folio.split("-")[0] + if "–" in obj.folio: + page = obj.folio.split("–")[0] + if page: + if match := NUMBER.match(page): + page = match["number"] + if page.endswith("v") or page.endswith("r"): + page = page[:-1] + try: + page = int(page) + page = f"{page:03d}" + except Exception: + pass + if page: + matches = [scanfile for scanfile in iiif_titles()[title] if page in scanfile] + if matches: + return matches[0] + print(obj.folio) + print(page) + return None class FixDateMixin: @@ -125,8 +120,8 @@ def get_scan_path(self, obj) -> str: def get_scandata(self, obj) -> dict: scandata = {} bibtex = json.loads(obj.bibtex) - title = bibtex["title"].replace(" ", "_").replace("(", "").replace(")", "") - if title in iiif_titles(): + title = normalize_title(bibtex["title"]) + if title in iiif_titles().keys(): scandata["title"] = title folio = get_folio(obj) scandata["pages"] = folio or f"{obj.pages_start}-{obj.pages_end}" diff --git a/apis_ontology/views.py b/apis_ontology/views.py index ef0521a..ac192b2 100644 --- a/apis_ontology/views.py +++ b/apis_ontology/views.py @@ -1,10 +1,8 @@ -import requests from django.views.generic.list import ListView from django.contrib.auth.mixins import LoginRequiredMixin from auditlog.models import LogEntry from apis_bibsonomy.models import Reference -from apis_ontology.serializers import iiif_titles, get_folio -from functools import cache +from apis_ontology.serializers import iiif_titles, get_folio, normalize_title import django_tables2 as tables from django.utils.html import format_html @@ -16,32 +14,18 @@ def get_queryset(self, *args, **kwargs): def scanfolderexists(ref): if "title" in ref.bibtexjson: - normtitle = ref.bibtexjson["title"].replace(" ", "_").replace("(", "").replace(")", "") - return normtitle in iiif_titles() + return normalize_title(ref.bibtexjson["title"]) in iiif_titles() return False -@cache -def iiif_files(title): - files = requests.get(f"https://iiif.acdh-dev.oeaw.ac.at/images/sicprod/{title}/", headers={"Accept": "application/json"}) - return files.json() - - -def scanfileexists(ref): - normtitle = ref.bibtexjson["title"].replace(" ", "_").replace("(", "").replace(")", "") - folio = get_folio(ref) - return f"{folio}.jpg" in iiif_files(normtitle) - - -def scanfile(ref): - normtitle = ref.bibtexjson["title"].replace(" ", "_").replace("(", "").replace(")", "") - folio = get_folio(ref) - return f"{normtitle}/{folio}" +def scanfolder(ref): + normtitle = normalize_title(ref.bibtexjson["title"]) + return f"{normtitle}/" class ReferenceFailTable(tables.Table): ref = tables.Column(empty_values=()) - scanfile = tables.Column(empty_values=()) + folder = tables.Column(empty_values=()) on = tables.Column(empty_values=()) def render_on(self, record): @@ -55,8 +39,8 @@ def render_on(self, record): def render_ref(self, record): return str(record) - def render_scanfile(self, record): - return format_html(scanfile(record)) + def render_folder(self, record): + return format_html(scanfolder(record)) class ReferenceScanFail(LoginRequiredMixin, tables.SingleTableView): @@ -66,5 +50,5 @@ class ReferenceScanFail(LoginRequiredMixin, tables.SingleTableView): def get_queryset(self, *args, **kwargs): refs = Reference.objects.all() refs = [ref for ref in refs if scanfolderexists(ref)] - refs = [ref for ref in refs if not scanfileexists(ref)] + refs = [ref for ref in refs if get_folio(ref) is None] return refs