Skip to content

Commit

Permalink
fix(serializer): make reference file detection more failure resistant
Browse files Browse the repository at this point in the history
  • Loading branch information
b1rger committed Aug 26, 2024
1 parent 66c8eff commit 7a1ae3f
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 67 deletions.
79 changes: 37 additions & 42 deletions apis_ontology/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from drf_spectacular.utils import extend_schema_field
from drf_spectacular.types import OpenApiTypes
from functools import cache
import roman

DATEPATTERN = re.compile(r"(?P<year>\d\d\d\d)-(?P<month>\d\d)-(?P<day>\d\d)")
FOLIOPATTERN = re.compile(r"^(?P<cleanfolio>\d{1,3}[r|v]).*$")
Expand All @@ -19,51 +18,47 @@

@cache
def iiif_titles():
full_dict = {}
titles = requests.get("https://iiif.acdh-dev.oeaw.ac.at/images/sicprod/", headers={"Accept": "application/json"})
return titles.json()
for title in titles.json():
full_dict[title] = requests.get(f"https://iiif.acdh-dev.oeaw.ac.at/images/sicprod/{title}", headers={"Accept": "application/json"}).json()
return full_dict


def normalize_title(title: str) -> str:
return title.replace(" ", "_").replace("(", "").replace(")", "")


NUMBER = re.compile(r"(?P<number>\d+)")


def get_folio(obj):
folio = obj.folio
# some references use pages
title = normalize_title(obj.bibtexjson["title"])
if page := obj.pages_start:
if page % 2 == 0:
# 7 (H) uses pages, but they files are named using recto/verso
if "7 (H)" in obj.bibtexjson["title"]:
return f"{page:03d}v-{page+1:03d}r"
else:
return f"{page:03d}-{page+1:03d}"
else:
if "7 (H)" in obj.bibtexjson["title"]:
return f"{page-1:03d}v-{page:03d}r"
else:
return f"{page-1:03d}-{page:03d}"
page = f"{page:03d}"
if obj.folio:
if match := ROMANPATTERN.match(obj.folio):
romanfirst = match["romanfirst"]
try:
number = roman.fromRoman(romanfirst)
except roman.InvalidRomanNumeralError:
return f"Invalid roman numeral: {obj.folio}"
if match["rectoverso"] == "r":
number -= 1
return f"{roman.toRoman(number)}v-{roman.toRoman(number+1)}r"
if match := FOLIOPATTERN.match(obj.folio):
cleanfolio = match["cleanfolio"]
nr = int(cleanfolio[:-1])
if cleanfolio.endswith("r"):
folio = f"{nr-1:03d}v-{nr:03d}r"
if cleanfolio.endswith("v"):
folio = f"{nr:03d}v-{nr+1:03d}r"
return folio
if match := PAGEPATTERN.match(obj.folio):
page = int(match["page"])
if page % 2 == 0:
folio = f"{page:03d}-{page+1:03d}"
else:
folio = f"{page-1:03d}-{page:03d}"
return folio
return folio
page = obj.folio
if "-" in obj.folio:
page = obj.folio.split("-")[0]
if "–" in obj.folio:
page = obj.folio.split("–")[0]
if page:
if match := NUMBER.match(page):
page = match["number"]
if page.endswith("v") or page.endswith("r"):
page = page[:-1]
try:
page = int(page)
page = f"{page:03d}"
except Exception:
pass
if page:
matches = [scanfile for scanfile in iiif_titles()[title] if page in scanfile]
if matches:
return matches[0]
print(obj.folio)
print(page)
return None


class FixDateMixin:
Expand Down Expand Up @@ -125,8 +120,8 @@ def get_scan_path(self, obj) -> str:
def get_scandata(self, obj) -> dict:
scandata = {}
bibtex = json.loads(obj.bibtex)
title = bibtex["title"].replace(" ", "_").replace("(", "").replace(")", "")
if title in iiif_titles():
title = normalize_title(bibtex["title"])
if title in iiif_titles().keys():
scandata["title"] = title
folio = get_folio(obj)
scandata["pages"] = folio or f"{obj.pages_start}-{obj.pages_end}"
Expand Down
34 changes: 9 additions & 25 deletions apis_ontology/views.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
import requests
from django.views.generic.list import ListView
from django.contrib.auth.mixins import LoginRequiredMixin
from auditlog.models import LogEntry
from apis_bibsonomy.models import Reference
from apis_ontology.serializers import iiif_titles, get_folio
from functools import cache
from apis_ontology.serializers import iiif_titles, get_folio, normalize_title
import django_tables2 as tables
from django.utils.html import format_html

Expand All @@ -16,32 +14,18 @@ def get_queryset(self, *args, **kwargs):

def scanfolderexists(ref):
if "title" in ref.bibtexjson:
normtitle = ref.bibtexjson["title"].replace(" ", "_").replace("(", "").replace(")", "")
return normtitle in iiif_titles()
return normalize_title(ref.bibtexjson["title"]) in iiif_titles()
return False


@cache
def iiif_files(title):
files = requests.get(f"https://iiif.acdh-dev.oeaw.ac.at/images/sicprod/{title}/", headers={"Accept": "application/json"})
return files.json()


def scanfileexists(ref):
normtitle = ref.bibtexjson["title"].replace(" ", "_").replace("(", "").replace(")", "")
folio = get_folio(ref)
return f"{folio}.jpg" in iiif_files(normtitle)


def scanfile(ref):
normtitle = ref.bibtexjson["title"].replace(" ", "_").replace("(", "").replace(")", "")
folio = get_folio(ref)
return f"<a href='https://iiif.acdh-dev.oeaw.ac.at/images/sicprod/{normtitle}/{folio}.jpg'>{normtitle}/{folio}</a>"
def scanfolder(ref):
normtitle = normalize_title(ref.bibtexjson["title"])
return f"<a href='https://iiif.acdh-dev.oeaw.ac.at/images/sicprod/{normtitle}/'>{normtitle}/</a>"


class ReferenceFailTable(tables.Table):
ref = tables.Column(empty_values=())
scanfile = tables.Column(empty_values=())
folder = tables.Column(empty_values=())
on = tables.Column(empty_values=())

def render_on(self, record):
Expand All @@ -55,8 +39,8 @@ def render_on(self, record):
def render_ref(self, record):
return str(record)

def render_scanfile(self, record):
return format_html(scanfile(record))
def render_folder(self, record):
return format_html(scanfolder(record))


class ReferenceScanFail(LoginRequiredMixin, tables.SingleTableView):
Expand All @@ -66,5 +50,5 @@ class ReferenceScanFail(LoginRequiredMixin, tables.SingleTableView):
def get_queryset(self, *args, **kwargs):
refs = Reference.objects.all()
refs = [ref for ref in refs if scanfolderexists(ref)]
refs = [ref for ref in refs if not scanfileexists(ref)]
refs = [ref for ref in refs if get_folio(ref) is None]
return refs

0 comments on commit 7a1ae3f

Please sign in to comment.