From 370e0f94c5bdd2ab00bdddf868954211b0acb5c7 Mon Sep 17 00:00:00 2001 From: geoffroy-noel-ddh Date: Fri, 23 Aug 2024 17:55:01 +0100 Subject: [PATCH] feat(search): text search now sorted by book & page number; removed image placeholder; 15 items per result page. --- apps/textsearch/html-to-html-modern.xslt | 6 +- apps/textsearch/textsearch.js | 107 +++-------------------- frontend/entities/textsearch.md | 9 +- 3 files changed, 26 insertions(+), 96 deletions(-) diff --git a/apps/textsearch/html-to-html-modern.xslt b/apps/textsearch/html-to-html-modern.xslt index 6840b0882..9ad81f6bb 100644 --- a/apps/textsearch/html-to-html-modern.xslt +++ b/apps/textsearch/html-to-html-modern.xslt @@ -21,11 +21,13 @@ diff --git a/apps/textsearch/textsearch.js b/apps/textsearch/textsearch.js index 3c1f1404b..de0dc5277 100644 --- a/apps/textsearch/textsearch.js +++ b/apps/textsearch/textsearch.js @@ -27,7 +27,7 @@ const LABEL_FROM_KEY = { 'semidip': 'Semi-diplomatic', } const TO_BE_INDEXED_PATH = 'to-be-indexed' -const LIMIT = 2 +const LIMIT = 100 class TextSearch { constructor() { @@ -82,96 +82,32 @@ class TextSearch { } getMetadataFromPath(path, version='modern') { - // clone/dts/documents/book_two/p.99.html let ret = null; + // clone/dts/documents/book_one/p.12.html const regex = /\/dts\/documents\/(?\w+)\/p\.(?\d+)\.html$/; const match = regex.exec(path); + if (match) { + // 12 + const pageNumber = match.groups.page + // 0012 + const pageNumberPadded = pageNumber.padStart(4, "0"); ret = { book: LABEL_FROM_KEY[match.groups.bookKey], - page: match.groups.page, + page: pageNumber, version: LABEL_FROM_KEY[version], - url: `/edition/?p0.do=${match.groups.bookKey}&p0.lo=p.${match.groups.page}&p0.vi=${version}`, - title: `${LABEL_FROM_KEY[match.groups.bookKey]}, page ${match.groups.page}`, + url: `/edition/?p0.do=${match.groups.bookKey}&p0.lo=p.${pageNumber}&p0.vi=${version}`, + // Book 1, page 12 + title: `${LABEL_FROM_KEY[match.groups.bookKey]}, page ${pageNumber}`, + // 1-0012 for Book 1, page 12 + bookPage: `${Object.keys(LABEL_FROM_KEY).indexOf(match.groups.bookKey)}-${pageNumberPadded}`, } } return ret } - async writeJsonFromTei() { - this.entities = []; - - for (let source of sources) { - await this.loadTei(sourceBase + source); - } - - this.postProcessEntities(); - - this.writeJson(target, this.entities); - } - - postProcessEntities() { - // processing which is much simpler in JS than XSLT - for (let entity of this.entities) { - // remove duplicate pages in entity.pages - entity.pages = Object.fromEntries( - Object.entries(entity.pages).map(([k, v]) => [k, [...new Set(v)]]) - ); - // remove books from entity.pages which have no pages - entity.pages = Object.fromEntries( - Object.entries(entity.pages).filter(([k, v]) => v.length) - ); - // entity.books = list of books they appear in - entity.books = Object.keys(entity.pages); - // missing key for people with no first/surname - if (!entity?.sortkey && entity.type == "person") { - // 'John Thornton (1633-1669)' => "Thornton-John" - entity.sortkey = entity.title - .replace(/\([^)]+\)/g, "") - .trim() - .split(/\s+/) - .reverse() - .join("-"); - console.log( - `WARNING: fixed missing sorkey for ${entity.type}:${entity.id} = ${entity.sortkey}` - ); - } - if (!entity?.search) { - entity.search = entity.title; - } - // remove text between [] - entity.search = entity.search.replace(/\[.*?\]/g, ""); - } - - // sort by sortKey, optional, only for debugging purpose as itemjs will sort anyway - this.entities = this.entities.sort((a, b) => - a.sortkey.localeCompare(b.sortkey) - ); - } - - async loadTei(source) { - // let docString = this.readFile(source) - let entitiesJson = await this.xslt(source, jsonSheetPath); - // console.log(entitiesJson.substring(0, 1000)); - // fs.writeFileSync('tmp.json', entitiesJson, "utf8"); - - let entities = []; - - if (entitiesJson) { - entities = JSON.parse(entitiesJson); - } else { - console.log( - `WARNING: entities file (${source}) transformed into an empty string.` - ); - } - - for (let i in entities) { - this.entities.push(entities[i]); - } - } - readFile(source) { return fs.readFileSync(source).toString(); } @@ -206,7 +142,7 @@ class TextSearch { let firstLine = ''; ret = ret.replace(firstLine, ""); - console.log(ret.substring(0, 300)) + // console.log(ret.substring(0, 300)) return ret; } @@ -235,21 +171,6 @@ class TextSearch { return ret; } - writeJson(path, data) { - // envelope: add metadata; format inspired by JSON:API - data = { - meta: { - dateCreated: new Date().toISOString(), - }, - data: data, - }; - // console.log(data) - let dataStr = JSON.stringify(data, null, 2); - fs.writeFileSync(path, dataStr, "utf8"); - console.log( - `WRITE ${path} (${(dataStr.length / 1024 / 1024).toFixed(2)} MB)` - ); - } } new TextSearch().transformHTMLs(); diff --git a/frontend/entities/textsearch.md b/frontend/entities/textsearch.md index c64e70a7c..ea7d84ecf 100644 --- a/frontend/entities/textsearch.md +++ b/frontend/entities/textsearch.md @@ -8,7 +8,14 @@ title: Text Search {% endraw %}