Skip to content

Commit

Permalink
feat(search): text search now sorted by book & page number; removed i…
Browse files Browse the repository at this point in the history
…mage placeholder; 15 items per result page.
  • Loading branch information
geoffroy-noel-ddh committed Aug 23, 2024
1 parent 20256f9 commit 370e0f9
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 96 deletions.
6 changes: 4 additions & 2 deletions apps/textsearch/html-to-html-modern.xslt
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,13 @@
</head>
<body data-pagefind-body=""
data-pagefind-meta="url[data-url],title[data-title]"
data-pagefind-filter="book[data-book],version[data-version]"
data-pagefind-sort="book-page[data-book-page]"
data-title="#title#"
data-url="#url#"
data-pagefind-filter="book[data-book],version[data-version]"
data-book="#book#"
data-version="#version#"
data-version="#version#"
data-book-page="#bookPage#"
>
<xsl:call-template name="copy-element" />
</body>
Expand Down
107 changes: 14 additions & 93 deletions apps/textsearch/textsearch.js
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ const LABEL_FROM_KEY = {
'semidip': 'Semi-diplomatic',
}
const TO_BE_INDEXED_PATH = 'to-be-indexed'
const LIMIT = 2
const LIMIT = 100

class TextSearch {
constructor() {
Expand Down Expand Up @@ -82,96 +82,32 @@ class TextSearch {
}

getMetadataFromPath(path, version='modern') {
// clone/dts/documents/book_two/p.99.html
let ret = null;

// clone/dts/documents/book_one/p.12.html
const regex = /\/dts\/documents\/(?<bookKey>\w+)\/p\.(?<page>\d+)\.html$/;
const match = regex.exec(path);

if (match) {
// 12
const pageNumber = match.groups.page
// 0012
const pageNumberPadded = pageNumber.padStart(4, "0");
ret = {
book: LABEL_FROM_KEY[match.groups.bookKey],
page: match.groups.page,
page: pageNumber,
version: LABEL_FROM_KEY[version],
url: `/edition/?p0.do=${match.groups.bookKey}&p0.lo=p.${match.groups.page}&p0.vi=${version}`,
title: `${LABEL_FROM_KEY[match.groups.bookKey]}, page ${match.groups.page}`,
url: `/edition/?p0.do=${match.groups.bookKey}&p0.lo=p.${pageNumber}&p0.vi=${version}`,
// Book 1, page 12
title: `${LABEL_FROM_KEY[match.groups.bookKey]}, page ${pageNumber}`,
// 1-0012 for Book 1, page 12
bookPage: `${Object.keys(LABEL_FROM_KEY).indexOf(match.groups.bookKey)}-${pageNumberPadded}`,
}
}

return ret
}

async writeJsonFromTei() {
this.entities = [];

for (let source of sources) {
await this.loadTei(sourceBase + source);
}

this.postProcessEntities();

this.writeJson(target, this.entities);
}

postProcessEntities() {
// processing which is much simpler in JS than XSLT
for (let entity of this.entities) {
// remove duplicate pages in entity.pages
entity.pages = Object.fromEntries(
Object.entries(entity.pages).map(([k, v]) => [k, [...new Set(v)]])
);
// remove books from entity.pages which have no pages
entity.pages = Object.fromEntries(
Object.entries(entity.pages).filter(([k, v]) => v.length)
);
// entity.books = list of books they appear in
entity.books = Object.keys(entity.pages);
// missing key for people with no first/surname
if (!entity?.sortkey && entity.type == "person") {
// 'John Thornton (1633-1669)' => "Thornton-John"
entity.sortkey = entity.title
.replace(/\([^)]+\)/g, "")
.trim()
.split(/\s+/)
.reverse()
.join("-");
console.log(
`WARNING: fixed missing sorkey for ${entity.type}:${entity.id} = ${entity.sortkey}`
);
}
if (!entity?.search) {
entity.search = entity.title;
}
// remove text between []
entity.search = entity.search.replace(/\[.*?\]/g, "");
}

// sort by sortKey, optional, only for debugging purpose as itemjs will sort anyway
this.entities = this.entities.sort((a, b) =>
a.sortkey.localeCompare(b.sortkey)
);
}

async loadTei(source) {
// let docString = this.readFile(source)
let entitiesJson = await this.xslt(source, jsonSheetPath);
// console.log(entitiesJson.substring(0, 1000));
// fs.writeFileSync('tmp.json', entitiesJson, "utf8");

let entities = [];

if (entitiesJson) {
entities = JSON.parse(entitiesJson);
} else {
console.log(
`WARNING: entities file (${source}) transformed into an empty string.`
);
}

for (let i in entities) {
this.entities.push(entities[i]);
}
}

readFile(source) {
return fs.readFileSync(source).toString();
}
Expand Down Expand Up @@ -206,7 +142,7 @@ class TextSearch {
let firstLine = '<?xml version="1.0" encoding="UTF-8"?>';
ret = ret.replace(firstLine, "");

console.log(ret.substring(0, 300))
// console.log(ret.substring(0, 300))

return ret;
}
Expand Down Expand Up @@ -235,21 +171,6 @@ class TextSearch {
return ret;
}

writeJson(path, data) {
// envelope: add metadata; format inspired by JSON:API
data = {
meta: {
dateCreated: new Date().toISOString(),
},
data: data,
};
// console.log(data)
let dataStr = JSON.stringify(data, null, 2);
fs.writeFileSync(path, dataStr, "utf8");
console.log(
`WRITE ${path} (${(dataStr.length / 1024 / 1024).toFixed(2)} MB)`
);
}
}

new TextSearch().transformHTMLs();
9 changes: 8 additions & 1 deletion frontend/entities/textsearch.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,14 @@ title: Text Search
<div id="search"></div>
<script>
window.addEventListener('DOMContentLoaded', (event) => {
new PagefindUI({ element: "#search", showSubResults: false });
new PagefindUI({
element: "#search",
showSubResults: false,
pageSize: 15,
showImages: false,
autofocus: true,
sort: { "book-page": "asc" }
});
});
</script>
{% endraw %}

0 comments on commit 370e0f9

Please sign in to comment.