From a2865b72d646fd93d56bf32e220b9d0653f1fd1b Mon Sep 17 00:00:00 2001 From: geoffroy-noel-ddh Date: Mon, 19 Aug 2024 16:45:58 +0100 Subject: [PATCH 1/9] added git cloned repo to ignorelist --- .gitignore | 1 + apps/textsearch/html-to-html.sef.jsont | 1 + apps/textsearch/html-to-html.xslt | 17 ++ apps/textsearch/package-lock.json | 206 ++++++++++++++ apps/textsearch/package.json | 16 ++ apps/textsearch/textsearch.js | 185 +++++++++++++ frontend/assets/js/textsearch.js | 364 +++++++++++++++++++++++++ frontend/entities/textsearch.md | 15 + 8 files changed, 805 insertions(+) create mode 100644 apps/textsearch/html-to-html.sef.jsont create mode 100644 apps/textsearch/html-to-html.xslt create mode 100644 apps/textsearch/package-lock.json create mode 100644 apps/textsearch/package.json create mode 100644 apps/textsearch/textsearch.js create mode 100644 frontend/assets/js/textsearch.js create mode 100644 frontend/entities/textsearch.md diff --git a/.gitignore b/.gitignore index 2e1af80d1..719a7c041 100644 --- a/.gitignore +++ b/.gitignore @@ -29,3 +29,4 @@ tei_all.rng /apps/checktexts/edition .idea/ /apps/entities/xslt/tei-to-json.sef.json +/apps/textsearch/clone \ No newline at end of file diff --git a/apps/textsearch/html-to-html.sef.jsont b/apps/textsearch/html-to-html.sef.jsont new file mode 100644 index 000000000..b5eb83d43 --- /dev/null +++ b/apps/textsearch/html-to-html.sef.jsont @@ -0,0 +1 @@ +{"N":"package","version":"20","packageVersion":"1","saxonVersion":"SaxonJS 2.6","target":"JS","targetVersion":"2","name":"TOP-LEVEL","relocatable":"false","buildDateTime":"2024-08-19T01:17:40.5+01:00","ns":"xml=~ =http://www.tei-c.org/ns/1.0 xsl=~ tei=http://www.tei-c.org/ns/1.0","C":[{"N":"co","id":"0","uniform":"true","binds":"1","C":[{"N":"template","flags":"os","module":"html-to-html.xslt","slots":"200","baseUri":"file:///home/jeff/src/prj/alice-thornton/apps/textsearch/html-to-html.xslt","name":"Q{}copy-element","line":"11","expand-text":"false","sType":"1 ","C":[{"N":"copy","sType":"1 ","flags":"cin","role":"body","line":"12","C":[{"N":"applyT","sType":"* ","line":"13","mode":"#unnamed","bSlot":"0","C":[{"N":"docOrder","sType":"*N u[NT,NP,NC,NE]","role":"select","line":"13","C":[{"N":"union","op":"|","sType":"*N u[NT,NP,NC,NE]","ns":"= xml=~ fn=~ xsl=~ tei=http://www.tei-c.org/ns/1.0 ","C":[{"N":"axis","name":"attribute","nodeTest":"*NA"},{"N":"axis","name":"child","nodeTest":"*N u[NT,NP,NC,NE]"}]}]}]}]}]}]},{"N":"co","id":"1","binds":"0","C":[{"N":"mode","onNo":"TC","flags":"","patternSlots":"0","prec":"","C":[{"N":"templateRule","rank":"0","prec":"0","seq":"0","ns":"xml=~ =http://www.tei-c.org/ns/1.0 xsl=~ tei=http://www.tei-c.org/ns/1.0","minImp":"0","flags":"s","slots":"200","baseUri":"file:///home/jeff/src/prj/alice-thornton/apps/textsearch/html-to-html.xslt","line":"7","module":"html-to-html.xslt","expand-text":"false","match":"@*|node()","prio":"-0.5","matches":"NA","C":[{"N":"p.nodeTest","role":"match","test":"NA","sType":"1NA"},{"N":"callT","bSlot":"0","sType":"* ","name":"Q{}copy-element","role":"action","line":"8"}]},{"N":"templateRule","rank":"1","prec":"0","seq":"0","ns":"xml=~ =http://www.tei-c.org/ns/1.0 xsl=~ tei=http://www.tei-c.org/ns/1.0","minImp":"0","flags":"s","slots":"200","baseUri":"file:///home/jeff/src/prj/alice-thornton/apps/textsearch/html-to-html.xslt","line":"7","module":"html-to-html.xslt","expand-text":"false","match":"@*|node()","prio":"-0.5","matches":"N u[NT,NP,NC,NE]","C":[{"N":"p.nodeTest","role":"match","test":"N u[NT,NP,NC,NE]","sType":"1N u[NT,NP,NC,NE]"},{"N":"callT","bSlot":"0","sType":"* ","name":"Q{}copy-element","role":"action","line":"8"}]}]}]},{"N":"overridden"},{"N":"output","C":[{"N":"property","name":"Q{http://saxon.sf.net/}stylesheet-version","value":"20"}]},{"N":"decimalFormat"}],"Σ":"c894e905"} \ No newline at end of file diff --git a/apps/textsearch/html-to-html.xslt b/apps/textsearch/html-to-html.xslt new file mode 100644 index 000000000..4cac3e022 --- /dev/null +++ b/apps/textsearch/html-to-html.xslt @@ -0,0 +1,17 @@ + + + + + + + + + + + + + diff --git a/apps/textsearch/package-lock.json b/apps/textsearch/package-lock.json new file mode 100644 index 000000000..73d31b57d --- /dev/null +++ b/apps/textsearch/package-lock.json @@ -0,0 +1,206 @@ +{ + "name": "textsearch", + "version": "0.1.0", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "name": "textsearch", + "version": "0.1.0", + "license": "ISC", + "dependencies": { + "pagefind": "^1.1.0", + "saxon-js": "^2.6.0" + } + }, + "node_modules/@pagefind/darwin-arm64": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/@pagefind/darwin-arm64/-/darwin-arm64-1.1.0.tgz", + "integrity": "sha512-SLsXNLtSilGZjvqis8sX42fBWsWAVkcDh1oerxwqbac84HbiwxpxOC2jm8hRwcR0Z55HPZPWO77XeRix/8GwTg==", + "cpu": [ + "arm64" + ], + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ] + }, + "node_modules/@pagefind/darwin-x64": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/@pagefind/darwin-x64/-/darwin-x64-1.1.0.tgz", + "integrity": "sha512-QjQSE/L5oS1C8N8GdljGaWtjCBMgMtfrPAoiCmINTu9Y9dp0ggAyXvF8K7Qg3VyIMYJ6v8vg2PN7Z3b+AaAqUA==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ] + }, + "node_modules/@pagefind/linux-arm64": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/@pagefind/linux-arm64/-/linux-arm64-1.1.0.tgz", + "integrity": "sha512-8zjYCa2BtNEL7KnXtysPtBELCyv5DSQ4yHeK/nsEq6w4ToAMTBl0K06khqxdSGgjMSwwrxvLzq3so0LC5Q14dA==", + "cpu": [ + "arm64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@pagefind/linux-x64": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/@pagefind/linux-x64/-/linux-x64-1.1.0.tgz", + "integrity": "sha512-4lsg6VB7A6PWTwaP8oSmXV4O9H0IHX7AlwTDcfyT+YJo/sPXOVjqycD5cdBgqNLfUk8B9bkWcTDCRmJbHrKeCw==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@pagefind/windows-x64": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/@pagefind/windows-x64/-/windows-x64-1.1.0.tgz", + "integrity": "sha512-OboCM76BcMKT9IoSfZuFhiqMRgTde8x4qDDvKulFmycgiJrlL5WnIqBHJLQxZq+o2KyZpoHF97iwsGAm8c32sQ==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "win32" + ] + }, + "node_modules/asynckit": { + "version": "0.4.0", + "resolved": "https://registry.npmjs.org/asynckit/-/asynckit-0.4.0.tgz", + "integrity": "sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q==", + "license": "MIT" + }, + "node_modules/axios": { + "version": "1.7.4", + "resolved": "https://registry.npmjs.org/axios/-/axios-1.7.4.tgz", + "integrity": "sha512-DukmaFRnY6AzAALSH4J2M3k6PkaC+MfaAGdEERRWcC9q3/TWQwLpHR8ZRLKTdQ3aBDL64EdluRDjJqKw+BPZEw==", + "license": "MIT", + "dependencies": { + "follow-redirects": "^1.15.6", + "form-data": "^4.0.0", + "proxy-from-env": "^1.1.0" + } + }, + "node_modules/combined-stream": { + "version": "1.0.8", + "resolved": "https://registry.npmjs.org/combined-stream/-/combined-stream-1.0.8.tgz", + "integrity": "sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg==", + "license": "MIT", + "dependencies": { + "delayed-stream": "~1.0.0" + }, + "engines": { + "node": ">= 0.8" + } + }, + "node_modules/delayed-stream": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/delayed-stream/-/delayed-stream-1.0.0.tgz", + "integrity": "sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ==", + "license": "MIT", + "engines": { + "node": ">=0.4.0" + } + }, + "node_modules/follow-redirects": { + "version": "1.15.6", + "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.6.tgz", + "integrity": "sha512-wWN62YITEaOpSK584EZXJafH1AGpO8RVgElfkuXbTOrPX4fIfOyEpW/CsiNd8JdYrAoOvafRTOEnvsO++qCqFA==", + "funding": [ + { + "type": "individual", + "url": "https://github.com/sponsors/RubenVerborgh" + } + ], + "license": "MIT", + "engines": { + "node": ">=4.0" + }, + "peerDependenciesMeta": { + "debug": { + "optional": true + } + } + }, + "node_modules/form-data": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/form-data/-/form-data-4.0.0.tgz", + "integrity": "sha512-ETEklSGi5t0QMZuiXoA/Q6vcnxcLQP5vdugSpuAyi6SVGi2clPPp+xgEhuMaHC+zGgn31Kd235W35f7Hykkaww==", + "license": "MIT", + "dependencies": { + "asynckit": "^0.4.0", + "combined-stream": "^1.0.8", + "mime-types": "^2.1.12" + }, + "engines": { + "node": ">= 6" + } + }, + "node_modules/mime-db": { + "version": "1.52.0", + "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.52.0.tgz", + "integrity": "sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg==", + "license": "MIT", + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/mime-types": { + "version": "2.1.35", + "resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.35.tgz", + "integrity": "sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw==", + "license": "MIT", + "dependencies": { + "mime-db": "1.52.0" + }, + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/pagefind": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/pagefind/-/pagefind-1.1.0.tgz", + "integrity": "sha512-1nmj0/vfYcMxNEQj0YDRp6bTVv9hI7HLdPhK/vBBYlrnwjATndQvHyicj5Y7pUHrpCFZpFnLVQXIF829tpFmaw==", + "license": "MIT", + "bin": { + "pagefind": "lib/runner/bin.cjs" + }, + "optionalDependencies": { + "@pagefind/darwin-arm64": "1.1.0", + "@pagefind/darwin-x64": "1.1.0", + "@pagefind/linux-arm64": "1.1.0", + "@pagefind/linux-x64": "1.1.0", + "@pagefind/windows-x64": "1.1.0" + } + }, + "node_modules/proxy-from-env": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/proxy-from-env/-/proxy-from-env-1.1.0.tgz", + "integrity": "sha512-D+zkORCbA9f1tdWRK0RaCR3GPv50cMxcrz4X8k5LTSUD1Dkw47mKJEZQNunItRTkWwgtaUSo1RVFRIG9ZXiFYg==", + "license": "MIT" + }, + "node_modules/saxon-js": { + "version": "2.6.0", + "resolved": "https://registry.npmjs.org/saxon-js/-/saxon-js-2.6.0.tgz", + "integrity": "sha512-4dinQEGz/OQX0cnmwLTbjVFY9KciMGRyfA6AUsMCO/mKDOwDxOJFmzoLStieTpEiOB/98E1E4VKV1ElsiD88yQ==", + "license": "SEE LICENSE IN LICENSE.txt", + "dependencies": { + "axios": "^1.5.1" + } + } + } +} diff --git a/apps/textsearch/package.json b/apps/textsearch/package.json new file mode 100644 index 000000000..7186caccb --- /dev/null +++ b/apps/textsearch/package.json @@ -0,0 +1,16 @@ +{ + "name": "textsearch", + "version": "0.1.0", + "main": "textsearch.js", + "scripts": { + "test": "echo \"Error: no test specified\" && exit 1", + "clone": "rm -rf clone && git clone https://github.com/kingsdigitallab/alice-thornton.git -b dts --single-branch clone" + }, + "author": "", + "license": "ISC", + "description": "", + "dependencies": { + "pagefind": "^1.1.0", + "saxon-js": "^2.6.0" + } +} diff --git a/apps/textsearch/textsearch.js b/apps/textsearch/textsearch.js new file mode 100644 index 000000000..fb98e0ec1 --- /dev/null +++ b/apps/textsearch/textsearch.js @@ -0,0 +1,185 @@ +"use strict"; + +// const gfetch = require("node-fetch"); +const SaxonJS = require("saxon-js"); +const fs = require("fs"); +const pathp = require('path'); +const {execSync} = require('child_process') + +const sourceBase = "./clone/dts/documents/"; +// const sourceBase = +// "https://raw.githubusercontent.com/kingsdigitallab/alice-thornton/edition/entities/"; +const sources = ["people.xml", "places.xml", "events.xml"]; +// const sources = ["events.xml"]; +const target = "../../frontend/assets/js/entities.json"; +// const jsonSheetPath = "html-to-html.sef.json"; +const XSLTPath = "html-to-html.xslt" + +class TextSearch { + constructor() { + this.entities = []; + } + + async transformHTMLs() { + var paths = fs.readdirSync(sourceBase); + while (paths.length) { + let path = paths.pop() + let pathAbs = pathp.join(sourceBase, path) + if (fs.lstatSync(pathAbs).isDirectory()) { + for (let p of fs.readdirSync(pathAbs)) { + paths.push(pathp.join(path, p)) + } + } else { + if (path.endsWith('.html')) { + await this.transformHTML(pathAbs) + break; + } + } + } + } + + async transformHTML(path) { + console.log(path) + let entitiesJson = await this.xslt(path, XSLTPath); + } + + async writeJsonFromTei() { + this.entities = []; + + for (let source of sources) { + await this.loadTei(sourceBase + source); + } + + this.postProcessEntities(); + + this.writeJson(target, this.entities); + } + + postProcessEntities() { + // processing which is much simpler in JS than XSLT + for (let entity of this.entities) { + // remove duplicate pages in entity.pages + entity.pages = Object.fromEntries( + Object.entries(entity.pages).map(([k, v]) => [k, [...new Set(v)]]) + ); + // remove books from entity.pages which have no pages + entity.pages = Object.fromEntries( + Object.entries(entity.pages).filter(([k, v]) => v.length) + ); + // entity.books = list of books they appear in + entity.books = Object.keys(entity.pages); + // missing key for people with no first/surname + if (!entity?.sortkey && entity.type == "person") { + // 'John Thornton (1633-1669)' => "Thornton-John" + entity.sortkey = entity.title + .replace(/\([^)]+\)/g, "") + .trim() + .split(/\s+/) + .reverse() + .join("-"); + console.log( + `WARNING: fixed missing sorkey for ${entity.type}:${entity.id} = ${entity.sortkey}` + ); + } + if (!entity?.search) { + entity.search = entity.title; + } + // remove text between [] + entity.search = entity.search.replace(/\[.*?\]/g, ""); + } + + // sort by sortKey, optional, only for debugging purpose as itemjs will sort anyway + this.entities = this.entities.sort((a, b) => + a.sortkey.localeCompare(b.sortkey) + ); + } + + async loadTei(source) { + // let docString = this.readFile(source) + let entitiesJson = await this.xslt(source, jsonSheetPath); + // console.log(entitiesJson.substring(0, 1000)); + // fs.writeFileSync('tmp.json', entitiesJson, "utf8"); + + let entities = []; + + if (entitiesJson) { + entities = JSON.parse(entitiesJson); + } else { + console.log( + `WARNING: entities file (${source}) transformed into an empty string.` + ); + } + + for (let i in entities) { + this.entities.push(entities[i]); + } + } + + readFile(source) { + return fs.readFileSync(source).toString(); + } + + xslt(docPath, XSLTPath) { + let docString = null; + + let jsonSheetPath = this.writeTransformJson(XSLTPath) + + docString = this.readFile(docPath); + + let output = SaxonJS.transform( + { + stylesheetFileName: jsonSheetPath, + sourceText: docString, + // sourceFileName: docPath, + destination: "serialized", + }, + "sync" + ); + + let ret = output.principalResult; + + // TODO: find another way to remove first line + let firstLine = ''; + ret = ret.replace(firstLine, ""); + + return ret; + } + + writeTransformJson(transformXsltPath) { + if (!fs.existsSync(transformXsltPath)) { + throw new Error(`Transform file not found: ${transformXsltPath}`) + } + let ret = transformXsltPath.replace('.xsl', '.sef.json') + if (this.getFileModifiedTime(ret) < this.getFileModifiedTime(transformXsltPath)) { + execSync(`npx xslt3 -xsl:${transformXsltPath} -export:${ret} -t -ns:##html5 -nogo`) + } + return ret + } + + getFileModifiedTime(path) { + let ret = 0 + if (fs.existsSync(path)) { + ret = fs.statSync(path).mtime.getTime() + } + return ret + } + + + writeJson(path, data) { + // envelope: add metadata; format inspired by JSON:API + data = { + meta: { + dateCreated: new Date().toISOString(), + }, + data: data, + }; + // console.log(data) + let dataStr = JSON.stringify(data, null, 2); + fs.writeFileSync(path, dataStr, "utf8"); + console.log( + `WRITE ${path} (${(dataStr.length / 1024 / 1024).toFixed(2)} MB)` + ); + } +} + +new TextSearch().transformHTMLs(); diff --git a/frontend/assets/js/textsearch.js b/frontend/assets/js/textsearch.js new file mode 100644 index 000000000..b545c44c4 --- /dev/null +++ b/frontend/assets/js/textsearch.js @@ -0,0 +1,364 @@ +const { createApp } = window.Vue; +const entitiesSource = "/assets/js/entities.json"; + +function setUpSearch() { + let app = createApp({ + data() { + return { + meta: { + dateCreated: "2023-10-23T21:42:39.127Z", + }, + views: { + collapsed: { + title: "Expand results", + action: "Expand/Collapse", + icon: "fa-caret-right", + }, + expanded: { + title: "Collapse results", + action: "Expand/Collapse", + icon: "fa-caret-down", + }, + }, + selection: { + view: "collapsed", + query: "", + hi: "", // the entity id passed by the viewer + type: "", // ??? unused? + perPage: 10, + page: 1, + filterByAnyOrAllBooks: "any", + }, + _facets: { + type: { + name: "Type", + options: { + person: { name: "People", selected: false }, + place: { name: "Places", selected: false }, + event: { name: "Events", selected: false }, + }, + }, + }, + updating: false, + results: { + data: { + items: [], + aggregations: {}, + }, + pagination: { + page: 1, + per_page: 10, + total: 1, + }, + }, + }; + }, + mounted() { + this.setSelectionFromAddressBar(); + this.fetchRecords(); + }, + computed: { + selectedView() { + return this.views[this.selection.view]; + }, + indexTimeStamp() { + return new Date(this.meta.dateCreated).toUTCString(); + }, + searchConfiguration() { + let ret = { + sortings: { + name_asc: { + field: "sortkey", + order: "asc", + }, + }, + aggregations: { + books: { + title: "By book", + size: 5, + sort: "key", + conjunction: this.selection.filterByAnyOrAllBooks == "all", + chosen_filters_on_top: false, + }, + type: { + title: "By result type", + size: 10, + sort: "key", + conjunction: false, + chosen_filters_on_top: false, + }, + cat: { + title: "By event type", + size: 100, + conjunction: false, + forType: "event", + }, + region: { + title: "By region", + size: 100, + conjunction: false, + forType: "place", + }, + }, + // we can't make the 'id' field searchable, + // otherwise user typing 'p', would bring up all the ppl:XXX. + // The offline indexer has prefixed the id with _ + // and stuck it at the end of 'search'. + // We also replace : with _. : is used by itemjs for field:value query syntax. + searchableFields: ["search"], + removeStopWordFilter: true, + }; + if (window.metadata.hideEventsFromSearchPage) { + delete ret["aggregations"]["cat"]; + } + return ret; + }, + facets() { + return this.results.data.aggregations; + }, + selectedTypes() { + // return an array of selected result types. E.g. ['person'] + return (this.facets?.type?.buckets || []) + .filter((b) => b.selected) + .map((b) => b.key); + }, + filteredFacets() { + // only returns facets relevant to the selected result type (itself a facet) + // see .forType in this.searchConfiguration + let selectedTypes = this.selectedTypes; + return Object.fromEntries( + Object.entries(this.facets).filter(([facetKey, facet]) => { + let forType = + this.searchConfiguration.aggregations[facetKey || facet.name] + .forType; + return forType + ? selectedTypes.length == 0 || selectedTypes.includes(forType) + : true; + }) + ); + }, + items() { + return this.results.data.items; + }, + lastPageNumber() { + return ( + Math.trunc( + (this.results.pagination.total - 1) / + this.results.pagination.per_page + ) + 1 + ); + }, + }, + watch: { + // "results.data.aggregations": { + // // eslint-disable-next-line + // handler(newValue, oldValue) { + // if (newValue != oldValue) { + // this.search(); + // } + // }, + // deep: true, + // }, + "selection.filterByAnyOrAllBooks": { + // eslint-disable-next-line + handler(newValue, oldValue) { + this.configureSearch(); + }, + }, + }, + methods: { + isBioVisible(item) { + // hidden if the bio is surrounded by square brackets. + return !item.bio.match(/^\s*\[.*\]\s*$/); + }, + onChangeView(viewKey) { + if (viewKey) { + this.selection.view = viewKey; + } else { + // just rotate through the views + let keys = Object.keys(this.views); + let index = keys.indexOf(this.selection.view) + 1; + if (index >= keys.length) index = 0; + viewKey = keys[index]; + } + this.selection.view = viewKey; + }, + isResultExpanded(item) { + return this.selection.view == "expanded" ? item.id : false; + }, + getClassFromType(type) { + const typesClass = { + person: "fa-user", + place: "fa-map-marker-alt", + event: "fa-calendar", + }; + return typesClass[type]; + }, + isLocusVisible(bookId, page) { + return window.isLocusVisible(bookId, page); + }, + getLabelFromOptionKey(optionKey) { + let labelFromKey = { + book_of_remembrances: "Book Rem", + book_one: "Book 1", + book_two: "Book 2", + book_three: "Book 3", + + person: "Person", + place: "Place, region", + event: "Event", + }; + return labelFromKey[optionKey] || optionKey; + }, + onClickNextPage() { + this.selection.page++; + if (this.selection.page > this.lastPageNumber) { + this.selection.page = this.lastPageNumber; + } + this.search(true); + }, + onClickPrevPage() { + this.selection.page--; + if (this.selection.page < 1) { + this.selection.page = 1; + } + this.search(true); + }, + getBuckets(facet) { + return facet.buckets; + }, + onClickOption() { + window.Vue.nextTick(() => { + this.search(); + }); + }, + onSubmitInputs() { + this.search(); + }, + clearSelection() { + this.selection.hi = ""; + this.selection.query = ""; + this.selection.type = ""; + + for (let facetKey of Object.keys(this.filteredFacets)) { + let facet = this.filteredFacets[facetKey]; + for (let option of facet.buckets) { + option.selected = false; + } + } + + this.search(); + }, + search(keepPage = false) { + this.updating = true; + + if (!keepPage) { + this.selection.page = 1; + } + + let filters = {}; + for (let facetKey of Object.keys(this.filteredFacets)) { + let facet = this.filteredFacets[facetKey]; + for (let option of facet.buckets) { + if (option.selected) { + if (!filters[facetKey]) { + filters[facetKey] = []; + } + filters[facetKey].push(option.key); + } + } + } + + let searchParameters = { + per_page: this.selection.perPage, + page: this.selection.page, + sort: "name_asc", + filters: filters, + }; + + let entityId = this.selection.hi; + if (entityId) { + searchParameters.filter = (e) => { + return e.id == entityId; + }; + } else { + searchParameters.query = this.selection.query; + } + + this.results = this.itemsjs.search(searchParameters); + + window.Vue.nextTick(() => { + this.updating = false; + }); + + //console.log(this.itemsjs.aggregations()) + }, + fetchRecords() { + this.records = []; + fetch(entitiesSource) + .then((res) => res.json()) + .then((data) => { + this.meta = data.meta; + this.records = data.data; + this.processRecords(); + this.configureSearch(); + }); + }, + configureSearch() { + // this.itemsjs = window.itemsjs(this.records, this.searchConfiguration); + this.search(); + }, + processRecords() { + // for (let record of this.records) { + // // record.titleSearch = record.title.replace(/\b(c|mr|mrs|sir|born|lady)\b/ig, '').replace(/\W+/g, ' ') + // if (record.search != record.title) { + // // console.log(`${record.title} => ${record.search}`); + // } + // } + if (window.metadata.hideEventsFromSearchPage) { + this.records = this.records.filter((r) => r.type != "event"); + } + }, + setAddressBarFromSelection() { + // ?p1.so=&p1.co=&p2.so=... + // let searchParams = new URLSearchParams(window.location.search) + let searchParams = ""; + let newRelativePathQuery = + window.location.pathname + "?" + searchParams; + history.pushState(null, "", newRelativePathQuery); + }, + async setSelectionFromAddressBar() { + let searchParams = new URLSearchParams(window.location.search); + // let q = searchParams.get("q"); + // if (q) { + // q = q.replace(/^(ppl|place):/, ""); + // this.selection.query = q; + // } + // + let hi = searchParams.get("hi"); + if (hi) { + // hi = hi.replace(/^(ppl|place):/, ""); + this.selection.hi = hi; + this.selection.view = "expanded"; + } + // console.log(searchParams); + }, + getContentClasses(panel) { + return `view-${panel.selections.view}`; + }, + getPageParts(page) { + // '123-130' => [123, 130] + // '123' => [123] + let ret = [...new Set(`${page}`.split("-"))]; + return ret; + }, + isSinglePage(pages) { + let ret = pages.length == 1 && this.getPageParts(pages[0]).length == 1; + return ret; + }, + }, + }); + app.mount("#textsearch"); +} + +setUpSearch(); diff --git a/frontend/entities/textsearch.md b/frontend/entities/textsearch.md new file mode 100644 index 000000000..fca332689 --- /dev/null +++ b/frontend/entities/textsearch.md @@ -0,0 +1,15 @@ +--- +title: Text Search +--- + +{% raw %} + + + + + +{% endraw %} From e76f6409c1c92a3b29d273149389e266d3fd03a1 Mon Sep 17 00:00:00 2001 From: geoffroy-noel-ddh Date: Mon, 19 Aug 2024 16:47:14 +0100 Subject: [PATCH 2/9] fix: autoformatting. --- apps/textsearch/textsearch.js | 44 +++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 20 deletions(-) diff --git a/apps/textsearch/textsearch.js b/apps/textsearch/textsearch.js index fb98e0ec1..c6d6fc4e5 100644 --- a/apps/textsearch/textsearch.js +++ b/apps/textsearch/textsearch.js @@ -1,10 +1,10 @@ "use strict"; // const gfetch = require("node-fetch"); +const { execSync } = require("child_process"); const SaxonJS = require("saxon-js"); +const pathp = require("path"); const fs = require("fs"); -const pathp = require('path'); -const {execSync} = require('child_process') const sourceBase = "./clone/dts/documents/"; // const sourceBase = @@ -13,7 +13,7 @@ const sources = ["people.xml", "places.xml", "events.xml"]; // const sources = ["events.xml"]; const target = "../../frontend/assets/js/entities.json"; // const jsonSheetPath = "html-to-html.sef.json"; -const XSLTPath = "html-to-html.xslt" +const XSLTPath = "html-to-html.xslt"; class TextSearch { constructor() { @@ -23,15 +23,15 @@ class TextSearch { async transformHTMLs() { var paths = fs.readdirSync(sourceBase); while (paths.length) { - let path = paths.pop() - let pathAbs = pathp.join(sourceBase, path) + let path = paths.pop(); + let pathAbs = pathp.join(sourceBase, path); if (fs.lstatSync(pathAbs).isDirectory()) { for (let p of fs.readdirSync(pathAbs)) { - paths.push(pathp.join(path, p)) + paths.push(pathp.join(path, p)); } } else { - if (path.endsWith('.html')) { - await this.transformHTML(pathAbs) + if (path.endsWith(".html")) { + await this.transformHTML(pathAbs); break; } } @@ -39,7 +39,7 @@ class TextSearch { } async transformHTML(path) { - console.log(path) + console.log(path); let entitiesJson = await this.xslt(path, XSLTPath); } @@ -122,7 +122,7 @@ class TextSearch { xslt(docPath, XSLTPath) { let docString = null; - let jsonSheetPath = this.writeTransformJson(XSLTPath) + let jsonSheetPath = this.writeTransformJson(XSLTPath); docString = this.readFile(docPath); @@ -147,24 +147,28 @@ class TextSearch { writeTransformJson(transformXsltPath) { if (!fs.existsSync(transformXsltPath)) { - throw new Error(`Transform file not found: ${transformXsltPath}`) + throw new Error(`Transform file not found: ${transformXsltPath}`); } - let ret = transformXsltPath.replace('.xsl', '.sef.json') - if (this.getFileModifiedTime(ret) < this.getFileModifiedTime(transformXsltPath)) { - execSync(`npx xslt3 -xsl:${transformXsltPath} -export:${ret} -t -ns:##html5 -nogo`) + let ret = transformXsltPath.replace(".xsl", ".sef.json"); + if ( + this.getFileModifiedTime(ret) < + this.getFileModifiedTime(transformXsltPath) + ) { + execSync( + `npx xslt3 -xsl:${transformXsltPath} -export:${ret} -t -ns:##html5 -nogo` + ); } - return ret + return ret; } getFileModifiedTime(path) { - let ret = 0 + let ret = 0; if (fs.existsSync(path)) { - ret = fs.statSync(path).mtime.getTime() + ret = fs.statSync(path).mtime.getTime(); } - return ret + return ret; } - - + writeJson(path, data) { // envelope: add metadata; format inspired by JSON:API data = { From eae9e741e13d528a7cc6324d4a0efc4ed975a9b8 Mon Sep 17 00:00:00 2001 From: geoffroy-noel-ddh Date: Wed, 21 Aug 2024 12:36:04 +0100 Subject: [PATCH 3/9] feat(search): first working version of the html transforms to prepare the edition pages to tbe indexed by pagefind. --- .gitignore | 3 ++- apps/textsearch/html-to-html.sef.jsont | 2 +- apps/textsearch/html-to-html.xslt | 13 ++++++++++++- apps/textsearch/package-lock.json | 12 +++++++++++- apps/textsearch/package.json | 6 ++++-- apps/textsearch/textsearch.js | 21 ++++++++++++++++++++- 6 files changed, 50 insertions(+), 7 deletions(-) diff --git a/.gitignore b/.gitignore index 719a7c041..a74ee68c1 100644 --- a/.gitignore +++ b/.gitignore @@ -29,4 +29,5 @@ tei_all.rng /apps/checktexts/edition .idea/ /apps/entities/xslt/tei-to-json.sef.json -/apps/textsearch/clone \ No newline at end of file +/apps/textsearch/clone +/apps/textsearch/to-be-indexed diff --git a/apps/textsearch/html-to-html.sef.jsont b/apps/textsearch/html-to-html.sef.jsont index b5eb83d43..b073296a0 100644 --- a/apps/textsearch/html-to-html.sef.jsont +++ b/apps/textsearch/html-to-html.sef.jsont @@ -1 +1 @@ -{"N":"package","version":"20","packageVersion":"1","saxonVersion":"SaxonJS 2.6","target":"JS","targetVersion":"2","name":"TOP-LEVEL","relocatable":"false","buildDateTime":"2024-08-19T01:17:40.5+01:00","ns":"xml=~ =http://www.tei-c.org/ns/1.0 xsl=~ tei=http://www.tei-c.org/ns/1.0","C":[{"N":"co","id":"0","uniform":"true","binds":"1","C":[{"N":"template","flags":"os","module":"html-to-html.xslt","slots":"200","baseUri":"file:///home/jeff/src/prj/alice-thornton/apps/textsearch/html-to-html.xslt","name":"Q{}copy-element","line":"11","expand-text":"false","sType":"1 ","C":[{"N":"copy","sType":"1 ","flags":"cin","role":"body","line":"12","C":[{"N":"applyT","sType":"* ","line":"13","mode":"#unnamed","bSlot":"0","C":[{"N":"docOrder","sType":"*N u[NT,NP,NC,NE]","role":"select","line":"13","C":[{"N":"union","op":"|","sType":"*N u[NT,NP,NC,NE]","ns":"= xml=~ fn=~ xsl=~ tei=http://www.tei-c.org/ns/1.0 ","C":[{"N":"axis","name":"attribute","nodeTest":"*NA"},{"N":"axis","name":"child","nodeTest":"*N u[NT,NP,NC,NE]"}]}]}]}]}]}]},{"N":"co","id":"1","binds":"0","C":[{"N":"mode","onNo":"TC","flags":"","patternSlots":"0","prec":"","C":[{"N":"templateRule","rank":"0","prec":"0","seq":"0","ns":"xml=~ =http://www.tei-c.org/ns/1.0 xsl=~ tei=http://www.tei-c.org/ns/1.0","minImp":"0","flags":"s","slots":"200","baseUri":"file:///home/jeff/src/prj/alice-thornton/apps/textsearch/html-to-html.xslt","line":"7","module":"html-to-html.xslt","expand-text":"false","match":"@*|node()","prio":"-0.5","matches":"NA","C":[{"N":"p.nodeTest","role":"match","test":"NA","sType":"1NA"},{"N":"callT","bSlot":"0","sType":"* ","name":"Q{}copy-element","role":"action","line":"8"}]},{"N":"templateRule","rank":"1","prec":"0","seq":"0","ns":"xml=~ =http://www.tei-c.org/ns/1.0 xsl=~ tei=http://www.tei-c.org/ns/1.0","minImp":"0","flags":"s","slots":"200","baseUri":"file:///home/jeff/src/prj/alice-thornton/apps/textsearch/html-to-html.xslt","line":"7","module":"html-to-html.xslt","expand-text":"false","match":"@*|node()","prio":"-0.5","matches":"N u[NT,NP,NC,NE]","C":[{"N":"p.nodeTest","role":"match","test":"N u[NT,NP,NC,NE]","sType":"1N u[NT,NP,NC,NE]"},{"N":"callT","bSlot":"0","sType":"* ","name":"Q{}copy-element","role":"action","line":"8"}]}]}]},{"N":"overridden"},{"N":"output","C":[{"N":"property","name":"Q{http://saxon.sf.net/}stylesheet-version","value":"20"}]},{"N":"decimalFormat"}],"Σ":"c894e905"} \ No newline at end of file +{"N":"package","version":"20","packageVersion":"1","saxonVersion":"SaxonJS 2.6","target":"JS","targetVersion":"2","name":"TOP-LEVEL","relocatable":"false","buildDateTime":"2024-08-20T23:36:37.093+01:00","ns":"xml=~ =http://www.w3.org/1999/xhtml html=http://www.w3.org/1999/xhtml xsl=~ tei=http://www.tei-c.org/ns/1.0","C":[{"N":"co","id":"0","uniform":"true","binds":"1","C":[{"N":"template","flags":"os","module":"html-to-html.xslt","slots":"200","baseUri":"file:///home/jeff/src/prj/alice-thornton/apps/textsearch/html-to-html.xslt","name":"Q{}copy-element","line":"12","expand-text":"false","sType":"1 ","C":[{"N":"copy","sType":"1 ","flags":"cin","role":"body","line":"13","C":[{"N":"applyT","sType":"* ","line":"14","mode":"#unnamed","bSlot":"0","C":[{"N":"docOrder","sType":"*N u[NT,NP,NC,NE]","role":"select","line":"14","C":[{"N":"union","op":"|","sType":"*N u[NT,NP,NC,NE]","ns":"= xml=~ fn=~ html=http://www.w3.org/1999/xhtml xsl=~ tei=http://www.tei-c.org/ns/1.0 ","C":[{"N":"axis","name":"attribute","nodeTest":"*NA"},{"N":"axis","name":"child","nodeTest":"*N u[NT,NP,NC,NE]"}]}]}]}]}]}]},{"N":"co","id":"1","binds":"0","C":[{"N":"mode","onNo":"TC","flags":"","patternSlots":"0","prec":"","C":[{"N":"templateRule","rank":"0","prec":"0","seq":"1","ns":"xml=~ =http://www.w3.org/1999/xhtml html=http://www.w3.org/1999/xhtml xsl=~ tei=http://www.tei-c.org/ns/1.0","minImp":"0","flags":"s","slots":"200","baseUri":"file:///home/jeff/src/prj/alice-thornton/apps/textsearch/html-to-html.xslt","line":"18","module":"html-to-html.xslt","expand-text":"false","match":"/","prio":"-0.5","matches":"ND","C":[{"N":"p.nodeTest","role":"match","test":"ND","sType":"1ND","ns":"= xml=~ fn=~ html=http://www.w3.org/1999/xhtml xsl=~ tei=http://www.tei-c.org/ns/1.0 "},{"N":"elem","name":"html","sType":"1NE nQ{http://www.w3.org/1999/xhtml}html ","nsuri":"http://www.w3.org/1999/xhtml","namespaces":"=http://www.w3.org/1999/xhtml html=http://www.w3.org/1999/xhtml tei=http://www.tei-c.org/ns/1.0","role":"action","line":"19","C":[{"N":"sequence","sType":"*NE ","C":[{"N":"elem","name":"head","sType":"1NE nQ{http://www.w3.org/1999/xhtml}head ","nsuri":"http://www.w3.org/1999/xhtml","namespaces":"=http://www.w3.org/1999/xhtml html=http://www.w3.org/1999/xhtml tei=http://www.tei-c.org/ns/1.0","line":"20","C":[{"N":"empty","sType":"0 "}]},{"N":"elem","name":"body","sType":"1NE nQ{http://www.w3.org/1999/xhtml}body ","nsuri":"http://www.w3.org/1999/xhtml","namespaces":"=http://www.w3.org/1999/xhtml html=http://www.w3.org/1999/xhtml tei=http://www.tei-c.org/ns/1.0","line":"22","C":[{"N":"sequence","sType":"* ","C":[{"N":"att","name":"data-pagefind-body","nsuri":"","sType":"1NA ","C":[{"N":"str","sType":"1AS ","val":""}]},{"N":"att","name":"data-pagefind-meta","nsuri":"","sType":"1NA ","C":[{"N":"str","sType":"1AS ","val":"url[data-viewer-url],title[data-title]"}]},{"N":"att","name":"data-title","nsuri":"","sType":"1NA ","C":[{"N":"str","sType":"1AS ","val":""}]},{"N":"att","name":"data-viewer-url","nsuri":"","sType":"1NA ","C":[{"N":"str","sType":"1AS ","val":""}]},{"N":"callT","bSlot":"0","sType":"* ","name":"Q{}copy-element","line":"23"}]}]}]}]}]},{"N":"templateRule","rank":"1","prec":"0","seq":"0","ns":"xml=~ =http://www.w3.org/1999/xhtml html=http://www.w3.org/1999/xhtml xsl=~ tei=http://www.tei-c.org/ns/1.0","minImp":"0","flags":"s","slots":"200","baseUri":"file:///home/jeff/src/prj/alice-thornton/apps/textsearch/html-to-html.xslt","line":"8","module":"html-to-html.xslt","expand-text":"false","match":"@*|node()","prio":"-0.5","matches":"NA","C":[{"N":"p.nodeTest","role":"match","test":"NA","sType":"1NA"},{"N":"callT","bSlot":"0","sType":"* ","name":"Q{}copy-element","role":"action","line":"9"}]},{"N":"templateRule","rank":"2","prec":"0","seq":"0","ns":"xml=~ =http://www.w3.org/1999/xhtml html=http://www.w3.org/1999/xhtml xsl=~ tei=http://www.tei-c.org/ns/1.0","minImp":"0","flags":"s","slots":"200","baseUri":"file:///home/jeff/src/prj/alice-thornton/apps/textsearch/html-to-html.xslt","line":"8","module":"html-to-html.xslt","expand-text":"false","match":"@*|node()","prio":"-0.5","matches":"N u[NT,NP,NC,NE]","C":[{"N":"p.nodeTest","role":"match","test":"N u[NT,NP,NC,NE]","sType":"1N u[NT,NP,NC,NE]"},{"N":"callT","bSlot":"0","sType":"* ","name":"Q{}copy-element","role":"action","line":"9"}]}]}]},{"N":"overridden"},{"N":"output","C":[{"N":"property","name":"Q{http://saxon.sf.net/}stylesheet-version","value":"20"}]},{"N":"decimalFormat"}],"Σ":"4a5e76cb"} \ No newline at end of file diff --git a/apps/textsearch/html-to-html.xslt b/apps/textsearch/html-to-html.xslt index 4cac3e022..6ff84f252 100644 --- a/apps/textsearch/html-to-html.xslt +++ b/apps/textsearch/html-to-html.xslt @@ -1,5 +1,6 @@ @@ -14,4 +15,14 @@ + + + + + + + + + + diff --git a/apps/textsearch/package-lock.json b/apps/textsearch/package-lock.json index 73d31b57d..7dc3730dd 100644 --- a/apps/textsearch/package-lock.json +++ b/apps/textsearch/package-lock.json @@ -10,7 +10,8 @@ "license": "ISC", "dependencies": { "pagefind": "^1.1.0", - "saxon-js": "^2.6.0" + "saxon-js": "^2.6.0", + "xmldom": "^0.6.0" } }, "node_modules/@pagefind/darwin-arm64": { @@ -201,6 +202,15 @@ "dependencies": { "axios": "^1.5.1" } + }, + "node_modules/xmldom": { + "version": "0.6.0", + "resolved": "https://registry.npmjs.org/xmldom/-/xmldom-0.6.0.tgz", + "integrity": "sha512-iAcin401y58LckRZ0TkI4k0VSM1Qg0KGSc3i8rU+xrxe19A/BN1zHyVSJY7uoutVlaTSzYyk/v5AmkewAP7jtg==", + "license": "MIT", + "engines": { + "node": ">=10.0.0" + } } } } diff --git a/apps/textsearch/package.json b/apps/textsearch/package.json index 7186caccb..ee2b2e7f2 100644 --- a/apps/textsearch/package.json +++ b/apps/textsearch/package.json @@ -4,13 +4,15 @@ "main": "textsearch.js", "scripts": { "test": "echo \"Error: no test specified\" && exit 1", - "clone": "rm -rf clone && git clone https://github.com/kingsdigitallab/alice-thornton.git -b dts --single-branch clone" + "clone": "rm -rf clone && git clone https://github.com/kingsdigitallab/alice-thornton.git -b dts --single-branch clone", + "index": "npx pagefind --site to-be-indexed --output-path ../../frontend/_site/pagefind" }, "author": "", "license": "ISC", "description": "", "dependencies": { "pagefind": "^1.1.0", - "saxon-js": "^2.6.0" + "saxon-js": "^2.6.0", + "xmldom": "^0.6.0" } } diff --git a/apps/textsearch/textsearch.js b/apps/textsearch/textsearch.js index c6d6fc4e5..b185ab15f 100644 --- a/apps/textsearch/textsearch.js +++ b/apps/textsearch/textsearch.js @@ -40,7 +40,17 @@ class TextSearch { async transformHTML(path) { console.log(path); - let entitiesJson = await this.xslt(path, XSLTPath); + let htmlString = await this.xslt(path, XSLTPath); + + if (1) { + htmlString = htmlString.replace('data-title=""', 'data-title="Book 2, page 2"') + htmlString = htmlString.replace('data-viewer-url=""', 'data-viewer-url="/edition/?p0.lo=p.2&p0.vi=modern&p0.do=book_one"') + } + + + let targetPath = path.replace('clone/dts/documents', 'to-be-indexed') + fs.mkdirSync(pathp.dirname(targetPath), { recursive: true }) + fs.writeFileSync(targetPath, htmlString, 'utf8') } async writeJsonFromTei() { @@ -124,13 +134,20 @@ class TextSearch { let jsonSheetPath = this.writeTransformJson(XSLTPath); + // convert html to xhtml so saxonjs is happy docString = this.readFile(docPath); + const { DOMParser, XMLSerializer } = require('xmldom'); + const parser = new DOMParser(); + let node = parser.parseFromString(docString, 'text/html'); + const serializer = new XMLSerializer(); + docString = serializer.serializeToString(node); let output = SaxonJS.transform( { stylesheetFileName: jsonSheetPath, sourceText: docString, // sourceFileName: docPath, + // sourceNode: node, destination: "serialized", }, "sync" @@ -142,6 +159,8 @@ class TextSearch { let firstLine = ''; ret = ret.replace(firstLine, ""); + console.log(ret) + return ret; } From b5031fa68ec681467b187447f14147c04f37c4df Mon Sep 17 00:00:00 2001 From: geoffroy-noel-ddh Date: Fri, 23 Aug 2024 14:48:46 +0100 Subject: [PATCH 4/9] feat(search): first working version of the pagefind indexing & pre-processing script with some support for semidip & modern and correct metadata recording. --- apps/textsearch/html-to-html-modern.xslt | 43 +++++++++++++ ...to-html.xslt => html-to-html-semidip.xslt} | 9 ++- apps/textsearch/html-to-html.sef.jsont | 2 +- apps/textsearch/package.json | 7 ++- apps/textsearch/textsearch.js | 63 ++++++++++++++++--- frontend/entities/textsearch.md | 3 +- 6 files changed, 112 insertions(+), 15 deletions(-) create mode 100644 apps/textsearch/html-to-html-modern.xslt rename apps/textsearch/{html-to-html.xslt => html-to-html-semidip.xslt} (68%) diff --git a/apps/textsearch/html-to-html-modern.xslt b/apps/textsearch/html-to-html-modern.xslt new file mode 100644 index 000000000..6840b0882 --- /dev/null +++ b/apps/textsearch/html-to-html-modern.xslt @@ -0,0 +1,43 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/apps/textsearch/html-to-html.xslt b/apps/textsearch/html-to-html-semidip.xslt similarity index 68% rename from apps/textsearch/html-to-html.xslt rename to apps/textsearch/html-to-html-semidip.xslt index 6ff84f252..0e24e6366 100644 --- a/apps/textsearch/html-to-html.xslt +++ b/apps/textsearch/html-to-html-semidip.xslt @@ -19,7 +19,14 @@ - + diff --git a/apps/textsearch/html-to-html.sef.jsont b/apps/textsearch/html-to-html.sef.jsont index b073296a0..56ca89906 100644 --- a/apps/textsearch/html-to-html.sef.jsont +++ b/apps/textsearch/html-to-html.sef.jsont @@ -1 +1 @@ -{"N":"package","version":"20","packageVersion":"1","saxonVersion":"SaxonJS 2.6","target":"JS","targetVersion":"2","name":"TOP-LEVEL","relocatable":"false","buildDateTime":"2024-08-20T23:36:37.093+01:00","ns":"xml=~ =http://www.w3.org/1999/xhtml html=http://www.w3.org/1999/xhtml xsl=~ tei=http://www.tei-c.org/ns/1.0","C":[{"N":"co","id":"0","uniform":"true","binds":"1","C":[{"N":"template","flags":"os","module":"html-to-html.xslt","slots":"200","baseUri":"file:///home/jeff/src/prj/alice-thornton/apps/textsearch/html-to-html.xslt","name":"Q{}copy-element","line":"12","expand-text":"false","sType":"1 ","C":[{"N":"copy","sType":"1 ","flags":"cin","role":"body","line":"13","C":[{"N":"applyT","sType":"* ","line":"14","mode":"#unnamed","bSlot":"0","C":[{"N":"docOrder","sType":"*N u[NT,NP,NC,NE]","role":"select","line":"14","C":[{"N":"union","op":"|","sType":"*N u[NT,NP,NC,NE]","ns":"= xml=~ fn=~ html=http://www.w3.org/1999/xhtml xsl=~ tei=http://www.tei-c.org/ns/1.0 ","C":[{"N":"axis","name":"attribute","nodeTest":"*NA"},{"N":"axis","name":"child","nodeTest":"*N u[NT,NP,NC,NE]"}]}]}]}]}]}]},{"N":"co","id":"1","binds":"0","C":[{"N":"mode","onNo":"TC","flags":"","patternSlots":"0","prec":"","C":[{"N":"templateRule","rank":"0","prec":"0","seq":"1","ns":"xml=~ =http://www.w3.org/1999/xhtml html=http://www.w3.org/1999/xhtml xsl=~ tei=http://www.tei-c.org/ns/1.0","minImp":"0","flags":"s","slots":"200","baseUri":"file:///home/jeff/src/prj/alice-thornton/apps/textsearch/html-to-html.xslt","line":"18","module":"html-to-html.xslt","expand-text":"false","match":"/","prio":"-0.5","matches":"ND","C":[{"N":"p.nodeTest","role":"match","test":"ND","sType":"1ND","ns":"= xml=~ fn=~ html=http://www.w3.org/1999/xhtml xsl=~ tei=http://www.tei-c.org/ns/1.0 "},{"N":"elem","name":"html","sType":"1NE nQ{http://www.w3.org/1999/xhtml}html ","nsuri":"http://www.w3.org/1999/xhtml","namespaces":"=http://www.w3.org/1999/xhtml html=http://www.w3.org/1999/xhtml tei=http://www.tei-c.org/ns/1.0","role":"action","line":"19","C":[{"N":"sequence","sType":"*NE ","C":[{"N":"elem","name":"head","sType":"1NE nQ{http://www.w3.org/1999/xhtml}head ","nsuri":"http://www.w3.org/1999/xhtml","namespaces":"=http://www.w3.org/1999/xhtml html=http://www.w3.org/1999/xhtml tei=http://www.tei-c.org/ns/1.0","line":"20","C":[{"N":"empty","sType":"0 "}]},{"N":"elem","name":"body","sType":"1NE nQ{http://www.w3.org/1999/xhtml}body ","nsuri":"http://www.w3.org/1999/xhtml","namespaces":"=http://www.w3.org/1999/xhtml html=http://www.w3.org/1999/xhtml tei=http://www.tei-c.org/ns/1.0","line":"22","C":[{"N":"sequence","sType":"* ","C":[{"N":"att","name":"data-pagefind-body","nsuri":"","sType":"1NA ","C":[{"N":"str","sType":"1AS ","val":""}]},{"N":"att","name":"data-pagefind-meta","nsuri":"","sType":"1NA ","C":[{"N":"str","sType":"1AS ","val":"url[data-viewer-url],title[data-title]"}]},{"N":"att","name":"data-title","nsuri":"","sType":"1NA ","C":[{"N":"str","sType":"1AS ","val":""}]},{"N":"att","name":"data-viewer-url","nsuri":"","sType":"1NA ","C":[{"N":"str","sType":"1AS ","val":""}]},{"N":"callT","bSlot":"0","sType":"* ","name":"Q{}copy-element","line":"23"}]}]}]}]}]},{"N":"templateRule","rank":"1","prec":"0","seq":"0","ns":"xml=~ =http://www.w3.org/1999/xhtml html=http://www.w3.org/1999/xhtml xsl=~ tei=http://www.tei-c.org/ns/1.0","minImp":"0","flags":"s","slots":"200","baseUri":"file:///home/jeff/src/prj/alice-thornton/apps/textsearch/html-to-html.xslt","line":"8","module":"html-to-html.xslt","expand-text":"false","match":"@*|node()","prio":"-0.5","matches":"NA","C":[{"N":"p.nodeTest","role":"match","test":"NA","sType":"1NA"},{"N":"callT","bSlot":"0","sType":"* ","name":"Q{}copy-element","role":"action","line":"9"}]},{"N":"templateRule","rank":"2","prec":"0","seq":"0","ns":"xml=~ =http://www.w3.org/1999/xhtml html=http://www.w3.org/1999/xhtml xsl=~ tei=http://www.tei-c.org/ns/1.0","minImp":"0","flags":"s","slots":"200","baseUri":"file:///home/jeff/src/prj/alice-thornton/apps/textsearch/html-to-html.xslt","line":"8","module":"html-to-html.xslt","expand-text":"false","match":"@*|node()","prio":"-0.5","matches":"N u[NT,NP,NC,NE]","C":[{"N":"p.nodeTest","role":"match","test":"N u[NT,NP,NC,NE]","sType":"1N u[NT,NP,NC,NE]"},{"N":"callT","bSlot":"0","sType":"* ","name":"Q{}copy-element","role":"action","line":"9"}]}]}]},{"N":"overridden"},{"N":"output","C":[{"N":"property","name":"Q{http://saxon.sf.net/}stylesheet-version","value":"20"}]},{"N":"decimalFormat"}],"Σ":"4a5e76cb"} \ No newline at end of file +{"N":"package","version":"20","packageVersion":"1","saxonVersion":"SaxonJS 2.6","target":"JS","targetVersion":"2","name":"TOP-LEVEL","relocatable":"false","buildDateTime":"2024-08-23T01:24:27.625+01:00","ns":"xml=~ =http://www.w3.org/1999/xhtml html=http://www.w3.org/1999/xhtml xsl=~ tei=http://www.tei-c.org/ns/1.0","C":[{"N":"co","id":"0","uniform":"true","binds":"1","C":[{"N":"template","flags":"os","module":"html-to-html.xslt","slots":"200","baseUri":"file:///home/jeff/src/prj/alice-thornton/apps/textsearch/html-to-html.xslt","name":"Q{}copy-element","line":"12","expand-text":"false","sType":"1 ","C":[{"N":"copy","sType":"1 ","flags":"cin","role":"body","line":"13","C":[{"N":"applyT","sType":"* ","line":"14","mode":"#unnamed","bSlot":"0","C":[{"N":"docOrder","sType":"*N u[NT,NP,NC,NE]","role":"select","line":"14","C":[{"N":"union","op":"|","sType":"*N u[NT,NP,NC,NE]","ns":"= xml=~ fn=~ html=http://www.w3.org/1999/xhtml xsl=~ tei=http://www.tei-c.org/ns/1.0 ","C":[{"N":"axis","name":"attribute","nodeTest":"*NA"},{"N":"axis","name":"child","nodeTest":"*N u[NT,NP,NC,NE]"}]}]}]}]}]}]},{"N":"co","id":"1","binds":"0","C":[{"N":"mode","onNo":"TC","flags":"","patternSlots":"0","prec":"","C":[{"N":"templateRule","rank":"0","prec":"0","seq":"1","ns":"xml=~ =http://www.w3.org/1999/xhtml html=http://www.w3.org/1999/xhtml xsl=~ tei=http://www.tei-c.org/ns/1.0","minImp":"0","flags":"s","slots":"200","baseUri":"file:///home/jeff/src/prj/alice-thornton/apps/textsearch/html-to-html.xslt","line":"18","module":"html-to-html.xslt","expand-text":"false","match":"/","prio":"-0.5","matches":"ND","C":[{"N":"p.nodeTest","role":"match","test":"ND","sType":"1ND","ns":"= xml=~ fn=~ html=http://www.w3.org/1999/xhtml xsl=~ tei=http://www.tei-c.org/ns/1.0 "},{"N":"elem","name":"html","sType":"1NE nQ{http://www.w3.org/1999/xhtml}html ","nsuri":"http://www.w3.org/1999/xhtml","namespaces":"=http://www.w3.org/1999/xhtml html=http://www.w3.org/1999/xhtml tei=http://www.tei-c.org/ns/1.0","role":"action","line":"19","C":[{"N":"sequence","sType":"*NE ","C":[{"N":"elem","name":"head","sType":"1NE nQ{http://www.w3.org/1999/xhtml}head ","nsuri":"http://www.w3.org/1999/xhtml","namespaces":"=http://www.w3.org/1999/xhtml html=http://www.w3.org/1999/xhtml tei=http://www.tei-c.org/ns/1.0","line":"20","C":[{"N":"empty","sType":"0 "}]},{"N":"elem","name":"body","sType":"1NE nQ{http://www.w3.org/1999/xhtml}body ","nsuri":"http://www.w3.org/1999/xhtml","namespaces":"=http://www.w3.org/1999/xhtml html=http://www.w3.org/1999/xhtml tei=http://www.tei-c.org/ns/1.0","line":"29","C":[{"N":"sequence","sType":"* ","C":[{"N":"att","name":"data-pagefind-body","nsuri":"","sType":"1NA ","C":[{"N":"str","sType":"1AS ","val":""}]},{"N":"att","name":"data-pagefind-meta","nsuri":"","sType":"1NA ","C":[{"N":"str","sType":"1AS ","val":"url[data-url],title[data-title]"}]},{"N":"att","name":"data-title","nsuri":"","sType":"1NA ","C":[{"N":"str","sType":"1AS ","val":"#title#"}]},{"N":"att","name":"data-url","nsuri":"","sType":"1NA ","C":[{"N":"str","sType":"1AS ","val":"#url#"}]},{"N":"att","name":"data-pagefind-filter","nsuri":"","sType":"1NA ","C":[{"N":"str","sType":"1AS ","val":"book[data-book],version[data-version]"}]},{"N":"att","name":"data-book","nsuri":"","sType":"1NA ","C":[{"N":"str","sType":"1AS ","val":"#book#"}]},{"N":"att","name":"data-version","nsuri":"","sType":"1NA ","C":[{"N":"str","sType":"1AS ","val":"#version#"}]},{"N":"callT","bSlot":"0","sType":"* ","name":"Q{}copy-element","line":"30"}]}]}]}]}]},{"N":"templateRule","rank":"1","prec":"0","seq":"0","ns":"xml=~ =http://www.w3.org/1999/xhtml html=http://www.w3.org/1999/xhtml xsl=~ tei=http://www.tei-c.org/ns/1.0","minImp":"0","flags":"s","slots":"200","baseUri":"file:///home/jeff/src/prj/alice-thornton/apps/textsearch/html-to-html.xslt","line":"8","module":"html-to-html.xslt","expand-text":"false","match":"@*|node()","prio":"-0.5","matches":"NA","C":[{"N":"p.nodeTest","role":"match","test":"NA","sType":"1NA"},{"N":"callT","bSlot":"0","sType":"* ","name":"Q{}copy-element","role":"action","line":"9"}]},{"N":"templateRule","rank":"2","prec":"0","seq":"0","ns":"xml=~ =http://www.w3.org/1999/xhtml html=http://www.w3.org/1999/xhtml xsl=~ tei=http://www.tei-c.org/ns/1.0","minImp":"0","flags":"s","slots":"200","baseUri":"file:///home/jeff/src/prj/alice-thornton/apps/textsearch/html-to-html.xslt","line":"8","module":"html-to-html.xslt","expand-text":"false","match":"@*|node()","prio":"-0.5","matches":"N u[NT,NP,NC,NE]","C":[{"N":"p.nodeTest","role":"match","test":"N u[NT,NP,NC,NE]","sType":"1N u[NT,NP,NC,NE]"},{"N":"callT","bSlot":"0","sType":"* ","name":"Q{}copy-element","role":"action","line":"9"}]}]}]},{"N":"overridden"},{"N":"output","C":[{"N":"property","name":"Q{http://saxon.sf.net/}stylesheet-version","value":"20"}]},{"N":"decimalFormat"}],"Σ":"6f3b471f"} \ No newline at end of file diff --git a/apps/textsearch/package.json b/apps/textsearch/package.json index ee2b2e7f2..801a36b23 100644 --- a/apps/textsearch/package.json +++ b/apps/textsearch/package.json @@ -4,8 +4,11 @@ "main": "textsearch.js", "scripts": { "test": "echo \"Error: no test specified\" && exit 1", - "clone": "rm -rf clone && git clone https://github.com/kingsdigitallab/alice-thornton.git -b dts --single-branch clone", - "index": "npx pagefind --site to-be-indexed --output-path ../../frontend/_site/pagefind" + "extract": "rm -rf clone && git clone https://github.com/kingsdigitallab/alice-thornton.git -b dts --single-branch clone", + "transform": "node textsearch.js", + "load": "npx pagefind --site to-be-indexed --output-path ../../frontend/_site/pagefind", + "index": "npm run extract && npm run transform && npm run load", + "tl": "npm run transform && npm run load" }, "author": "", "license": "ISC", diff --git a/apps/textsearch/textsearch.js b/apps/textsearch/textsearch.js index b185ab15f..889354d9b 100644 --- a/apps/textsearch/textsearch.js +++ b/apps/textsearch/textsearch.js @@ -13,7 +13,19 @@ const sources = ["people.xml", "places.xml", "events.xml"]; // const sources = ["events.xml"]; const target = "../../frontend/assets/js/entities.json"; // const jsonSheetPath = "html-to-html.sef.json"; -const XSLTPath = "html-to-html.xslt"; +const XSLTPath = { + 'modern': "html-to-html-modern.xslt", + // 'semidip': "html-to-html-semidip.xslt" +}; +const LABEL_FROM_KEY = { + 'book_of_remembrances': 'Book of Remembrances', + 'book_one': 'Book 1', + 'book_two': 'Book 2', + 'book_three': 'Book 3', + 'modern': 'Modernised', + 'semidip': 'Semi-diplomatic', +} +const TO_BE_INDEXED_PATH = 'to-be-indexed' class TextSearch { constructor() { @@ -22,6 +34,12 @@ class TextSearch { async transformHTMLs() { var paths = fs.readdirSync(sourceBase); + + fs.rmSync(TO_BE_INDEXED_PATH, { recursive: true }) + + let limit = 2; + let processed = 0; + while (paths.length) { let path = paths.pop(); let pathAbs = pathp.join(sourceBase, path); @@ -32,7 +50,8 @@ class TextSearch { } else { if (path.endsWith(".html")) { await this.transformHTML(pathAbs); - break; + processed++; + if (limit && processed >= limit) break; } } } @@ -40,19 +59,45 @@ class TextSearch { async transformHTML(path) { console.log(path); - let htmlString = await this.xslt(path, XSLTPath); - - if (1) { - htmlString = htmlString.replace('data-title=""', 'data-title="Book 2, page 2"') - htmlString = htmlString.replace('data-viewer-url=""', 'data-viewer-url="/edition/?p0.lo=p.2&p0.vi=modern&p0.do=book_one"') + for (let version of Object.keys(XSLTPath)) { + this.transformHTMLVersion(path, version) } + } + + async transformHTMLVersion(path, version='modern') { + let htmlString = await this.xslt(path, XSLTPath[version]); + let metadata = this.getMetadataFromPath(path, version); - let targetPath = path.replace('clone/dts/documents', 'to-be-indexed') + for (let k of Object.keys(metadata)) { + htmlString = htmlString.replace(`#${k}#`, metadata[k]) + } + + let targetPath = path.replace('clone/dts/documents', TO_BE_INDEXED_PATH) + targetPath = targetPath.replace('.html', '-' + version + '.html') fs.mkdirSync(pathp.dirname(targetPath), { recursive: true }) fs.writeFileSync(targetPath, htmlString, 'utf8') } + getMetadataFromPath(path, version='modern') { + // clone/dts/documents/book_two/p.99.html + let ret = null; + + const regex = /\/dts\/documents\/(?\w+)\/p\.(?\d+)\.html$/; + const match = regex.exec(path); + if (match) { + ret = { + book: LABEL_FROM_KEY[match.groups.bookKey], + page: match.groups.page, + version: LABEL_FROM_KEY[version], + url: `/edition/?p0.do=${match.groups.bookKey}&p0.lo=p.${match.groups.page}&p0.vi=${version}`, + title: `${LABEL_FROM_KEY[match.groups.bookKey]}, page ${match.groups.page}`, + } + } + + return ret + } + async writeJsonFromTei() { this.entities = []; @@ -159,7 +204,7 @@ class TextSearch { let firstLine = ''; ret = ret.replace(firstLine, ""); - console.log(ret) + console.log(ret.substring(0, 300)) return ret; } diff --git a/frontend/entities/textsearch.md b/frontend/entities/textsearch.md index fca332689..c64e70a7c 100644 --- a/frontend/entities/textsearch.md +++ b/frontend/entities/textsearch.md @@ -3,13 +3,12 @@ title: Text Search --- {% raw %} - {% endraw %} From f0bc7cf52fc671103e1fe3a4f373f1dc8c738311 Mon Sep 17 00:00:00 2001 From: geoffroy-noel-ddh Date: Fri, 23 Aug 2024 14:56:05 +0100 Subject: [PATCH 5/9] feat(search): added pagefind text indexing to the regular autopull bash script. --- _build/autopull.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/_build/autopull.sh b/_build/autopull.sh index 66fc2bf6f..a3b747a6e 100755 --- a/_build/autopull.sh +++ b/_build/autopull.sh @@ -49,6 +49,8 @@ if [[ $h1 != $h2 ]]; then # fi fi +su - $GITUSER -c "export SITE_ENV=$SITE_ENV; cd $PROJECT_DIR/apps/textsearch/ && npm run index" + # update tweets # su - $GITUSER -c "cd $PROJECT_DIR && npm run tweets -w frontend" From 20256f95ad3318d5d4b3d513c95235e23e3b415f Mon Sep 17 00:00:00 2001 From: geoffroy-noel-ddh Date: Fri, 23 Aug 2024 15:17:22 +0100 Subject: [PATCH 6/9] feat(search): textsearch index all pages & books on dev, stg & site. On lcl it depends on LIMIT. --- apps/textsearch/textsearch.js | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/apps/textsearch/textsearch.js b/apps/textsearch/textsearch.js index 889354d9b..3c1f1404b 100644 --- a/apps/textsearch/textsearch.js +++ b/apps/textsearch/textsearch.js @@ -6,6 +6,7 @@ const SaxonJS = require("saxon-js"); const pathp = require("path"); const fs = require("fs"); +const SITE_ENV = process.env.SITE_ENV || 'lcl' const sourceBase = "./clone/dts/documents/"; // const sourceBase = // "https://raw.githubusercontent.com/kingsdigitallab/alice-thornton/edition/entities/"; @@ -26,6 +27,7 @@ const LABEL_FROM_KEY = { 'semidip': 'Semi-diplomatic', } const TO_BE_INDEXED_PATH = 'to-be-indexed' +const LIMIT = 2 class TextSearch { constructor() { @@ -37,7 +39,7 @@ class TextSearch { fs.rmSync(TO_BE_INDEXED_PATH, { recursive: true }) - let limit = 2; + let limit = SITE_ENV == 'lcl' ? LIMIT : 0; let processed = 0; while (paths.length) { From 370e0f94c5bdd2ab00bdddf868954211b0acb5c7 Mon Sep 17 00:00:00 2001 From: geoffroy-noel-ddh Date: Fri, 23 Aug 2024 17:55:01 +0100 Subject: [PATCH 7/9] feat(search): text search now sorted by book & page number; removed image placeholder; 15 items per result page. --- apps/textsearch/html-to-html-modern.xslt | 6 +- apps/textsearch/textsearch.js | 107 +++-------------------- frontend/entities/textsearch.md | 9 +- 3 files changed, 26 insertions(+), 96 deletions(-) diff --git a/apps/textsearch/html-to-html-modern.xslt b/apps/textsearch/html-to-html-modern.xslt index 6840b0882..9ad81f6bb 100644 --- a/apps/textsearch/html-to-html-modern.xslt +++ b/apps/textsearch/html-to-html-modern.xslt @@ -21,11 +21,13 @@ diff --git a/apps/textsearch/textsearch.js b/apps/textsearch/textsearch.js index 3c1f1404b..de0dc5277 100644 --- a/apps/textsearch/textsearch.js +++ b/apps/textsearch/textsearch.js @@ -27,7 +27,7 @@ const LABEL_FROM_KEY = { 'semidip': 'Semi-diplomatic', } const TO_BE_INDEXED_PATH = 'to-be-indexed' -const LIMIT = 2 +const LIMIT = 100 class TextSearch { constructor() { @@ -82,96 +82,32 @@ class TextSearch { } getMetadataFromPath(path, version='modern') { - // clone/dts/documents/book_two/p.99.html let ret = null; + // clone/dts/documents/book_one/p.12.html const regex = /\/dts\/documents\/(?\w+)\/p\.(?\d+)\.html$/; const match = regex.exec(path); + if (match) { + // 12 + const pageNumber = match.groups.page + // 0012 + const pageNumberPadded = pageNumber.padStart(4, "0"); ret = { book: LABEL_FROM_KEY[match.groups.bookKey], - page: match.groups.page, + page: pageNumber, version: LABEL_FROM_KEY[version], - url: `/edition/?p0.do=${match.groups.bookKey}&p0.lo=p.${match.groups.page}&p0.vi=${version}`, - title: `${LABEL_FROM_KEY[match.groups.bookKey]}, page ${match.groups.page}`, + url: `/edition/?p0.do=${match.groups.bookKey}&p0.lo=p.${pageNumber}&p0.vi=${version}`, + // Book 1, page 12 + title: `${LABEL_FROM_KEY[match.groups.bookKey]}, page ${pageNumber}`, + // 1-0012 for Book 1, page 12 + bookPage: `${Object.keys(LABEL_FROM_KEY).indexOf(match.groups.bookKey)}-${pageNumberPadded}`, } } return ret } - async writeJsonFromTei() { - this.entities = []; - - for (let source of sources) { - await this.loadTei(sourceBase + source); - } - - this.postProcessEntities(); - - this.writeJson(target, this.entities); - } - - postProcessEntities() { - // processing which is much simpler in JS than XSLT - for (let entity of this.entities) { - // remove duplicate pages in entity.pages - entity.pages = Object.fromEntries( - Object.entries(entity.pages).map(([k, v]) => [k, [...new Set(v)]]) - ); - // remove books from entity.pages which have no pages - entity.pages = Object.fromEntries( - Object.entries(entity.pages).filter(([k, v]) => v.length) - ); - // entity.books = list of books they appear in - entity.books = Object.keys(entity.pages); - // missing key for people with no first/surname - if (!entity?.sortkey && entity.type == "person") { - // 'John Thornton (1633-1669)' => "Thornton-John" - entity.sortkey = entity.title - .replace(/\([^)]+\)/g, "") - .trim() - .split(/\s+/) - .reverse() - .join("-"); - console.log( - `WARNING: fixed missing sorkey for ${entity.type}:${entity.id} = ${entity.sortkey}` - ); - } - if (!entity?.search) { - entity.search = entity.title; - } - // remove text between [] - entity.search = entity.search.replace(/\[.*?\]/g, ""); - } - - // sort by sortKey, optional, only for debugging purpose as itemjs will sort anyway - this.entities = this.entities.sort((a, b) => - a.sortkey.localeCompare(b.sortkey) - ); - } - - async loadTei(source) { - // let docString = this.readFile(source) - let entitiesJson = await this.xslt(source, jsonSheetPath); - // console.log(entitiesJson.substring(0, 1000)); - // fs.writeFileSync('tmp.json', entitiesJson, "utf8"); - - let entities = []; - - if (entitiesJson) { - entities = JSON.parse(entitiesJson); - } else { - console.log( - `WARNING: entities file (${source}) transformed into an empty string.` - ); - } - - for (let i in entities) { - this.entities.push(entities[i]); - } - } - readFile(source) { return fs.readFileSync(source).toString(); } @@ -206,7 +142,7 @@ class TextSearch { let firstLine = ''; ret = ret.replace(firstLine, ""); - console.log(ret.substring(0, 300)) + // console.log(ret.substring(0, 300)) return ret; } @@ -235,21 +171,6 @@ class TextSearch { return ret; } - writeJson(path, data) { - // envelope: add metadata; format inspired by JSON:API - data = { - meta: { - dateCreated: new Date().toISOString(), - }, - data: data, - }; - // console.log(data) - let dataStr = JSON.stringify(data, null, 2); - fs.writeFileSync(path, dataStr, "utf8"); - console.log( - `WRITE ${path} (${(dataStr.length / 1024 / 1024).toFixed(2)} MB)` - ); - } } new TextSearch().transformHTMLs(); diff --git a/frontend/entities/textsearch.md b/frontend/entities/textsearch.md index c64e70a7c..ea7d84ecf 100644 --- a/frontend/entities/textsearch.md +++ b/frontend/entities/textsearch.md @@ -8,7 +8,14 @@ title: Text Search {% endraw %} From 54b817657db6a7911ee7cc7ba8b81a6ac303f0b3 Mon Sep 17 00:00:00 2001 From: geoffroy-noel-ddh Date: Fri, 23 Aug 2024 18:11:19 +0100 Subject: [PATCH 8/9] feat(search): added a visible tag in the results for the version. --- apps/textsearch/html-to-html-modern.xslt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/textsearch/html-to-html-modern.xslt b/apps/textsearch/html-to-html-modern.xslt index 9ad81f6bb..6798e4a1c 100644 --- a/apps/textsearch/html-to-html-modern.xslt +++ b/apps/textsearch/html-to-html-modern.xslt @@ -20,7 +20,7 @@ Date: Mon, 2 Sep 2024 18:38:33 +0100 Subject: [PATCH 9/9] fix(search): finished cleaning up the modernised version of the search index.. --- apps/textsearch/html-to-html-modern.xslt | 35 ++++++++++++++++++++++-- apps/textsearch/textsearch.js | 24 ++++++++++------ frontend/assets/css/text-viewer2.scss | 3 ++ 3 files changed, 52 insertions(+), 10 deletions(-) diff --git a/apps/textsearch/html-to-html-modern.xslt b/apps/textsearch/html-to-html-modern.xslt index 6798e4a1c..3b676a0f6 100644 --- a/apps/textsearch/html-to-html-modern.xslt +++ b/apps/textsearch/html-to-html-modern.xslt @@ -34,12 +34,43 @@ + + + + + + + + + + + + + + + - - + + + + + + + + + + + + + + + + + + diff --git a/apps/textsearch/textsearch.js b/apps/textsearch/textsearch.js index de0dc5277..8546e7fab 100644 --- a/apps/textsearch/textsearch.js +++ b/apps/textsearch/textsearch.js @@ -8,12 +8,7 @@ const fs = require("fs"); const SITE_ENV = process.env.SITE_ENV || 'lcl' const sourceBase = "./clone/dts/documents/"; -// const sourceBase = -// "https://raw.githubusercontent.com/kingsdigitallab/alice-thornton/edition/entities/"; -const sources = ["people.xml", "places.xml", "events.xml"]; -// const sources = ["events.xml"]; const target = "../../frontend/assets/js/entities.json"; -// const jsonSheetPath = "html-to-html.sef.json"; const XSLTPath = { 'modern': "html-to-html-modern.xslt", // 'semidip': "html-to-html-semidip.xslt" @@ -27,7 +22,7 @@ const LABEL_FROM_KEY = { 'semidip': 'Semi-diplomatic', } const TO_BE_INDEXED_PATH = 'to-be-indexed' -const LIMIT = 100 +const LIMIT = 0 class TextSearch { constructor() { @@ -37,7 +32,9 @@ class TextSearch { async transformHTMLs() { var paths = fs.readdirSync(sourceBase); - fs.rmSync(TO_BE_INDEXED_PATH, { recursive: true }) + if (fs.existsSync(TO_BE_INDEXED_PATH)) { + fs.rmSync(TO_BE_INDEXED_PATH, { recursive: true }) + } let limit = SITE_ENV == 'lcl' ? LIMIT : 0; let processed = 0; @@ -69,12 +66,23 @@ class TextSearch { async transformHTMLVersion(path, version='modern') { let htmlString = await this.xslt(path, XSLTPath[version]); + // variables substitution let metadata = this.getMetadataFromPath(path, version); - for (let k of Object.keys(metadata)) { htmlString = htmlString.replace(`#${k}#`, metadata[k]) } + // con-form => conform + // Join parts of a word separated by a line break + htmlString = htmlString.replace(/<\/br>/g, ""); + // remove spaces around a line break in the middle of a word + htmlString = htmlString.replace(/\s*(]+data-tei-break="no"[^>]*>)\s*/g, "$1"); + // remove the hyphen + htmlString = htmlString.replace( + /-<\/span>(]+data-tei-break="no"[^>]*>)/g, + '' + ); + let targetPath = path.replace('clone/dts/documents', TO_BE_INDEXED_PATH) targetPath = targetPath.replace('.html', '-' + version + '.html') fs.mkdirSync(pathp.dirname(targetPath), { recursive: true }) diff --git a/frontend/assets/css/text-viewer2.scss b/frontend/assets/css/text-viewer2.scss index 7fe994a76..44ec4d34a 100644 --- a/frontend/assets/css/text-viewer2.scss +++ b/frontend/assets/css/text-viewer2.scss @@ -1088,6 +1088,7 @@ $render-sic: false; } } + // 2.284, 1.8 &.view-modern { .tei-fw[data-tei-place~="margin"]:not(.tei-type-head):not(.tei-type-header) { display: none; @@ -1103,6 +1104,7 @@ $render-sic: false; } } + // 2.36,81 .tei-fw.tei-type-pageNum { & + .tei-lb { display: none; @@ -1151,6 +1153,7 @@ $render-sic: false; } &.view-modern { + // 2.3 .tei-del { display: none; }