From 8b6e201d5f8c881fe08725eeed2864716a3d3f3c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20ROUCOU?= Date: Wed, 24 Apr 2024 16:21:01 +0200 Subject: [PATCH] Fix sitemap URL location sitemap URL must have "/sitemap/" to be accessible in case of sitemap index file --- .../iq/dataverse/sitemap/SiteMapUtil.java | 15 ++-- src/main/webapp/WEB-INF/pretty-config.xml | 5 ++ .../iq/dataverse/sitemap/SiteMapUtilTest.java | 76 ++++++++++++++++--- 3 files changed, 80 insertions(+), 16 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/sitemap/SiteMapUtil.java b/src/main/java/edu/harvard/iq/dataverse/sitemap/SiteMapUtil.java index 3077c41fa14..8408e7d91f2 100644 --- a/src/main/java/edu/harvard/iq/dataverse/sitemap/SiteMapUtil.java +++ b/src/main/java/edu/harvard/iq/dataverse/sitemap/SiteMapUtil.java @@ -25,13 +25,14 @@ public class SiteMapUtil { - private static final Logger logger = Logger.getLogger(SiteMapUtil.class.getCanonicalName()); - private static DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd"); - + static final String DATE_PATTERN = "yyyy-MM-dd"; static final String SITEMAP_FILENAME_STAGED = "sitemap.xml.staged"; /** @see https://www.sitemaps.org/protocol.html#index */ static final int SITEMAP_LIMIT = 50000; + private static final Logger logger = Logger.getLogger(SiteMapUtil.class.getCanonicalName()); + private static DateTimeFormatter formatter = DateTimeFormatter.ofPattern(DATE_PATTERN); + public static void updateSiteMap(List dataverses, List datasets) { @@ -56,11 +57,15 @@ public static void updateSiteMap(List dataverses, List datas directory.mkdir(); } - // Use DAY pattern (2024-01-24), local machine timezone + // Use DAY pattern (YYYY-MM-DD), local machine timezone final W3CDateFormat dateFormat = new W3CDateFormat(Pattern.DAY); WebSitemapGenerator wsg = null; try { - wsg = WebSitemapGenerator.builder(dataverseSiteUrl, directory).autoValidate(true).dateFormat(dateFormat) + // All sitemap files are in "sitemap" folder, see "getSitemapPathString" method. + // But with pretty-faces configuration, "sitemap.xml" and "sitemap_index.xml" are accessible directly, + // like "https://demo.dataverse.org/sitemap.xml". So "/sitemap/" need to be added on "WebSitemapGenerator" + // in order to have valid URL for sitemap location. + wsg = WebSitemapGenerator.builder(dataverseSiteUrl + "/sitemap/", directory).autoValidate(true).dateFormat(dateFormat) .build(); } catch (MalformedURLException e) { logger.warning(String.format(msgErrorFormat, "Dataverse site URL", dataverseSiteUrl, e.getLocalizedMessage())); diff --git a/src/main/webapp/WEB-INF/pretty-config.xml b/src/main/webapp/WEB-INF/pretty-config.xml index ab5f37a1051..5f8f4877af8 100644 --- a/src/main/webapp/WEB-INF/pretty-config.xml +++ b/src/main/webapp/WEB-INF/pretty-config.xml @@ -27,4 +27,9 @@ + + + + + \ No newline at end of file diff --git a/src/test/java/edu/harvard/iq/dataverse/sitemap/SiteMapUtilTest.java b/src/test/java/edu/harvard/iq/dataverse/sitemap/SiteMapUtilTest.java index 19f985cc984..f17cb825986 100644 --- a/src/test/java/edu/harvard/iq/dataverse/sitemap/SiteMapUtilTest.java +++ b/src/test/java/edu/harvard/iq/dataverse/sitemap/SiteMapUtilTest.java @@ -18,6 +18,7 @@ import java.sql.Timestamp; import java.text.ParseException; import java.text.SimpleDateFormat; +import java.time.LocalDateTime; import java.time.format.DateTimeFormatter; import java.util.ArrayList; import java.util.Date; @@ -25,6 +26,7 @@ import static org.junit.jupiter.api.Assertions.*; +import org.apache.commons.lang3.StringUtils; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -127,14 +129,20 @@ void testUpdateSiteMap() throws IOException, ParseException, SAXException { void testHugeSiteMap() throws IOException, ParseException, SAXException { // given final int nbDataverse = 50; - final int nbDataset = 50000; + final int nbDataset = SiteMapUtil.SITEMAP_LIMIT; + final Timestamp now = new Timestamp(new Date().getTime()); + // Regex validate dataset URL + final String sitemapUrlRegex = ".*/dataset\\.xhtml\\?persistentId=doi:10\\.666/FAKE/published[0-9]{1,5}$"; + // Regex validate sitemap URL: must include "/sitemap/" to be accessible because there is no pretty-faces rewrite + final String sitemapIndexUrlRegex = ".*/sitemap/sitemap[1-2]\\.xml$"; + final String today = LocalDateTime.now().format(DateTimeFormatter.ofPattern(SiteMapUtil.DATE_PATTERN)); final List dataverses = new ArrayList<>(nbDataverse); for (int i = 1; i <= nbDataverse; i++) { final Dataverse publishedDataverse = new Dataverse(); publishedDataverse.setAlias(String.format("publishedDv%s", i)); - publishedDataverse.setModificationTime(new Timestamp(new Date().getTime())); - publishedDataverse.setPublicationDate(new Timestamp(new Date().getTime())); + publishedDataverse.setModificationTime(now); + publishedDataverse.setPublicationDate(now); dataverses.add(publishedDataverse); } @@ -142,8 +150,8 @@ void testHugeSiteMap() throws IOException, ParseException, SAXException { for (int i = 1; i <= nbDataset; i++) { final Dataset published = new Dataset(); published.setGlobalId(new GlobalId(AbstractDOIProvider.DOI_PROTOCOL, "10.666", String.format("FAKE/published%s", i), null, AbstractDOIProvider.DOI_RESOLVER_URL, null)); - published.setPublicationDate(new Timestamp(new Date().getTime())); - published.setModificationTime(new Timestamp(new Date().getTime())); + published.setPublicationDate(now); + published.setModificationTime(now); datasets.add(published); } @@ -153,24 +161,70 @@ void testHugeSiteMap() throws IOException, ParseException, SAXException { // then final Path siteMapDir = tempDocroot.resolve("sitemap"); final String pathToSiteMapIndexFile = siteMapDir.resolve("sitemap_index.xml").toString(); + final String pathToSiteMap1File = siteMapDir.resolve("sitemap1.xml").toString(); + final String pathToSiteMap2File = siteMapDir.resolve("sitemap2.xml").toString(); + + // validate sitemap_index.xml file with XSD assertDoesNotThrow(() -> XmlValidator.validateXmlWellFormed(pathToSiteMapIndexFile)); assertTrue(XmlValidator.validateXmlSchema(pathToSiteMapIndexFile, new URL(xsdSitemapIndex))); - final File sitemapFile = new File(pathToSiteMapIndexFile); + // verify sitemap_index.xml content + File sitemapFile = new File(pathToSiteMapIndexFile); String sitemapString = XmlPrinter.prettyPrintXml(new String(Files.readAllBytes(Paths.get(sitemapFile.getAbsolutePath())), StandardCharsets.UTF_8)); // System.out.println("sitemap: " + sitemapString); - assertTrue(sitemapString.contains("sitemap1.xml")); - assertTrue(sitemapString.contains("sitemap2.xml")); - assertTrue(sitemapString.contains("")); + String[] lines = sitemapString.split("\n"); + for (int i = 0; i < lines.length; i++) { + String line = lines[i].strip(); + if (StringUtils.isNotBlank(line)) { + if (i == 0) { + assertEquals("", line); + } else if (i == 1) { + assertEquals("", line); + } else if (i == 2) { + assertEquals("", line); + } else if (line.startsWith("")) { + final String errorWithSitemapIndexUrl = String.format("Sitemap URL must match with \"%s\" but was \"%s\"", sitemapIndexUrlRegex, line); + assertTrue(line.matches(sitemapIndexUrlRegex), errorWithSitemapIndexUrl); + } else if (line.startsWith("")) { + assertEquals(String.format("%s", today), line); + } + } + } - final String pathToSiteMap1File = siteMapDir.resolve("sitemap1.xml").toString(); + // validate sitemap1.xml file with XSD assertDoesNotThrow(() -> XmlValidator.validateXmlWellFormed(pathToSiteMap1File)); assertTrue(XmlValidator.validateXmlSchema(pathToSiteMap1File, new URL(xsdSitemap))); - final String pathToSiteMap2File = siteMapDir.resolve("sitemap2.xml").toString(); + // validate sitemap2.xml file with XSD assertDoesNotThrow(() -> XmlValidator.validateXmlWellFormed(pathToSiteMap2File)); assertTrue(XmlValidator.validateXmlSchema(pathToSiteMap2File, new URL(xsdSitemap))); + + // verify sitemap2.xml content + sitemapFile = new File(pathToSiteMap2File); + sitemapString = XmlPrinter.prettyPrintXml(new String(Files.readAllBytes(Paths.get(sitemapFile.getAbsolutePath())), StandardCharsets.UTF_8)); + + lines = sitemapString.split("\n"); + assertEquals("", lines[0].strip()); + assertEquals("", lines[1].strip()); + boolean isContainsLocTag = false; + boolean isContainsLastmodTag = false; + // loop over 10 lines only, just need to validate the and tags + for (int i = 5; i < 15; i++) { + String line = lines[i].strip(); + if (StringUtils.isNotBlank(line)) { + if (line.startsWith("")) { + isContainsLocTag = true; + final String errorWithSitemapIndexUrl = String.format("Sitemap URL must match with \"%s\" but was \"%s\"", sitemapUrlRegex, line); + assertTrue(line.matches(sitemapUrlRegex), errorWithSitemapIndexUrl); + } else if (line.startsWith("")) { + isContainsLastmodTag = true; + assertEquals(String.format("%s", today), line); + } + } + } + assertTrue(isContainsLocTag, "Sitemap file must contains tag"); + assertTrue(isContainsLastmodTag, "Sitemap file must contains tag"); } }