Skip to content

Commit

Permalink
Fix sitemap URL location
Browse files Browse the repository at this point in the history
sitemap URL must have "/sitemap/" to be accessible in case of sitemap index file
  • Loading branch information
jeromeroucou committed Apr 24, 2024
1 parent e6ff44e commit 8b6e201
Show file tree
Hide file tree
Showing 3 changed files with 80 additions and 16 deletions.
15 changes: 10 additions & 5 deletions src/main/java/edu/harvard/iq/dataverse/sitemap/SiteMapUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,14 @@

public class SiteMapUtil {

private static final Logger logger = Logger.getLogger(SiteMapUtil.class.getCanonicalName());
private static DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd");

static final String DATE_PATTERN = "yyyy-MM-dd";
static final String SITEMAP_FILENAME_STAGED = "sitemap.xml.staged";
/** @see https://www.sitemaps.org/protocol.html#index */
static final int SITEMAP_LIMIT = 50000;

private static final Logger logger = Logger.getLogger(SiteMapUtil.class.getCanonicalName());
private static DateTimeFormatter formatter = DateTimeFormatter.ofPattern(DATE_PATTERN);


public static void updateSiteMap(List<Dataverse> dataverses, List<Dataset> datasets) {

Expand All @@ -56,11 +57,15 @@ public static void updateSiteMap(List<Dataverse> dataverses, List<Dataset> datas
directory.mkdir();
}

// Use DAY pattern (2024-01-24), local machine timezone
// Use DAY pattern (YYYY-MM-DD), local machine timezone
final W3CDateFormat dateFormat = new W3CDateFormat(Pattern.DAY);
WebSitemapGenerator wsg = null;
try {
wsg = WebSitemapGenerator.builder(dataverseSiteUrl, directory).autoValidate(true).dateFormat(dateFormat)
// All sitemap files are in "sitemap" folder, see "getSitemapPathString" method.
// But with pretty-faces configuration, "sitemap.xml" and "sitemap_index.xml" are accessible directly,
// like "https://demo.dataverse.org/sitemap.xml". So "/sitemap/" need to be added on "WebSitemapGenerator"
// in order to have valid URL for sitemap location.
wsg = WebSitemapGenerator.builder(dataverseSiteUrl + "/sitemap/", directory).autoValidate(true).dateFormat(dateFormat)
.build();
} catch (MalformedURLException e) {
logger.warning(String.format(msgErrorFormat, "Dataverse site URL", dataverseSiteUrl, e.getLocalizedMessage()));
Expand Down
5 changes: 5 additions & 0 deletions src/main/webapp/WEB-INF/pretty-config.xml
Original file line number Diff line number Diff line change
Expand Up @@ -27,4 +27,9 @@
<view-id value="/sitemap/sitemap.xml" />
</url-mapping>

<url-mapping id="sitemapindex">
<pattern value="/sitemap_index.xml" />
<view-id value="/sitemap/sitemap_index.xml" />
</url-mapping>

</pretty-config>
76 changes: 65 additions & 11 deletions src/test/java/edu/harvard/iq/dataverse/sitemap/SiteMapUtilTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,15 @@
import java.sql.Timestamp;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;

import static org.junit.jupiter.api.Assertions.*;

import org.apache.commons.lang3.StringUtils;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
Expand Down Expand Up @@ -127,23 +129,29 @@ void testUpdateSiteMap() throws IOException, ParseException, SAXException {
void testHugeSiteMap() throws IOException, ParseException, SAXException {
// given
final int nbDataverse = 50;
final int nbDataset = 50000;
final int nbDataset = SiteMapUtil.SITEMAP_LIMIT;
final Timestamp now = new Timestamp(new Date().getTime());
// Regex validate dataset URL
final String sitemapUrlRegex = ".*/dataset\\.xhtml\\?persistentId=doi:10\\.666/FAKE/published[0-9]{1,5}</loc>$";
// Regex validate sitemap URL: must include "/sitemap/" to be accessible because there is no pretty-faces rewrite
final String sitemapIndexUrlRegex = ".*/sitemap/sitemap[1-2]\\.xml</loc>$";
final String today = LocalDateTime.now().format(DateTimeFormatter.ofPattern(SiteMapUtil.DATE_PATTERN));

final List<Dataverse> dataverses = new ArrayList<>(nbDataverse);
for (int i = 1; i <= nbDataverse; i++) {
final Dataverse publishedDataverse = new Dataverse();
publishedDataverse.setAlias(String.format("publishedDv%s", i));
publishedDataverse.setModificationTime(new Timestamp(new Date().getTime()));
publishedDataverse.setPublicationDate(new Timestamp(new Date().getTime()));
publishedDataverse.setModificationTime(now);
publishedDataverse.setPublicationDate(now);
dataverses.add(publishedDataverse);
}

final List<Dataset> datasets = new ArrayList<>(nbDataset);
for (int i = 1; i <= nbDataset; i++) {
final Dataset published = new Dataset();
published.setGlobalId(new GlobalId(AbstractDOIProvider.DOI_PROTOCOL, "10.666", String.format("FAKE/published%s", i), null, AbstractDOIProvider.DOI_RESOLVER_URL, null));
published.setPublicationDate(new Timestamp(new Date().getTime()));
published.setModificationTime(new Timestamp(new Date().getTime()));
published.setPublicationDate(now);
published.setModificationTime(now);
datasets.add(published);
}

Expand All @@ -153,24 +161,70 @@ void testHugeSiteMap() throws IOException, ParseException, SAXException {
// then
final Path siteMapDir = tempDocroot.resolve("sitemap");
final String pathToSiteMapIndexFile = siteMapDir.resolve("sitemap_index.xml").toString();
final String pathToSiteMap1File = siteMapDir.resolve("sitemap1.xml").toString();
final String pathToSiteMap2File = siteMapDir.resolve("sitemap2.xml").toString();

// validate sitemap_index.xml file with XSD
assertDoesNotThrow(() -> XmlValidator.validateXmlWellFormed(pathToSiteMapIndexFile));
assertTrue(XmlValidator.validateXmlSchema(pathToSiteMapIndexFile, new URL(xsdSitemapIndex)));

final File sitemapFile = new File(pathToSiteMapIndexFile);
// verify sitemap_index.xml content
File sitemapFile = new File(pathToSiteMapIndexFile);
String sitemapString = XmlPrinter.prettyPrintXml(new String(Files.readAllBytes(Paths.get(sitemapFile.getAbsolutePath())), StandardCharsets.UTF_8));
// System.out.println("sitemap: " + sitemapString);

assertTrue(sitemapString.contains("sitemap1.xml"));
assertTrue(sitemapString.contains("sitemap2.xml"));
assertTrue(sitemapString.contains("</sitemapindex>"));
String[] lines = sitemapString.split("\n");
for (int i = 0; i < lines.length; i++) {
String line = lines[i].strip();
if (StringUtils.isNotBlank(line)) {
if (i == 0) {
assertEquals("<?xml version=\"1.0\" encoding=\"UTF-8\"?>", line);
} else if (i == 1) {
assertEquals("<sitemapindex xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">", line);
} else if (i == 2) {
assertEquals("<sitemap>", line);
} else if (line.startsWith("<loc>")) {
final String errorWithSitemapIndexUrl = String.format("Sitemap URL must match with \"%s\" but was \"%s\"", sitemapIndexUrlRegex, line);
assertTrue(line.matches(sitemapIndexUrlRegex), errorWithSitemapIndexUrl);
} else if (line.startsWith("<lastmod>")) {
assertEquals(String.format("<lastmod>%s</lastmod>", today), line);
}
}
}

final String pathToSiteMap1File = siteMapDir.resolve("sitemap1.xml").toString();
// validate sitemap1.xml file with XSD
assertDoesNotThrow(() -> XmlValidator.validateXmlWellFormed(pathToSiteMap1File));
assertTrue(XmlValidator.validateXmlSchema(pathToSiteMap1File, new URL(xsdSitemap)));

final String pathToSiteMap2File = siteMapDir.resolve("sitemap2.xml").toString();
// validate sitemap2.xml file with XSD
assertDoesNotThrow(() -> XmlValidator.validateXmlWellFormed(pathToSiteMap2File));
assertTrue(XmlValidator.validateXmlSchema(pathToSiteMap2File, new URL(xsdSitemap)));

// verify sitemap2.xml content
sitemapFile = new File(pathToSiteMap2File);
sitemapString = XmlPrinter.prettyPrintXml(new String(Files.readAllBytes(Paths.get(sitemapFile.getAbsolutePath())), StandardCharsets.UTF_8));

lines = sitemapString.split("\n");
assertEquals("<?xml version=\"1.0\" encoding=\"UTF-8\"?>", lines[0].strip());
assertEquals("<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">", lines[1].strip());
boolean isContainsLocTag = false;
boolean isContainsLastmodTag = false;
// loop over 10 lines only, just need to validate the <loc> and <lastmod> tags
for (int i = 5; i < 15; i++) {
String line = lines[i].strip();
if (StringUtils.isNotBlank(line)) {
if (line.startsWith("<loc>")) {
isContainsLocTag = true;
final String errorWithSitemapIndexUrl = String.format("Sitemap URL must match with \"%s\" but was \"%s\"", sitemapUrlRegex, line);
assertTrue(line.matches(sitemapUrlRegex), errorWithSitemapIndexUrl);
} else if (line.startsWith("<lastmod>")) {
isContainsLastmodTag = true;
assertEquals(String.format("<lastmod>%s</lastmod>", today), line);
}
}
}
assertTrue(isContainsLocTag, "Sitemap file must contains <loc> tag");
assertTrue(isContainsLastmodTag, "Sitemap file must contains <lastmod> tag");
}

}

0 comments on commit 8b6e201

Please sign in to comment.