diff --git a/src/main/java/com/adobe/epubcheck/messages/DefaultSeverities.java b/src/main/java/com/adobe/epubcheck/messages/DefaultSeverities.java index cd25d4799..b6ad8ffd1 100644 --- a/src/main/java/com/adobe/epubcheck/messages/DefaultSeverities.java +++ b/src/main/java/com/adobe/epubcheck/messages/DefaultSeverities.java @@ -248,7 +248,6 @@ private void initialize() severities.put(MessageId.OPF_058, Severity.SUPPRESSED); severities.put(MessageId.OPF_059, Severity.SUPPRESSED); severities.put(MessageId.OPF_060, Severity.ERROR); - severities.put(MessageId.OPF_061, Severity.WARNING); severities.put(MessageId.OPF_062, Severity.USAGE); severities.put(MessageId.OPF_063, Severity.WARNING); severities.put(MessageId.OPF_064, Severity.INFO); diff --git a/src/main/java/com/adobe/epubcheck/messages/MessageId.java b/src/main/java/com/adobe/epubcheck/messages/MessageId.java index dd8486823..7724bb5d4 100644 --- a/src/main/java/com/adobe/epubcheck/messages/MessageId.java +++ b/src/main/java/com/adobe/epubcheck/messages/MessageId.java @@ -242,7 +242,6 @@ public enum MessageId implements Comparable OPF_058("OPF-058"), OPF_059("OPF-059"), OPF_060("OPF-060"), - OPF_061("OPF-061"), OPF_062("OPF-062"), OPF_063("OPF-063"), OPF_064("OPF-064"), diff --git a/src/main/java/com/adobe/epubcheck/ocf/OCFChecker.java b/src/main/java/com/adobe/epubcheck/ocf/OCFChecker.java index 181a9a949..5839c3fdc 100755 --- a/src/main/java/com/adobe/epubcheck/ocf/OCFChecker.java +++ b/src/main/java/com/adobe/epubcheck/ocf/OCFChecker.java @@ -25,11 +25,9 @@ import java.io.File; import java.io.IOException; import java.io.InputStream; -import java.text.Normalizer; import java.util.HashSet; import java.util.LinkedList; import java.util.List; -import java.util.Locale; import java.util.Map; import java.util.Set; @@ -37,6 +35,7 @@ import org.w3c.epubcheck.core.AbstractChecker; import org.w3c.epubcheck.core.Checker; import org.w3c.epubcheck.core.CheckerFactory; +import org.w3c.epubcheck.util.text.UnicodeUtils; import com.adobe.epubcheck.api.EPUBLocation; import com.adobe.epubcheck.api.EPUBProfile; @@ -296,18 +295,6 @@ private boolean checkContainerStructure(OCFCheckerState state) // FIXME 2022 report symbolic links and continue - // Check duplicate entries - if (normalizedPaths.contains(resource.getPath().toLowerCase(Locale.ROOT))) - { - context.report.message(MessageId.OPF_060, EPUBLocation.of(context), resource.getPath()); - } - // Check duplicate entries after NFC normalization - else if (normalizedPaths.contains( - Normalizer.normalize(resource.getPath().toLowerCase(Locale.ROOT), Normalizer.Form.NFC))) - { - context.report.message(MessageId.OPF_061, EPUBLocation.of(context), resource.getPath()); - } - // Store the resource in the data structure if (resource.isDirectory()) { @@ -318,9 +305,19 @@ else if (normalizedPaths.contains( else { // The container resource is a file, - // sStore its path for later checking of empty directories + // store its path for later checking of empty directories filePaths.add(resource.getPath()); - normalizedPaths.add(resource.getPath().toLowerCase(Locale.ROOT)); + + // Check duplicate entries + String normalizedPath = UnicodeUtils.canonicalCaseFold(resource.getPath()); + if (normalizedPaths.contains(normalizedPath)) + { + context.report.message(MessageId.OPF_060, EPUBLocation.of(context), resource.getPath()); + } + else + { + normalizedPaths.add(normalizedPath); + } // Check file name requirements new OCFFilenameChecker(resource.getPath(), state.context().build()).check(); diff --git a/src/main/java/org/w3c/epubcheck/util/text/UnicodeUtils.java b/src/main/java/org/w3c/epubcheck/util/text/UnicodeUtils.java new file mode 100644 index 000000000..4f280b33d --- /dev/null +++ b/src/main/java/org/w3c/epubcheck/util/text/UnicodeUtils.java @@ -0,0 +1,39 @@ +package org.w3c.epubcheck.util.text; + +import com.google.common.base.Preconditions; +import com.ibm.icu.text.CaseMap; +import com.ibm.icu.text.Normalizer2; + +public final class UnicodeUtils +{ + + private static final Normalizer2 NFD_NORMALIZER = Normalizer2.getNFCInstance(); + private static final CaseMap.Fold CASE_FOLDER = CaseMap.fold(); + + private UnicodeUtils() + { + // static utility class + } + + /** + * Applies Unicode Canonical Case Fold Normalization as defined in + * https://www.w3.org/TR/charmod-norm/#CanonicalFoldNormalizationStep + * + * This applies, in sequence: - canonical decomposition (NFD) - case folding + * + * Note that the result is **not** recomposed (NFC), i.e. the optional + * post-folding NFC normalization is not applied. + * + * In other words, the result is suitable for string comparison for + * case-insensitive string comparison, but not for display. + * + * @param string + * the string to normalize + * @return the string normalized by applying NFD then case folding + */ + public static String canonicalCaseFold(String string) + { + Preconditions.checkArgument(string != null); + return CASE_FOLDER.apply(NFD_NORMALIZER.normalize(string)); + } +} diff --git a/src/main/resources/com/adobe/epubcheck/messages/MessageBundle.properties b/src/main/resources/com/adobe/epubcheck/messages/MessageBundle.properties index bed87d83a..0a78f94c6 100644 --- a/src/main/resources/com/adobe/epubcheck/messages/MessageBundle.properties +++ b/src/main/resources/com/adobe/epubcheck/messages/MessageBundle.properties @@ -238,8 +238,7 @@ OPF_058=Spine item "%1$s" is not referenced from the TOC in the Nav Doc. OPF_058_SUG=Every spine item in the manifest should be referenced by at least one TOC entry in the Nav Doc. OPF_059=Spine item "%1$s" is not referenced from the TOC in the NCX. OPF_059_SUG=Every spine item in the manifest should be referenced by at least one TOC entry in the NCX file. -OPF_060=Duplicate entry in the ZIP file: "%1$s". -OPF_061=Duplicate entry in the ZIP file (after Unicode NFC normalization) "%1$s". +OPF_060=Duplicate entry in the ZIP file: "%1$s" (file names must be unique after Unicode canonical normalization and full case folding). OPF_062=Found Adobe page-map attribute on spine element in opf file. OPF_063=Referenced Adobe page-map item "%1$s" was not found in the manifest. OPF_064=OPF declares type "%1$s", validating using profile "%2$s". diff --git a/src/test/resources/epub3/04-ocf/files/ocf-filename-duplicate-after-unicode-normalization-warning.epub b/src/test/resources/epub3/04-ocf/files/ocf-filename-duplicate-after-canonical-normalization-error.epub similarity index 100% rename from src/test/resources/epub3/04-ocf/files/ocf-filename-duplicate-after-unicode-normalization-warning.epub rename to src/test/resources/epub3/04-ocf/files/ocf-filename-duplicate-after-canonical-normalization-error.epub diff --git a/src/test/resources/epub3/04-ocf/files/ocf-filename-duplicate-after-case-normalization-error.epub b/src/test/resources/epub3/04-ocf/files/ocf-filename-duplicate-after-common-case-folding-error.epub similarity index 100% rename from src/test/resources/epub3/04-ocf/files/ocf-filename-duplicate-after-case-normalization-error.epub rename to src/test/resources/epub3/04-ocf/files/ocf-filename-duplicate-after-common-case-folding-error.epub diff --git a/src/test/resources/epub3/04-ocf/files/ocf-filename-duplicate-after-compatibility-normalization-valid.epub b/src/test/resources/epub3/04-ocf/files/ocf-filename-duplicate-after-compatibility-normalization-valid.epub new file mode 100644 index 000000000..22a38b2dc Binary files /dev/null and b/src/test/resources/epub3/04-ocf/files/ocf-filename-duplicate-after-compatibility-normalization-valid.epub differ diff --git a/src/test/resources/epub3/04-ocf/files/ocf-filename-duplicate-after-full-case-folding-error.epub b/src/test/resources/epub3/04-ocf/files/ocf-filename-duplicate-after-full-case-folding-error.epub new file mode 100644 index 000000000..eb531cc02 Binary files /dev/null and b/src/test/resources/epub3/04-ocf/files/ocf-filename-duplicate-after-full-case-folding-error.epub differ diff --git a/src/test/resources/epub3/04-ocf/ocf.feature b/src/test/resources/epub3/04-ocf/ocf.feature index 017e9c55a..332dcde9f 100644 --- a/src/test/resources/epub3/04-ocf/ocf.feature +++ b/src/test/resources/epub3/04-ocf/ocf.feature @@ -29,17 +29,28 @@ Feature: EPUB 3 — Open Container Format Then no errors or warnings are reported @spec @xref:sec-container-filenames - Scenario: Report a duplicate filename if two files only differ by case - When checking EPUB 'ocf-filename-duplicate-after-case-normalization-error.epub' + Scenario: Report a duplicate filename after common case folding + When checking EPUB 'ocf-filename-duplicate-after-common-case-folding-error.epub' Then error OPF-060 is reported And no other errors or warnings are reported @spec @xref:sec-container-filenames - Scenario: Report a duplicate filename if two files have the same name after Unicode normalization - When checking EPUB 'ocf-filename-duplicate-after-unicode-normalization-warning.epub' - Then warning OPF-061 is reported + Scenario: Report a duplicate filename after full case folding + When checking EPUB 'ocf-filename-duplicate-after-full-case-folding-error.epub' + Then error OPF-060 is reported + And no other errors or warnings are reported + + @spec @xref:sec-container-filenames + Scenario: Report a duplicate filename after Unicode canonical normalization (NFC) + When checking EPUB 'ocf-filename-duplicate-after-canonical-normalization-error.epub' + Then error OPF-060 is reported And no other errors or warnings are reported + @spec @xref:sec-container-filenames + Scenario: Allow a duplicate filename after Unicode compatibility normalization (NFKC) + When checking EPUB 'ocf-filename-duplicate-after-compatibility-normalization-valid.epub' + Then no other errors or warnings are reported + @spec @xref:sec-container-filenames Scenario: Allow Unicode emoji tag set in file name When checking EPUB 'ocf-filename-character-emoji-tag-sequence-valid'