Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

UnicodeData: Also support Unicode blocks #1692

Merged
merged 1 commit into from
Mar 1, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion tool-codegen/src/main/string-template/unicodedata.st
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,16 @@ static private void addProperty<k>() {
addPropertyAliases();
}

private static String normalize(String propertyCodeOrAlias) {
return propertyCodeOrAlias.toLowerCase(Locale.US).replace('-', '_');
}

/**
* Given a Unicode property (general category code, binary property name, or script name),
* returns the {@link IntervalSet} of Unicode code point ranges which have that property.
*/
public static IntervalSet getPropertyCodePoints(String propertyCodeOrAlias) {
String normalizedPropertyCodeOrAlias = propertyCodeOrAlias.toLowerCase(Locale.US);
String normalizedPropertyCodeOrAlias = normalize(propertyCodeOrAlias);
IntervalSet result = propertyCodePointRanges.get(normalizedPropertyCodeOrAlias);
if (result == null) {
String propertyCode = propertyAliases.get(normalizedPropertyCodeOrAlias);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,11 +75,13 @@ public static Map<String, Object> getProperties() {
addUnicodeCategoryCodesToCodePointRanges(propertyCodePointRanges);
addUnicodeBinaryPropertyCodesToCodePointRanges(propertyCodePointRanges);
addUnicodeScriptCodesToCodePointRanges(propertyCodePointRanges);
addUnicodeBlocksToCodePointRanges(propertyCodePointRanges);

Map<String, String> propertyAliases = new LinkedHashMap<>();
addUnicodeCategoryCodesToNames(propertyAliases);
addUnicodeBinaryPropertyCodesToNames(propertyAliases);
addUnicodeScriptCodesToNames(propertyAliases);
addUnicodeBlocksToNames(propertyAliases);

Map<String, Object> properties = new LinkedHashMap<>();
properties.put("propertyCodePointRanges", propertyCodePointRanges);
Expand Down Expand Up @@ -171,34 +173,42 @@ private static void addUnicodeBinaryPropertyCodesToNames(Map<String, String> pro
}
}

private static void addUnicodeScriptCodesToCodePointRanges(Map<String, IntervalSet> propertyCodePointRanges) {
for (int script = UCharacter.getIntPropertyMinValue(UProperty.SCRIPT);
script <= UCharacter.getIntPropertyMaxValue(UProperty.SCRIPT);
script++) {
private static void addIntPropertyRanges(int property, String namePrefix, Map<String, IntervalSet> propertyCodePointRanges) {
for (int propertyValue = UCharacter.getIntPropertyMinValue(property);
propertyValue <= UCharacter.getIntPropertyMaxValue(property);
propertyValue++) {
UnicodeSet set = new UnicodeSet();
set.applyIntPropertyValue(UProperty.SCRIPT, script);
String scriptName = UCharacter.getPropertyValueName(UProperty.SCRIPT, script, UProperty.NameChoice.SHORT);
IntervalSet intervalSet = propertyCodePointRanges.get(scriptName);
set.applyIntPropertyValue(property, propertyValue);
String propertyName = namePrefix + UCharacter.getPropertyValueName(property, propertyValue, UProperty.NameChoice.SHORT);
IntervalSet intervalSet = propertyCodePointRanges.get(propertyName);
if (intervalSet == null) {
intervalSet = new IntervalSet();
propertyCodePointRanges.put(scriptName, intervalSet);
propertyCodePointRanges.put(propertyName, intervalSet);
}
for (UnicodeSet.EntryRange range : set.ranges()) {
intervalSet.add(range.codepoint, range.codepointEnd);
}
}
}

private static void addUnicodeScriptCodesToNames(Map<String, String> propertyAliases) {
for (int script = UCharacter.getIntPropertyMinValue(UProperty.SCRIPT);
script <= UCharacter.getIntPropertyMaxValue(UProperty.SCRIPT);
script++) {
String propertyName = UCharacter.getPropertyValueName(UProperty.SCRIPT, script, UProperty.NameChoice.SHORT);
private static void addUnicodeScriptCodesToCodePointRanges(Map<String, IntervalSet> propertyCodePointRanges) {
addIntPropertyRanges(UProperty.SCRIPT, "", propertyCodePointRanges);
}

private static void addUnicodeBlocksToCodePointRanges(Map<String, IntervalSet> propertyCodePointRanges) {
addIntPropertyRanges(UProperty.BLOCK, "In", propertyCodePointRanges);
}

private static void addIntPropertyAliases(int property, String namePrefix, Map<String, String> propertyAliases) {
for (int propertyValue = UCharacter.getIntPropertyMinValue(property);
propertyValue <= UCharacter.getIntPropertyMaxValue(property);
propertyValue++) {
String propertyName = namePrefix + UCharacter.getPropertyValueName(property, propertyValue, UProperty.NameChoice.SHORT);
int nameChoice = UProperty.NameChoice.LONG;
String alias;
while (true) {
try {
alias = UCharacter.getPropertyValueName(UProperty.SCRIPT, script, nameChoice);
alias = namePrefix + UCharacter.getPropertyValueName(property, propertyValue, nameChoice);
} catch (IllegalArgumentException e) {
// No more aliases.
break;
Expand All @@ -209,4 +219,12 @@ private static void addUnicodeScriptCodesToNames(Map<String, String> propertyAli
}
}
}

private static void addUnicodeScriptCodesToNames(Map<String, String> propertyAliases) {
addIntPropertyAliases(UProperty.SCRIPT, "", propertyAliases);
}

private static void addUnicodeBlocksToNames(Map<String, String> propertyAliases) {
addIntPropertyAliases(UProperty.BLOCK, "In", propertyAliases);
}
}
19 changes: 19 additions & 0 deletions tool-testsuite/test/org/antlr/v4/test/tool/TestUnicodeData.java
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,20 @@ public void testUnicodeScriptAliases() {
assertTrue(UnicodeData.getPropertyCodePoints("Cyrillic").contains(0x0404));
}

@Test
public void testUnicodeBlocks() {
assertTrue(UnicodeData.getPropertyCodePoints("InASCII").contains('0'));
assertTrue(UnicodeData.getPropertyCodePoints("InCJK").contains(0x4E04));
assertTrue(UnicodeData.getPropertyCodePoints("InCyrillic").contains(0x0404));
assertTrue(UnicodeData.getPropertyCodePoints("InMisc_Pictographs").contains(0x1F4A9));
}

@Test
public void testUnicodeBlockAliases() {
assertTrue(UnicodeData.getPropertyCodePoints("InBasic_Latin").contains('0'));
assertTrue(UnicodeData.getPropertyCodePoints("InMiscellaneous_Mathematical_Symbols_B").contains(0x29BE));
}

@Test
public void testPropertyCaseInsensitivity() {
assertTrue(UnicodeData.getPropertyCodePoints("l").contains('x'));
Expand All @@ -116,6 +130,11 @@ public void testPropertyCaseInsensitivity() {
assertTrue(UnicodeData.getPropertyCodePoints("Alnum").contains('0'));
}

@Test
public void testPropertyDashSameAsUnderscore() {
assertTrue(UnicodeData.getPropertyCodePoints("InLatin-1").contains('\u00F0'));
}

@Test
public void modifyingUnicodeDataShouldThrow() {
thrown.expect(IllegalStateException.class);
Expand Down