Skip to content

Commit

Permalink
Count surrogate pair as single character (#779)
Browse files Browse the repository at this point in the history
* Count surrogate pair as single character

String expression operators now count UTF-16 surrogate pairs as single characters instead of splitting them up into individual surrogates.

* Removed extraneous empty string case
  • Loading branch information
1ec5 authored Aug 17, 2024
1 parent 6fb546e commit a59e2b3
Show file tree
Hide file tree
Showing 6 changed files with 270 additions and 16 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
- _...Add new stuff here..._

### 🐞 Bug fixes
- The `index-of`, `length`, and `slice` expression operators count a UTF-16 surrogate pair as a single character. ([#779](https://github.com/maplibre/maplibre-style-spec/pull/779))
- _...Add new stuff here..._

## 20.3.0
Expand Down
20 changes: 14 additions & 6 deletions src/expression/definitions/index_of.ts
Original file line number Diff line number Diff line change
Expand Up @@ -60,16 +60,24 @@ class IndexOf implements Expression {
throw new RuntimeError(`Expected first argument to be of type boolean, string, number or null, but found ${toString(typeOf(needle))} instead.`);
}

if (!isValidNativeType(haystack, ['string', 'array'])) {
throw new RuntimeError(`Expected second argument to be of type array or string, but found ${toString(typeOf(haystack))} instead.`);
let fromIndex;
if (this.fromIndex) {
fromIndex = (this.fromIndex.evaluate(ctx) as number);
}

if (this.fromIndex) {
const fromIndex = (this.fromIndex.evaluate(ctx) as number);
if (isValidNativeType(haystack, ['string'])) {
const rawIndex = haystack.indexOf(needle, fromIndex);
if (rawIndex === -1) {
return -1;
} else {
// The index may be affected by surrogate pairs, so get the length of the preceding substring.
return [...haystack.slice(0, rawIndex)].length;
}
} else if (isValidNativeType(haystack, ['array'])) {
return haystack.indexOf(needle, fromIndex);
} else {
throw new RuntimeError(`Expected second argument to be of type array or string, but found ${toString(typeOf(haystack))} instead.`);
}

return haystack.indexOf(needle);
}

eachChild(fn: (_: Expression) => void) {
Expand Down
3 changes: 2 additions & 1 deletion src/expression/definitions/length.ts
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@ class Length implements Expression {
evaluate(ctx: EvaluationContext) {
const input = this.input.evaluate(ctx);
if (typeof input === 'string') {
return input.length;
// The length may be affected by surrogate pairs.
return [...input].length;
} else if (Array.isArray(input)) {
return input.length;
} else {
Expand Down
15 changes: 9 additions & 6 deletions src/expression/definitions/slice.ts
Original file line number Diff line number Diff line change
Expand Up @@ -56,16 +56,19 @@ class Slice implements Expression {
const input = (this.input.evaluate(ctx) as any);
const beginIndex = (this.beginIndex.evaluate(ctx) as number);

if (!isValidNativeType(input, ['string', 'array'])) {
throw new RuntimeError(`Expected first argument to be of type array or string, but found ${toString(typeOf(input))} instead.`);
let endIndex;
if (this.endIndex) {
endIndex = (this.endIndex.evaluate(ctx) as number);
}

if (this.endIndex) {
const endIndex = (this.endIndex.evaluate(ctx) as number);
if (isValidNativeType(input, ['string'])) {
// Indices may be affected by surrogate pairs.
return [...input].slice(beginIndex, endIndex).join('');
} else if (isValidNativeType(input, ['array'])) {
return input.slice(beginIndex, endIndex);
} else {
throw new RuntimeError(`Expected first argument to be of type array or string, but found ${toString(typeOf(input))} instead.`);
}

return input.slice(beginIndex);
}

eachChild(fn: (_: Expression) => void) {
Expand Down
241 changes: 241 additions & 0 deletions src/expression/expression.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -388,3 +388,244 @@ describe('Distance expression', () => {
});
});
});

describe('index-of expression', () => {
test('requires a needle', () => {
const response = createExpression(['index-of']);
expect(response.result).toBe('error');
});
test('requires a haystack', () => {
const response = createExpression(['index-of', 'a']);
expect(response.result).toBe('error');
});
test('rejects a fourth argument', () => {
const response = createExpression(['index-of', 'a', 'abc', 1, 8]);
expect(response.result).toBe('error');
});
test('requires a primitive as the needle', () => {
const response = createExpression(['index-of', ['literal', ['a']], ['a', 'b', 'c']]);
expect(response.result).toBe('error');
});
test('requires a string or array as the haystack', () => {
const response = createExpression(['index-of', 't', true]);
expect(response.result).toBe('error');
});
test('finds an empty substring in an empty string', () => {
const response = createExpression(['index-of', '', '']);
expect(response.result).toBe('success');
expect((response.value as StyleExpression)?.evaluate({zoom: 20})).toBe(0);
});
test('finds an empty substring in a non-empty string', () => {
const response = createExpression(['index-of', '', 'abc']);
expect(response.result).toBe('success');
expect((response.value as StyleExpression)?.evaluate({zoom: 20})).toBe(0);
});
test('cannot find a non-empty substring in an empty string', () => {
const response = createExpression(['index-of', 'abc', '']);
expect(response.result).toBe('success');
expect((response.value as StyleExpression)?.evaluate({zoom: 20})).toBe(-1);
});
test('finds a non-empty substring in a non-empty string', () => {
const response = createExpression(['index-of', 'b', 'abc']);
expect(response.result).toBe('success');
expect((response.value as StyleExpression)?.evaluate({zoom: 20})).toBe(1);
});
test('only finds the first occurrence in a string', () => {
const response = createExpression(['index-of', 'b', 'abbc']);
expect(response.result).toBe('success');
expect((response.value as StyleExpression)?.evaluate({zoom: 20})).toBe(1);
});
test('starts looking for the substring at a positive start index', () => {
const response = createExpression(['index-of', 'a', 'abc', 1]);
expect(response.result).toBe('success');
expect((response.value as StyleExpression)?.evaluate({zoom: 20})).toBe(-1);
});
test('starts looking for the substring at a negative start index', () => {
const response = createExpression(['index-of', 'c', 'abc', -1]);
expect(response.result).toBe('success');
expect((response.value as StyleExpression)?.evaluate({zoom: 20})).toBe(2);
});
test('counts a non-ASCII character as a single character', () => {
const response = createExpression(['index-of', '镇', '市镇']);
expect(response.result).toBe('success');
expect((response.value as StyleExpression)?.evaluate({zoom: 20})).toBe(1);
});
test('counts a surrogate pair as a single character', () => {
const response = createExpression(['index-of', '市镇', '丐𦨭市镇']);
expect(response.result).toBe('success');
expect((response.value as StyleExpression)?.evaluate({zoom: 20})).toBe(2);
});
test('cannot find an element in an empty array', () => {
const response = createExpression(['index-of', 1, ['literal', []]]);
expect(response.result).toBe('success');
expect((response.value as StyleExpression)?.evaluate({zoom: 20})).toBe(-1);
});
test('finds an element in a non-empty array', () => {
const response = createExpression(['index-of', 2, ['literal', [1, 2, 3]]]);
expect(response.result).toBe('success');
expect((response.value as StyleExpression)?.evaluate({zoom: 20})).toBe(1);
});
test('only finds the first occurrence in an array', () => {
const response = createExpression(['index-of', 2, ['literal', [1, 2, 2, 3]]]);
expect(response.result).toBe('success');
expect((response.value as StyleExpression)?.evaluate({zoom: 20})).toBe(1);
});
test('starts looking for the element at a positive start index', () => {
const response = createExpression(['index-of', 1, ['literal', [1, 2, 3]], 1]);
expect(response.result).toBe('success');
expect((response.value as StyleExpression)?.evaluate({zoom: 20})).toBe(-1);
});
test('starts looking for the element at a negative start index', () => {
const response = createExpression(['index-of', 3, ['literal', [1, 2, 3]], -1]);
expect(response.result).toBe('success');
expect((response.value as StyleExpression)?.evaluate({zoom: 20})).toBe(2);
});
});

describe('length expression', () => {
test('requires an argument', () => {
const response = createExpression(['length']);
expect(response.result).toBe('error');
});
test('requires a string or array as the argument', () => {
const response = createExpression(['length', true]);
expect(response.result).toBe('error');
});
test('rejects a second argument', () => {
const response = createExpression(['length', 'abc', 'def']);
expect(response.result).toBe('error');
});
test('measures an empty string', () => {
const response = createExpression(['length', '']);
expect(response.result).toBe('success');
expect((response.value as StyleExpression)?.evaluate({zoom: 20})).toBe(0);
});
test('measures a non-empty string', () => {
const response = createExpression(['length', 'abc']);
expect(response.result).toBe('success');
expect((response.value as StyleExpression)?.evaluate({zoom: 20})).toBe(3);
});
test('counts a non-ASCII character as a single character', () => {
const response = createExpression(['length', '市镇']);
expect(response.result).toBe('success');
expect((response.value as StyleExpression)?.evaluate({zoom: 20})).toBe(2);
});
test('counts a surrogate pair as a single character', () => {
const response = createExpression(['length', '丐𦨭市镇']);
expect(response.result).toBe('success');
expect((response.value as StyleExpression)?.evaluate({zoom: 20})).toBe(4);
});
test('measures an empty array', () => {
const response = createExpression(['length', ['literal', []]]);
expect(response.result).toBe('success');
expect((response.value as StyleExpression)?.evaluate({zoom: 20})).toBe(0);
});
test('measures a non-empty array', () => {
const response = createExpression(['length', ['literal', [1, 2, 3]]]);
expect(response.result).toBe('success');
expect((response.value as StyleExpression)?.evaluate({zoom: 20})).toBe(3);
});
});

describe('slice expression', () => {
test('requires an input argument', () => {
const response = createExpression(['slice']);
expect(response.result).toBe('error');
});
test('requires a start index argument', () => {
const response = createExpression(['slice', 'abc']);
expect(response.result).toBe('error');
});
test('rejects a fourth argument', () => {
const response = createExpression(['slice', 'abc', 0, 1, 8]);
expect(response.result).toBe('error');
});
test('requires a string or array as the input argument', () => {
const response = createExpression(['slice', true, 0]);
expect(response.result).toBe('error');
});
test('requires a number as the start index argument', () => {
const response = createExpression(['slice', 'abc', true]);
expect(response.result).toBe('error');
});
test('slices an empty string', () => {
const response = createExpression(['slice', '', 0]);
expect(response.result).toBe('success');
expect((response.value as StyleExpression)?.evaluate({zoom: 20})).toBe('');
});
test('slices a string starting at the beginning', () => {
const response = createExpression(['slice', 'abc', 0]);
expect(response.result).toBe('success');
expect((response.value as StyleExpression)?.evaluate({zoom: 20})).toBe('abc');
});
test('slices a string starting at the middle', () => {
const response = createExpression(['slice', 'abc', 1]);
expect(response.result).toBe('success');
expect((response.value as StyleExpression)?.evaluate({zoom: 20})).toBe('bc');
});
test('slices a string starting at the end', () => {
const response = createExpression(['slice', 'abc', 3]);
expect(response.result).toBe('success');
expect((response.value as StyleExpression)?.evaluate({zoom: 20})).toBe('');
});
test('slices a string backwards from the end', () => {
const response = createExpression(['slice', 'abc', -2]);
expect(response.result).toBe('success');
expect((response.value as StyleExpression)?.evaluate({zoom: 20})).toBe('bc');
});
test('slices a string by a zero-length range', () => {
const response = createExpression(['slice', 'abc', 1, 1]);
expect(response.result).toBe('success');
expect((response.value as StyleExpression)?.evaluate({zoom: 20})).toBe('');
});
test('slices a string by a negative-length range', () => {
const response = createExpression(['slice', 'abc', 2, 1]);
expect(response.result).toBe('success');
expect((response.value as StyleExpression)?.evaluate({zoom: 20})).toBe('');
});
test('avoids splitting a non-ASCII character', () => {
const response = createExpression(['slice', '市镇', 1]);
expect(response.result).toBe('success');
expect((response.value as StyleExpression)?.evaluate({zoom: 20})).toBe('镇');
});
test('avoids splitting a surrogate pair', () => {
const response = createExpression(['slice', '丐𦨭市镇', 2]);
expect(response.result).toBe('success');
expect((response.value as StyleExpression)?.evaluate({zoom: 20})).toBe('市镇');
});
test('slices an empty array', () => {
const response = createExpression(['slice', ['literal', []], 0]);
expect(response.result).toBe('success');
expect((response.value as StyleExpression)?.evaluate({zoom: 20})).toEqual([]);
});
test('slices an array starting at the beginning', () => {
const response = createExpression(['slice', ['literal', [1, 2, 3]], 0]);
expect(response.result).toBe('success');
expect((response.value as StyleExpression)?.evaluate({zoom: 20})).toEqual([1, 2, 3]);
});
test('slices an array starting at the middle', () => {
const response = createExpression(['slice', ['literal', [1, 2, 3]], 1]);
expect(response.result).toBe('success');
expect((response.value as StyleExpression)?.evaluate({zoom: 20})).toEqual([2, 3]);
});
test('slices an array starting at the end', () => {
const response = createExpression(['slice', ['literal', [1, 2, 3]], 3]);
expect(response.result).toBe('success');
expect((response.value as StyleExpression)?.evaluate({zoom: 20})).toEqual([]);
});
test('slices an array backwards from the end', () => {
const response = createExpression(['slice', ['literal', [1, 2, 3]], -2]);
expect(response.result).toBe('success');
expect((response.value as StyleExpression)?.evaluate({zoom: 20})).toEqual([2, 3]);
});
test('slices an array by a zero-length range', () => {
const response = createExpression(['slice', ['literal', [1, 2, 3]], 1, 1]);
expect(response.result).toBe('success');
expect((response.value as StyleExpression)?.evaluate({zoom: 20})).toEqual([]);
});
test('slices an array by a negative-length range', () => {
const response = createExpression(['slice', ['literal', [1, 2, 3]], 2, 1]);
expect(response.result).toBe('success');
expect((response.value as StyleExpression)?.evaluate({zoom: 20})).toEqual([]);
});
});
6 changes: 3 additions & 3 deletions src/reference/v8.json
Original file line number Diff line number Diff line change
Expand Up @@ -2826,7 +2826,7 @@
}
},
"index-of": {
"doc": "Returns the first position at which an item can be found in an array or a substring can be found in a string, or `-1` if the input cannot be found. Accepts an optional index from where to begin the search.",
"doc": "Returns the first position at which an item can be found in an array or a substring can be found in a string, or `-1` if the input cannot be found. Accepts an optional index from where to begin the search. In a string, a UTF-16 surrogate pair counts as a single position.",
"example": {
"syntax": {
"method": ["value", "value", "number?"],
Expand All @@ -2844,7 +2844,7 @@
}
},
"slice": {
"doc": "Returns an item from an array or a substring from a string from a specified start index, or between a start index and an end index if set. The return value is inclusive of the start index but not of the end index.",
"doc": "Returns an item from an array or a substring from a string from a specified start index, or between a start index and an end index if set. The return value is inclusive of the start index but not of the end index. In a string, a UTF-16 surrogate pair counts as a single position.",
"example": {
"syntax": {
"method": ["value", "number", "number?"],
Expand Down Expand Up @@ -3380,7 +3380,7 @@
}
},
"length": {
"doc": "Gets the length of an array or string.",
"doc": "Gets the length of an array or string. In a string, a UTF-16 surrogate pair counts as a single position.",
"example": {
"syntax": {
"method": ["array"],
Expand Down

0 comments on commit a59e2b3

Please sign in to comment.