From 673ea5a0cc44b425610612dfcbaa83cdf8fc25ac Mon Sep 17 00:00:00 2001 From: LongYinan Date: Mon, 29 Jul 2024 01:03:54 +0800 Subject: [PATCH] use v_jsonescape --- Cargo.lock | 16 ++ Cargo.toml | 1 + crates/oxc_sourcemap/Cargo.toml | 11 +- crates/oxc_sourcemap/src/encode.rs | 366 +---------------------------- 4 files changed, 29 insertions(+), 365 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 95bff8255815a..9f9aa742e543a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -183,6 +183,12 @@ dependencies = [ "serde", ] +[[package]] +name = "buf-min" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22d5698cf6842742ed64805705798f8b351fff53fa546fd45c52184bee58dc90" + [[package]] name = "bumpalo" version = "3.16.0" @@ -1745,6 +1751,7 @@ dependencies = [ "rustc-hash", "serde", "serde_json", + "v_jsonescape", ] [[package]] @@ -3010,6 +3017,15 @@ dependencies = [ "serde", ] +[[package]] +name = "v_jsonescape" +version = "0.7.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be8219cc464ba10c48c3231a6871f11d26d831c5c45a47467eea387ea7bb10e8" +dependencies = [ + "buf-min", +] + [[package]] name = "valuable" version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index 62634502de38c..408fc3accd00d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -184,6 +184,7 @@ unicode-id-start = "1" # Relaxed version so the user can decide which unicode unicode-width = "0.1.13" ureq = { version = "2.9.6", default-features = false } url = "2.5.2" +v_jsonescape = "0.7.3" walkdir = "2.5.0" wasm-bindgen = "0.2.92" diff --git a/crates/oxc_sourcemap/Cargo.toml b/crates/oxc_sourcemap/Cargo.toml index 0532b8ab55489..cf3469e5847fe 100644 --- a/crates/oxc_sourcemap/Cargo.toml +++ b/crates/oxc_sourcemap/Cargo.toml @@ -19,11 +19,12 @@ workspace = true doctest = false [dependencies] -rustc-hash = { workspace = true } -serde = { workspace = true, features = ["derive"] } -serde_json = { workspace = true } -base64-simd = { workspace = true } -cfg-if = { workspace = true } +rustc-hash = { workspace = true } +serde = { workspace = true, features = ["derive"] } +serde_json = { workspace = true } +base64-simd = { workspace = true } +cfg-if = { workspace = true } +v_jsonescape = { workspace = true, features = ["bytes-buf"] } rayon = { workspace = true, optional = true } diff --git a/crates/oxc_sourcemap/src/encode.rs b/crates/oxc_sourcemap/src/encode.rs index 4f2501648b452..f4bb42e90d7de 100644 --- a/crates/oxc_sourcemap/src/encode.rs +++ b/crates/oxc_sourcemap/src/encode.rs @@ -1,12 +1,8 @@ -#![cfg_attr(target_arch = "x86_64", allow(clippy::cast_ptr_alignment))] -#![cfg_attr(target_arch = "x86_64", allow(clippy::cast_possible_wrap))] -#![cfg_attr(target_arch = "x86_64", allow(clippy::cast_sign_loss))] -#![cfg_attr(target_arch = "x86_64", allow(clippy::transmute_ptr_to_ptr))] - use std::borrow::Cow; #[cfg(feature = "concurrent")] use rayon::prelude::*; +use v_jsonescape::b_escape as simd_escape; use crate::JSONSourceMap; /// Port from https://github.com/getsentry/rust-sourcemap/blob/master/src/encoder.rs @@ -241,363 +237,13 @@ impl<'a> PreAllocatedString<'a> { } } -// Copied from https://github.com/serde-rs/json/blob/v1.0.120/src/ser.rs#L2097-L2127 - -const BB: u8 = b'b'; // \x08 -const TT: u8 = b't'; // \x09 -const NN: u8 = b'n'; // \x0A -const FF: u8 = b'f'; // \x0C -const RR: u8 = b'r'; // \x0D -const QU: u8 = b'"'; // \x22 -const BS: u8 = b'\\'; // \x5C -const UU: u8 = b'u'; // \x00...\x1F except the ones above -const __: u8 = 0; - -// Lookup table of escape sequences. A value of b'x' at index i means that byte -// i is escaped as "\x" in JSON. A value of 0 means that byte i is not escaped. -static ESCAPE: [u8; 256] = [ - // 1 2 3 4 5 6 7 8 9 A B C D E F - UU, UU, UU, UU, UU, UU, UU, UU, BB, TT, NN, UU, FF, RR, UU, UU, // 0 - UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, // 1 - __, __, QU, __, __, __, __, __, __, __, __, __, __, __, __, __, // 2 - __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 3 - __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 4 - __, __, __, __, __, __, __, __, __, __, __, __, BS, __, __, __, // 5 - __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 6 - __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 7 - __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 8 - __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 9 - __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // A - __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // B - __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // C - __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // D - __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // E - __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // F -]; - -const UTF8_CHAR_WIDTH: [u8; 256] = [ - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0, -]; - -const HEX_DIGITS: &[u8; 16] = b"0123456789abcdef"; - -#[cfg(target_arch = "aarch64")] -#[inline] -fn escape_json_string>(input: S) -> String { - use core::arch::aarch64::{uint8x16_t, vld1q_u8, vqtbl1q_u8, vst1q_u8}; - - let input = input.as_ref(); - let bytes = input.as_bytes(); - let len = bytes.len(); - let mut result = String::with_capacity(len * 2 + 2); - let mut chunk_head = 0; - - result.push('"'); - - // Safety: simd is naturally unsafe. - unsafe { - let mut escape_result = [0u8; 16]; - while chunk_head + 16 <= len { - let chunk: uint8x16_t = vld1q_u8(bytes[chunk_head..].as_ptr()); - // Use ESCAPE table to check for characters that need escaping - let escape = vqtbl1q_u8(vld1q_u8(ESCAPE.as_ptr()), chunk); - - // Store the escape results in a temporary array - vst1q_u8(escape_result.as_mut_ptr(), escape); - - // Process each byte in the chunk - let mut head = 0; - while head < 16 { - let b = bytes[chunk_head + head]; - let e = escape_result[head]; - if e == 0 { - if b & 0x80 == 0 { - // ASCII character - result.push(b as char); - head += 1; - } else { - // Unicode character - let char_len = UTF8_CHAR_WIDTH[b as usize] as usize; - if chunk_head + head + char_len <= len { - let c = input[chunk_head + head..chunk_head + head + char_len] - .chars() - .next() - .unwrap(); - if c.is_control() { - result.push_str(&format!("\\u{:04x}", c as u32)); - } else { - result.push(c); - } - } else { - // Incomplete UTF-8 sequence, just copy the bytes - result.push_str(&input[chunk_head + head..]); - head = 16; // Exit the loop - } - head += char_len; - } - } else if e == UU { - // For control characters, use unicode escape - result.push_str(&format!("\\u{:04x}", u32::from(b))); - head += 1; - } else { - // For other escaped characters - result.push('\\'); - result.push(e as char); - head += 1; - } - } - - chunk_head += 16; - } - } - - // Process remaining bytes - escape_json_string_fallback(&input[chunk_head..], &mut result); - - result -} - -#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] -#[inline] -fn escape_json_string>(input: S) -> String { - use std::arch::x86_64::{ - __m128i, __m256i, _mm256_loadu_si256, _mm256_movemask_epi8, _mm256_shuffle_epi8, - _mm_loadu_si128, _mm_movemask_epi8, _mm_shuffle_epi8, - }; - - let input = input.as_ref(); - let bytes = input.as_bytes(); - let len = bytes.len(); - - let mut result = String::with_capacity(len * 2 + 2); - result.push('"'); - - let mut i = 0; - let mut escape_buf = [b'\\', b'u', b'0', b'0', b'0', b'0']; - - while i < len { - if is_x86_feature_detected!("avx2") && i + 32 <= len { - // Safety: SIMD operations are unsafe - unsafe { - let escape_table = _mm256_loadu_si256(ESCAPE.as_ptr().cast::<__m256i>()); - let chunk = _mm256_loadu_si256(bytes[i..].as_ptr().cast::<__m256i>()); - let escape = _mm256_shuffle_epi8(escape_table, chunk); - let mask = _mm256_movemask_epi8(escape); - - if mask == 0 { - // No characters need escaping - result.push_str(std::str::from_utf8(&bytes[i..i + 32]).unwrap()); - i += 32; - } else { - // Some characters need escaping, process byte by byte - break; - } - } - } else if is_x86_feature_detected!("sse2") && i + 16 <= len { - // Safety: SIMD operations are unsafe - unsafe { - let escape_table = _mm_loadu_si128(ESCAPE.as_ptr().cast::<__m128i>()); - let chunk = _mm_loadu_si128(bytes[i..].as_ptr().cast::<__m128i>()); - let escape = _mm_shuffle_epi8(escape_table, chunk); - let mask = _mm_movemask_epi8(escape); - - if mask == 0 { - // No characters need escaping - result.push_str(std::str::from_utf8_unchecked(&bytes[i..i + 16])); - i += 16; - } else { - // Some characters need escaping, process byte by byte - break; - } - } - } else { - // Process byte by byte - break; - } - } - - // Process remaining bytes - while i < len { - let byte = bytes[i]; - let escape = ESCAPE[byte as usize]; - if escape == 0 { - let char_len = UTF8_CHAR_WIDTH[byte as usize] as usize; - if i + char_len <= len { - result.push_str(&input[i..i + char_len]); - i += char_len; - } else { - // Incomplete UTF-8 sequence, just copy the byte - result.push(byte as char); - i += 1; - } - } else if escape == UU { - escape_buf[4] = HEX_DIGITS[(byte >> 4) as usize]; - escape_buf[5] = HEX_DIGITS[(byte & 0xF) as usize]; - // Safety: escape_buf is always valid utf-8 - result.push_str(unsafe { std::str::from_utf8_unchecked(&escape_buf) }); - i += 1; - } else { - result.push('\\'); - result.push(escape as char); - i += 1; - } - } - - result.push('"'); - result -} - -#[cfg(target_arch = "wasm32")] -#[inline] fn escape_json_string>(s: S) -> String { - use core::arch::wasm32::{u8x16_swizzle, v128, v128_load, v128_store}; - let s = s.as_ref(); - let bytes = s.as_bytes(); - let len = bytes.len(); - let mut result = String::with_capacity(len * 2 + 2); - let mut i = 0; - - result.push('"'); - - let mut escape_buf = [b'\\', b'u', b'0', b'0', b'0', b'0']; - - // Safety: SIMD operations are unsafe - unsafe { - let mut escape_result = [0u8; 16]; - let escape_table = v128_load(ESCAPE.as_ptr() as *const v128); - - while i + 16 <= len { - let chunk = v128_load(bytes[i..].as_ptr() as *const v128); - // Use ESCAPE table to check for characters that need escaping - let escape = u8x16_swizzle(escape_table, chunk); - - // Store the escape results in a temporary array - v128_store(escape_result.as_mut_ptr() as *mut v128, escape); - - // Process each byte in the chunk - let mut j = 0; - while j < 16 { - let b = bytes[i + j]; - let e = escape_result[j]; - if e == 0 { - if b & 0x80 == 0 { - // ASCII character - result.push(b as char); - j += 1; - } else { - // Unicode character - let char_len = UTF8_CHAR_WIDTH[b as usize] as usize; - if i + j + char_len <= len { - let c = s[i + j..i + j + char_len].chars().next().unwrap(); - if c.is_control() { - let c_u32 = c as u32; - let buf = [ - b'\\', - b'u', - HEX_DIGITS[((c_u32 >> 12) & 0xF) as usize], - HEX_DIGITS[((c_u32 >> 8) & 0xF) as usize], - HEX_DIGITS[((c_u32 >> 4) & 0xF) as usize], - HEX_DIGITS[(c_u32 & 0xF) as usize], - ]; - result.push_str(std::str::from_utf8_unchecked(&buf)); - } else { - result.push(c); - } - } else { - // Incomplete UTF-8 sequence, just copy the bytes - result.push_str(&s[i + j..]); - j = 16; // Exit the loop - } - j += char_len; - } - } else if e == UU { - // For control characters, use unicode escape - escape_buf[0] = b'\\'; - escape_buf[1] = b'u'; - escape_buf[2] = b'0'; - escape_buf[3] = b'0'; - escape_buf[4] = HEX_DIGITS[(b >> 4) as usize]; - escape_buf[5] = HEX_DIGITS[(b & 0xF) as usize]; - result.push_str(std::str::from_utf8(&escape_buf).unwrap()); - j += 1; - } else { - // For other escaped characters - result.push('\\'); - result.push(e as char); - j += 1; - } - } - - i += 16; - } - } - - // Process remaining bytes - escape_json_string_fallback(&s[i..], &mut result); - - result -} - -#[cfg(not(any( - target_arch = "aarch64", - target_arch = "x86_64", - target_arch = "x86", - target_arch = "wasm32" -)))] -#[inline] -fn escape_json_string>(s: S) -> String { - let mut result = String::with_capacity(s.as_ref().len() * 2 + 2); - result.push('"'); - escape_json_string_fallback(s.as_ref(), &mut result); - result -} - -#[allow(unused)] -#[inline] -fn escape_json_string_fallback(s: &str, result: &mut String) { - let mut escape_buf = [b'\\', b'u', b'0', b'0', b'0', b'0']; - for c in s.chars() { - if c.is_ascii() { - let b = c as u8; - let e = ESCAPE[b as usize]; - if e == 0 { - result.push(c); - } else if e == UU { - // For control characters, use unicode escape - escape_buf[4] = HEX_DIGITS[(b >> 4) as usize]; - escape_buf[5] = HEX_DIGITS[(b & 0xF) as usize]; - // Safety: escape_buf is always valid utf-8 - result.push_str(unsafe { std::str::from_utf8_unchecked(&escape_buf) }); - } else { - // For other escaped characters - result.push('\\'); - result.push(e as char); - } - } else if c.is_control() { - let c_u32 = c as u32; - let buf = [ - b'\\', - b'u', - HEX_DIGITS[((c_u32 >> 12) & 0xF) as usize], - HEX_DIGITS[((c_u32 >> 8) & 0xF) as usize], - HEX_DIGITS[((c_u32 >> 4) & 0xF) as usize], - HEX_DIGITS[(c_u32 & 0xF) as usize], - ]; - // Safety: buf is always valid utf-8 - result.push_str(unsafe { std::str::from_utf8_unchecked(&buf) }); - } else { - result.push(c); - } - } - result.push('"'); + let mut escaped = String::with_capacity(s.len() * 2 + 2); + escaped.push('"'); + simd_escape(s.as_bytes(), &mut escaped); + escaped.push('"'); + escaped } #[test]