Skip to content

Commit

Permalink
Clean up a few issues with ARM-optimized varint decoding.
Browse files Browse the repository at this point in the history
1) Move the check for invalid varint outside handling a >= 32 bit varint. This gives a modest speedup (+0.5%) on the [fleetbench protogen benchmark](https://github.com/google/fleetbench/tree/main/fleetbench/proto) by working around an LLVM issue which inserted a redundant branch.
2) Move ValueBarrier() to parse_context.h so we can delete ForceToRegister(), which does the same thing as the single value ValueBarrier() implementation.

PiperOrigin-RevId: 504883550
  • Loading branch information
protobuf-github-bot authored and copybara-github committed Jan 26, 2023
1 parent 7508850 commit bbe2e68
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 34 deletions.
38 changes: 10 additions & 28 deletions src/google/protobuf/parse_context.cc
Original file line number Diff line number Diff line change
Expand Up @@ -677,27 +677,12 @@ const char* UnknownFieldParse(uint32_t tag, std::string* unknown,
// ABSL_LOG(FATAL) << "Invalid num_bits value";
// }
// ```
template <typename V1Type, typename V2Type>
PROTOBUF_ALWAYS_INLINE inline V1Type ValueBarrier(V1Type value1,
V2Type value2) {
asm("" : "+r"(value1) : "r"(value2));
return value1;
}

// Falsely indicate that the specific value is modified at this location. This
// prevents code which depends on this value from being scheduled earlier.
template <typename V1Type>
PROTOBUF_ALWAYS_INLINE inline V1Type ValueBarrier(V1Type value1) {
asm("" : "+r"(value1));
return value1;
}

PROTOBUF_ALWAYS_INLINE inline uint64_t ExtractAndMergeTwoChunks(
uint64_t data, uint64_t first_byte) {
ABSL_DCHECK_LE(first_byte, 6);
uint64_t first = Ubfx7(data, first_byte * 8);
uint64_t second = Ubfx7(data, (first_byte + 1) * 8);
return ForceToRegister(first | (second << 7));
return ValueBarrier(first | (second << 7));
}

struct SlowPathEncodedInfo {
Expand All @@ -716,9 +701,9 @@ PROTOBUF_ALWAYS_INLINE inline SlowPathEncodedInfo ComputeLengthAndUpdateP(
SlowPathEncodedInfo result;
// Load the last two bytes of the encoded Varint.
std::memcpy(&result.last8, p + 2, sizeof(result.last8));
uint64_t mask = ForceToRegister(0x8080808080808080);
uint64_t mask = ValueBarrier(0x8080808080808080);
// Only set continuation bits remain
result.masked_cont_bits = ForceToRegister(mask & (~result.last8));
result.masked_cont_bits = ValueBarrier(mask & (~result.last8));
// The first cleared continuation bit is the most significant 1 in the
// reversed value. Result is undefined for an input of 0 and we handle that
// case below.
Expand Down Expand Up @@ -758,9 +743,11 @@ PROTOBUF_NOINLINE const char* VarintParseSlowArm64(const char* p, uint64_t* out,
merged_45 << kFirstResultBitChunk4;
// This immediate ends in 14 zeroes since valid_chunk_bits is too low by 14.
uint64_t result_mask = kResultMaskUnshifted << info.valid_chunk_bits;
// masked_cont_bits is 0 iff the Varint is invalid. In that case
info.valid_bits =
info.masked_cont_bits ? info.valid_bits : kValidBitsForInvalidVarint;
// masked_cont_bits is 0 iff the Varint is invalid.
if (PROTOBUF_PREDICT_FALSE(!info.masked_cont_bits)) {
*out = 0;
return nullptr;
}
// Test for early exit if Varint does not exceed 6 chunks. Branching on one
// bit is faster on ARM than via a compare and branch.
if (PROTOBUF_PREDICT_FALSE((info.valid_bits & 0x20) != 0)) {
Expand All @@ -772,11 +759,6 @@ PROTOBUF_NOINLINE const char* VarintParseSlowArm64(const char* p, uint64_t* out,
result |= merged_67 << kFirstResultBitChunk6;
result |= merged_89 << kFirstResultBitChunk8;
// Handle an invalid Varint with all 10 continuation bits set.
info.masked_cont_bits = ValueBarrier(info.masked_cont_bits);
if (PROTOBUF_PREDICT_FALSE(info.masked_cont_bits == 0)) {
*out = 0;
return nullptr;
}
}
// Mask off invalid data bytes.
result &= ~result_mask;
Expand All @@ -795,8 +777,8 @@ PROTOBUF_NOINLINE const char* VarintParseSlowArm32(const char* p, uint32_t* out,
uint64_t merged_34 = ExtractAndMergeTwoChunks(first8, /*first_chunk=*/3);
first8 = ValueBarrier(first8, p);
uint64_t result = Ubfx7(first8, /*start=*/0);
result = ForceToRegister(result | merged_12 << kFirstResultBitChunk1);
result = ForceToRegister(result | merged_34 << kFirstResultBitChunk3);
result = ValueBarrier(result | merged_12 << kFirstResultBitChunk1);
result = ValueBarrier(result | merged_34 << kFirstResultBitChunk3);
uint64_t result_mask = kResultMaskUnshifted << info.valid_chunk_bits;
result &= ~result_mask;
// It is extremely unlikely that a Varint is invalid so checking that
Expand Down
20 changes: 14 additions & 6 deletions src/google/protobuf/parse_context.h
Original file line number Diff line number Diff line change
Expand Up @@ -630,17 +630,25 @@ inline const char* VarintParseSlowArm(const char* p, uint64_t* out,
return VarintParseSlowArm64(p, out, first8);
}

// Moves data into a register and returns data. This impacts the compiler's
// scheduling and instruction selection decisions.
static PROTOBUF_ALWAYS_INLINE inline uint64_t ForceToRegister(uint64_t data) {
asm("" : "+r"(data));
return data;
// Falsely indicate that the specific value is modified at this location. This
// prevents code which depends on this value from being scheduled earlier.
template <typename V1Type>
PROTOBUF_ALWAYS_INLINE inline V1Type ValueBarrier(V1Type value1) {
asm("" : "+r"(value1));
return value1;
}

template <typename V1Type, typename V2Type>
PROTOBUF_ALWAYS_INLINE inline V1Type ValueBarrier(V1Type value1,
V2Type value2) {
asm("" : "+r"(value1) : "r"(value2));
return value1;
}

// Performs a 7 bit UBFX (Unsigned Bit Extract) starting at the indicated bit.
static PROTOBUF_ALWAYS_INLINE inline uint64_t Ubfx7(uint64_t data,
uint64_t start) {
return ForceToRegister((data >> start) & 0x7f);
return ValueBarrier((data >> start) & 0x7f);
}

#endif // __aarch64__
Expand Down

0 comments on commit bbe2e68

Please sign in to comment.