-
Notifications
You must be signed in to change notification settings - Fork 34
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
AArch64: Add implementation for pow2 bitmask division.
This adds an implementation for the new optab for unsigned pow2 bitmask for AArch64. The implementation rewrites: x = y / (2 ^ (sizeof (y)/2)-1 into e.g. (for bytes) (x + ((x + 257) >> 8)) >> 8 where it's required that the additions be done in double the precision of x such that we don't lose any bits during an overflow. Essentially the sequence decomposes the division into doing two smaller divisions, one for the top and bottom parts of the number and adding the results back together. To account for the fact that shift by 8 would be division by 256 we add 1 to both parts of x such that when 255 we still get 1 as the answer. Because the amount we shift are half the original datatype we can use the halfing instructions the ISA provides to do the operation instead of using actual shifts. For AArch64 this means we generate for: void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n) { for (int i = 0; i < (n & -16); i+=1) pixel[i] = (pixel[i] * level) / 0xff; } the following: movi v3.16b, 0x1 umull2 v1.8h, v0.16b, v2.16b umull v0.8h, v0.8b, v2.8b addhn v5.8b, v1.8h, v3.8h addhn v4.8b, v0.8h, v3.8h uaddw v1.8h, v1.8h, v5.8b uaddw v0.8h, v0.8h, v4.8b uzp2 v0.16b, v0.16b, v1.16b instead of: umull v2.8h, v1.8b, v5.8b umull2 v1.8h, v1.16b, v5.16b umull v0.4s, v2.4h, v3.4h umull2 v2.4s, v2.8h, v3.8h umull v4.4s, v1.4h, v3.4h umull2 v1.4s, v1.8h, v3.8h uzp2 v0.8h, v0.8h, v2.8h uzp2 v1.8h, v4.8h, v1.8h shrn v0.8b, v0.8h, 7 shrn2 v0.16b, v1.8h, 7 Which results in significantly faster code. Thanks for Wilco for the concept. gcc/ChangeLog: * config/aarch64/aarch64-simd.md (@aarch64_bitmask_udiv<mode>3): New. * config/aarch64/aarch64.cc (aarch64_vectorize_can_special_div_by_constant): New. gcc/testsuite/ChangeLog: * gcc.target/aarch64/div-by-bitmask.c: New test.
- Loading branch information
1 parent
8beff04
commit c98aabc
Showing
3 changed files
with
156 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
/* { dg-do compile } */ | ||
/* { dg-additional-options "-O3 -std=c99" } */ | ||
/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */ | ||
|
||
#include <stdint.h> | ||
|
||
#pragma GCC target "+nosve" | ||
|
||
/* | ||
** draw_bitmap1: | ||
** ... | ||
** addhn v[0-9]+.8b, v[0-9]+.8h, v[0-9]+.8h | ||
** addhn v[0-9]+.8b, v[0-9]+.8h, v[0-9]+.8h | ||
** uaddw v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8b | ||
** uaddw v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8b | ||
** uzp2 v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b | ||
** ... | ||
*/ | ||
void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n) | ||
{ | ||
for (int i = 0; i < (n & -16); i+=1) | ||
pixel[i] = (pixel[i] * level) / 0xff; | ||
} | ||
|
||
void draw_bitmap2(uint8_t* restrict pixel, uint8_t level, int n) | ||
{ | ||
for (int i = 0; i < (n & -16); i+=1) | ||
pixel[i] = (pixel[i] * level) / 0xfe; | ||
} | ||
|
||
/* | ||
** draw_bitmap3: | ||
** ... | ||
** addhn v[0-9]+.4h, v[0-9]+.4s, v[0-9]+.4s | ||
** addhn v[0-9]+.4h, v[0-9]+.4s, v[0-9]+.4s | ||
** uaddw v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4h | ||
** uaddw v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4h | ||
** uzp2 v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h | ||
** ... | ||
*/ | ||
void draw_bitmap3(uint16_t* restrict pixel, uint16_t level, int n) | ||
{ | ||
for (int i = 0; i < (n & -16); i+=1) | ||
pixel[i] = (pixel[i] * level) / 0xffffU; | ||
} | ||
|
||
/* | ||
** draw_bitmap4: | ||
** ... | ||
** addhn v[0-9]+.2s, v[0-9]+.2d, v[0-9]+.2d | ||
** addhn v[0-9]+.2s, v[0-9]+.2d, v[0-9]+.2d | ||
** uaddw v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2s | ||
** uaddw v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2s | ||
** uzp2 v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s | ||
** ... | ||
*/ | ||
void draw_bitmap4(uint32_t* restrict pixel, uint32_t level, int n) | ||
{ | ||
for (int i = 0; i < (n & -16); i+=1) | ||
pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL; | ||
} |