[Lazarus] Faster than popcnt [[Re: UTF8LengthFast returning incorrect results on AARCH64 (MacOS)]]
Marco van de Voort
fpc at pascalprogramming.org
Wed Dec 29 02:10:41 CET 2021
On 28-12-2021 23:35, Martin Frb via lazarus wrote:
>
>
> "nx" has a single "1" in each of the 8 bytes in a Qword (based on 64bit).
> If we regard each of this bytes as an entity of its own, then we can
> keep adding those "1".
I also was thinking in that direction, but more about how to optimize
that loop using SSE2
Some simple masking achieves the same (an 1 for each byte that starts
with %10 bits) in 5 instructions, the load inclusive.
Since 64-bit always supports SSE2, this could work:
{$mode objfpc}{$H+}
{$asmmode intel}
uses sysutils,strutils;
Type int128 = array[0..1] of int64;
const
mask3 : array[0..15] of byte = ( $C0,$C0,$C0,$C0,
$C0,$C0,$C0,$C0,
$C0,$C0,$C0,$C0,
$C0,$C0,$C0,$C0);
mask4 : array[0..15] of byte = ( $80,$80,$80,$80,
$80,$80,$80,$80,
$80,$80,$80,$80,
$80,$80,$80,$80);
mask2 : array[0..15] of byte = ( $1,$1,$1,$1,
$1,$1,$1,$1,
$1,$1,$1,$1,
$1,$1,$1,$1);
function utf8length(const s : pchar;var res:int128;len:integer):integer;
// len is number of 16-byte counts to accumulate, max 255 I think
// stores 16 bytes worth of counts in "res"
begin
asm
movdqu xmm1,[rip+mask3] // unaligned is SSE3, doesn't work on
original X86_64 clawhammer?
movdqu xmm2,[rip+mask4]
movdqu xmm3,[rip+mask2]
pxor xmm4,xmm4
@lbl:
movdqu xmm0, [rcx]
pand xmm0,xmm1 // mask out top 2 bits ($C0)
pcmpeqb xmm0,xmm2 // compare with $80. sets byte to 11111111 or
00000000
pand xmm0,xmm3 // change to lsb (1/0) per byte only.
paddb xmm4,xmm0 // add to cumulative
add rcx,16
dec r8
jne @lbl
movdqu [rdx],xmm4
end; // no volatile registers used.
end;
function countmask(nx:int64):integer;
// Martin's routine that should be replaced by some punpkl magic, but it
is too late now.
begin
nx := (nx and $00FF00FF00FF00FF) + ((nx >> 8) and $00FF00FF00FF00FF);
nx := (nx and $0000FFFF0000FFFF) + ((nx >> 16) and $0000FFFF0000FFFF);
result := (nx and $00000000FFFFFFFF) + ((nx >> 32) and
$00000000FFFFFFFF);
end;
// one of each pattern.
const pattern : array[0..3] of char = (chr(%11001001),chr(%10001001),
chr(%00001001),chr(%01001001));
const testblocks = 5;
var s : string;
i,j,cnt : integer;
r : int128;
begin
randomize;
setlength(s,testblocks*16);
// random string but keep a count of bytes with high value %10
cnt:=0;
for i:=0 to testblocks*16-1 do
begin
j:=random(4);
if j=1 then inc(cnt);
s[i+1]:=pattern[j];
end;
utf8length(pchar(s),r,testblocks+1);
writeln(cnt,' = ',countmask(r[0])+countmask(r[1]));
// writeln(inttohex(r[0],16));
// writeln(inttohex(r[1],16));
end.
More information about the lazarus
mailing list