[Lazarus] Faster than popcnt [[Re: UTF8LengthFast returning incorrect results on AARCH64 (MacOS)]]

Wed Dec 29 02:10:41 CET 2021

On 28-12-2021 23:35, Martin Frb via lazarus wrote:
>
>
> "nx" has a single "1" in each of the 8 bytes in a Qword (based on 64bit).
> If we regard each of this bytes as an entity of its own, then we can 
> keep adding those "1".

I also was thinking in that direction, but more about how to optimize 
that loop using SSE2

Some simple masking achieves the same (an 1 for each byte that starts 
with %10 bits) in 5 instructions, the load inclusive.

Since 64-bit always supports SSE2, this could work:

{$mode objfpc}{$H+}
{$asmmode intel}

uses sysutils,strutils;

Type int128 = array[0..1] of int64;

const
      mask3       :  array[0..15] of byte  = ( $C0,$C0,$C0,$C0,
                                                  $C0,$C0,$C0,$C0,
                                                  $C0,$C0,$C0,$C0,
                                                  $C0,$C0,$C0,$C0);

       mask4       :  array[0..15] of byte  = (   $80,$80,$80,$80,
                                                  $80,$80,$80,$80,
                                                  $80,$80,$80,$80,
                                                  $80,$80,$80,$80);

       mask2       :  array[0..15] of byte  = ( $1,$1,$1,$1,
                          $1,$1,$1,$1,
                                                  $1,$1,$1,$1,
                                                  $1,$1,$1,$1);

function utf8length(const s : pchar;var res:int128;len:integer):integer;
// len is number of 16-byte counts to accumulate, max 255 I think
// stores 16 bytes worth of counts in "res"
begin
  asm
   movdqu xmm1,[rip+mask3]         // unaligned is SSE3, doesn't work on 
original X86_64 clawhammer?
   movdqu xmm2,[rip+mask4]
   movdqu xmm3,[rip+mask2]
   pxor xmm4,xmm4

@lbl:
   movdqu xmm0, [rcx]
   pand  xmm0,xmm1      // mask out top 2 bits  ($C0)
   pcmpeqb xmm0,xmm2    // compare with $80. sets byte to 11111111 or 
00000000
   pand  xmm0,xmm3      // change to lsb (1/0) per byte only.
   paddb  xmm4,xmm0     // add to cumulative

   add rcx,16
   dec r8
   jne @lbl

   movdqu [rdx],xmm4

end; // no volatile registers used.
end;

function countmask(nx:int64):integer;
// Martin's routine that should be replaced by some punpkl magic, but it 
is too late now.
begin
    nx := (nx and $00FF00FF00FF00FF) + ((nx >>  8) and $00FF00FF00FF00FF);
    nx := (nx and $0000FFFF0000FFFF) + ((nx >> 16) and $0000FFFF0000FFFF);
    result := (nx and $00000000FFFFFFFF) + ((nx >> 32) and 
$00000000FFFFFFFF);
end;

// one of each pattern.
const pattern : array[0..3] of char = (chr(%11001001),chr(%10001001),
chr(%00001001),chr(%01001001));

const testblocks = 5;

var s : string;
     i,j,cnt : integer;
     r : int128;

begin
   randomize;
   setlength(s,testblocks*16);
   // random string but keep a count of bytes with high value %10
   cnt:=0;
   for i:=0 to testblocks*16-1 do
     begin
       j:=random(4);
       if j=1 then inc(cnt);
       s[i+1]:=pattern[j];
     end;

   utf8length(pchar(s),r,testblocks+1);

   writeln(cnt,' = ',countmask(r[0])+countmask(r[1]));
//  writeln(inttohex(r[0],16));
//  writeln(inttohex(r[1],16));

end.