[Lazarus] Faster than popcnt [[Re: UTF8LengthFast returning incorrect results on AARCH64 (MacOS)]]

Thu Dec 30 01:29:14 CET 2021

On 29-12-2021 00:00, Bart via lazarus wrote:
> On Tue, Dec 28, 2021 at 11:35 PM Martin Frb via lazarus
> <lazarus at lists.lazarus-ide.org> wrote:
>
>> I have a core I7-8600
>> The diff between the old code and popcnt is less significant.
>>
>> old: 715
>> pop: 695
>>
>> But there is a 3rd way, that is faster.
>> add: 610
> Not surprising that you should come up with a faster solution.
> IIRC you won both speed contests I had on the forum ;-)
>
> Feel free to implement it in LazUtf8.

New unit test, with Martin's integrated. If I play with godbolt, Ryzen 
zen3 (ryzen 5x00X) is nearly twice as fast in cycles as my Ivy Bridge, 
so I would like to see some benchmarks from various processors. Also 
from very old ones (P4 and Clawhammers) to test instruction sets.

I use unaligned loads which afaik on older ( pre Core 1st or 2nd 
generation) CPUs are costly. (because it loads two caches lines per time)

-------------- next part --------------
//
// (C) 2021 Martin Friebe and Marco van de Voort.
// attempt to accelerate utf8lengthfast which is a length(s) in utf8 codepoints without integrity checking
//
// 4 versions.
// - Original,
// - with popcount and
// - the "add" variant that accumulates 127 iterations of ptrints and only adds
//         the intermeidates outside that loop
// - a SSE2 version loosely inspired by the add variant combined with
//        the core of an existing (branchless) binarization routine for the main loop.

{$mode objfpc}{$H+}
{$asmmode intel}
{define asmdebug}

uses SysUtils,StrUtils;

const   
      mask3       :  array[0..15] of byte  = (   $C0,$C0,$C0,$C0,
                                                 $C0,$C0,$C0,$C0,
                                                 $C0,$C0,$C0,$C0,
                                                 $C0,$C0,$C0,$C0);

      mask4       :  array[0..15] of byte  = (   $80,$80,$80,$80,
                                                 $80,$80,$80,$80,
                                                 $80,$80,$80,$80,
                                                 $80,$80,$80,$80);

      mask2       :  array[0..15] of byte  = (   $1,$1,$1,$1,
						 $1,$1,$1,$1,
                                                 $1,$1,$1,$1,
                                                 $1,$1,$1,$1);

// Integer arguments are passed in registers RCX, RDX, R8, and R9.
// Floating point arguments are passed in XMM0L, XMM1L, XMM2L, and XMM3L.
// volatile: RAX, RCX, RDX, R8, R9, R10, R11
// nonvolatile RBX, RBP, RDI, RSI, RSP, R12, R13, R14, and R15 are considered nonvolatile
// volatile xmm0-xmm3 (params) en xmm4,5
// https://msdn.microsoft.com/en-us/library/ms235286.aspx

{$ifdef asmdebug}
function asmutf8length(const s : pchar;res:pbyte;len:integer):int64;
{$else}
function asmutf8length(const s : pchar;len:integer):int64;
{$endif}

begin
 asm
  {$ifndef asmdebug}
    mov r8,rdx
  {$endif}

  // using broadcast etc raises requirements?
  mov rax,r8
  mov r9,r8
  // tuning for short strings:
  // ------
  // test rax,rax
  // je @theend
  // cmp r9,128 // difference between long and short
  // jl @restbytes

  and r9,15
  shr r8,4
  pxor xmm5,xmm5   // always zero
  pxor xmm6,xmm6   // dword counts

  movdqu xmm1,[rip+mask3]
  movdqu xmm2,[rip+mask4]
  movdqu xmm3,[rip+mask2]

  test r8,r8
  je @restbytes

@outer:
  mov r10,127            // max iterations
  cmp r10,r8            // more or less left?
  jl @last               // more
  mov r10,r8             // less
  @last:
  sub r8,r10             // iterations left - iterations to do

  pxor xmm4,xmm4

@inner:
  movdqu xmm0, [rcx]
  pand  xmm0,xmm1      // mask out top 2 bits   
  pcmpeqb xmm0,xmm2    // compare with $80. 
  pand  xmm0,xmm3      // change to $1 per byte.
  paddb  xmm4,xmm0     // add to cumulative

  add rcx,16
  dec r10
  jne @inner

  // process 127 iterations

  movdqa xmm0,xmm4

  PUNPCKLBW xmm0,xmm5           // zero extend to words
  PUNPCKHBW xmm4,xmm5
  paddw xmm0,xmm4               // add, now 8 16-bit words.

  movdqa xmm4,xmm0
  PUNPCKLWD xmm0,xmm5           // zero extend to dwords
  paddd xmm6,xmm0
  PUNPCKHWD  xmm4,xmm5
  paddd xmm6,xmm4               // add both to cumulative
  test r8,r8
  jne @outer

  MOVHLPS  xmm4,xmm6            // move high 8 bytes to low (float->int penalty?)

  paddd xmm6,xmm4               // add both 2*dwords (high doesn't matter)

  pshufd xmm4,xmm6,1           // mov 2nd dword in xmm6 to first in xmm4
  paddd xmm6,xmm4              // add

  movd r8d,xmm6                // to int alu reg
  sub  rax,r8                  // subtract from length in bytes.
@restbytes:
  test r9,r9
  je @theend                   // Done!
@restloop:
  movzx r8d, byte [rcx]        // unaligned bytes after sse loop
  mov r10,r8
  shr r10,7
  not r8
  shr r8,6
  and r10,r8
  sub rax,r10
  inc rcx
  dec r9
  jne @restloop

@theend:
end['xmm5','xmm6']; // volatile registers used.
end;

function countmask(nx:int64):integer;

begin
   nx := (nx and $00FF00FF00FF00FF) + ((nx >>  8) and $00FF00FF00FF00FF);
   nx := (nx and $0000FFFF0000FFFF) + ((nx >> 16) and $0000FFFF0000FFFF);
   result := (nx and $00000000FFFFFFFF) + ((nx >> 32) and $00000000FFFFFFFF); 
end;

function UTF8LengthFast(p: PChar; ByteCount: PtrInt): PtrInt;
const
{$ifdef CPU32}
  ONEMASK   =$01010101;
  EIGHTYMASK=$80808080;
{$endif}
{$ifdef CPU64}
  ONEMASK   =$0101010101010101;
  EIGHTYMASK=$8080808080808080;
{$endif}
var
  pnx: PPtrInt absolute p; // To get contents of text in PtrInt blocks. x refers to 32 or 64 bits
  pn8: pint8 absolute pnx; // To read text as Int8 in the initial and final loops
  ix: PtrInt absolute pnx; // To read text as PtrInt in the block loop
  nx: PtrInt;              // values processed in block loop
  i,cnt,e: PtrInt;
begin
  Result := 0;
  e := ix+ByteCount; // End marker
  // Handle any initial misaligned bytes.
  cnt := (not (ix-1)) and (sizeof(PtrInt)-1);
  if cnt>ByteCount then
    cnt := ByteCount;
  for i := 1 to cnt do
  begin
    // Is this byte NOT the first byte of a character?
    Result += (pn8^ shr 7) and ((not pn8^) shr 6);
    inc(pn8);
  end;
  // Handle complete blocks
  for i := 1 to (ByteCount-cnt) div sizeof(PtrInt) do
  begin
    // Count bytes which are NOT the first byte of a character.
    nx := ((pnx^ and EIGHTYMASK) shr 7) and ((not pnx^) shr 6);
    {$push}{$overflowchecks off} // "nx * ONEMASK" causes an arithmetic overflow.
    Result += (nx * ONEMASK) >> ((sizeof(PtrInt) - 1) * 8);
    {$pop}
    inc(pnx);
  end;
  // Take care of any left-over bytes.
  while ix<e do
  begin
    // Is this byte NOT the first byte of a character?
    Result += (pn8^ shr 7) and ((not pn8^) shr 6);
    inc(pn8);
  end;
  Result := ByteCount - Result;
end;

function UTF8LengthPop(p: PChar; ByteCount: PtrInt): PtrInt;
const
{$ifdef CPU32}
  ONEMASK   =$01010101;
  EIGHTYMASK=$80808080;
{$endif}
{$ifdef CPU64}
  ONEMASK   =$0101010101010101;
  EIGHTYMASK=$8080808080808080;
{$endif}
var
  pnx: PPtrInt absolute p; // To get contents of text in PtrInt blocks. x refers to 32 or 64 bits
  pn8: pint8 absolute pnx; // To read text as Int8 in the initial and final loops
  ix: PtrInt absolute pnx; // To read text as PtrInt in the block loop
  nx: PtrInt;              // values processed in block loop
  i,cnt,e: PtrInt;
begin
  Result := 0;
  e := ix+ByteCount; // End marker
  // Handle any initial misaligned bytes.
  cnt := (not (ix-1)) and (sizeof(PtrInt)-1);
  if cnt>ByteCount then
    cnt := ByteCount;
  for i := 1 to cnt do
  begin
    // Is this byte NOT the first byte of a character?
    Result += (pn8^ shr 7) and ((not pn8^) shr 6);
    inc(pn8);
  end;
  // Handle complete blocks
  for i := 1 to (ByteCount-cnt) div sizeof(PtrInt) do
  begin
    // Count bytes which are NOT the first byte of a character.
    nx := ((pnx^ and EIGHTYMASK) shr 7) and ((not pnx^) shr 6);
    {$push}{$overflowchecks off} // "nx * ONEMASK" causes an arithmetic overflow.
    //Result += (nx * ONEMASK) >> ((sizeof(PtrInt) - 1) * 8);
    Result += PopCnt(qword(nx));
    {$pop}
    inc(pnx);
  end;
  // Take care of any left-over bytes.
  while ix<e do
  begin
    // Is this byte NOT the first byte of a character?
    Result += (pn8^ shr 7) and ((not pn8^) shr 6);
    inc(pn8);
  end;
  Result := ByteCount - Result;
end;

function UTF8LengthAdd(p: PChar; ByteCount: PtrInt): PtrInt;
const
{$ifdef CPU32}
  ONEMASK   =$01010101;
  EIGHTYMASK=$80808080;
{$endif}
{$ifdef CPU64}
  ONEMASK   =$0101010101010101;
  EIGHTYMASK=$8080808080808080;
{$endif}
var
  pnx: PPtrInt absolute p; // To get contents of text in PtrInt blocks. x refers to 32 or 64 bits
  pn8: pint8 absolute pnx; // To read text as Int8 in the initial and final loops
  ix: PtrInt absolute pnx; // To read text as PtrInt in the block loop
  nx: PtrInt;              // values processed in block loop
  i,j,cnt,e, bc: PtrInt;
begin
  Result := 0;
  e := ix+ByteCount; // End marker
  // Handle any initial misaligned bytes.
  cnt := (not (ix-1)) and (sizeof(PtrInt)-1);
  if cnt>ByteCount then
    cnt := ByteCount;
  for i := 1 to cnt do
  begin
    // Is this byte NOT the first byte of a character?
    Result += (pn8^ shr 7) and ((not pn8^) shr 6);
    inc(pn8);
  end;
  // Handle complete blocks

  bc := (ByteCount-cnt) div sizeof(PtrInt);
  for j := 1 to bc >> 7 do begin
    nx := 0;
    for i := 0 to 127 do
    begin
      // Count bytes which are NOT the first byte of a character.
      nx += ((pnx^ and EIGHTYMASK) shr 7) and ((not pnx^) shr 6);
      inc(pnx);
    end;
    nx := (nx and $00FF00FF00FF00FF) + ((nx >>  8) and $00FF00FF00FF00FF);
    nx := (nx and $0000FFFF0000FFFF) + ((nx >> 16) and $0000FFFF0000FFFF);
    nx := (nx and $00000000FFFFFFFF) + ((nx >> 32) and $00000000FFFFFFFF);
    Result := Result + nx;
  end;

  if (bc and 127) > 0 then begin
  nx := 0;
  for i := 1 to bc and 127 do
  begin
    // Count bytes which are NOT the first byte of a character.
    nx += ((pnx^ and EIGHTYMASK) shr 7) and ((not pnx^) shr 6);
    inc(pnx);
  end;
    nx := (nx and $00FF00FF00FF00FF) + ((nx >>  8) and $00FF00FF00FF00FF);
    nx := (nx and $0000FFFF0000FFFF) + ((nx >> 16) and $0000FFFF0000FFFF);
    nx := (nx and $00000000FFFFFFFF) + ((nx >> 32) and $00000000FFFFFFFF);
    Result := Result + nx;
  end;

  // Take care of any left-over bytes.
  while ix<e do
  begin
    // Is this byte NOT the first byte of a character?
    Result += (pn8^ shr 7) and ((not pn8^) shr 6);
    inc(pn8);
  end;
  Result := ByteCount - Result;
end;

// one of each pattern.
const pattern : array[0..3] of char = (chr(%11001001),chr(%10001001),
                                       chr(%00001001),chr(%01001001));

function pseudorandomutf8string(len:integer;var cnt:integer):string;
// random string but keep a count of bytes with high value %10
var lcnt:integer;
    i,j:integer;
begin
  setlength(result,len);

  lcnt:=0;
  for i:=1 to length(result) do
    begin
      j:=random(4);
      //j:=i and 3;
      if j=1 then inc(lcnt);
      result[i]:=pattern[j];
    end;
  cnt:=length(result)-lcnt;
end;

var   r : array[0..10000] of byte; // FPC "registers" dialog is poor, we use this for writeln like dumping

procedure testasmutf8length;

const testlen = 64*100;

var s : string;
    i,j,cnt : integer;    
    rx : int64;

begin
  randomize;

  s:=pseudorandomutf8string(testlen+Random(50),cnt);

  rx:=asmutf8length(pchar(s),{$ifdef asmdebug}@r[0],{$endif}length(s));

  writeln(inttohex(cnt,2),' = ',inttohex(rx,2),' ',inttohex(length(s)-cnt,2),' = ',inttohex(length(s)-rx,2));  // hex because most register dumps are easier in hex.
  {$ifdef asmdebug}
  for i:=0 to 6 do
      begin
        write(i:2,' ');
        for j:=0 to 3 do
           write(inttohex(pdword(@r[i*16+j*4])^,8), ' ');
        writeln;
      end;
  {$endif}
end;

  var
  a: ansistring;
  t: QWord;
  i, j, ii: Integer;
begin
  {$ifdef asmdebug}
     testasmutf8length;
  {$else}
  a := 'اربك تكست هو اول موقع يسمح لزواره الكرام بتحويل الكتابة العربي الى كتابة مفهومة من قبل اغلب برامج التصميم مثل الفوتوشوب و';
  a := a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A;
  a := a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A;
  a := a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A;
  a := a+a+A+a+a+A+a+a;
  writeln(Length(a));
  writeln('fst:',UTF8LengthFast(@a[1], Length(a)));
  writeln('pop:',UTF8LengthPop(@a[1], Length(a)));
  writeln('add:',UTF8LengthAdd(@a[1], Length(a)));
  writeln('asm:',asmUTF8Length(@a[1], {$ifdef asmdebug}@r[0],{$endif} Length(a)));
  WriteLn();
  writeln(Length(a) div 8);

  WriteLn();

  for ii := 0 to 1 do begin
  for i := 0 to 3 do begin
    t := GetTickCount64;
    for j := 0 to 19 do
      UTF8LengthFast(@a[1], Length(a));
    t := GetTickCount64 - t;
    writeln('fst ',t);
  end;

  for i := 0 to 3 do begin
    t := GetTickCount64;
    for j := 0 to 19 do
      UTF8LengthPop(@a[1], Length(a));
    t := GetTickCount64 - t;
    writeln('pop ',t);
  end;

  for i := 0 to 3 do begin
    t := GetTickCount64;
    for j := 0 to 19 do
      UTF8LengthAdd(@a[1], Length(a));
    t := GetTickCount64 - t;
    writeln('add ',t);
  end;

  for i := 0 to 3 do begin
    t := GetTickCount64;
    for j := 0 to 19 do
      asmUTF8Length(@a[1], {$ifdef asmdebug}@r[0],{$endif} Length(a));
    t := GetTickCount64 - t;
    writeln('asm ',t);
  end;

  end;
    {$endif}
 {$ifndef FPC}
 if debughook<>nil then // runtime debugger detection
 {$endif}
   readln;
end.