[Lazarus] Faster than popcnt [[Re: UTF8LengthFast returning incorrect results on AARCH64 (MacOS)]]

Tue Dec 28 23:35:12 CET 2021

On 28/12/2021 15:50, Bart via lazarus wrote:
> On Tue, Dec 28, 2021 at 3:39 PM Marco van de Voort via lazarus
> <lazarus at lists.lazarus-ide.org> wrote:
>
>> On what machine did you test? The settings if for the generated code,
>> but the actual processor determines the effective speed.
> I have a Intel i5 7th generation on my Win10-64 laptop from approx.
> 2017 (so, it's really old for more modern folks than me).
>
> Compiled for 32-bit:
> With -CpCOREI
> Unsigned version with multiplication: 1359
> Unsigned version with PopCnt        : 1282
>

I have a core I7-8600
The diff between the old code and popcnt is less significant.

old: 715
pop: 695

But there is a 3rd way, that is faster.
add: 610

"nx" has a single "1" in each of the 8 bytes in a Qword (based on 64bit).
If we regard each of this bytes as an entity of its own, then we can 
keep adding those "1".

We could add the 1 of up to 255 iteration, before an overflow can happen.
The example only does 128, as this avoids the "div" and "mod" operations.

The full routine / incl benchmark for all 3 versions is attached

For 64 bit:

   bc := (ByteCount-cnt) div sizeof(PtrInt);
   for j := 1 to bc >> 7 do begin
     nx := 0;
     for i := 0 to 127 do
     begin
       // Count bytes which are NOT the first byte of a character.
       nx += ((pnx^ and EIGHTYMASK) shr 7) and ((not pnx^) shr 6);
       inc(pnx);
     end;
     nx := (nx and $00FF00FF00FF00FF) + ((nx >>  8) and $00FF00FF00FF00FF);
     nx := (nx and $0000FFFF0000FFFF) + ((nx >> 16) and $0000FFFF0000FFFF);
     nx := (nx and $00000000FFFFFFFF) + ((nx >> 32) and $00000000FFFFFFFF);
     Result := Result + nx;
   end;

   if (bc and 127) > 0 then begin
   nx := 0;
   for i := 1 to bc and 127 do
   begin
     // Count bytes which are NOT the first byte of a character.
     nx += ((pnx^ and EIGHTYMASK) shr 7) and ((not pnx^) shr 6);
     inc(pnx);
   end;
     nx := (nx and $00FF00FF00FF00FF) + ((nx >>  8) and $00FF00FF00FF00FF);
     nx := (nx and $0000FFFF0000FFFF) + ((nx >> 16) and $0000FFFF0000FFFF);
     nx := (nx and $00000000FFFFFFFF) + ((nx >> 32) and $00000000FFFFFFFF);
     Result := Result + nx;
   end;

-------------- next part --------------
program Project1;
{$mode objfpc}{$H+}

uses SysUtils;

function UTF8LengthFast(p: PChar; ByteCount: PtrInt): PtrInt;
const
{$ifdef CPU32}
  ONEMASK   =$01010101;
  EIGHTYMASK=$80808080;
{$endif}
{$ifdef CPU64}
  ONEMASK   =$0101010101010101;
  EIGHTYMASK=$8080808080808080;
{$endif}
var
  pnx: PPtrInt absolute p; // To get contents of text in PtrInt blocks. x refers to 32 or 64 bits
  pn8: pint8 absolute pnx; // To read text as Int8 in the initial and final loops
  ix: PtrInt absolute pnx; // To read text as PtrInt in the block loop
  nx: PtrInt;              // values processed in block loop
  i,cnt,e: PtrInt;
begin
  Result := 0;
  e := ix+ByteCount; // End marker
  // Handle any initial misaligned bytes.
  cnt := (not (ix-1)) and (sizeof(PtrInt)-1);
  if cnt>ByteCount then
    cnt := ByteCount;
  for i := 1 to cnt do
  begin
    // Is this byte NOT the first byte of a character?
    Result += (pn8^ shr 7) and ((not pn8^) shr 6);
    inc(pn8);
  end;
  // Handle complete blocks
  for i := 1 to (ByteCount-cnt) div sizeof(PtrInt) do
  begin
    // Count bytes which are NOT the first byte of a character.
    nx := ((pnx^ and EIGHTYMASK) shr 7) and ((not pnx^) shr 6);
    {$push}{$overflowchecks off} // "nx * ONEMASK" causes an arithmetic overflow.
    Result += (nx * ONEMASK) >> ((sizeof(PtrInt) - 1) * 8);
    {$pop}
    inc(pnx);
  end;
  // Take care of any left-over bytes.
  while ix<e do
  begin
    // Is this byte NOT the first byte of a character?
    Result += (pn8^ shr 7) and ((not pn8^) shr 6);
    inc(pn8);
  end;
  Result := ByteCount - Result;
end;

function UTF8LengthPop(p: PChar; ByteCount: PtrInt): PtrInt;
const
{$ifdef CPU32}
  ONEMASK   =$01010101;
  EIGHTYMASK=$80808080;
{$endif}
{$ifdef CPU64}
  ONEMASK   =$0101010101010101;
  EIGHTYMASK=$8080808080808080;
{$endif}
var
  pnx: PPtrInt absolute p; // To get contents of text in PtrInt blocks. x refers to 32 or 64 bits
  pn8: pint8 absolute pnx; // To read text as Int8 in the initial and final loops
  ix: PtrInt absolute pnx; // To read text as PtrInt in the block loop
  nx: PtrInt;              // values processed in block loop
  i,cnt,e: PtrInt;
begin
  Result := 0;
  e := ix+ByteCount; // End marker
  // Handle any initial misaligned bytes.
  cnt := (not (ix-1)) and (sizeof(PtrInt)-1);
  if cnt>ByteCount then
    cnt := ByteCount;
  for i := 1 to cnt do
  begin
    // Is this byte NOT the first byte of a character?
    Result += (pn8^ shr 7) and ((not pn8^) shr 6);
    inc(pn8);
  end;
  // Handle complete blocks
  for i := 1 to (ByteCount-cnt) div sizeof(PtrInt) do
  begin
    // Count bytes which are NOT the first byte of a character.
    nx := ((pnx^ and EIGHTYMASK) shr 7) and ((not pnx^) shr 6);
    {$push}{$overflowchecks off} // "nx * ONEMASK" causes an arithmetic overflow.
    //Result += (nx * ONEMASK) >> ((sizeof(PtrInt) - 1) * 8);
    Result += PopCnt(qword(nx));
    {$pop}
    inc(pnx);
  end;
  // Take care of any left-over bytes.
  while ix<e do
  begin
    // Is this byte NOT the first byte of a character?
    Result += (pn8^ shr 7) and ((not pn8^) shr 6);
    inc(pn8);
  end;
  Result := ByteCount - Result;
end;

function UTF8LengthAdd(p: PChar; ByteCount: PtrInt): PtrInt;
const
{$ifdef CPU32}
  ONEMASK   =$01010101;
  EIGHTYMASK=$80808080;
{$endif}
{$ifdef CPU64}
  ONEMASK   =$0101010101010101;
  EIGHTYMASK=$8080808080808080;
{$endif}
var
  pnx: PPtrInt absolute p; // To get contents of text in PtrInt blocks. x refers to 32 or 64 bits
  pn8: pint8 absolute pnx; // To read text as Int8 in the initial and final loops
  ix: PtrInt absolute pnx; // To read text as PtrInt in the block loop
  nx: PtrInt;              // values processed in block loop
  i,j,cnt,e, bc: PtrInt;
begin
  Result := 0;
  e := ix+ByteCount; // End marker
  // Handle any initial misaligned bytes.
  cnt := (not (ix-1)) and (sizeof(PtrInt)-1);
  if cnt>ByteCount then
    cnt := ByteCount;
  for i := 1 to cnt do
  begin
    // Is this byte NOT the first byte of a character?
    Result += (pn8^ shr 7) and ((not pn8^) shr 6);
    inc(pn8);
  end;
  // Handle complete blocks

  bc := (ByteCount-cnt) div sizeof(PtrInt);
  for j := 1 to bc >> 7 do begin
    nx := 0;
    for i := 0 to 127 do
    begin
      // Count bytes which are NOT the first byte of a character.
      nx += ((pnx^ and EIGHTYMASK) shr 7) and ((not pnx^) shr 6);
      inc(pnx);
    end;
    nx := (nx and $00FF00FF00FF00FF) + ((nx >>  8) and $00FF00FF00FF00FF);
    nx := (nx and $0000FFFF0000FFFF) + ((nx >> 16) and $0000FFFF0000FFFF);
    nx := (nx and $00000000FFFFFFFF) + ((nx >> 32) and $00000000FFFFFFFF);
    Result := Result + nx;
  end;

  if (bc and 127) > 0 then begin
  nx := 0;
  for i := 1 to bc and 127 do
  begin
    // Count bytes which are NOT the first byte of a character.
    nx += ((pnx^ and EIGHTYMASK) shr 7) and ((not pnx^) shr 6);
    inc(pnx);
  end;
    nx := (nx and $00FF00FF00FF00FF) + ((nx >>  8) and $00FF00FF00FF00FF);
    nx := (nx and $0000FFFF0000FFFF) + ((nx >> 16) and $0000FFFF0000FFFF);
    nx := (nx and $00000000FFFFFFFF) + ((nx >> 32) and $00000000FFFFFFFF);
    Result := Result + nx;
  end;

  // Take care of any left-over bytes.
  while ix<e do
  begin
    // Is this byte NOT the first byte of a character?
    Result += (pn8^ shr 7) and ((not pn8^) shr 6);
    inc(pn8);
  end;
  Result := ByteCount - Result;
end;

var
  a: ansistring;
  t: QWord;
  i, j, ii: Integer;
begin
  a := 'اربك تكست هو اول موقع يسمح لزواره الكرام بتحويل الكتابة العربي الى كتابة مفهومة من قبل اغلب برامج التصميم مثل الفوتوشوب و';
  a := a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A;
  a := a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A;
  a := a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A;
  a := a+a+A+a+a+A+a+a;
  writeln(Length(a));
  writeln(UTF8LengthFast(@a[1], Length(a)));
  writeln(UTF8LengthPop(@a[1], Length(a)));
  writeln(UTF8LengthAdd(@a[1], Length(a)));
  WriteLn();
  writeln(Length(a) div 8);

  WriteLn();

  for ii := 0 to 1 do begin
  for i := 0 to 3 do begin
    t := GetTickCount64;
    for j := 0 to 19 do
      UTF8LengthFast(@a[1], Length(a));
    t := GetTickCount64 - t;
    writeln('fst ',t);
  end;

  for i := 0 to 3 do begin
    t := GetTickCount64;
    for j := 0 to 19 do
      UTF8LengthPop(@a[1], Length(a));
    t := GetTickCount64 - t;
    writeln('pop ',t);
  end;

  for i := 0 to 3 do begin
    t := GetTickCount64;
    for j := 0 to 19 do
      UTF8LengthAdd(@a[1], Length(a));
    t := GetTickCount64 - t;
    writeln('add ',t);
  end;

  end;

  readln;

end.