[Lazarus] Faster than popcnt [[Re: UTF8LengthFast returning incorrect results on AARCH64 (MacOS)]]
Martin Frb
lazarus at mfriebe.de
Tue Dec 28 23:35:12 CET 2021
On 28/12/2021 15:50, Bart via lazarus wrote:
> On Tue, Dec 28, 2021 at 3:39 PM Marco van de Voort via lazarus
> <lazarus at lists.lazarus-ide.org> wrote:
>
>> On what machine did you test? The settings if for the generated code,
>> but the actual processor determines the effective speed.
> I have a Intel i5 7th generation on my Win10-64 laptop from approx.
> 2017 (so, it's really old for more modern folks than me).
>
> Compiled for 32-bit:
> With -CpCOREI
> Unsigned version with multiplication: 1359
> Unsigned version with PopCnt : 1282
>
I have a core I7-8600
The diff between the old code and popcnt is less significant.
old: 715
pop: 695
But there is a 3rd way, that is faster.
add: 610
"nx" has a single "1" in each of the 8 bytes in a Qword (based on 64bit).
If we regard each of this bytes as an entity of its own, then we can
keep adding those "1".
We could add the 1 of up to 255 iteration, before an overflow can happen.
The example only does 128, as this avoids the "div" and "mod" operations.
The full routine / incl benchmark for all 3 versions is attached
For 64 bit:
bc := (ByteCount-cnt) div sizeof(PtrInt);
for j := 1 to bc >> 7 do begin
nx := 0;
for i := 0 to 127 do
begin
// Count bytes which are NOT the first byte of a character.
nx += ((pnx^ and EIGHTYMASK) shr 7) and ((not pnx^) shr 6);
inc(pnx);
end;
nx := (nx and $00FF00FF00FF00FF) + ((nx >> 8) and $00FF00FF00FF00FF);
nx := (nx and $0000FFFF0000FFFF) + ((nx >> 16) and $0000FFFF0000FFFF);
nx := (nx and $00000000FFFFFFFF) + ((nx >> 32) and $00000000FFFFFFFF);
Result := Result + nx;
end;
if (bc and 127) > 0 then begin
nx := 0;
for i := 1 to bc and 127 do
begin
// Count bytes which are NOT the first byte of a character.
nx += ((pnx^ and EIGHTYMASK) shr 7) and ((not pnx^) shr 6);
inc(pnx);
end;
nx := (nx and $00FF00FF00FF00FF) + ((nx >> 8) and $00FF00FF00FF00FF);
nx := (nx and $0000FFFF0000FFFF) + ((nx >> 16) and $0000FFFF0000FFFF);
nx := (nx and $00000000FFFFFFFF) + ((nx >> 32) and $00000000FFFFFFFF);
Result := Result + nx;
end;
-------------- next part --------------
program Project1;
{$mode objfpc}{$H+}
uses SysUtils;
function UTF8LengthFast(p: PChar; ByteCount: PtrInt): PtrInt;
const
{$ifdef CPU32}
ONEMASK =$01010101;
EIGHTYMASK=$80808080;
{$endif}
{$ifdef CPU64}
ONEMASK =$0101010101010101;
EIGHTYMASK=$8080808080808080;
{$endif}
var
pnx: PPtrInt absolute p; // To get contents of text in PtrInt blocks. x refers to 32 or 64 bits
pn8: pint8 absolute pnx; // To read text as Int8 in the initial and final loops
ix: PtrInt absolute pnx; // To read text as PtrInt in the block loop
nx: PtrInt; // values processed in block loop
i,cnt,e: PtrInt;
begin
Result := 0;
e := ix+ByteCount; // End marker
// Handle any initial misaligned bytes.
cnt := (not (ix-1)) and (sizeof(PtrInt)-1);
if cnt>ByteCount then
cnt := ByteCount;
for i := 1 to cnt do
begin
// Is this byte NOT the first byte of a character?
Result += (pn8^ shr 7) and ((not pn8^) shr 6);
inc(pn8);
end;
// Handle complete blocks
for i := 1 to (ByteCount-cnt) div sizeof(PtrInt) do
begin
// Count bytes which are NOT the first byte of a character.
nx := ((pnx^ and EIGHTYMASK) shr 7) and ((not pnx^) shr 6);
{$push}{$overflowchecks off} // "nx * ONEMASK" causes an arithmetic overflow.
Result += (nx * ONEMASK) >> ((sizeof(PtrInt) - 1) * 8);
{$pop}
inc(pnx);
end;
// Take care of any left-over bytes.
while ix<e do
begin
// Is this byte NOT the first byte of a character?
Result += (pn8^ shr 7) and ((not pn8^) shr 6);
inc(pn8);
end;
Result := ByteCount - Result;
end;
function UTF8LengthPop(p: PChar; ByteCount: PtrInt): PtrInt;
const
{$ifdef CPU32}
ONEMASK =$01010101;
EIGHTYMASK=$80808080;
{$endif}
{$ifdef CPU64}
ONEMASK =$0101010101010101;
EIGHTYMASK=$8080808080808080;
{$endif}
var
pnx: PPtrInt absolute p; // To get contents of text in PtrInt blocks. x refers to 32 or 64 bits
pn8: pint8 absolute pnx; // To read text as Int8 in the initial and final loops
ix: PtrInt absolute pnx; // To read text as PtrInt in the block loop
nx: PtrInt; // values processed in block loop
i,cnt,e: PtrInt;
begin
Result := 0;
e := ix+ByteCount; // End marker
// Handle any initial misaligned bytes.
cnt := (not (ix-1)) and (sizeof(PtrInt)-1);
if cnt>ByteCount then
cnt := ByteCount;
for i := 1 to cnt do
begin
// Is this byte NOT the first byte of a character?
Result += (pn8^ shr 7) and ((not pn8^) shr 6);
inc(pn8);
end;
// Handle complete blocks
for i := 1 to (ByteCount-cnt) div sizeof(PtrInt) do
begin
// Count bytes which are NOT the first byte of a character.
nx := ((pnx^ and EIGHTYMASK) shr 7) and ((not pnx^) shr 6);
{$push}{$overflowchecks off} // "nx * ONEMASK" causes an arithmetic overflow.
//Result += (nx * ONEMASK) >> ((sizeof(PtrInt) - 1) * 8);
Result += PopCnt(qword(nx));
{$pop}
inc(pnx);
end;
// Take care of any left-over bytes.
while ix<e do
begin
// Is this byte NOT the first byte of a character?
Result += (pn8^ shr 7) and ((not pn8^) shr 6);
inc(pn8);
end;
Result := ByteCount - Result;
end;
function UTF8LengthAdd(p: PChar; ByteCount: PtrInt): PtrInt;
const
{$ifdef CPU32}
ONEMASK =$01010101;
EIGHTYMASK=$80808080;
{$endif}
{$ifdef CPU64}
ONEMASK =$0101010101010101;
EIGHTYMASK=$8080808080808080;
{$endif}
var
pnx: PPtrInt absolute p; // To get contents of text in PtrInt blocks. x refers to 32 or 64 bits
pn8: pint8 absolute pnx; // To read text as Int8 in the initial and final loops
ix: PtrInt absolute pnx; // To read text as PtrInt in the block loop
nx: PtrInt; // values processed in block loop
i,j,cnt,e, bc: PtrInt;
begin
Result := 0;
e := ix+ByteCount; // End marker
// Handle any initial misaligned bytes.
cnt := (not (ix-1)) and (sizeof(PtrInt)-1);
if cnt>ByteCount then
cnt := ByteCount;
for i := 1 to cnt do
begin
// Is this byte NOT the first byte of a character?
Result += (pn8^ shr 7) and ((not pn8^) shr 6);
inc(pn8);
end;
// Handle complete blocks
bc := (ByteCount-cnt) div sizeof(PtrInt);
for j := 1 to bc >> 7 do begin
nx := 0;
for i := 0 to 127 do
begin
// Count bytes which are NOT the first byte of a character.
nx += ((pnx^ and EIGHTYMASK) shr 7) and ((not pnx^) shr 6);
inc(pnx);
end;
nx := (nx and $00FF00FF00FF00FF) + ((nx >> 8) and $00FF00FF00FF00FF);
nx := (nx and $0000FFFF0000FFFF) + ((nx >> 16) and $0000FFFF0000FFFF);
nx := (nx and $00000000FFFFFFFF) + ((nx >> 32) and $00000000FFFFFFFF);
Result := Result + nx;
end;
if (bc and 127) > 0 then begin
nx := 0;
for i := 1 to bc and 127 do
begin
// Count bytes which are NOT the first byte of a character.
nx += ((pnx^ and EIGHTYMASK) shr 7) and ((not pnx^) shr 6);
inc(pnx);
end;
nx := (nx and $00FF00FF00FF00FF) + ((nx >> 8) and $00FF00FF00FF00FF);
nx := (nx and $0000FFFF0000FFFF) + ((nx >> 16) and $0000FFFF0000FFFF);
nx := (nx and $00000000FFFFFFFF) + ((nx >> 32) and $00000000FFFFFFFF);
Result := Result + nx;
end;
// Take care of any left-over bytes.
while ix<e do
begin
// Is this byte NOT the first byte of a character?
Result += (pn8^ shr 7) and ((not pn8^) shr 6);
inc(pn8);
end;
Result := ByteCount - Result;
end;
var
a: ansistring;
t: QWord;
i, j, ii: Integer;
begin
a := 'اربك تكست هو اول موقع يسمح لزواره الكرام بتحويل الكتابة العربي الى كتابة مفهومة من قبل اغلب برامج التصميم مثل الفوتوشوب و';
a := a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A;
a := a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A;
a := a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A;
a := a+a+A+a+a+A+a+a;
writeln(Length(a));
writeln(UTF8LengthFast(@a[1], Length(a)));
writeln(UTF8LengthPop(@a[1], Length(a)));
writeln(UTF8LengthAdd(@a[1], Length(a)));
WriteLn();
writeln(Length(a) div 8);
WriteLn();
for ii := 0 to 1 do begin
for i := 0 to 3 do begin
t := GetTickCount64;
for j := 0 to 19 do
UTF8LengthFast(@a[1], Length(a));
t := GetTickCount64 - t;
writeln('fst ',t);
end;
for i := 0 to 3 do begin
t := GetTickCount64;
for j := 0 to 19 do
UTF8LengthPop(@a[1], Length(a));
t := GetTickCount64 - t;
writeln('pop ',t);
end;
for i := 0 to 3 do begin
t := GetTickCount64;
for j := 0 to 19 do
UTF8LengthAdd(@a[1], Length(a));
t := GetTickCount64 - t;
writeln('add ',t);
end;
end;
readln;
end.
More information about the lazarus
mailing list