[Lazarus] UTF-8 string recognition
Robin Hoo
robin.hoo.cn at gmail.com
Wed Mar 3 00:24:35 CET 2010
Hi, Antonio
Pls check the function I used for check UTF8 string. Hope it helpful
function IsUTF8(UnknownStr:string):boolean;
var
i :Integer;
begin
if length(UnknownStr)=0 then exit(true);
i:=1;
while i<length(UnknownStr) do
begin
// ASCII
if (UnknownStr[i] = #$09) or
(UnknownStr[i] = #$0A) or
(UnknownStr[i] = #$0D) or
(UnknownStr[i] in [#$20..#$7E]) then
begin
inc(i);
continue;
end;
// non-overlong 2-byte
if (UnknownStr[i] in [#$C2..#$DF]) and
(UnknownStr[i+1] in [#$80..#$BF]) then
begin
inc(i,2);
continue;
end;
// excluding overlongs
if ((UnknownStr[i]=#$E0) and
(UnknownStr[i+1] in [#$A0..#$BF]) and
(UnknownStr[i+2] in [#$80..#$BF]))
or
// straight 3-byte
(((UnknownStr[i] in [#$E1..#$EC]) or
(UnknownStr[i] = #$EE) or
(UnknownStr[i] = #$EF))
and
(UnknownStr[i+1] in [#$80..#$BF]) and
(UnknownStr[i+2] in [#$80..#$BF]))
or
// excluding surrogates
((UnknownStr[i]=#$ED) and
(UnknownStr[i+1] in [#$80..#$9F]) and
(UnknownStr[i+2] in [#$80..#$BF])) then
begin
inc(i,3);
continue;
end;
// planes 1-3
if ((UnknownStr[i]=#$F0) and
(UnknownStr[i+1] in [#$90..#$BF]) and
(UnknownStr[i+2] in [#$80..#$BF]) and
(UnknownStr[i+3] in [#$80..#$BF]))
or
// planes 4-15
((UnknownStr[i] in [#$F1..#$F3]) and
(UnknownStr[i+1] in [#$80..#$BF]) and
(UnknownStr[i+2] in [#$80..#$BF]) and
(UnknownStr[i+3] in [#$80..#$BF]))
or
// plane 16
((UnknownStr[i]=#$F4) and
(UnknownStr[i+1] in [#$80..#$8F]) and
(UnknownStr[i+2] in [#$80..#$BF]) and
(UnknownStr[i+3] in [#$80..#$BF])) then
begin
inc(i,4);
continue;
end;
exit(false);
end;
exit(true);
end;
2010/2/27 Antônio <antoniog12345 at gmail.com>
> How to determine whether a string is UTF-8 or not?
>
> --
> _______________________________________________
> Lazarus mailing list
> Lazarus at lists.lazarus.freepascal.org
> http://lists.lazarus.freepascal.org/mailman/listinfo/lazarus
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.lazarus-ide.org/pipermail/lazarus/attachments/20100303/2baa419a/attachment-0004.html>
More information about the Lazarus
mailing list