608 lines
13 KiB
Plaintext
Executable File
608 lines
13 KiB
Plaintext
Executable File
{
|
|
This file is translated to Delphi from the file referenced below
|
|
by Jan Martin Pettersen (hdalis@users.sourceforge.net)
|
|
23/07/2005.
|
|
|
|
Some code in this file is also taken from the SciTE (Neil Hodgson)
|
|
}
|
|
// Utf8_16.cxx
|
|
// Copyright (C) 2002 Scott Kirkwood
|
|
//
|
|
// Permission to use, copy, modify, distribute and sell this code
|
|
// and its documentation for any purpose is hereby granted without fee,
|
|
// provided that the above copyright notice appear in all copies or
|
|
// any derived copies. Scott Kirkwood makes no representations
|
|
// about the suitability of this software for any purpose.
|
|
// It is provided "as is" without express or implied warranty.
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
unit UtfFunct;
|
|
interface
|
|
|
|
uses Windows,SysUtils,Classes,Math;
|
|
const
|
|
UniBufSize=32000;
|
|
|
|
type
|
|
Utf16=Word;
|
|
Utf8=Byte;
|
|
TUtf8Array=array[0..1] of Utf8;
|
|
PUtf8=^TUtf8Array;
|
|
TUtf16Array=array[0..1] of Utf16;
|
|
PUtf16=^TUtf16Array;
|
|
|
|
//uniCookie isn't used yet..
|
|
UniMode=(uni8Bit, uni16BE, uni16LE, uniUTF8,uniCookie);
|
|
//States for the unicode Next functions..
|
|
eState=(eStart,e2Bytes2,e3Bytes2,e3Bytes3);
|
|
|
|
// Reads UTF-16 and outputs UTF-8
|
|
Utf16_Iter=class(TObject)
|
|
private
|
|
m_eEncoding : UniMode;
|
|
m_eState : eState;
|
|
m_pBuf : PByte;
|
|
m_pRead : PByte;
|
|
m_pEnd : PByte;
|
|
m_nCur : Utf8;
|
|
m_nCur16 : Utf16;
|
|
public
|
|
constructor Create;
|
|
procedure Reset;
|
|
procedure Set_(const pbuf : PByte;nLen : Cardinal;eEncoding : UniMode);
|
|
function More : Boolean;
|
|
procedure Next;
|
|
function Get : Utf8;
|
|
end;
|
|
|
|
|
|
// Reads UTF-8 and outputs UTF-16
|
|
Utf8_Iter =class(TObject)
|
|
private
|
|
m_eEncoding : UniMode;
|
|
m_eState : eState;
|
|
m_pBuf : PByte;
|
|
m_pRead : PByte;
|
|
m_pEnd : PByte;
|
|
m_nCur16 : Utf16;
|
|
procedure toStart;
|
|
procedure Swap;
|
|
public
|
|
constructor Create;
|
|
procedure Reset;
|
|
procedure Set_(const pbuf : PByte;nLen : Cardinal;eEncoding : UniMode);
|
|
function More : Boolean; //bool
|
|
procedure Next;
|
|
function Get : Utf16;
|
|
function canGet : Boolean;
|
|
end;
|
|
|
|
// Reads UTF16 and outputs UTF8
|
|
UtfRead=class(TObject)
|
|
private
|
|
m_eEncoding : UniMode;
|
|
m_pBuf : PByte;
|
|
m_nBufSize : Cardinal;
|
|
m_bFirstRead : Boolean;
|
|
m_pNewBuf : PByte;
|
|
m_nLen : Cardinal;
|
|
m_Iter16 : Utf16_Iter;
|
|
public
|
|
constructor Create;
|
|
destructor Destroy;override;
|
|
function getEncoding : UniMode;
|
|
function getNewBuf : PChar;
|
|
function Convert(buf : PChar; len : Cardinal) : Cardinal;
|
|
procedure Reset;
|
|
property Encoding : UniMode read m_eEncoding;
|
|
end;
|
|
|
|
// Read in a UTF-8 buffer and write out to UTF-16 or UTF-8
|
|
UtfWrite=class(TObject)
|
|
private
|
|
m_eEncoding : UniMode;
|
|
m_pBuf : PUtf16;
|
|
m_nBufSize : Cardinal;
|
|
m_bFirstWrite : Boolean;
|
|
m_pFile : TStream;
|
|
procedure SetDestStream(Value : TStream);
|
|
procedure SetEncoding(eType : UniMode);
|
|
public
|
|
constructor Create;
|
|
function Write(const Buffer; Count : Cardinal) : LongInt;
|
|
property DestStream : TStream read m_pFile write SetDestStream;
|
|
property Encoding : UniMode read m_eEncoding write SetEncoding;
|
|
end;
|
|
|
|
//Returns the UTF8 length of the buffer 'uptr'.
|
|
function UTF8Length(const wideSrc : PWideChar; wideLen : Cardinal) : Cardinal;
|
|
//Transforms UCS2 to UTF8.
|
|
procedure UTF8FromUCS2(const wideSrc : PWideChar; wideLen : Cardinal; utfDestBuf : PChar; utfDestLen : Cardinal);
|
|
function UTF8ToAnsiP(const srcBuffer : PChar;len : Integer;destBuffer : PChar) : Integer;
|
|
function DetectEncoding(buf : PByte;len : Integer;var Encoding : UniMode) : Integer;
|
|
|
|
implementation
|
|
|
|
const
|
|
k_boms : array[uni8bit..uniUTF8,0..2] of Utf8=(
|
|
($00,$00,$00),
|
|
($FE,$FF,$00),
|
|
($FF,$FE,$00),
|
|
($EF,$BB,$BF));
|
|
|
|
function UTF8ToAnsiP(const srcBuffer : PChar;len : Integer;destBuffer : PChar) : Integer;
|
|
var
|
|
tmpbuffer : String;
|
|
srcLen,i,destLen : Integer;
|
|
begin
|
|
Result:=0;
|
|
if (not assigned(srcBuffer)) or (not assigned(destBuffer)) then Exit;
|
|
if len=-1 then
|
|
srcLen:=Length(srcBuffer)
|
|
else
|
|
srcLen:=len;
|
|
tmpbuffer:=UTF8ToAnsi(Copy(srcBuffer,1,srcLen));
|
|
destLen:=Length(tmpbuffer);
|
|
for i:=1 to destLen do
|
|
destBuffer[i-1]:=tmpbuffer[i];
|
|
destBuffer[destLen]:=#0;
|
|
Result:=destLen;
|
|
end;
|
|
|
|
function UTF8Length(const wideSrc : PWideChar; wideLen : Cardinal) : Cardinal;
|
|
var
|
|
i,len : Cardinal;
|
|
uch : Cardinal;
|
|
begin
|
|
len := 0;
|
|
i:=0;
|
|
while((i<wideLen) and (Cardinal(wideSrc[i])<>0)) do
|
|
begin
|
|
uch:=Cardinal(wideSrc[i]);
|
|
if (uch < $80) then
|
|
Inc(len)
|
|
else if (uch < $800) then
|
|
Inc(len,2)
|
|
else
|
|
Inc(len,3);
|
|
Inc(i);
|
|
end;
|
|
Result:=len;
|
|
end;
|
|
|
|
procedure UTF8FromUCS2(const wideSrc : PWideChar; wideLen : Cardinal; utfDestBuf : PChar; utfDestLen : Cardinal);
|
|
var
|
|
k : Integer;
|
|
i : Cardinal;
|
|
uch : Cardinal;
|
|
begin
|
|
k:= 0;
|
|
i:=0;
|
|
while((i<wideLen)and(Cardinal(wideSrc[i])<>0)) do
|
|
begin
|
|
uch:=Cardinal(wideSrc[i]);
|
|
if uch<$80 then
|
|
begin
|
|
utfDestBuf[k] := Char(uch);
|
|
Inc(k);
|
|
end else
|
|
if (uch<$800) then
|
|
begin
|
|
utfDestBuf[k]:=Char($C0 or (uch shr 6));
|
|
Inc(k);
|
|
utfDestBuf[k] := Char($80 or (uch and $3f));
|
|
Inc(k);
|
|
end else
|
|
begin
|
|
utfDestBuf[k] := Char($E0 or (uch shr 12));
|
|
Inc(k);
|
|
utfDestBuf[k] := Char($80 or ((uch shr 6) and $3f));
|
|
Inc(k);
|
|
utfDestBuf[k] := Char($80 or (uch and $3f));
|
|
Inc(k);
|
|
end;
|
|
end;
|
|
utfDestBuf[utfDestLen]:=#0;
|
|
end;
|
|
|
|
function DetectEncoding(buf : PByte;len : Integer;var Encoding : UniMode) : Integer;
|
|
var
|
|
nRet : Integer;
|
|
pbTmp : PByteArray;
|
|
begin
|
|
Encoding := uni8bit;
|
|
pbTmp:=PByteArray(buf);
|
|
nRet := 0;
|
|
if (len > 1) then
|
|
begin
|
|
if ((pbTmp[0]=k_Boms[uni16BE][0]) and (pbTmp[1]=k_Boms[uni16BE][1])) then
|
|
begin
|
|
Encoding := uni16BE;
|
|
nRet := 2;
|
|
end else
|
|
if ((pbTmp[0]=k_Boms[uni16LE][0]) and (pbTmp[1]=k_Boms[uni16LE][1])) then
|
|
begin
|
|
Encoding := uni16LE;
|
|
nRet := 2;
|
|
end else
|
|
if ((len>2) and (pbTmp[0]=k_Boms[uniUTF8][0]) and (pbTmp[1]=k_Boms[uniUTF8][1]) and (pbTmp[2]=k_Boms[uniUTF8][2])) then
|
|
begin
|
|
Encoding := uniUTF8;
|
|
nRet := 3;
|
|
end;
|
|
end;
|
|
Result:=nRet;
|
|
end;
|
|
|
|
|
|
constructor Utf16_Iter.Create;
|
|
begin
|
|
Reset;
|
|
end;
|
|
procedure Utf16_Iter.Reset;
|
|
begin
|
|
m_pBuf := nil;
|
|
m_pRead := nil;
|
|
m_pEnd := nil;
|
|
m_eState := eStart;
|
|
m_nCur := 0;
|
|
m_nCur16 := 0;
|
|
m_eEncoding := uni8bit;
|
|
end;
|
|
procedure Utf16_Iter.Set_(const pbuf : PByte;nLen : Cardinal;eEncoding : UniMode);
|
|
begin
|
|
m_pBuf := pBuf;
|
|
m_pRead := pBuf;
|
|
m_pEnd := pBuf;
|
|
Inc(m_pEnd,nLen);
|
|
m_eEncoding := eEncoding;
|
|
Next;
|
|
end;
|
|
|
|
procedure Utf16_Iter.Next;
|
|
begin
|
|
case m_eState of
|
|
eStart:
|
|
begin
|
|
if (m_eEncoding = uni16LE) then
|
|
begin
|
|
m_nCur16 := Utf16(m_pRead^);
|
|
Inc(m_pRead);
|
|
m_nCur16 := m_nCur16 or Utf16((m_pRead^ shl 8));
|
|
end else
|
|
begin
|
|
m_nCur16 := Utf16(m_pRead^ shl 8);
|
|
Inc(m_pRead);
|
|
m_nCur16 := m_nCur16 or m_pRead^;
|
|
end;
|
|
Inc(m_pRead);
|
|
|
|
if (m_nCur16 < $80) then
|
|
begin
|
|
m_nCur := Byte(m_nCur16 and $FF);
|
|
m_eState := eStart;
|
|
end else
|
|
if (m_nCur16 < $800) then
|
|
begin
|
|
m_nCur := Byte($C0 or (m_nCur16 shr 6));
|
|
m_eState := e2Bytes2;
|
|
end else
|
|
begin
|
|
m_nCur := Byte($E0 or (m_nCur16 shr 12));
|
|
m_eState := e3Bytes2;
|
|
end;
|
|
|
|
end;
|
|
e2Bytes2:
|
|
begin
|
|
m_nCur := Byte($80 or (m_nCur16 and $3F));
|
|
m_eState := eStart;
|
|
end;
|
|
e3Bytes2:
|
|
begin
|
|
m_nCur := Byte($80 or ((m_nCur16 shr 6) and $3F));
|
|
m_eState := e3Bytes3;
|
|
end;
|
|
e3Bytes3:
|
|
begin
|
|
m_nCur := Byte($80 or (m_nCur16 and $3F));
|
|
m_eState := eStart;
|
|
end;
|
|
end;
|
|
end;
|
|
|
|
function Utf16_Iter.More : Boolean;
|
|
begin
|
|
Result:=Cardinal(m_pRead) <= Cardinal(m_pEnd);
|
|
end;
|
|
|
|
function Utf16_Iter.Get : Utf8;
|
|
begin
|
|
Result:=m_nCur;
|
|
end;
|
|
|
|
constructor Utf8_Iter.Create;
|
|
begin
|
|
Reset;
|
|
end;
|
|
|
|
procedure Utf8_Iter.Reset;
|
|
begin
|
|
m_pBuf := nil;
|
|
m_pRead := nil;
|
|
m_pEnd := nil;
|
|
m_eState := eStart;
|
|
m_nCur16 := 0;
|
|
m_eEncoding := uni8bit;
|
|
end;
|
|
|
|
procedure Utf8_Iter.Set_(const pbuf : PByte;nLen : Cardinal;eEncoding : UniMode);
|
|
begin
|
|
m_pBuf := pBuf;
|
|
m_pRead := pBuf;
|
|
m_pEnd := pBuf;
|
|
Inc(m_pEnd,nLen);
|
|
m_eEncoding := eEncoding;
|
|
Next;
|
|
end;
|
|
|
|
procedure Utf8_Iter.Next;
|
|
begin
|
|
case (m_eState) of
|
|
eStart:
|
|
begin
|
|
if (($E0 and m_pRead^) = $E0) then
|
|
begin
|
|
m_nCur16 := Utf16(((not $E0) and m_pRead^) shl 12);
|
|
m_eState := e3Bytes2;
|
|
end else if (($C0 and m_pRead^) = $C0) then
|
|
begin
|
|
m_nCur16 := Utf16((not $C0 and m_pRead^) shl 6);
|
|
m_eState := e2Bytes2;
|
|
end else
|
|
begin
|
|
m_nCur16 := m_pRead^;
|
|
toStart;
|
|
end;
|
|
end;
|
|
e2Bytes2:
|
|
begin
|
|
m_nCur16 :=m_nCur16 or utf8($3F and m_pRead^);
|
|
toStart;
|
|
end;
|
|
e3Bytes2:
|
|
begin
|
|
m_nCur16 :=m_nCur16 or utf16(($3F and m_pRead^) shl 6);
|
|
m_eState := e3Bytes3;
|
|
end;
|
|
e3Bytes3:
|
|
begin
|
|
m_nCur16 :=m_nCur16 or utf8($3F and m_pRead^);
|
|
toStart;
|
|
end;
|
|
end;
|
|
Inc(m_pRead);
|
|
end;
|
|
|
|
function Utf8_Iter.More : Boolean;
|
|
begin
|
|
Result:=Cardinal(m_pRead) <= Cardinal(m_pEnd);
|
|
end;
|
|
|
|
function Utf8_Iter.Get : Utf16;
|
|
begin
|
|
Result:=m_nCur16;
|
|
end;
|
|
function Utf8_Iter.canGet : Boolean;
|
|
begin
|
|
Result:=m_eState = eStart;
|
|
end;
|
|
procedure Utf8_Iter.toStart;
|
|
begin
|
|
m_eState := eStart;
|
|
if (m_eEncoding = uni16BE) then
|
|
Swap;
|
|
end;
|
|
procedure Utf8_Iter.Swap;
|
|
var
|
|
p : PUtf8;
|
|
swapbyte : Utf8;
|
|
begin
|
|
p := PUtf8(@m_nCur16);
|
|
swapbyte := p[0];
|
|
p[0]:= p[1];
|
|
p[1]:=swapbyte;
|
|
end;
|
|
|
|
constructor UtfRead.Create;
|
|
begin
|
|
m_eEncoding := uni8bit;
|
|
m_nBufSize := 0;
|
|
m_pNewBuf := nil;
|
|
m_bFirstRead := True;
|
|
end;
|
|
|
|
destructor UtfRead.Destroy;
|
|
begin
|
|
if ((m_eEncoding <> uni8bit) and (m_eEncoding <> uniUTF8)) then
|
|
begin
|
|
if assigned(m_pNewBuf) then FreeMem(m_pNewBuf);
|
|
end;
|
|
inherited;
|
|
end;
|
|
|
|
function UtfRead.getEncoding : UniMode;
|
|
begin
|
|
Result:=m_eEncoding;
|
|
end;
|
|
function UtfRead.getNewBuf : PChar;
|
|
begin
|
|
Result:=PChar(m_pNewBuf);
|
|
end;
|
|
|
|
procedure UtfRead.Reset;
|
|
begin
|
|
m_bFirstRead:=True;
|
|
m_nBufSize:=0;
|
|
m_eEncoding :=uni8Bit;
|
|
end;
|
|
|
|
function UtfRead.Convert(buf : PChar; len : Cardinal) : Cardinal;
|
|
var
|
|
nSkip : Cardinal;
|
|
newSize : Cardinal;
|
|
pCur,pTemp : PByte;
|
|
begin
|
|
m_Iter16:=Utf16_Iter.Create;
|
|
try
|
|
m_pBuf := PByte(buf);
|
|
m_nLen := len;
|
|
|
|
nSkip := 0;
|
|
if (m_bFirstRead) then
|
|
begin
|
|
nSkip := DetectEncoding(m_pBuf,m_nLen,m_eEncoding);
|
|
m_bFirstRead := False;
|
|
end;
|
|
|
|
if (m_eEncoding = uni8bit) then
|
|
begin
|
|
// Do nothing, pass through
|
|
m_nBufSize := 0;
|
|
m_pNewBuf := m_pBuf;
|
|
Result:=len;
|
|
Exit;
|
|
end;
|
|
|
|
if (m_eEncoding = uniUTF8) then
|
|
begin
|
|
// Pass through after BOM
|
|
m_nBufSize := 0;
|
|
m_pNewBuf := m_pBuf;
|
|
Inc(m_pNewBuf,nSkip);
|
|
Result:=len - nSkip;
|
|
Exit;
|
|
end;
|
|
|
|
// Else...
|
|
//newSize := len + len div 2 + 1;
|
|
newSize:=len*2+1;
|
|
if (m_nBufSize <> newSize) then
|
|
begin
|
|
FreeMem(m_pNewBuf);
|
|
m_pNewBuf:=nil;
|
|
GetMem(m_pNewBuf,newSize);
|
|
m_nBufSize := newSize;
|
|
end;
|
|
|
|
pCur := m_pNewBuf;
|
|
pTemp:=m_pBuf;
|
|
Inc(pTemp,nSkip);
|
|
m_Iter16.Set_(pTemp, len - nSkip, m_eEncoding);
|
|
while(m_Iter16.More) do
|
|
begin
|
|
pCur^:=m_Iter16.Get;
|
|
Inc(PCur);
|
|
m_Iter16.Next;
|
|
end;
|
|
// Return number of bytes writen out
|
|
finally
|
|
FreeAndNil(m_Iter16);
|
|
end;
|
|
Result:=Cardinal(pCur) - Cardinal(m_pNewBuf);
|
|
end;
|
|
|
|
constructor UtfWrite.Create;
|
|
begin
|
|
m_eEncoding := uni8bit;
|
|
m_pFile := nil;
|
|
m_pBuf := nil;
|
|
m_bFirstWrite := true;
|
|
m_nBufSize := 0;
|
|
end;
|
|
|
|
procedure UtfWrite.SetEncoding(eType : UniMode);
|
|
begin
|
|
m_eEncoding := eType;
|
|
end;
|
|
procedure UtfWrite.SetDestStream(Value : TStream);
|
|
begin
|
|
m_pFile:=Value;
|
|
m_bFirstWrite:=True;
|
|
end;
|
|
|
|
function UtfWrite.Write(const Buffer; Count : Cardinal) : LongInt;
|
|
var
|
|
iter8 : Utf8_Iter;
|
|
pCur : ^Utf16;
|
|
ret : LongInt;
|
|
pTemp : PChar;
|
|
begin
|
|
if Count=0 then
|
|
begin
|
|
Result:=0;
|
|
Exit;
|
|
end;
|
|
iter8:=Utf8_Iter.Create;
|
|
try
|
|
if (not assigned(m_pFile)) then
|
|
begin
|
|
Result:=0;
|
|
Exit;
|
|
end;
|
|
if (m_eEncoding = uni8bit) then
|
|
begin
|
|
// Normal write
|
|
m_bFirstWrite:=False;
|
|
Result:=m_pFile.Write(PChar(Buffer)^, Count);
|
|
Exit;
|
|
end;
|
|
|
|
if (m_eEncoding = uniUTF8) then
|
|
begin
|
|
pTemp:=PChar(Buffer);
|
|
if (m_bFirstWrite) then
|
|
begin
|
|
m_pFile.Write(k_Boms[m_eEncoding], 3);
|
|
m_bFirstWrite := false;
|
|
end;
|
|
Result:=m_pFile.Write(pTemp^, Count);
|
|
Exit;
|
|
end;
|
|
|
|
if (Count > m_nBufSize) then
|
|
begin
|
|
m_nBufSize := Count;
|
|
if assigned(m_pBuf) then FreeMem(m_pBuf);
|
|
m_pBuf := nil;
|
|
GetMem(m_pBuf,SizeOf(Utf16)*(Count+1));
|
|
end;
|
|
|
|
if (m_bFirstWrite) then
|
|
begin
|
|
if ((m_eEncoding = uni16BE) or (m_eEncoding = uni16LE)) then
|
|
begin
|
|
// Write the BOM
|
|
m_pFile.Write(k_Boms[m_eEncoding],2);
|
|
end;
|
|
m_bFirstWrite := false;
|
|
end;
|
|
|
|
iter8.set_(PByte(Buffer), Count, m_eEncoding);
|
|
pCur := @m_pBuf[0];
|
|
while(iter8.More) do
|
|
begin
|
|
if (iter8.canGet) then
|
|
begin
|
|
pCur^ := iter8.Get;
|
|
Inc(pCur);
|
|
end;
|
|
iter8.Next;
|
|
end;
|
|
ret := m_pFile.Write(m_pBuf^,Cardinal(pCur)-Cardinal(m_pBuf));
|
|
finally
|
|
if assigned(iter8) then FreeAndNil(iter8);
|
|
end;
|
|
Result:=ret;
|
|
end;
|
|
|
|
end. |