amxmodx/editor/studio/UtfFunct.PAS

608 lines
13 KiB
Plaintext
Raw Normal View History

2005-10-30 10:33:16 +00:00
{
This file is translated to Delphi from the file referenced below
by Jan Martin Pettersen (hdalis@users.sourceforge.net)
23/07/2005.
Some code in this file is also taken from the SciTE (Neil Hodgson)
}
// Utf8_16.cxx
// Copyright (C) 2002 Scott Kirkwood
//
// Permission to use, copy, modify, distribute and sell this code
// and its documentation for any purpose is hereby granted without fee,
// provided that the above copyright notice appear in all copies or
// any derived copies. Scott Kirkwood makes no representations
// about the suitability of this software for any purpose.
// It is provided "as is" without express or implied warranty.
////////////////////////////////////////////////////////////////////////////////
unit UtfFunct;
interface
uses Windows,SysUtils,Classes,Math;
const
UniBufSize=32000;
type
Utf16=Word;
Utf8=Byte;
TUtf8Array=array[0..1] of Utf8;
PUtf8=^TUtf8Array;
TUtf16Array=array[0..1] of Utf16;
PUtf16=^TUtf16Array;
//uniCookie isn't used yet..
UniMode=(uni8Bit, uni16BE, uni16LE, uniUTF8,uniCookie);
//States for the unicode Next functions..
eState=(eStart,e2Bytes2,e3Bytes2,e3Bytes3);
// Reads UTF-16 and outputs UTF-8
Utf16_Iter=class(TObject)
private
m_eEncoding : UniMode;
m_eState : eState;
m_pBuf : PByte;
m_pRead : PByte;
m_pEnd : PByte;
m_nCur : Utf8;
m_nCur16 : Utf16;
public
constructor Create;
procedure Reset;
procedure Set_(const pbuf : PByte;nLen : Cardinal;eEncoding : UniMode);
function More : Boolean;
procedure Next;
function Get : Utf8;
end;
// Reads UTF-8 and outputs UTF-16
Utf8_Iter =class(TObject)
private
m_eEncoding : UniMode;
m_eState : eState;
m_pBuf : PByte;
m_pRead : PByte;
m_pEnd : PByte;
m_nCur16 : Utf16;
procedure toStart;
procedure Swap;
public
constructor Create;
procedure Reset;
procedure Set_(const pbuf : PByte;nLen : Cardinal;eEncoding : UniMode);
function More : Boolean; //bool
procedure Next;
function Get : Utf16;
function canGet : Boolean;
end;
// Reads UTF16 and outputs UTF8
UtfRead=class(TObject)
private
m_eEncoding : UniMode;
m_pBuf : PByte;
m_nBufSize : Cardinal;
m_bFirstRead : Boolean;
m_pNewBuf : PByte;
m_nLen : Cardinal;
m_Iter16 : Utf16_Iter;
public
constructor Create;
destructor Destroy;override;
function getEncoding : UniMode;
function getNewBuf : PChar;
function Convert(buf : PChar; len : Cardinal) : Cardinal;
procedure Reset;
property Encoding : UniMode read m_eEncoding;
end;
// Read in a UTF-8 buffer and write out to UTF-16 or UTF-8
UtfWrite=class(TObject)
private
m_eEncoding : UniMode;
m_pBuf : PUtf16;
m_nBufSize : Cardinal;
m_bFirstWrite : Boolean;
m_pFile : TStream;
procedure SetDestStream(Value : TStream);
procedure SetEncoding(eType : UniMode);
public
constructor Create;
function Write(const Buffer; Count : Cardinal) : LongInt;
property DestStream : TStream read m_pFile write SetDestStream;
property Encoding : UniMode read m_eEncoding write SetEncoding;
end;
//Returns the UTF8 length of the buffer 'uptr'.
function UTF8Length(const wideSrc : PWideChar; wideLen : Cardinal) : Cardinal;
//Transforms UCS2 to UTF8.
procedure UTF8FromUCS2(const wideSrc : PWideChar; wideLen : Cardinal; utfDestBuf : PChar; utfDestLen : Cardinal);
function UTF8ToAnsiP(const srcBuffer : PChar;len : Integer;destBuffer : PChar) : Integer;
function DetectEncoding(buf : PByte;len : Integer;var Encoding : UniMode) : Integer;
implementation
const
k_boms : array[uni8bit..uniUTF8,0..2] of Utf8=(
($00,$00,$00),
($FE,$FF,$00),
($FF,$FE,$00),
($EF,$BB,$BF));
function UTF8ToAnsiP(const srcBuffer : PChar;len : Integer;destBuffer : PChar) : Integer;
var
tmpbuffer : String;
srcLen,i,destLen : Integer;
begin
Result:=0;
if (not assigned(srcBuffer)) or (not assigned(destBuffer)) then Exit;
if len=-1 then
srcLen:=Length(srcBuffer)
else
srcLen:=len;
tmpbuffer:=UTF8ToAnsi(Copy(srcBuffer,1,srcLen));
destLen:=Length(tmpbuffer);
for i:=1 to destLen do
destBuffer[i-1]:=tmpbuffer[i];
destBuffer[destLen]:=#0;
Result:=destLen;
end;
function UTF8Length(const wideSrc : PWideChar; wideLen : Cardinal) : Cardinal;
var
i,len : Cardinal;
uch : Cardinal;
begin
len := 0;
i:=0;
while((i<wideLen) and (Cardinal(wideSrc[i])<>0)) do
begin
uch:=Cardinal(wideSrc[i]);
if (uch < $80) then
Inc(len)
else if (uch < $800) then
Inc(len,2)
else
Inc(len,3);
Inc(i);
end;
Result:=len;
end;
procedure UTF8FromUCS2(const wideSrc : PWideChar; wideLen : Cardinal; utfDestBuf : PChar; utfDestLen : Cardinal);
var
k : Integer;
i : Cardinal;
uch : Cardinal;
begin
k:= 0;
i:=0;
while((i<wideLen)and(Cardinal(wideSrc[i])<>0)) do
begin
uch:=Cardinal(wideSrc[i]);
if uch<$80 then
begin
utfDestBuf[k] := Char(uch);
Inc(k);
end else
if (uch<$800) then
begin
utfDestBuf[k]:=Char($C0 or (uch shr 6));
Inc(k);
utfDestBuf[k] := Char($80 or (uch and $3f));
Inc(k);
end else
begin
utfDestBuf[k] := Char($E0 or (uch shr 12));
Inc(k);
utfDestBuf[k] := Char($80 or ((uch shr 6) and $3f));
Inc(k);
utfDestBuf[k] := Char($80 or (uch and $3f));
Inc(k);
end;
end;
utfDestBuf[utfDestLen]:=#0;
end;
function DetectEncoding(buf : PByte;len : Integer;var Encoding : UniMode) : Integer;
var
nRet : Integer;
pbTmp : PByteArray;
begin
Encoding := uni8bit;
pbTmp:=PByteArray(buf);
nRet := 0;
if (len > 1) then
begin
if ((pbTmp[0]=k_Boms[uni16BE][0]) and (pbTmp[1]=k_Boms[uni16BE][1])) then
begin
Encoding := uni16BE;
nRet := 2;
end else
if ((pbTmp[0]=k_Boms[uni16LE][0]) and (pbTmp[1]=k_Boms[uni16LE][1])) then
begin
Encoding := uni16LE;
nRet := 2;
end else
if ((len>2) and (pbTmp[0]=k_Boms[uniUTF8][0]) and (pbTmp[1]=k_Boms[uniUTF8][1]) and (pbTmp[2]=k_Boms[uniUTF8][2])) then
begin
Encoding := uniUTF8;
nRet := 3;
end;
end;
Result:=nRet;
end;
constructor Utf16_Iter.Create;
begin
Reset;
end;
procedure Utf16_Iter.Reset;
begin
m_pBuf := nil;
m_pRead := nil;
m_pEnd := nil;
m_eState := eStart;
m_nCur := 0;
m_nCur16 := 0;
m_eEncoding := uni8bit;
end;
procedure Utf16_Iter.Set_(const pbuf : PByte;nLen : Cardinal;eEncoding : UniMode);
begin
m_pBuf := pBuf;
m_pRead := pBuf;
m_pEnd := pBuf;
Inc(m_pEnd,nLen);
m_eEncoding := eEncoding;
Next;
end;
procedure Utf16_Iter.Next;
begin
case m_eState of
eStart:
begin
if (m_eEncoding = uni16LE) then
begin
m_nCur16 := Utf16(m_pRead^);
Inc(m_pRead);
m_nCur16 := m_nCur16 or Utf16((m_pRead^ shl 8));
end else
begin
m_nCur16 := Utf16(m_pRead^ shl 8);
Inc(m_pRead);
m_nCur16 := m_nCur16 or m_pRead^;
end;
Inc(m_pRead);
if (m_nCur16 < $80) then
begin
m_nCur := Byte(m_nCur16 and $FF);
m_eState := eStart;
end else
if (m_nCur16 < $800) then
begin
m_nCur := Byte($C0 or (m_nCur16 shr 6));
m_eState := e2Bytes2;
end else
begin
m_nCur := Byte($E0 or (m_nCur16 shr 12));
m_eState := e3Bytes2;
end;
end;
e2Bytes2:
begin
m_nCur := Byte($80 or (m_nCur16 and $3F));
m_eState := eStart;
end;
e3Bytes2:
begin
m_nCur := Byte($80 or ((m_nCur16 shr 6) and $3F));
m_eState := e3Bytes3;
end;
e3Bytes3:
begin
m_nCur := Byte($80 or (m_nCur16 and $3F));
m_eState := eStart;
end;
end;
end;
function Utf16_Iter.More : Boolean;
begin
Result:=Cardinal(m_pRead) <= Cardinal(m_pEnd);
end;
function Utf16_Iter.Get : Utf8;
begin
Result:=m_nCur;
end;
constructor Utf8_Iter.Create;
begin
Reset;
end;
procedure Utf8_Iter.Reset;
begin
m_pBuf := nil;
m_pRead := nil;
m_pEnd := nil;
m_eState := eStart;
m_nCur16 := 0;
m_eEncoding := uni8bit;
end;
procedure Utf8_Iter.Set_(const pbuf : PByte;nLen : Cardinal;eEncoding : UniMode);
begin
m_pBuf := pBuf;
m_pRead := pBuf;
m_pEnd := pBuf;
Inc(m_pEnd,nLen);
m_eEncoding := eEncoding;
Next;
end;
procedure Utf8_Iter.Next;
begin
case (m_eState) of
eStart:
begin
if (($E0 and m_pRead^) = $E0) then
begin
m_nCur16 := Utf16(((not $E0) and m_pRead^) shl 12);
m_eState := e3Bytes2;
end else if (($C0 and m_pRead^) = $C0) then
begin
m_nCur16 := Utf16((not $C0 and m_pRead^) shl 6);
m_eState := e2Bytes2;
end else
begin
m_nCur16 := m_pRead^;
toStart;
end;
end;
e2Bytes2:
begin
m_nCur16 :=m_nCur16 or utf8($3F and m_pRead^);
toStart;
end;
e3Bytes2:
begin
m_nCur16 :=m_nCur16 or utf16(($3F and m_pRead^) shl 6);
m_eState := e3Bytes3;
end;
e3Bytes3:
begin
m_nCur16 :=m_nCur16 or utf8($3F and m_pRead^);
toStart;
end;
end;
Inc(m_pRead);
end;
function Utf8_Iter.More : Boolean;
begin
Result:=Cardinal(m_pRead) <= Cardinal(m_pEnd);
end;
function Utf8_Iter.Get : Utf16;
begin
Result:=m_nCur16;
end;
function Utf8_Iter.canGet : Boolean;
begin
Result:=m_eState = eStart;
end;
procedure Utf8_Iter.toStart;
begin
m_eState := eStart;
if (m_eEncoding = uni16BE) then
Swap;
end;
procedure Utf8_Iter.Swap;
var
p : PUtf8;
swapbyte : Utf8;
begin
p := PUtf8(@m_nCur16);
swapbyte := p[0];
p[0]:= p[1];
p[1]:=swapbyte;
end;
constructor UtfRead.Create;
begin
m_eEncoding := uni8bit;
m_nBufSize := 0;
m_pNewBuf := nil;
m_bFirstRead := True;
end;
destructor UtfRead.Destroy;
begin
if ((m_eEncoding <> uni8bit) and (m_eEncoding <> uniUTF8)) then
begin
if assigned(m_pNewBuf) then FreeMem(m_pNewBuf);
end;
inherited;
end;
function UtfRead.getEncoding : UniMode;
begin
Result:=m_eEncoding;
end;
function UtfRead.getNewBuf : PChar;
begin
Result:=PChar(m_pNewBuf);
end;
procedure UtfRead.Reset;
begin
m_bFirstRead:=True;
m_nBufSize:=0;
m_eEncoding :=uni8Bit;
end;
function UtfRead.Convert(buf : PChar; len : Cardinal) : Cardinal;
var
nSkip : Cardinal;
newSize : Cardinal;
pCur,pTemp : PByte;
begin
m_Iter16:=Utf16_Iter.Create;
try
m_pBuf := PByte(buf);
m_nLen := len;
nSkip := 0;
if (m_bFirstRead) then
begin
nSkip := DetectEncoding(m_pBuf,m_nLen,m_eEncoding);
m_bFirstRead := False;
end;
if (m_eEncoding = uni8bit) then
begin
// Do nothing, pass through
m_nBufSize := 0;
m_pNewBuf := m_pBuf;
Result:=len;
Exit;
end;
if (m_eEncoding = uniUTF8) then
begin
// Pass through after BOM
m_nBufSize := 0;
m_pNewBuf := m_pBuf;
Inc(m_pNewBuf,nSkip);
Result:=len - nSkip;
Exit;
end;
// Else...
//newSize := len + len div 2 + 1;
newSize:=len*2+1;
if (m_nBufSize <> newSize) then
begin
FreeMem(m_pNewBuf);
m_pNewBuf:=nil;
GetMem(m_pNewBuf,newSize);
m_nBufSize := newSize;
end;
pCur := m_pNewBuf;
pTemp:=m_pBuf;
Inc(pTemp,nSkip);
m_Iter16.Set_(pTemp, len - nSkip, m_eEncoding);
while(m_Iter16.More) do
begin
pCur^:=m_Iter16.Get;
Inc(PCur);
m_Iter16.Next;
end;
// Return number of bytes writen out
finally
FreeAndNil(m_Iter16);
end;
Result:=Cardinal(pCur) - Cardinal(m_pNewBuf);
end;
constructor UtfWrite.Create;
begin
m_eEncoding := uni8bit;
m_pFile := nil;
m_pBuf := nil;
m_bFirstWrite := true;
m_nBufSize := 0;
end;
procedure UtfWrite.SetEncoding(eType : UniMode);
begin
m_eEncoding := eType;
end;
procedure UtfWrite.SetDestStream(Value : TStream);
begin
m_pFile:=Value;
m_bFirstWrite:=True;
end;
function UtfWrite.Write(const Buffer; Count : Cardinal) : LongInt;
var
iter8 : Utf8_Iter;
pCur : ^Utf16;
ret : LongInt;
pTemp : PChar;
begin
if Count=0 then
begin
Result:=0;
Exit;
end;
iter8:=Utf8_Iter.Create;
try
if (not assigned(m_pFile)) then
begin
Result:=0;
Exit;
end;
if (m_eEncoding = uni8bit) then
begin
// Normal write
m_bFirstWrite:=False;
Result:=m_pFile.Write(PChar(Buffer)^, Count);
Exit;
end;
if (m_eEncoding = uniUTF8) then
begin
pTemp:=PChar(Buffer);
if (m_bFirstWrite) then
begin
m_pFile.Write(k_Boms[m_eEncoding], 3);
m_bFirstWrite := false;
end;
Result:=m_pFile.Write(pTemp^, Count);
Exit;
end;
if (Count > m_nBufSize) then
begin
m_nBufSize := Count;
if assigned(m_pBuf) then FreeMem(m_pBuf);
m_pBuf := nil;
GetMem(m_pBuf,SizeOf(Utf16)*(Count+1));
end;
if (m_bFirstWrite) then
begin
if ((m_eEncoding = uni16BE) or (m_eEncoding = uni16LE)) then
begin
// Write the BOM
m_pFile.Write(k_Boms[m_eEncoding],2);
end;
m_bFirstWrite := false;
end;
iter8.set_(PByte(Buffer), Count, m_eEncoding);
pCur := @m_pBuf[0];
while(iter8.More) do
begin
if (iter8.canGet) then
begin
pCur^ := iter8.Get;
Inc(pCur);
end;
iter8.Next;
end;
ret := m_pFile.Write(m_pBuf^,Cardinal(pCur)-Cardinal(m_pBuf));
finally
if assigned(iter8) then FreeAndNil(iter8);
end;
Result:=ret;
end;
end.