Updated components, added 2 new units
This commit is contained in:
608
editor/studio/UtfFunct.PAS
Executable file
608
editor/studio/UtfFunct.PAS
Executable file
@ -0,0 +1,608 @@
|
||||
{
|
||||
This file is translated to Delphi from the file referenced below
|
||||
by Jan Martin Pettersen (hdalis@users.sourceforge.net)
|
||||
23/07/2005.
|
||||
|
||||
Some code in this file is also taken from the SciTE (Neil Hodgson)
|
||||
}
|
||||
// Utf8_16.cxx
|
||||
// Copyright (C) 2002 Scott Kirkwood
|
||||
//
|
||||
// Permission to use, copy, modify, distribute and sell this code
|
||||
// and its documentation for any purpose is hereby granted without fee,
|
||||
// provided that the above copyright notice appear in all copies or
|
||||
// any derived copies. Scott Kirkwood makes no representations
|
||||
// about the suitability of this software for any purpose.
|
||||
// It is provided "as is" without express or implied warranty.
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
unit UtfFunct;
|
||||
interface
|
||||
|
||||
uses Windows,SysUtils,Classes,Math;
|
||||
const
|
||||
UniBufSize=32000;
|
||||
|
||||
type
|
||||
Utf16=Word;
|
||||
Utf8=Byte;
|
||||
TUtf8Array=array[0..1] of Utf8;
|
||||
PUtf8=^TUtf8Array;
|
||||
TUtf16Array=array[0..1] of Utf16;
|
||||
PUtf16=^TUtf16Array;
|
||||
|
||||
//uniCookie isn't used yet..
|
||||
UniMode=(uni8Bit, uni16BE, uni16LE, uniUTF8,uniCookie);
|
||||
//States for the unicode Next functions..
|
||||
eState=(eStart,e2Bytes2,e3Bytes2,e3Bytes3);
|
||||
|
||||
// Reads UTF-16 and outputs UTF-8
|
||||
Utf16_Iter=class(TObject)
|
||||
private
|
||||
m_eEncoding : UniMode;
|
||||
m_eState : eState;
|
||||
m_pBuf : PByte;
|
||||
m_pRead : PByte;
|
||||
m_pEnd : PByte;
|
||||
m_nCur : Utf8;
|
||||
m_nCur16 : Utf16;
|
||||
public
|
||||
constructor Create;
|
||||
procedure Reset;
|
||||
procedure Set_(const pbuf : PByte;nLen : Cardinal;eEncoding : UniMode);
|
||||
function More : Boolean;
|
||||
procedure Next;
|
||||
function Get : Utf8;
|
||||
end;
|
||||
|
||||
|
||||
// Reads UTF-8 and outputs UTF-16
|
||||
Utf8_Iter =class(TObject)
|
||||
private
|
||||
m_eEncoding : UniMode;
|
||||
m_eState : eState;
|
||||
m_pBuf : PByte;
|
||||
m_pRead : PByte;
|
||||
m_pEnd : PByte;
|
||||
m_nCur16 : Utf16;
|
||||
procedure toStart;
|
||||
procedure Swap;
|
||||
public
|
||||
constructor Create;
|
||||
procedure Reset;
|
||||
procedure Set_(const pbuf : PByte;nLen : Cardinal;eEncoding : UniMode);
|
||||
function More : Boolean; //bool
|
||||
procedure Next;
|
||||
function Get : Utf16;
|
||||
function canGet : Boolean;
|
||||
end;
|
||||
|
||||
// Reads UTF16 and outputs UTF8
|
||||
UtfRead=class(TObject)
|
||||
private
|
||||
m_eEncoding : UniMode;
|
||||
m_pBuf : PByte;
|
||||
m_nBufSize : Cardinal;
|
||||
m_bFirstRead : Boolean;
|
||||
m_pNewBuf : PByte;
|
||||
m_nLen : Cardinal;
|
||||
m_Iter16 : Utf16_Iter;
|
||||
public
|
||||
constructor Create;
|
||||
destructor Destroy;override;
|
||||
function getEncoding : UniMode;
|
||||
function getNewBuf : PChar;
|
||||
function Convert(buf : PChar; len : Cardinal) : Cardinal;
|
||||
procedure Reset;
|
||||
property Encoding : UniMode read m_eEncoding;
|
||||
end;
|
||||
|
||||
// Read in a UTF-8 buffer and write out to UTF-16 or UTF-8
|
||||
UtfWrite=class(TObject)
|
||||
private
|
||||
m_eEncoding : UniMode;
|
||||
m_pBuf : PUtf16;
|
||||
m_nBufSize : Cardinal;
|
||||
m_bFirstWrite : Boolean;
|
||||
m_pFile : TStream;
|
||||
procedure SetDestStream(Value : TStream);
|
||||
procedure SetEncoding(eType : UniMode);
|
||||
public
|
||||
constructor Create;
|
||||
function Write(const Buffer; Count : Cardinal) : LongInt;
|
||||
property DestStream : TStream read m_pFile write SetDestStream;
|
||||
property Encoding : UniMode read m_eEncoding write SetEncoding;
|
||||
end;
|
||||
|
||||
//Returns the UTF8 length of the buffer 'uptr'.
|
||||
function UTF8Length(const wideSrc : PWideChar; wideLen : Cardinal) : Cardinal;
|
||||
//Transforms UCS2 to UTF8.
|
||||
procedure UTF8FromUCS2(const wideSrc : PWideChar; wideLen : Cardinal; utfDestBuf : PChar; utfDestLen : Cardinal);
|
||||
function UTF8ToAnsiP(const srcBuffer : PChar;len : Integer;destBuffer : PChar) : Integer;
|
||||
function DetectEncoding(buf : PByte;len : Integer;var Encoding : UniMode) : Integer;
|
||||
|
||||
implementation
|
||||
|
||||
const
|
||||
k_boms : array[uni8bit..uniUTF8,0..2] of Utf8=(
|
||||
($00,$00,$00),
|
||||
($FE,$FF,$00),
|
||||
($FF,$FE,$00),
|
||||
($EF,$BB,$BF));
|
||||
|
||||
function UTF8ToAnsiP(const srcBuffer : PChar;len : Integer;destBuffer : PChar) : Integer;
|
||||
var
|
||||
tmpbuffer : String;
|
||||
srcLen,i,destLen : Integer;
|
||||
begin
|
||||
Result:=0;
|
||||
if (not assigned(srcBuffer)) or (not assigned(destBuffer)) then Exit;
|
||||
if len=-1 then
|
||||
srcLen:=Length(srcBuffer)
|
||||
else
|
||||
srcLen:=len;
|
||||
tmpbuffer:=UTF8ToAnsi(Copy(srcBuffer,1,srcLen));
|
||||
destLen:=Length(tmpbuffer);
|
||||
for i:=1 to destLen do
|
||||
destBuffer[i-1]:=tmpbuffer[i];
|
||||
destBuffer[destLen]:=#0;
|
||||
Result:=destLen;
|
||||
end;
|
||||
|
||||
function UTF8Length(const wideSrc : PWideChar; wideLen : Cardinal) : Cardinal;
|
||||
var
|
||||
i,len : Cardinal;
|
||||
uch : Cardinal;
|
||||
begin
|
||||
len := 0;
|
||||
i:=0;
|
||||
while((i<wideLen) and (Cardinal(wideSrc[i])<>0)) do
|
||||
begin
|
||||
uch:=Cardinal(wideSrc[i]);
|
||||
if (uch < $80) then
|
||||
Inc(len)
|
||||
else if (uch < $800) then
|
||||
Inc(len,2)
|
||||
else
|
||||
Inc(len,3);
|
||||
Inc(i);
|
||||
end;
|
||||
Result:=len;
|
||||
end;
|
||||
|
||||
procedure UTF8FromUCS2(const wideSrc : PWideChar; wideLen : Cardinal; utfDestBuf : PChar; utfDestLen : Cardinal);
|
||||
var
|
||||
k : Integer;
|
||||
i : Cardinal;
|
||||
uch : Cardinal;
|
||||
begin
|
||||
k:= 0;
|
||||
i:=0;
|
||||
while((i<wideLen)and(Cardinal(wideSrc[i])<>0)) do
|
||||
begin
|
||||
uch:=Cardinal(wideSrc[i]);
|
||||
if uch<$80 then
|
||||
begin
|
||||
utfDestBuf[k] := Char(uch);
|
||||
Inc(k);
|
||||
end else
|
||||
if (uch<$800) then
|
||||
begin
|
||||
utfDestBuf[k]:=Char($C0 or (uch shr 6));
|
||||
Inc(k);
|
||||
utfDestBuf[k] := Char($80 or (uch and $3f));
|
||||
Inc(k);
|
||||
end else
|
||||
begin
|
||||
utfDestBuf[k] := Char($E0 or (uch shr 12));
|
||||
Inc(k);
|
||||
utfDestBuf[k] := Char($80 or ((uch shr 6) and $3f));
|
||||
Inc(k);
|
||||
utfDestBuf[k] := Char($80 or (uch and $3f));
|
||||
Inc(k);
|
||||
end;
|
||||
end;
|
||||
utfDestBuf[utfDestLen]:=#0;
|
||||
end;
|
||||
|
||||
function DetectEncoding(buf : PByte;len : Integer;var Encoding : UniMode) : Integer;
|
||||
var
|
||||
nRet : Integer;
|
||||
pbTmp : PByteArray;
|
||||
begin
|
||||
Encoding := uni8bit;
|
||||
pbTmp:=PByteArray(buf);
|
||||
nRet := 0;
|
||||
if (len > 1) then
|
||||
begin
|
||||
if ((pbTmp[0]=k_Boms[uni16BE][0]) and (pbTmp[1]=k_Boms[uni16BE][1])) then
|
||||
begin
|
||||
Encoding := uni16BE;
|
||||
nRet := 2;
|
||||
end else
|
||||
if ((pbTmp[0]=k_Boms[uni16LE][0]) and (pbTmp[1]=k_Boms[uni16LE][1])) then
|
||||
begin
|
||||
Encoding := uni16LE;
|
||||
nRet := 2;
|
||||
end else
|
||||
if ((len>2) and (pbTmp[0]=k_Boms[uniUTF8][0]) and (pbTmp[1]=k_Boms[uniUTF8][1]) and (pbTmp[2]=k_Boms[uniUTF8][2])) then
|
||||
begin
|
||||
Encoding := uniUTF8;
|
||||
nRet := 3;
|
||||
end;
|
||||
end;
|
||||
Result:=nRet;
|
||||
end;
|
||||
|
||||
|
||||
constructor Utf16_Iter.Create;
|
||||
begin
|
||||
Reset;
|
||||
end;
|
||||
procedure Utf16_Iter.Reset;
|
||||
begin
|
||||
m_pBuf := nil;
|
||||
m_pRead := nil;
|
||||
m_pEnd := nil;
|
||||
m_eState := eStart;
|
||||
m_nCur := 0;
|
||||
m_nCur16 := 0;
|
||||
m_eEncoding := uni8bit;
|
||||
end;
|
||||
procedure Utf16_Iter.Set_(const pbuf : PByte;nLen : Cardinal;eEncoding : UniMode);
|
||||
begin
|
||||
m_pBuf := pBuf;
|
||||
m_pRead := pBuf;
|
||||
m_pEnd := pBuf;
|
||||
Inc(m_pEnd,nLen);
|
||||
m_eEncoding := eEncoding;
|
||||
Next;
|
||||
end;
|
||||
|
||||
procedure Utf16_Iter.Next;
|
||||
begin
|
||||
case m_eState of
|
||||
eStart:
|
||||
begin
|
||||
if (m_eEncoding = uni16LE) then
|
||||
begin
|
||||
m_nCur16 := Utf16(m_pRead^);
|
||||
Inc(m_pRead);
|
||||
m_nCur16 := m_nCur16 or Utf16((m_pRead^ shl 8));
|
||||
end else
|
||||
begin
|
||||
m_nCur16 := Utf16(m_pRead^ shl 8);
|
||||
Inc(m_pRead);
|
||||
m_nCur16 := m_nCur16 or m_pRead^;
|
||||
end;
|
||||
Inc(m_pRead);
|
||||
|
||||
if (m_nCur16 < $80) then
|
||||
begin
|
||||
m_nCur := Byte(m_nCur16 and $FF);
|
||||
m_eState := eStart;
|
||||
end else
|
||||
if (m_nCur16 < $800) then
|
||||
begin
|
||||
m_nCur := Byte($C0 or (m_nCur16 shr 6));
|
||||
m_eState := e2Bytes2;
|
||||
end else
|
||||
begin
|
||||
m_nCur := Byte($E0 or (m_nCur16 shr 12));
|
||||
m_eState := e3Bytes2;
|
||||
end;
|
||||
|
||||
end;
|
||||
e2Bytes2:
|
||||
begin
|
||||
m_nCur := Byte($80 or (m_nCur16 and $3F));
|
||||
m_eState := eStart;
|
||||
end;
|
||||
e3Bytes2:
|
||||
begin
|
||||
m_nCur := Byte($80 or ((m_nCur16 shr 6) and $3F));
|
||||
m_eState := e3Bytes3;
|
||||
end;
|
||||
e3Bytes3:
|
||||
begin
|
||||
m_nCur := Byte($80 or (m_nCur16 and $3F));
|
||||
m_eState := eStart;
|
||||
end;
|
||||
end;
|
||||
end;
|
||||
|
||||
function Utf16_Iter.More : Boolean;
|
||||
begin
|
||||
Result:=Cardinal(m_pRead) <= Cardinal(m_pEnd);
|
||||
end;
|
||||
|
||||
function Utf16_Iter.Get : Utf8;
|
||||
begin
|
||||
Result:=m_nCur;
|
||||
end;
|
||||
|
||||
constructor Utf8_Iter.Create;
|
||||
begin
|
||||
Reset;
|
||||
end;
|
||||
|
||||
procedure Utf8_Iter.Reset;
|
||||
begin
|
||||
m_pBuf := nil;
|
||||
m_pRead := nil;
|
||||
m_pEnd := nil;
|
||||
m_eState := eStart;
|
||||
m_nCur16 := 0;
|
||||
m_eEncoding := uni8bit;
|
||||
end;
|
||||
|
||||
procedure Utf8_Iter.Set_(const pbuf : PByte;nLen : Cardinal;eEncoding : UniMode);
|
||||
begin
|
||||
m_pBuf := pBuf;
|
||||
m_pRead := pBuf;
|
||||
m_pEnd := pBuf;
|
||||
Inc(m_pEnd,nLen);
|
||||
m_eEncoding := eEncoding;
|
||||
Next;
|
||||
end;
|
||||
|
||||
procedure Utf8_Iter.Next;
|
||||
begin
|
||||
case (m_eState) of
|
||||
eStart:
|
||||
begin
|
||||
if (($E0 and m_pRead^) = $E0) then
|
||||
begin
|
||||
m_nCur16 := Utf16(((not $E0) and m_pRead^) shl 12);
|
||||
m_eState := e3Bytes2;
|
||||
end else if (($C0 and m_pRead^) = $C0) then
|
||||
begin
|
||||
m_nCur16 := Utf16((not $C0 and m_pRead^) shl 6);
|
||||
m_eState := e2Bytes2;
|
||||
end else
|
||||
begin
|
||||
m_nCur16 := m_pRead^;
|
||||
toStart;
|
||||
end;
|
||||
end;
|
||||
e2Bytes2:
|
||||
begin
|
||||
m_nCur16 :=m_nCur16 or utf8($3F and m_pRead^);
|
||||
toStart;
|
||||
end;
|
||||
e3Bytes2:
|
||||
begin
|
||||
m_nCur16 :=m_nCur16 or utf16(($3F and m_pRead^) shl 6);
|
||||
m_eState := e3Bytes3;
|
||||
end;
|
||||
e3Bytes3:
|
||||
begin
|
||||
m_nCur16 :=m_nCur16 or utf8($3F and m_pRead^);
|
||||
toStart;
|
||||
end;
|
||||
end;
|
||||
Inc(m_pRead);
|
||||
end;
|
||||
|
||||
function Utf8_Iter.More : Boolean;
|
||||
begin
|
||||
Result:=Cardinal(m_pRead) <= Cardinal(m_pEnd);
|
||||
end;
|
||||
|
||||
function Utf8_Iter.Get : Utf16;
|
||||
begin
|
||||
Result:=m_nCur16;
|
||||
end;
|
||||
function Utf8_Iter.canGet : Boolean;
|
||||
begin
|
||||
Result:=m_eState = eStart;
|
||||
end;
|
||||
procedure Utf8_Iter.toStart;
|
||||
begin
|
||||
m_eState := eStart;
|
||||
if (m_eEncoding = uni16BE) then
|
||||
Swap;
|
||||
end;
|
||||
procedure Utf8_Iter.Swap;
|
||||
var
|
||||
p : PUtf8;
|
||||
swapbyte : Utf8;
|
||||
begin
|
||||
p := PUtf8(@m_nCur16);
|
||||
swapbyte := p[0];
|
||||
p[0]:= p[1];
|
||||
p[1]:=swapbyte;
|
||||
end;
|
||||
|
||||
constructor UtfRead.Create;
|
||||
begin
|
||||
m_eEncoding := uni8bit;
|
||||
m_nBufSize := 0;
|
||||
m_pNewBuf := nil;
|
||||
m_bFirstRead := True;
|
||||
end;
|
||||
|
||||
destructor UtfRead.Destroy;
|
||||
begin
|
||||
if ((m_eEncoding <> uni8bit) and (m_eEncoding <> uniUTF8)) then
|
||||
begin
|
||||
if assigned(m_pNewBuf) then FreeMem(m_pNewBuf);
|
||||
end;
|
||||
inherited;
|
||||
end;
|
||||
|
||||
function UtfRead.getEncoding : UniMode;
|
||||
begin
|
||||
Result:=m_eEncoding;
|
||||
end;
|
||||
function UtfRead.getNewBuf : PChar;
|
||||
begin
|
||||
Result:=PChar(m_pNewBuf);
|
||||
end;
|
||||
|
||||
procedure UtfRead.Reset;
|
||||
begin
|
||||
m_bFirstRead:=True;
|
||||
m_nBufSize:=0;
|
||||
m_eEncoding :=uni8Bit;
|
||||
end;
|
||||
|
||||
function UtfRead.Convert(buf : PChar; len : Cardinal) : Cardinal;
|
||||
var
|
||||
nSkip : Cardinal;
|
||||
newSize : Cardinal;
|
||||
pCur,pTemp : PByte;
|
||||
begin
|
||||
m_Iter16:=Utf16_Iter.Create;
|
||||
try
|
||||
m_pBuf := PByte(buf);
|
||||
m_nLen := len;
|
||||
|
||||
nSkip := 0;
|
||||
if (m_bFirstRead) then
|
||||
begin
|
||||
nSkip := DetectEncoding(m_pBuf,m_nLen,m_eEncoding);
|
||||
m_bFirstRead := False;
|
||||
end;
|
||||
|
||||
if (m_eEncoding = uni8bit) then
|
||||
begin
|
||||
// Do nothing, pass through
|
||||
m_nBufSize := 0;
|
||||
m_pNewBuf := m_pBuf;
|
||||
Result:=len;
|
||||
Exit;
|
||||
end;
|
||||
|
||||
if (m_eEncoding = uniUTF8) then
|
||||
begin
|
||||
// Pass through after BOM
|
||||
m_nBufSize := 0;
|
||||
m_pNewBuf := m_pBuf;
|
||||
Inc(m_pNewBuf,nSkip);
|
||||
Result:=len - nSkip;
|
||||
Exit;
|
||||
end;
|
||||
|
||||
// Else...
|
||||
//newSize := len + len div 2 + 1;
|
||||
newSize:=len*2+1;
|
||||
if (m_nBufSize <> newSize) then
|
||||
begin
|
||||
FreeMem(m_pNewBuf);
|
||||
m_pNewBuf:=nil;
|
||||
GetMem(m_pNewBuf,newSize);
|
||||
m_nBufSize := newSize;
|
||||
end;
|
||||
|
||||
pCur := m_pNewBuf;
|
||||
pTemp:=m_pBuf;
|
||||
Inc(pTemp,nSkip);
|
||||
m_Iter16.Set_(pTemp, len - nSkip, m_eEncoding);
|
||||
while(m_Iter16.More) do
|
||||
begin
|
||||
pCur^:=m_Iter16.Get;
|
||||
Inc(PCur);
|
||||
m_Iter16.Next;
|
||||
end;
|
||||
// Return number of bytes writen out
|
||||
finally
|
||||
FreeAndNil(m_Iter16);
|
||||
end;
|
||||
Result:=Cardinal(pCur) - Cardinal(m_pNewBuf);
|
||||
end;
|
||||
|
||||
constructor UtfWrite.Create;
|
||||
begin
|
||||
m_eEncoding := uni8bit;
|
||||
m_pFile := nil;
|
||||
m_pBuf := nil;
|
||||
m_bFirstWrite := true;
|
||||
m_nBufSize := 0;
|
||||
end;
|
||||
|
||||
procedure UtfWrite.SetEncoding(eType : UniMode);
|
||||
begin
|
||||
m_eEncoding := eType;
|
||||
end;
|
||||
procedure UtfWrite.SetDestStream(Value : TStream);
|
||||
begin
|
||||
m_pFile:=Value;
|
||||
m_bFirstWrite:=True;
|
||||
end;
|
||||
|
||||
function UtfWrite.Write(const Buffer; Count : Cardinal) : LongInt;
|
||||
var
|
||||
iter8 : Utf8_Iter;
|
||||
pCur : ^Utf16;
|
||||
ret : LongInt;
|
||||
pTemp : PChar;
|
||||
begin
|
||||
if Count=0 then
|
||||
begin
|
||||
Result:=0;
|
||||
Exit;
|
||||
end;
|
||||
iter8:=Utf8_Iter.Create;
|
||||
try
|
||||
if (not assigned(m_pFile)) then
|
||||
begin
|
||||
Result:=0;
|
||||
Exit;
|
||||
end;
|
||||
if (m_eEncoding = uni8bit) then
|
||||
begin
|
||||
// Normal write
|
||||
m_bFirstWrite:=False;
|
||||
Result:=m_pFile.Write(PChar(Buffer)^, Count);
|
||||
Exit;
|
||||
end;
|
||||
|
||||
if (m_eEncoding = uniUTF8) then
|
||||
begin
|
||||
pTemp:=PChar(Buffer);
|
||||
if (m_bFirstWrite) then
|
||||
begin
|
||||
m_pFile.Write(k_Boms[m_eEncoding], 3);
|
||||
m_bFirstWrite := false;
|
||||
end;
|
||||
Result:=m_pFile.Write(pTemp^, Count);
|
||||
Exit;
|
||||
end;
|
||||
|
||||
if (Count > m_nBufSize) then
|
||||
begin
|
||||
m_nBufSize := Count;
|
||||
if assigned(m_pBuf) then FreeMem(m_pBuf);
|
||||
m_pBuf := nil;
|
||||
GetMem(m_pBuf,SizeOf(Utf16)*(Count+1));
|
||||
end;
|
||||
|
||||
if (m_bFirstWrite) then
|
||||
begin
|
||||
if ((m_eEncoding = uni16BE) or (m_eEncoding = uni16LE)) then
|
||||
begin
|
||||
// Write the BOM
|
||||
m_pFile.Write(k_Boms[m_eEncoding],2);
|
||||
end;
|
||||
m_bFirstWrite := false;
|
||||
end;
|
||||
|
||||
iter8.set_(PByte(Buffer), Count, m_eEncoding);
|
||||
pCur := @m_pBuf[0];
|
||||
while(iter8.More) do
|
||||
begin
|
||||
if (iter8.canGet) then
|
||||
begin
|
||||
pCur^ := iter8.Get;
|
||||
Inc(pCur);
|
||||
end;
|
||||
iter8.Next;
|
||||
end;
|
||||
ret := m_pFile.Write(m_pBuf^,Cardinal(pCur)-Cardinal(m_pBuf));
|
||||
finally
|
||||
if assigned(iter8) then FreeAndNil(iter8);
|
||||
end;
|
||||
Result:=ret;
|
||||
end;
|
||||
|
||||
end.
|
Reference in New Issue
Block a user