Screenshot of CodeCentral item 28673
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE constants[
<!ENTITY notice "The XML-csv format was developed by Sean B. Durkin�www.seanbdurkin.id.au">
]>
<xsl:stylesheet
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:fn="http://www.w3.org/2005/xpath-functions"
xmlns:local="http://seanbdurkin.id.au/xslt/csv2xml.xsl"
xmlns:xs="http://www.w3.org/2001/XMLSchema"
xmlns:csv="http://seanbdurkin.id.au/xslt/csv.xsd"
version="2.0" exclude-result-prefixes="xsl xs fn local">
<xsl:output indent="yes" encoding="UTF-8" />
<xsl:import-schema schema-location="http://seanbdurkin.id.au/xslt/csv.xsd"
use-when="system-property('xsl:is-schema-aware')='yes'"/>
<xsl:param name="url-of-csv" />
<!-- Constants -->
<xsl:variable name="quote" as="xs:string">"</xsl:variable>
<xsl:variable name="fail-open-message-xlations"> <!-- Localise here, as required. -->
<en>Could not open CSV resource </en>
<!-- Insert other language translations here, as needed. -->
</xsl:variable>
<xsl:variable name="lang" as="xs:string">en</xsl:variable> <!-- Language for $fail-open-message. -->
<xsl:variable name="fail-open-message">
<xsl:value-of select="$fail-open-message-xlations/en" />
</xsl:variable>
<xsl:function name="local:unparsed-text-lines" as="xs:string+">
<xsl:param name="href" as="xs:string" />
<xsl:sequence use-when="function-available('unparsed-text-lines')" select="fn:unparsed-text-lines($href)" />
<xsl:sequence use-when="not(function-available('unparsed-text-lines'))" select="tokenize(unparsed-text($href), '\r\n|\r|\n')[not(position()=last() and .='')]" />
</xsl:function>
<xsl:function name="local:error-node" as="node()">
<xsl:param name="message" as="xs:string" />
<xsl:param name="do-localise" as="xs:boolean" />
<csv:error>
<xsl:if test="$do-localise">
<xsl:attribute name="xml:lang"><xsl:value-of select="$lang"/></xsl:attribute>
</xsl:if>
<xsl:value-of select="$message" />
</csv:error>
</xsl:function>
<xsl:function name="local:csv-to-xml" as="node()+">
<xsl:param name="href" as="xs:string" />
<csv:comma-separated-values>
<csv:notice xml:lang="en">¬ice;</csv:notice> <!-- Notice language is fixed as per Schema. -->
<xsl:copy-of select="local:error-node($fail-open-message, fn:true())"/>
<xsl:choose>
<xsl:when test="fn:unparsed-text-available($href)">
<xsl:for-each select="local:unparsed-text-lines($href)">
<csv:row>
<xsl:analyze-string select="fn:concat(., ',')" regex='((("[^"]*")+[^,])*|[^,"]*),'>
<xsl:matching-substring>
<csv:cell>
<xsl:if test="fn:starts-with( fn:regex-group(1), $quote)">
<xsl:attribute name="quoted">
<xsl:value-of select="fn:true()" />
</xsl:attribute>
</xsl:if>
<xsl:value-of select='fn:replace(fn:regex-group(1), "^""|""$|("")""", "$1" )' />
</csv:cell>
</xsl:matching-substring>
<xsl:non-matching-substring>
<xsl:copy-of select="local:error-node(., fn:false())"/>
</xsl:non-matching-substring>
</xsl:analyze-string>
</csv:row>
</xsl:for-each>
</xsl:when>
<xsl:otherwise>
<xsl:copy-of select="local:error-node(fn:concat( $fail-open-message, $href), fn:true())"/>
</xsl:otherwise>
</xsl:choose>
</csv:comma-separated-values>
</xsl:function>
<xsl:template match="/" name="main" mode="csv-to-xml">
<xsl:copy-of select="local:csv-to-xml($url-of-csv)" />
</xsl:template>
<xsl:template match="csv" mode="xml-to-csv">
<xsl:apply-templates select="row" />
</xsl:template>
<xsl:template match="row" mode="xml-to-csv">
<xsl:apply-templates select="cell" />
<xsl:text>
</xsl:text>
</xsl:template>
</xsl:stylesheet>
unit uCharacters;
interface
uses Generics.Collections, SysUtils, Classes;
type
UniChar = UCS4Char;
GB_CodePoint = LongWord;
IEncodingFamily = interface;
IEncoding = interface;
ICharCodec = interface;
ICharacterServices = interface
['{D4C7BFF8-CBE3-411A-AE29-6C281CBDE296}']
function FindEncodingFamily( const FamilyName: string; out Family: IEncodingFamily): boolean;
function FindEncoding( const EncodingName: string; out Encoding: IEncoding): boolean;
function EncodingFamilies: TEnumerable<IEncodingFamily>;
function Encodings: TEnumerable<IEncoding>;
function DefaultEncoding: IEncoding;
function NativeEncoding: IEncoding;
function UTF8: IEncoding;
function UTF16LE: IEncoding;
function ReadBOM( Document: TStream): IEncodingFamily;
procedure RegisterCustomEncoding( const Addend: IEncoding; const Family: IEncodingFamily);
end;
IEncodingFamily = interface
['{767A1283-319B-430D-B1C4-CE320DD9B1DA}']
function Name: string;
function BaseEncoding: IEncoding;
function Encodings: TEnumerable<IEncoding>;
function BytesPerCodeUnit: integer;
function BOM: TBytes;
end;
//Need to support:
// ISO-8859-1
// UTF-8
// UTF-16LE
// UTF-16BE
// GB 18030-2000 Could not find 2005
IEncoding = interface
['{F445A9B6-B975-4BBC-BFCE-3C318FC6F84A}']
function Name: string;
function Family: IEncodingFamily;
function Aliases: string; // '|' separated list of aliase's to Name.
function CharCodec: ICharCodec;
function isNative: boolean;
function CodePage: integer;
function ToString( const Encoded: RawByteString): string;
function FromString( const Native: string): RawByteString;
function ReadCharactersFromStream(
Document: TStream; RequestCount: integer;
out ReadCount, ReadBytes: Integer): string;
// Reads RequestCount whole unicode characters from the stream
// encodes the characters in the native encoding (UTF16-LE)
// and returns this read string.
// ReadCount the count of succesfully read unicode characters,
// and ReadBytes is the count of bytes read.
// If the encoding is UTF-16, orphaned lead or trail surrogates
// are silently dropped.
function ReadCharacterFromStream(
Document: TStream; out ReadBytes: Integer): UniChar;
// Similar to ReadCharactersFromStream, but only reads one
// character and that character is returned directly instead
// of being UTF-16LE encoded.
end;
ICharCodec = interface
['{BC0DA141-0A0A-48EA-83E3-99C7435D9F43}']
procedure SetSynced;
function isInSync: boolean;
function ReadUniChar( b: byte; var ch: UniChar): boolean;
function WriteUniChar( ch: UniChar; var b: byte): boolean;
end;
function UniChar_to_Char( Ch: UniChar; var Lead, Trail: Char): boolean;
// Converts unicode code-point Ch into UTF-16 code units.
// Returns True iff 2 code units are returned instead of 1.
// Invalid code-points are not checked for.
function Char_to_UniChar1( Lead: Char; var ch: UniChar): boolean;
// Converts a lead/solo UTF-16 code unit into a nicode code-point Ch.
// Returns True iff a trail code unit is required. If this is the case,
// invoke Char_to_UniChar2.
procedure Char_to_UniChar2( Trail: Char; var ch: UniChar);
// Converts a trail UTF-16 code unit into a nicode code-point Ch,
// following a call to Char_to_UniChar1.
function CharacterServices: ICharacterServices;
// Get the CharacterServices. This is a singleton object.
function NewCharacterServices: ICharacterServices;
// Makes a fresh instance of CharacterServices, perhaps for the purpose
// of temporary registration of a custom IEncoding.
function GBBytes_to_GB_CodePoint( GBBytes: LongRec; CodePointRangeCase: Integer): GB_CodePoint;
function GB_CodePoint_to_GBBytes( CodePoint: GB_CodePoint; out CodePointRangeCase: Integer): LongRec;
function ByteLengthOfRangeCase( CodePointRangeCase: integer): integer;
function EstimateRangeCase( ByteValue: byte; ByteIndex: Integer{0 or 1 only}): integer;
function IsValidByte( ByteValue: byte; ByteIndex: Integer; CodePointRangeCase: integer): boolean;
implementation
uses Generics.Defaults;
type
TEnumerable_Family = class( TEnumerable<IEncodingFamily>)
protected
FMembers: IInterfaceList; // of IEncodingFamily
end;
TEnumerable_Encoding = class( TEnumerable<IEncoding>)
protected
FMembers: IInterfaceList; // of IEncoding
end;
TCharacterServices = class( TInterfacedObject, ICharacterServices)
protected
FFamilies: IInterfaceList; // of IEncodingFamily
FFamilyEnc: TEnumerable_Family;
FEncodiEnc: TEnumerable_Encoding;
FDefaultEncoding: IEncoding;
FNativeEncoding: IEncoding;
FUTF8: IEncoding;
FUTF16LE: IEncoding;
public
function FindEncodingFamily( const FamilyName: string; out Family: IEncodingFamily): boolean;
function FindEncoding( const EncodingName: string; out Encoding: IEncoding): boolean;
function EncodingFamilies: TEnumerable<IEncodingFamily>;
function Encodings: TEnumerable<IEncoding>;
function DefaultEncoding: IEncoding;
function NativeEncoding: IEncoding;
function UTF8: IEncoding;
function UTF16LE: IEncoding;
function ReadBOM( Document: TStream): IEncodingFamily;
procedure RegisterCustomEncoding( const Addend: IEncoding; const Family: IEncodingFamily);
constructor Create;
destructor Destroy; override;
end;
TEncodingFamily = class abstract( TInterfacedObject, IEncodingFamily)
protected
FBaseEncoding: IEncoding;
FEncodings: TEnumerable_Encoding;
FMembers: IInterfaceList; // of IEncoding
FBytesPerCodeUnit: integer;
FBOM: TBytes;
function Name: string; virtual; abstract;
function BaseEncoding: IEncoding; virtual;
function Encodings: TEnumerable<IEncoding>;
function BytesPerCodeUnit: integer;
function BOM: TBytes;
public
constructor Create;
destructor Destroy; override;
end;
TEncoding = class abstract( TInterfacedObject, IEncoding)
protected
FFamily: pointer;{ cast as IEncodingFamily}
FAliases: string;
FCodePage: integer;
function Name: string; virtual; abstract;
function Family: IEncodingFamily;
function Aliases: string;
function CharCodec: ICharCodec; virtual; abstract;
function isNative: boolean; virtual;
function CodePage: integer;
function ToString( const Encoded: RawByteString): string; virtual;
function FromString( const Native: string): RawByteString; virtual;
function ReadCharactersFromStream(
Document: TStream; RequestCount: integer;
out ReadCount, ReadBytes: Integer): string;
function ReadCharacterFromStream(
Document: TStream; out ReadBytes: Integer): UniChar;
public
constructor Create;
destructor Destroy; override;
end;
TCharCodec = class abstract( TInterfacedObject, ICharCodec)
protected
procedure SetSynced; virtual;
function isInSync: boolean; virtual;
function ReadUniChar( b: byte; var ch: UniChar): boolean; virtual; abstract;
function WriteUniChar( ch: UniChar; var b: byte): boolean; virtual; abstract;
end;
TUTF16LECodec = class( TCharCodec)
protected
FByteIdx: integer;
FBuff: LongRec;
FNeedBytes: integer;
procedure SetSynced; override;
function isInSync: boolean; override;
function ReadUniChar( b: byte; var ch: UniChar): boolean; override;
function WriteUniChar( ch: UniChar; var b: byte): boolean; override;
end;
TUTF8Codec = class( TCharCodec)
protected
FByteIdx: integer;
FBuff: LongRec;
FNeedBytes: integer;
procedure SetSynced; override;
function isInSync: boolean; override;
function ReadUniChar( b: byte; var ch: UniChar): boolean; override;
function WriteUniChar( ch: UniChar; var b: byte): boolean; override;
end;
TUTF8Family = class ( TEncodingFamily)
protected
function Name: string; override;
function BaseEncoding: IEncoding; override;
end;
TUTF16Family = class abstract( TUTF8Family)
protected
function Name: string; override;
function BaseEncoding: IEncoding; override;
end;
TUTF16LEFamily = class ( TUTF16Family)
protected
function BaseEncoding: IEncoding; override;
end;
TUTF16BEFamily = class ( TUTF16Family)
protected
function BaseEncoding: IEncoding; override;
end;
TUTF8Encoding = class ( TEncoding)
protected
function Name: string; override;
function CharCodec: ICharCodec; override;
function isNative: boolean; override;
function ToString( const Encoded: RawByteString): string; override;
function FromString( const Native: string): RawByteString; override;
end;
TUTF16LEEncoding = class ( TEncoding)
protected
function Name: string; override;
function CharCodec: ICharCodec; override;
function isNative: boolean; override;
function ToString( const Encoded: RawByteString): string; override;
function FromString( const Native: string): RawByteString; override;
end;
TUTF16BEEncoding = class ( TEncoding)
protected
function Name: string; override;
function CharCodec: ICharCodec; override;
function isNative: boolean; override;
function ToString( const Encoded: RawByteString): string; override;
function FromString( const Native: string): RawByteString; override;
end;
ISO_8859_1 = class sealed( TEncoding)
protected
function Name: string; override;
function CharCodec: ICharCodec; override;
function isNative: boolean; override;
function ToString( const Encoded: RawByteString): string; override;
function FromString( const Native: string): RawByteString; override;
end;
// Static data for stock families.
RStockFamilyStaticDatum = record
Family : TClass;
Name : string;
BOM : RawByteString;
nCodeUnit : integer; // Bytes per code unit.
nMaxCodeUnits: integer; // Maximum code unit per unicode character.
end;
const
StockFamilyData: array[0..0] of RStockFamilyStaticDatum = (
(Family: nil; Name: ''; BOM: ''; nCodeUnit: 0; nMaxCodeUnits: 0)
);
type
// Static data for stock Encodings.
RStockEncodingStaticDatum = record
Family : TClass;
Encoding : TClass;
Name : string;
nMaxCodeUnits: integer; // Maximum code unit per unicode character.
Aliases : string;
CodePage : integer; // 0 = no code page applicable.
CanAutoSync : Boolean;
end;
const
StockEncodingData: array[0..0] of RStockEncodingStaticDatum = (
(Family: nil; Encoding: nil; Name: ''; nMaxCodeUnits: 0; Aliases: ''; CodePage: 0; CanAutoSync: False)
);
ReplacementCharacter: UniChar = $00FFFD;
ReplacementChar : Char = #$FFFD;
function IsTailSurrogate( ch: Char): boolean; inline;
begin
result := (ch >= #$D800) and (ch <= #$DBFF)
end;
function IsLeadSurrogate( ch: Char): boolean; inline;
begin
result := (ch >= #$DC00) and (ch <= #$DFFF)
end;
function UniChar_to_Char( Ch: UniChar; var Lead, Trail: Char): boolean;
begin
end;
function Char_to_UniChar1( Lead: Char; var ch: UniChar): boolean;
begin
end;
procedure Char_to_UniChar2( Trail: Char; var ch: UniChar);
begin
end;
function CharacterServices: ICharacterServices;
begin
end;
function NewCharacterServices: ICharacterServices;
begin
end;
//http://www.iana.org/assignments/character-setswindows-1251 http://tlt.its.psu.edu/suggestions/international/bylanguage/index.html
//http://www.sagehill.net/docbookxsl/CharEncoding.html
//
//u = the unicode code point. Can be stored in 32 bits.
//l = lead word
//t = trail word
// u: [$000000..$10FFFF]-[$00D800..$00DFFF];
//
//if u >= $010000:
// l := ((u - $010000) shr 10 ) + $D800
// l == [$D800..$DBFF]
// t := ((u - $010000) and $0003FF) + $DC00
// t == [$DC00..0xDFFF]
//
//if u < $010000:
// l := u
// l == [$0000..$D7FF]+[$E000..$FFFF];
// (no t)
//
//if (l <= $D7FF) or (l >= $E000):
// (no t)
// u := l
//
//if (l >= $D800) and (l <= $DBFF):
// u := ((l - $D800) shl 10) + (t - $DC00) + $010000
{ TCharacterServices }
constructor TCharacterServices.Create;
begin
end;
function TCharacterServices.DefaultEncoding: IEncoding;
begin
end;
destructor TCharacterServices.Destroy;
begin
inherited;
end;
function TCharacterServices.EncodingFamilies: TEnumerable<IEncodingFamily>;
begin
end;
function TCharacterServices.Encodings: TEnumerable<IEncoding>;
begin
end;
function TCharacterServices.FindEncoding(const EncodingName: string;
out Encoding: IEncoding): boolean;
begin
end;
function TCharacterServices.FindEncodingFamily(const FamilyName: string;
out Family: IEncodingFamily): boolean;
begin
end;
function TCharacterServices.NativeEncoding: IEncoding;
begin
end;
function TCharacterServices.ReadBOM( Document: TStream): IEncodingFamily;
begin
end;
procedure TCharacterServices.RegisterCustomEncoding(
const Addend: IEncoding; const Family: IEncodingFamily);
begin
end;
function TCharacterServices.UTF16LE: IEncoding;
begin
end;
function TCharacterServices.UTF8: IEncoding;
begin
end;
{ TEncodingFamily }
function TEncodingFamily.BaseEncoding: IEncoding;
begin
end;
function TEncodingFamily.BOM: TBytes;
begin
end;
function TEncodingFamily.BytesPerCodeUnit: integer;
begin
end;
constructor TEncodingFamily.Create;
begin
end;
destructor TEncodingFamily.Destroy;
begin
inherited;
end;
function TEncodingFamily.Encodings: TEnumerable<IEncoding>;
begin
end;
{ TEncoding }
function TEncoding.Aliases: string;
begin
end;
function TEncoding.CodePage: integer;
begin
end;
constructor TEncoding.Create;
begin
end;
destructor TEncoding.Destroy;
begin
inherited;
end;
function TEncoding.Family: IEncodingFamily;
begin
end;
function TEncoding.FromString( const Native: string): RawByteString;
begin
end;
function TEncoding.isNative: boolean;
begin
result := False
end;
function TEncoding.ReadCharacterFromStream(
Document: TStream;
out ReadBytes: Integer): UniChar;
var
Codec: ICharCodec;
b: byte;
Ok: boolean;
begin
Codec := CharCodec;
ReadBytes := 0;
repeat
Ok := Document.Read( b, 1) = 1;
if Ok then
Inc( ReadBytes)
until (not Ok) or (not Codec.ReadUniChar( b, result));
if Ok then exit;
if ReadBytes <> 0 then
result := ReplacementCharacter
else
result := 0
end;
function TEncoding.ReadCharactersFromStream(
Document: TStream;
RequestCount: integer; out ReadCount, ReadBytes: Integer): string;
var
Codec: ICharCodec;
b: byte;
Ok: boolean;
ch: unichar;
Lead, Trail: Char;
Idx: integer;
RequestIndex: integer;
BytesReadThisChar: integer;
begin
Codec := CharCodec;
ReadBytes := 0;
Idx := 1;
ReadCount := 0;
SetLength( result, RequestCount*2); // Estimate average 2 bytes per character.
for RequestIndex := 1 to RequestCount do
begin
BytesReadThisChar := 0;
repeat
Ok := Document.Read( b, 1) = 1;
if Ok then
Inc( BytesReadThisChar)
until (not Ok) or (not Codec.ReadUniChar( b, ch));
Inc( ReadBytes, BytesReadThisChar);
if BytesReadThisChar = 0 then break;
if not Ok then
ch := ReplacementCharacter; // Partial character at the end of the stream.
Inc( ReadCount);
if UniChar_to_Char( Ch, Lead, Trail) then
begin
if Idx >= Length( result) then
SetLength( result, Idx+1);
result[Idx] := Lead;
Inc( Idx);
result[Idx] := Trail;
Inc( Idx)
end
else
begin
if Idx > Length( result) then
SetLength( result, Idx);
result[Idx] := Lead;
Inc( Idx)
end
end;
SetLength( result, Idx - 1)
end;
function TEncoding.ToString( const Encoded: RawByteString): string;
begin
end;
{ TCharCodec }
function TCharCodec.isInSync: boolean;
begin
result := True
end;
procedure TCharCodec.SetSynced;
begin
end;
{ TUTF8Family }
function TUTF8Family.BaseEncoding: IEncoding;
begin
end;
function TUTF8Family.Name: string;
begin
end;
{ TUTF16Family }
function TUTF16Family.BaseEncoding: IEncoding;
begin
end;
function TUTF16Family.Name: string;
begin
end;
{ TUTF16LEFamily }
function TUTF16LEFamily.BaseEncoding: IEncoding;
begin
end;
{ TUTF16BEFamily }
function TUTF16BEFamily.BaseEncoding: IEncoding;
begin
end;
{ TUTF8Encoding }
function TUTF8Encoding.CharCodec: ICharCodec;
begin
result := TUTF8Codec.Create
end;
function TUTF8Encoding.FromString( const Native: string): RawByteString;
begin
end;
function TUTF8Encoding.isNative: boolean;
begin
result := False
end;
function TUTF8Encoding.Name: string;
begin
result := 'utf-8'
end;
function TUTF8Encoding.ToString( const Encoded: RawByteString): string;
begin
end;
{ TUTF16LEEncoding }
function TUTF16LEEncoding.CharCodec: ICharCodec;
begin
end;
function TUTF16LEEncoding.FromString(const Native: string): RawByteString;
begin
end;
function TUTF16LEEncoding.isNative: boolean;
begin
result := True
end;
function TUTF16LEEncoding.Name: string;
begin
result := 'utf-16le'
end;
function TUTF16LEEncoding.ToString(const Encoded: RawByteString): string;
begin
end;
{ TUTF16BEEncoding }
function TUTF16BEEncoding.CharCodec: ICharCodec;
begin
end;
function TUTF16BEEncoding.FromString(const Native: string): RawByteString;
begin
end;
function TUTF16BEEncoding.isNative: boolean;
begin
end;
function TUTF16BEEncoding.Name: string;
begin
end;
function TUTF16BEEncoding.ToString(const Encoded: RawByteString): string;
begin
end;
{ ISO_8859_1 }
function ISO_8859_1.CharCodec: ICharCodec;
begin
end;
function ISO_8859_1.FromString(const Native: string): RawByteString;
begin
end;
function ISO_8859_1.isNative: boolean;
begin
end;
function ISO_8859_1.Name: string;
begin
end;
function ISO_8859_1.ToString(const Encoded: RawByteString): string;
begin
end;
{ TUTF16LECodec }
function TUTF16LECodec.ReadUniChar( b: byte; var ch: UniChar): boolean;
begin
FBuff.Bytes[FByteIdx] := b;
Inc( FByteIdx);
if FByteIdx = 2 then
begin
if IsTailSurrogate( Char( FBuff.Words[0])) then
begin
ch := ReplacementCharacter;
result := False;
Exit
end;
if Char_to_UniChar1( Char( FBuff.Words[0]), ch) then
FNeedBytes := 2
else
FNeedBytes := 0;
end;
result := FByteIdx < (FNeedBytes + 2);
if result then exit;
if FNeedBytes = 2 then
begin
if not IsTailSurrogate( Char( FBuff.Words[0])) then
begin
ch := ReplacementCharacter;
result := False;
Exit
end;
Char_to_UniChar2( Char( FBuff.Words[1]), ch);
end;
FByteIdx := 0;
FNeedBytes := 0;
Cardinal( FBuff) := 0
end;
function TUTF16LECodec.WriteUniChar( ch: UniChar; var b: byte): boolean;
begin
if FByteIdx = 0 then
if UniChar_to_Char( Ch, Char( FBuff.Words[0]), Char( FBuff.Words[0])) then
FNeedBytes := 2
else
FNeedBytes := 0;
b := LongRec(ch).Bytes[FByteIdx];
Inc( FByteIdx);
result := FByteIdx < (FNeedBytes + 2);
if result then exit;
FByteIdx := 0;
FNeedBytes := 0;
Cardinal( FBuff) := 0
end;
function TUTF16LECodec.isInSync: boolean;
begin
result := FByteIdx = 0
end;
procedure TUTF16LECodec.SetSynced;
begin
FByteIdx := 0;
FNeedBytes := 0;
Cardinal( FBuff) := 0
end;
{ TUTF8Codec }
function TUTF8Codec.ReadUniChar( b: byte; var ch: UniChar): boolean;
begin // he replacement character "�" (U+FFFD)
FBuff.Bytes[FByteIdx] := b;
Inc( FByteIdx);
if FByteIdx = 1 then
begin
if b <= $7F then
FNeedBytes := 0 // single byte
else if b <= $BF then
begin
// continuation
ch := ReplacementCharacter;
result := False;
Exit
end
else if b <= $DF then
FNeedBytes := 1
else if b <= $EF then
FNeedBytes := 2
else if b <= $F7 then
FNeedBytes := 3
else
begin
// Invalid encoding.
ch := ReplacementCharacter;
result := False;
Exit
end
end
else if (b > $BF) or (b <= $DF) then
begin
ch := ReplacementCharacter;
Result := False;
exit
end;
result := FByteIdx < (FNeedBytes + 1);
if result then exit;
if not result then
begin
ch := 0;
case FNeedBytes of
0: ch := FBuff.Bytes[0];
1: ch := (((FBuff.Bytes[0] and $1F) shl 6) or (FBuff.Bytes[1] and $3F));
2: ch := ((FBuff.Bytes[0] and $0F) shl 12) or ((FBuff.Bytes[1] and $3F) shl 6) or (FBuff.Bytes[2] and $3F);
3: ch := ((FBuff.Bytes[0] and $07) shl 18) or ((FBuff.Bytes[1] and $3F) shl 12) or ((FBuff.Bytes[2] and $3F) shl 6) or (FBuff.Bytes[3] and $3F);
end
end;
FByteIdx := 0;
FNeedBytes := 0;
Cardinal( FBuff) := 0
end;
function TUTF8Codec.WriteUniChar( ch: UniChar; var b: byte): boolean;
begin
Cardinal( FBuff) := 0;
if ch <= $00007F then
begin
FNeedBytes := 0; // single byte
FBuff.Bytes[0] := ch
end
else if ch <= $0007FF then
begin
FNeedBytes := 1; // 2 bytes
FBuff.Bytes[0] := (ch shr 6 ) or $C0;
FBuff.Bytes[1] := ((ch and $3F) and $C0) or $80
end
else if ch <= $00FFFF then
begin
FNeedBytes := 2; // 3 bytes
FBuff.Bytes[0] := ( ch shr 12 ) or $E0;
FBuff.Bytes[1] := ((ch shr 6 ) and $C0) or $80;
FBuff.Bytes[2] := ((ch and $3F) and $C0) or $80
end
else
begin
FNeedBytes := 3; // 4 bytes
FBuff.Bytes[0] := ((ch shr 18 ) and $F8) or $F0;
FBuff.Bytes[1] := ((ch shr 12 ) and $C0) or $80;
FBuff.Bytes[2] := ((ch shr 6 ) and $C0) or $80;
FBuff.Bytes[3] := ((ch and $3F) and $C0) or $80
end;
//http://www.herongyang.com/Unicode/index.html
// and http://source.icu-project.org/repos/icu/data/trunk/charset/data/xml/gb-18030-2000.xml
// table:
// range:
// unicode code-point
// gb code point (mapped to/from 4 byte)
// range length
//1 0x00 - 0x7F : 128 MSB = 128 cum: 128
//2 0x81 - 0xFE 0x40 - 0x7E : 126 * 63 = 7938 cum: 8066
//2 0x81 - 0xFE 0x80 - 0xFE : 126 * 127 = 16002 cum: 24068
//4 0x81 - 0xFE 0x30 - 0x39 0x81 - 0xFE 0x30 - 0x39 : 126 * 10 * 126 * 10 = 1587600 cum: 1611668
end;
// Also read http://www.unicode.org/reports/tr22/
// and http://site.icu-project.org/charts/charset
// http://icu-project.org/docs/papers/gb18030.html
//
//xmlns:http://source.icu-project.org/repos/icu/data/trunk/charset/data/xml/gb-18030-2000
//url=http://source.icu-project.org/repos/icu/data/trunk/charset/data/xml/gb-18030-2000.xml
//file=C:\TEMP\Projects\Third Party\PascalRegex\gb-18030-2000.xml
//
///characterMapping/assignments/a/@u : string regex=[0-9A-F]{4,6}
///characterMapping/assignments/a/@b : string* regex=[0-9A-F]{2}
///characterMapping/assignments/a/range/@uFirst: string regex=[0-9A-F]{4,6}
///characterMapping/assignments/a/range/@uLast : string regex=[0-9A-F]{4,6}
///characterMapping/assignments/a/range/@bFirst: string* regex=[0-9A-F]{2}
///characterMapping/assignments/a/range/@bLast : string* regex=[0-9A-F]{2}
function GBBytes_to_GB_CodePoint( GBBytes: LongRec; CodePointRangeCase: Integer): GB_CodePoint;
begin
result := 0;
case CodePointRangeCase of
0: result := GBBytes.Bytes[0];
1: result := ((GBBytes.Bytes[0] - $81) * 126) + (GBBytes.Bytes[1] - $40) + 128;
2: result := ((GBBytes.Bytes[0] - $81) * 126) + (GBBytes.Bytes[1] - $80) + 8066;
3: result := ((GBBytes.Bytes[0] - $81) * 158760) + ((GBBytes.Bytes[1] - $30) * 1260) + ((GBBytes.Bytes[2] - $81) * 126) + (GBBytes.Bytes[3] - $30) + 24068;
end;
end;
function GB_CodePoint_to_GBBytes( CodePoint: GB_CodePoint; out CodePointRangeCase: Integer): LongRec;
begin
cardinal( result) := 0;
if CodePoint < 128 then
begin
CodePointRangeCase := 0;
Result.Bytes[0] := CodePoint
end
else if CodePoint < 8066 then
begin
CodePointRangeCase := 1;
Dec( CodePoint, 128);
Result.Bytes[0] := (CodePoint div 126) + $81;
Result.Bytes[1] := (CodePoint mod 126) + $40
end
else if CodePoint < 24068 then
begin
CodePointRangeCase := 2;
Dec( CodePoint, 8066);
Result.Bytes[0] := (CodePoint div 126) + $81;
Result.Bytes[1] := (CodePoint mod 126) + $80
end
else
begin
CodePointRangeCase := 3;
Dec( CodePoint, 24068);
result.Bytes[2] := (CodePoint div 10) + $81;
result.Bytes[3] := (CodePoint mod 10) + $30;
CodePoint := result.Bytes[2];
result.Bytes[0] := (CodePoint div 126) + $81;
result.Bytes[1] := (CodePoint mod 126) + $30
end
end;
function ByteLengthOfRangeCase( CodePointRangeCase: integer): integer;
begin
case CodePointRangeCase of
0: result := 1;
1,2: result := 2;
3: result := 4;
end;
end;
function EstimateRangeCase( ByteValue: byte; ByteIndex: Integer{0 or 1 only}): integer;
begin
if ByteIndex = 0 then
begin
if ByteValue < 128 then
result := 0
else
result := 1
end
else if ByteIndex = 1 then
begin
if ByteValue <= $30 then
result := 3
else if ByteValue <= $7E then
result := 1
else if ByteValue <= $FE then
result := 2
end;
end;
function IsValidByte( ByteValue: byte; ByteIndex: Integer; CodePointRangeCase: integer): boolean;
begin
case CodePointRangeCase of
0: result := ByteValue < 128;
1,2: if ByteIndex = 0 then
result := (ByteValue >= $81) and (ByteValue <= $FE)
else
result := (ByteValue >= $40) and (ByteValue <= $FE) and (ByteValue <> $7F);
3: case ByteIndex of
0: result := (ByteValue >= $81) and (ByteValue <= $FE);
1: result := (ByteValue >= $30) and (ByteValue <= $39);
2: result := (ByteValue >= $81) and (ByteValue <= $FE);
3: result := (ByteValue >= $30) and (ByteValue <= $39);
end;
end;
end;
function TUTF8Codec.isInSync: boolean;
begin
result := FByteIdx = 0
end;
procedure TUTF8Codec.SetSynced;
begin
FByteIdx := 0;
FNeedBytes := 0;
Cardinal( FBuff) := 0
end;
end.