unit wopruniloader;
{$ifdef fpc}
{$mode objfpc}{$H+}
{$endif}

interface

uses Classes, Types;

type

  TUniFileTypes = (ufUndefined, ufUtf8, ufUtf16be, ufUtf16le, ufUtf32be, ufUtf32le);

  TUniLoader = class
  private
    fStream: TStream;
    fUniFileType: TUniFileTypes;
    fHaveType: Boolean;
    function GetOffset: integer;
    function GetFileType: TUniFileTypes;
  protected
    procedure CheckFileType;
  public
    constructor Create(AStream: TStream);
    destructor Destroy; override;
    function GetFullString: string;
    function GetFileTypeStr: string;
    property FileType: TUniFileTypes read GetFileType;
    property Offset: integer read GetOffset;
  published

  end;
function LoadUTF8FromFile(FileName:UTF8String): UTF8String; overload;
function LoadUTF8FromFile(FileName:UTF8String; var FileType:String): UTF8String; overload;
function SwapEndian(const AValue: Word): Word; overload;
function SwapEndian(const AValue: DWord): DWord; overload;
procedure WideSwapEndian(PWC: PWideChar);
procedure UCSwapEndian(var UC: UCS4String);

const
  UTF8BOM: string = #$EF#$BB#$BF;
  UTF16BEBOM: string = #$FE#$FF;
  UTF16LEBOM: string = #$FF#$FE;
  UTF32BEBOM: string = #$00#$00#$FE#$FF;
  UTF32LEBOM: string = #$FF#$FE#$00#$00;


implementation

uses Sysutils, LConvEncoding, FileUtil;

Function LoadUTF8FromFile(FileName:UTF8String; var FileType:String): UTF8String;
Var Strm: TMemoryStream;
  Ul: TUniLoader;
  Enc: String;
Begin
  Strm := TMemoryStream.Create;
  Try
    Strm.LoadFromFile(UTF8ToSys(FileName));
    Ul := TUniLoader.Create(Strm);
    Result := Ul.GetFullString;
    //if the file has no BOM:
    If Ul.FileType=ufUnDefined Then
      Begin
        Enc := LConvencoding.GuessEncoding(Result);
        If Enc<>EncodingUTF8 Then Result := ConvertEncoding(Result,Enc,EncodingUTF8);
        FileType:=Enc;
      End else FileType:=Ul.GetFileTypeStr;
    Ul.free;
  Finally
    Strm.free;
  End;
End;

function LoadUTF8FromFile(FileName:UTF8String): UTF8String; overload;
var Aft:String;
begin
 Result:=LoadUTF8FromFile(FileName, Aft);
end;

function SwapEndian(const AValue: Word): Word;
begin
  Result := Word((AValue shr 8) or (AValue shl 8));
end;


function SwapEndian(const AValue: DWord): DWord;
begin
  Result := (AValue shl 24)
    or ((AValue and $0000FF00) shl 8)
    or ((AValue and $00FF0000) shr 8)
    or (AValue shr 24);
end;


procedure WideSwapEndian(PWC: PWideChar);
begin
  while PWC^ <> #0 do
  begin
    PWC^ := WideChar(SwapEndian(Word(PWC^)));
    inc(PWC);
  end;
end;

procedure UCSwapEndian(var UC: UCS4String);
var i: integer;
begin
  for i := 0 to Length(UC) do UC[i] := SwapEndian(DWord(UC[i]));
end;

{ TUniLoader }

procedure TUniLoader.CheckFileType;
var ASt: string[5];
begin
  Ast := #0#0#0#0#0;
  fUniFileType := ufUndefined;
  fStream.Position := 0;
  if fStream.Read(ASt[1], 4) = 4 then
  begin
    if Copy(Ast, 1, 3) = UTF8BOM then fUniFileType := ufUtf8 else
      if Copy(Ast, 1, 4) = UTF32LEBOM then fUniFileType := ufUtf32le else
        if Copy(Ast, 1, 4) = UTF32BEBOM then fUniFileType := ufUtf32be else
          if Copy(Ast, 1, 2) = UTF16LEBOM then fUniFileType := ufUtf16le else
            if Copy(Ast, 1, 2) = UTF16BEBOM then fUniFileType := ufUtf16be;
    fStream.Position := 0;
    fHaveType := True;
  end;
end;

constructor TUniLoader.Create(AStream: TStream);
begin
  inherited Create;
  fStream := AStream;
  fHaveType := False;
end;

destructor TUniLoader.Destroy;
begin
  inherited;
end;

function TUniLoader.GetFileType: TUniFileTypes;
begin
  if not fHaveType then CheckFileType;
  Result := fUniFileType;
end;

function TUniLoader.GetFullString: string;
var Strm: TMemoryStream;
  PWC: PWideChar;
  PC: PChar;
  UC: UCS4String;
  aPtr: PChar;
begin
  if not fHaveType then CheckFileType;
  if fStream is TMemoryStream then Strm := TMemoryStream(fStream) else
  begin
    Strm := TMemoryStream.Create;
    Strm.LoadFromStream(fStream);
  end;
  Strm.Position := 0;
  case fUniFileType of
    ufUndefined:
      begin
        PC := Strm.Memory;
        Result := Copy(PC, 1, (Strm.size));
      end;
    ufUtf8:
      begin
        PC := Strm.Memory;
        Result := Copy(PC, 4, (Strm.size - 3));
      end;
    ufUtf16be, ufUtf16le:
      begin
        PWC := Strm.Memory;
        if fUniFileType = ufUtf16be then WideSwapEndian(PWC);
        Result := UTF8Encode(Copy(PWC, 2, (Strm.size - 1) div 2));
      end;
    ufUtf32be, ufUtf32le:
      begin
        aPtr := Strm.Memory;
        inc(aPtr, 4);
        SetLength(UC, Strm.size div 4);
        Move(aPtr^, UC[0], Strm.Size - 4);
        if fUniFileType = ufUtf32be then UCSwapEndian(UC);
        Result := UTF8Encode(UCS4StringToWideString(UC));
        SetLength(UC,0);
      end;
  end;
  if not (fStream is TMemoryStream) then Strm.free;
end;

function TUniLoader.GetOffset: integer;
begin
  if not fHaveType then CheckFileType;
  case fUniFileType of
    ufUndefined: Result := 0;
    ufUtf8: Result := 3;
    ufUtf16be, ufUtf16le: Result := 2;
    ufUtf32be, ufUtf32le: Result := 4;
  end;
end;

function TUniLoader.GetFileTypeStr: string;
begin
  if not fHaveType then CheckFileType;
  case fUniFileType of
    ufUndefined: Result := 'Undefined';
    ufUtf8:      Result := 'Utf8';
    ufUtf16be:   Result := 'Utf16be';
    ufUtf16le:   Result := 'Utf16le';
    ufUtf32be:   Result := 'Utf32be';
    ufUtf32le:   Result := 'Utf32le';
  end;
end;

end.

