MaxLogicFoundation/MaxLogic.TextFileEncodingHelper.pas at main · MaxLogic/MaxLogicFoundation

309 lines (256 loc) · 7.15 KB
﻿unit MaxLogic.TextFileEncodingHelper;
  we have often the problem that a file is encoded using multiple encodings so you have utf8 in one line... then ansi win 1250 in an other... and maybe something else in an other line
  this unit helps to load and process those files
  remember utf8 has those properties:
  If the code point is < 128, it’s represented by the corresponding byte value.
  If the code point is >= 128, it’s turned into a sequence of two, three, or four bytes, where each byte of the sequence is between 128 and 255.
  MaxLogic.BufferedFile,
  classes, sysUtils, RTTI, generics.collections;
  TTextFileEncodingHelper = Class
    fUtf8BomFoundInFile: Boolean;
    fLineBreakChars: RawByteString;
    fLines: TArray<RawByteString>;
    Function DetectUsedLineBreak(Const aBody: RawByteString): Boolean;
    Procedure LoadAllText(aStream: TStream; Out Body: RawByteString; Out BodyLen: Integer);
    Function GetCount: Integer;
    Procedure SplitLines(Const aBody: RawByteString);
    Procedure SplitUsing1ByteLineBreak(Const aBody: RawByteString; Var aCount, aCapacity: Integer);
    Procedure SplitUsing2ByteLineBreak(Const aBody: RawByteString; Var aCount, aCapacity: Integer);
    Procedure AddLine(Const aBody: RawByteString; index, len: Integer; Var aCount, aCapacity: Integer);
    Constructor Create;
    Destructor destroy; Override;
    Procedure Clear;
    Procedure LoadFromFile(Const aFileName: String);
    procedure loadFromStream(aStream: TStream);
    Procedure ConvertTo(l: TStringList); Overload;
    // saves the lines using utf8 to a stream
    Procedure ConvertTo(Utf8Stream: TStream; Const aLineBreak: String); Overload;
    Class Function RawToUnicode(Const s: RawByteString): String;
    Property Utf8BomFoundInFile: Boolean Read fUtf8BomFoundInFile;
    Property count: Integer Read GetCount;
    Property LineBreakChars: RawByteString Read fLineBreakChars;
Implementation
  {$IFDEF MSWINDOWS}
  maxInMemoryFile,
  ioUtils, ansiStrings, System.WideStrUtils;
{ TTextFileEncodingHelper }
Constructor TTextFileEncodingHelper.Create;
Destructor TTextFileEncodingHelper.destroy;
  Inherited;
Procedure TTextFileEncodingHelper.LoadFromFile(Const aFileName: String);
  fs: TFileStream;
  fs := nil;
    if TFile.Exists(aFileName) then
      fs := TFileStream.Create(aFileName, fmopenread);
      loadFromStream(fs);
    fs.Free;
procedure TTextFileEncodingHelper.loadFromStream(aStream: TStream);
  Body: RawByteString;
  BodyLen: Integer;
  LoadAllText(aStream, Body, BodyLen);
  If BodyLen = 0 Then
  If DetectUsedLineBreak(Body) Then
    SplitLines(Body)
    fLines := [Body];
Procedure TTextFileEncodingHelper.ConvertTo(l: TStringList);
  x: Integer;
  For x := 0 To count - 1 Do
    l.Add(RawToUnicode(fLines[x]));
Class Function TTextFileEncodingHelper.RawToUnicode(Const s: RawByteString): String;
  encoding: TEncodeType;
  encoding := System.WideStrUtils.DetectUTF8Encoding(s);
  Case encoding Of
    etANSI:
      result := String(s);
    etUSASCII:
      result := String(s);
    etUTF8:
      result := Utf8ToString(s);
Procedure TTextFileEncodingHelper.AddLine(Const aBody: RawByteString; index, len: Integer; Var aCount, aCapacity: Integer);
  // inc the capacity?
  If aCount = aCapacity Then
    aCapacity := aCapacity * 2;
    SetLength(fLines, aCapacity);
  If len > 0 Then
    fLines[aCount] := copy(aBody, Index, len)
    fLines[aCount] := '';
  inc(aCount);
Procedure TTextFileEncodingHelper.LoadAllText(aStream: TStream; Out Body: RawByteString; Out BodyLen: Integer);
  b1, b2: TBytes;
  aStream.Position := 0;
  BodyLen := aStream.Size;
  If BodyLen > 0 Then
    // skip Bom
    If System.WideStrUtils.HasUTF8BOM(aStream) Then
      dec(BodyLen, 3);
      fUtf8BomFoundInFile := True;
      aStream.Position := 3; // skip bom
    SetLength(Body, BodyLen);
    aStream.ReadBuffer(Body[1], BodyLen);
Procedure TTextFileEncodingHelper.Clear;
  fUtf8BomFoundInFile := false;
  fLineBreakChars := '';
  fLines := Nil;
Function TTextFileEncodingHelper.DetectUsedLineBreak(Const aBody: RawByteString): Boolean;
  x: Integer;
  result := false;
  For x := 1 To length(aBody) Do
    If aBody[x] In [#10, #13] Then
      fLineBreakChars := aBody[x];
      // is it a 2 char line break?
      If x + 1 <= length(aBody) Then
        If aBody[x + 1] <> aBody[x] Then
          If aBody[x + 1] In [#10, #13] Then
            fLineBreakChars := fLineBreakChars + aBody[x + 1];
      result := True;
      break;
Function TTextFileEncodingHelper.GetCount: Integer;
  result := length(fLines);
Procedure TTextFileEncodingHelper.SplitLines(Const aBody: RawByteString);
  lCapacity, lCount: Integer;
  // init capacity and fLines array
  lCapacity := (length(aBody) Div 10) + 1; // just a starting point
  lCount := 0;
  SetLength(fLines, lCapacity);
  // most will use just #10 as line break. we can compare faster if we just always check 1 byte
  If length(fLineBreakChars) = 1 Then
    SplitUsing1ByteLineBreak(aBody, lCount, lCapacity)
    SplitUsing2ByteLineBreak(aBody, lCount, lCapacity);
  // truncate
  SetLength(fLines, lCount);
Procedure TTextFileEncodingHelper.SplitUsing1ByteLineBreak(Const aBody: RawByteString; Var aCount, aCapacity: Integer);
  i1: Integer;
  x: Integer;
  len: Integer;
  pb1: pByte;
  b := Byte(fLineBreakChars[1]);
  pb1 := @aBody[1];
  For x := 1 To length(aBody) Do
    If b = pb1^ Then
      len := (x - i1);
      AddLine(aBody, i1, len, aCount, aCapacity);
      i1 := x + 1;
    inc(pb1);
  // add the tail
  If i1 <= length(aBody) Then
    len := (length(aBody) - i1) + 1;
    AddLine(aBody, i1, len, aCount, aCapacity);
Procedure TTextFileEncodingHelper.SplitUsing2ByteLineBreak(Const aBody: RawByteString; Var aCount, aCapacity: Integer);
  i1: Integer;
  x: Integer;
  len: Integer;
  pb1: pByte;
  move(fLineBreakChars[1], w, 2);
  pb1 := @aBody[1];
  For x := 1 To length(aBody) Do
    If w = pWord(pb1)^ Then
      len := (x - i1);
      AddLine(aBody, i1, len, aCount, aCapacity);
      i1 := x + 2;
    inc(pb1);
  // add the tail
  If i1 <= length(aBody) Then
    len := (length(aBody) - i1) + 1;
    AddLine(aBody, i1, len, aCount, aCapacity);
Procedure TTextFileEncodingHelper.ConvertTo(Utf8Stream: TStream; Const aLineBreak: String);
  x: Integer;
  lLineBreakBytes, bytes: TBytes;
  NlLen: Integer;
  lLineBreakBytes := TEncoding.UTF8.GetBytes(aLineBreak);
  NlLen := length(lLineBreakBytes);
  For x := 0 To count - 1 Do
    bytes := TEncoding.UTF8.GetBytes(RawToUnicode(fLines[x]));
    If length(bytes) <> 0 Then
      Utf8Stream.WriteBuffer(bytes[0], length(bytes));
    If NlLen <> 0 Then
      Utf8Stream.WriteBuffer(lLineBreakBytes[0], NlLen);
Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

MaxLogic.TextFileEncodingHelper.pas

Latest commit

History

MaxLogic.TextFileEncodingHelper.pas

File metadata and controls