-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathMaxLogic.TextFileEncodingHelper.pas
More file actions
309 lines (256 loc) · 7.15 KB
/
MaxLogic.TextFileEncodingHelper.pas
File metadata and controls
309 lines (256 loc) · 7.15 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
unit MaxLogic.TextFileEncodingHelper;
{
we have often the problem that a file is encoded using multiple encodings so you have utf8 in one line... then ansi win 1250 in an other... and maybe something else in an other line
this unit helps to load and process those files
remember utf8 has those properties:
If the code point is < 128, it’s represented by the corresponding byte value.
If the code point is >= 128, it’s turned into a sequence of two, three, or four bytes, where each byte of the sequence is between 128 and 255.
}
Interface
Uses
MaxLogic.BufferedFile,
classes, sysUtils, RTTI, generics.collections;
Type
TTextFileEncodingHelper = Class
Private
fUtf8BomFoundInFile: Boolean;
fLineBreakChars: RawByteString;
fLines: TArray<RawByteString>;
Function DetectUsedLineBreak(Const aBody: RawByteString): Boolean;
Procedure LoadAllText(aStream: TStream; Out Body: RawByteString; Out BodyLen: Integer);
Function GetCount: Integer;
Procedure SplitLines(Const aBody: RawByteString);
Procedure SplitUsing1ByteLineBreak(Const aBody: RawByteString; Var aCount, aCapacity: Integer);
Procedure SplitUsing2ByteLineBreak(Const aBody: RawByteString; Var aCount, aCapacity: Integer);
Procedure AddLine(Const aBody: RawByteString; index, len: Integer; Var aCount, aCapacity: Integer);
Public
Constructor Create;
Destructor destroy; Override;
Procedure Clear;
Procedure LoadFromFile(Const aFileName: String);
procedure loadFromStream(aStream: TStream);
Procedure ConvertTo(l: TStringList); Overload;
// saves the lines using utf8 to a stream
Procedure ConvertTo(Utf8Stream: TStream; Const aLineBreak: String); Overload;
Class Function RawToUnicode(Const s: RawByteString): String;
Property Utf8BomFoundInFile: Boolean Read fUtf8BomFoundInFile;
Property count: Integer Read GetCount;
Property LineBreakChars: RawByteString Read fLineBreakChars;
End;
Implementation
Uses
{$IFDEF MSWINDOWS}
maxInMemoryFile,
{$ENDIF}
ioUtils, ansiStrings, System.WideStrUtils;
{ TTextFileEncodingHelper }
Constructor TTextFileEncodingHelper.Create;
Begin
End;
Destructor TTextFileEncodingHelper.destroy;
Begin
Inherited;
End;
Procedure TTextFileEncodingHelper.LoadFromFile(Const aFileName: String);
Var
fs: TFileStream;
Begin
fs := nil;
Clear;
try
if TFile.Exists(aFileName) then
begin
fs := TFileStream.Create(aFileName, fmopenread);
loadFromStream(fs);
end;
finally
fs.Free;
end;
End;
procedure TTextFileEncodingHelper.loadFromStream(aStream: TStream);
Var
Body: RawByteString;
BodyLen: Integer;
Begin
Clear;
LoadAllText(aStream, Body, BodyLen);
If BodyLen = 0 Then
Exit;
If DetectUsedLineBreak(Body) Then
SplitLines(Body)
Else
fLines := [Body];
end;
Procedure TTextFileEncodingHelper.ConvertTo(l: TStringList);
Var
x: Integer;
Begin
l.Clear;
For x := 0 To count - 1 Do
l.Add(RawToUnicode(fLines[x]));
End;
Class Function TTextFileEncodingHelper.RawToUnicode(Const s: RawByteString): String;
Var
encoding: TEncodeType;
Begin
encoding := System.WideStrUtils.DetectUTF8Encoding(s);
Case encoding Of
etANSI:
result := String(s);
etUSASCII:
result := String(s);
etUTF8:
result := Utf8ToString(s);
End;
End;
Procedure TTextFileEncodingHelper.AddLine(Const aBody: RawByteString; index, len: Integer; Var aCount, aCapacity: Integer);
Begin
// inc the capacity?
If aCount = aCapacity Then
Begin
aCapacity := aCapacity * 2;
SetLength(fLines, aCapacity);
End;
If len > 0 Then
fLines[aCount] := copy(aBody, Index, len)
Else
fLines[aCount] := '';
inc(aCount);
End;
Procedure TTextFileEncodingHelper.LoadAllText(aStream: TStream; Out Body: RawByteString; Out BodyLen: Integer);
Var
b1, b2: TBytes;
Begin
aStream.Position := 0;
BodyLen := aStream.Size;
If BodyLen > 0 Then
Begin
// skip Bom
If System.WideStrUtils.HasUTF8BOM(aStream) Then
Begin
dec(BodyLen, 3);
fUtf8BomFoundInFile := True;
aStream.Position := 3; // skip bom
End;
SetLength(Body, BodyLen);
aStream.ReadBuffer(Body[1], BodyLen);
End;
End;
Procedure TTextFileEncodingHelper.Clear;
Begin
fUtf8BomFoundInFile := false;
fLineBreakChars := '';
fLines := Nil;
End;
Function TTextFileEncodingHelper.DetectUsedLineBreak(Const aBody: RawByteString): Boolean;
Var
x: Integer;
Begin
result := false;
For x := 1 To length(aBody) Do
If aBody[x] In [#10, #13] Then
Begin
fLineBreakChars := aBody[x];
// is it a 2 char line break?
If x + 1 <= length(aBody) Then
If aBody[x + 1] <> aBody[x] Then
If aBody[x + 1] In [#10, #13] Then
fLineBreakChars := fLineBreakChars + aBody[x + 1];
result := True;
break;
End;
End;
Function TTextFileEncodingHelper.GetCount: Integer;
Begin
result := length(fLines);
End;
Procedure TTextFileEncodingHelper.SplitLines(Const aBody: RawByteString);
Var
lCapacity, lCount: Integer;
Begin
// init capacity and fLines array
lCapacity := (length(aBody) Div 10) + 1; // just a starting point
lCount := 0;
SetLength(fLines, lCapacity);
// most will use just #10 as line break. we can compare faster if we just always check 1 byte
If length(fLineBreakChars) = 1 Then
SplitUsing1ByteLineBreak(aBody, lCount, lCapacity)
Else
SplitUsing2ByteLineBreak(aBody, lCount, lCapacity);
// truncate
SetLength(fLines, lCount);
End;
Procedure TTextFileEncodingHelper.SplitUsing1ByteLineBreak(Const aBody: RawByteString; Var aCount, aCapacity: Integer);
Var
i1: Integer;
x: Integer;
len: Integer;
pb1: pByte;
b: Byte;
Begin
i1 := 1;
b := Byte(fLineBreakChars[1]);
pb1 := @aBody[1];
For x := 1 To length(aBody) Do
Begin
If b = pb1^ Then
Begin
len := (x - i1);
AddLine(aBody, i1, len, aCount, aCapacity);
i1 := x + 1;
End;
inc(pb1);
End;
// add the tail
If i1 <= length(aBody) Then
Begin
len := (length(aBody) - i1) + 1;
AddLine(aBody, i1, len, aCount, aCapacity);
End;
End;
Procedure TTextFileEncodingHelper.SplitUsing2ByteLineBreak(Const aBody: RawByteString; Var aCount, aCapacity: Integer);
Var
i1: Integer;
x: Integer;
len: Integer;
pb1: pByte;
w: word;
Begin
i1 := 1;
move(fLineBreakChars[1], w, 2);
pb1 := @aBody[1];
For x := 1 To length(aBody) Do
Begin
If w = pWord(pb1)^ Then
Begin
len := (x - i1);
AddLine(aBody, i1, len, aCount, aCapacity);
i1 := x + 2;
End;
inc(pb1);
End;
// add the tail
If i1 <= length(aBody) Then
Begin
len := (length(aBody) - i1) + 1;
AddLine(aBody, i1, len, aCount, aCapacity);
End;
End;
Procedure TTextFileEncodingHelper.ConvertTo(Utf8Stream: TStream; Const aLineBreak: String);
Var
x: Integer;
lLineBreakBytes, bytes: TBytes;
NlLen: Integer;
Begin
lLineBreakBytes := TEncoding.UTF8.GetBytes(aLineBreak);
NlLen := length(lLineBreakBytes);
For x := 0 To count - 1 Do
Begin
bytes := TEncoding.UTF8.GetBytes(RawToUnicode(fLines[x]));
If length(bytes) <> 0 Then
Utf8Stream.WriteBuffer(bytes[0], length(bytes));
If NlLen <> 0 Then
Utf8Stream.WriteBuffer(lLineBreakBytes[0], NlLen);
End;
End;
End.