forked from livecode/livecode
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfoundation-text.h
More file actions
231 lines (163 loc) · 6.11 KB
/
foundation-text.h
File metadata and controls
231 lines (163 loc) · 6.11 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
/* Copyright (C) 2015 LiveCode Ltd.
This file is part of LiveCode.
LiveCode is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License v3 as published by the Free
Software Foundation.
LiveCode is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
for more details.
You should have received a copy of the GNU General Public License
along with LiveCode. If not see <http://www.gnu.org/licenses/>. */
#ifndef __MC_FOUNDATION_TEXT_
#define __MC_FOUNDATION_TEXT_
#include "foundation.h"
#include "foundation-unicode.h"
#include "foundation-locale.h"
class MCTextFilter
{
public:
// Returns the next codepoint
virtual codepoint_t GetNextCodepoint() = 0;
// Advances the read cursor or returns false if no more data remains
virtual bool AdvanceCursor() = 0;
// Returns true if there is still data to be read
virtual bool HasData() const = 0;
// Marks the codepoints read up to this point as being accepted (this is
// used by comparisons to find the range of indices that match). Filters
// that have multiple codepoints of state should only mark on boundaries.
virtual void MarkText();
// Returns the index into the underlying data that has been accepted
virtual uindex_t GetMarkedLength() const;
// Destructor also destroys all connected filters (i.e the entire chain is
// destroyed at one) - this is to simplify filter chain management.
virtual ~MCTextFilter();
// Filter chaining. Not all filters stack fully: encoding filters can't
// have anything added after them and decoding filters can't have anything
// added before them.
virtual bool PlaceBefore(MCTextFilter* p_filter);
virtual bool PlaceAfter(MCTextFilter* p_filter);
// Chain information
MCTextFilter *NextFilter() const;
MCTextFilter *PrevFilter() const;
protected:
MCTextFilter();
private:
// Filter chain
MCTextFilter *m_Next, *m_Prev;
};
class MCTextFilter_Encoder : public MCTextFilter
{
public:
// Must always come last so always fails
virtual bool PlaceBefore(MCTextFilter* p_filter);
};
class MCTextFilter_Decoder : public MCTextFilter
{
public:
// Must always come first so always fails
virtual bool PlaceAfter(MCTextFilter* p_filter);
};
class MCTextFilter_Breaker : public MCTextFilter
{
private:
// Break iterator being used
MCBreakIteratorRef m_BreakIterator;
};
class MCTextFilter_DecodeUTF16 : public MCTextFilter_Decoder
{
public:
// Inherited from MCTextFilter
virtual codepoint_t GetNextCodepoint();
virtual bool AdvanceCursor();
virtual bool HasData() const;
virtual void MarkText();
virtual uindex_t GetMarkedLength() const;
MCTextFilter_DecodeUTF16(const unichar_t*, uindex_t, bool);
~MCTextFilter_DecodeUTF16();
private:
// Flag to indicate whether we need to advance 2 codeunits
bool m_surrogate;
// Accepted and reading indices into the code units
uindex_t m_AcceptedIndex, m_ReadIndex;
// Text storage
const unichar_t *m_Data;
uindex_t m_DataLength;
// Going backwards, for things like shared suffix
bool m_Reverse;
};
class MCTextFilter_DecodeNative : public MCTextFilter_Decoder
{
public:
// Inherited from MCTextFilter
virtual codepoint_t GetNextCodepoint();
virtual bool AdvanceCursor();
virtual bool HasData() const;
virtual void MarkText();
virtual uindex_t GetMarkedLength() const;
MCTextFilter_DecodeNative(const char_t*, uindex_t, bool);
~MCTextFilter_DecodeNative();
private:
// Text storage
const char_t *m_Data;
uindex_t m_DataLength;
// Accepted and reading indices into the code units
uindex_t m_AcceptedIndex, m_ReadIndex;
// Going backwards, for things like shared suffix
bool m_Reverse;
};
class MCTextFilter_EncodeUTF16 : public MCTextFilter_Encoder
{
private:
// Non-zero value is trailing surrogate to emit next
unichar_t m_TrailSurrogate;
};
class MCTextFilter_SimpleCaseFold : public MCTextFilter
{
public:
// Inherited from MCTextFilter
virtual codepoint_t GetNextCodepoint();
virtual bool AdvanceCursor();
virtual bool HasData() const;
MCTextFilter_SimpleCaseFold();
~MCTextFilter_SimpleCaseFold();
private:
// This class uses the "simple" case folding rules where each character
// folds to exactly one character (so no sharp-s -> SS -> ss). As such, it
// doesn't need to maintain any state.
};
class MCTextFilter_NormalizeNFC : public MCTextFilter
{
public:
// Inherited from MCTextFilter
virtual codepoint_t GetNextCodepoint();
virtual bool AdvanceCursor();
virtual bool HasData() const;
virtual void MarkText();
virtual uindex_t GetMarkedLength() const;
MCTextFilter_NormalizeNFC(bool);
~MCTextFilter_NormalizeNFC();
private:
// The amount of context neaded for normalisation is potentially unbounded.
// To avoid problems with this, we implement the same fudge as libICU:
// arbitrarily limit the length of normalisable sequences to 256 codepoints.
enum { kMCTextFilterMaxNormLength = 256 };
unichar_t m_State[kMCTextFilterMaxNormLength];
// The length of state currently stored
uindex_t m_StateLength;
// Cursor
uindex_t m_ReadIndex;
// Marked length
uindex_t m_MarkedLength;
uindex_t m_MarkPoint;
bool m_surrogate;
// Going backwards, for things like shared suffix
bool m_Reverse;
codepoint_t GetNextCodepointReverse();
};
// Utility functions for creating filter chains - the returned objects should
// be released by calling 'delete'.
MCTextFilter *MCTextFilterCreate(MCStringRef, MCStringOptions);
MCTextFilter *MCTextFilterCreate(MCDataRef, MCStringEncoding, MCStringOptions);
MCTextFilter *MCTextFilterCreate(const void *, uindex_t, MCStringEncoding, MCStringOptions, bool from_end = false);
#endif