Skip to content

Commit 639098a

Browse files
committed
Improve html entity support
1 parent 27faec1 commit 639098a

1 file changed

Lines changed: 130 additions & 20 deletions

File tree

httpsget.c

Lines changed: 130 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -126,40 +126,145 @@ void parse_html(char *html_content) {
126126

127127
// タグ内でない場合は文字をコピー
128128
if (!in_tag && !in_script && !in_style) {
129-
// HTMLエンティティの簡単な変換
130-
if (strncmp((char*)src, "<", 4) == 0) {
129+
if (isspace(*src)) {
130+
// 連続する空白文字を1つのスペースに変換
131+
if (!consecutive_spaces) {
132+
*dst++ = ' ';
133+
consecutive_spaces = 1;
134+
}
135+
src++;
136+
} else {
137+
*dst++ = *src++;
138+
consecutive_spaces = 0;
139+
}
140+
} else {
141+
src++;
142+
}
143+
}
144+
145+
*dst = '\0';
146+
}
147+
148+
/*
149+
* HTML実体参照を文字コードに変換
150+
*/
151+
void decode_html_entities(char *text) {
152+
char *src = text;
153+
char *dst = text;
154+
155+
while (*src) {
156+
if (*src == '&') {
157+
// よく使われる実体参照を変換
158+
if (strncmp(src, "<", 4) == 0) {
131159
*dst++ = '<';
132160
src += 4;
133-
consecutive_spaces = 0;
134-
} else if (strncmp((char*)src, "&gt;", 4) == 0) {
161+
} else if (strncmp(src, "&gt;", 4) == 0) {
135162
*dst++ = '>';
136163
src += 4;
137-
consecutive_spaces = 0;
138-
} else if (strncmp((char*)src, "&amp;", 5) == 0) {
164+
} else if (strncmp(src, "&amp;", 5) == 0) {
139165
*dst++ = '&';
140166
src += 5;
141-
consecutive_spaces = 0;
142-
} else if (strncmp((char*)src, "&quot;", 6) == 0) {
167+
} else if (strncmp(src, "&quot;", 6) == 0) {
143168
*dst++ = '"';
144169
src += 6;
145-
consecutive_spaces = 0;
146-
} else if (strncmp((char*)src, "&nbsp;", 6) == 0) {
147-
*dst++ = ' ';
170+
} else if (strncmp(src, "&apos;", 6) == 0) {
171+
*dst++ = '\'';
148172
src += 6;
149-
consecutive_spaces = 1;
150-
} else if (isspace(*src)) {
151-
// 連続する空白文字を1つのスペースに変換
152-
if (!consecutive_spaces) {
153-
*dst++ = ' ';
154-
consecutive_spaces = 1;
173+
} else if (strncmp(src, "&nbsp;", 6) == 0) {
174+
// 改行なしスペース (U+00A0) -> UTF-8: 0xC2 0xA0
175+
*dst++ = 0xC2;
176+
*dst++ = 0xA0;
177+
src += 6;
178+
} else if (strncmp(src, "&copy;", 6) == 0) {
179+
// copyright symbol (U+00A9) -> UTF-8: 0xC2 0xA9
180+
*dst++ = 0xC2;
181+
*dst++ = 0xA9;
182+
src += 6;
183+
} else if (strncmp(src, "&reg;", 5) == 0) {
184+
// registered trademark symbol (U+00AE) -> UTF-8: 0xC2 0xAE
185+
*dst++ = 0xC2;
186+
*dst++ = 0xAE;
187+
src += 5;
188+
} else if (strncmp(src, "&trade;", 7) == 0) {
189+
// trademark symbol (U+2122) -> UTF-8: 0xE2 0x84 0xA2
190+
*dst++ = 0xE2;
191+
*dst++ = 0x84;
192+
*dst++ = 0xA2;
193+
src += 7;
194+
} else if (strncmp(src, "&yen;", 5) == 0) {
195+
// yen sign (U+00A5) -> UTF-8: 0xC2 0xA5
196+
*dst++ = 0xC2;
197+
*dst++ = 0xA5;
198+
src += 5;
199+
} else if (strncmp(src, "&euro;", 6) == 0) {
200+
// euro sign (U+20AC) -> UTF-8: 0xE2 0x82 0xAC
201+
*dst++ = 0xE2;
202+
*dst++ = 0x82;
203+
*dst++ = 0xAC;
204+
src += 6;
205+
} else if (strncmp(src, "&pound;", 7) == 0) {
206+
// pound sign (U+00A3) -> UTF-8: 0xC2 0xA3
207+
*dst++ = 0xC2;
208+
*dst++ = 0xA3;
209+
src += 7;
210+
} else if (strncmp(src, "&cent;", 6) == 0) {
211+
// cent sign (U+00A2) -> UTF-8: 0xC2 0xA2
212+
*dst++ = 0xC2;
213+
*dst++ = 0xA2;
214+
src += 6;
215+
} else if (strncmp(src, "&deg;", 5) == 0) {
216+
// degree sign (U+00B0) -> UTF-8: 0xC2 0xB0
217+
*dst++ = 0xC2;
218+
*dst++ = 0xB0;
219+
src += 5;
220+
src += 7;
221+
} else if (src[1] == '#') {
222+
// 数値文字参照 &#nnn; または &#xhex;
223+
char *end_ptr;
224+
unsigned long code;
225+
if (src[2] == 'x' || src[2] == 'X') {
226+
// 16進数
227+
code = strtoul(src + 3, &end_ptr, 16);
228+
} else {
229+
// 10進数
230+
code = strtoul(src + 2, &end_ptr, 10);
231+
}
232+
233+
if (*end_ptr == ';' && code > 0) {
234+
// UTF-8文字をバイト列に変換
235+
if (code < 0x80) {
236+
// 1バイト文字 (ASCII)
237+
*dst++ = (char)code;
238+
} else if (code < 0x800) {
239+
// 2バイト文字
240+
*dst++ = (char)(0xC0 | (code >> 6));
241+
*dst++ = (char)(0x80 | (code & 0x3F));
242+
} else if (code < 0x10000) {
243+
// 3バイト文字
244+
*dst++ = (char)(0xE0 | (code >> 12));
245+
*dst++ = (char)(0x80 | ((code >> 6) & 0x3F));
246+
*dst++ = (char)(0x80 | (code & 0x3F));
247+
} else if (code < 0x110000) {
248+
// 4バイト文字
249+
*dst++ = (char)(0xF0 | (code >> 18));
250+
*dst++ = (char)(0x80 | ((code >> 12) & 0x3F));
251+
*dst++ = (char)(0x80 | ((code >> 6) & 0x3F));
252+
*dst++ = (char)(0x80 | (code & 0x3F));
253+
} else {
254+
// 無効なコードポイント
255+
*dst++ = '?';
256+
}
257+
src = end_ptr + 1;
258+
} else {
259+
// 無効な数値文字参照はそのまま
260+
*dst++ = *src++;
155261
}
156-
src++;
157262
} else {
263+
// 未知の実体参照はそのまま
158264
*dst++ = *src++;
159-
consecutive_spaces = 0;
160265
}
161266
} else {
162-
src++;
267+
*dst++ = *src++;
163268
}
164269
}
165270

@@ -334,6 +439,11 @@ int main(int argc, char **argv)
334439

335440
char *html_body = extract_html_body((char *)sbuffer);
336441

442+
// HTML実体参照をデコード
443+
decode_html_entities((char *)sbuffer);
444+
445+
fwrite(sbuffer, 1, html_body - (char *)sbuffer, stdout);
446+
337447
parse_html(html_body);
338448
printf("%s", html_body);
339449

0 commit comments

Comments
 (0)