Skip to content

Commit e2255ba

Browse files
committed
Fix html entity conversion
1 parent e56a995 commit e2255ba

File tree

1 file changed

+44
-64
lines changed

1 file changed

+44
-64
lines changed

httpsget.c

Lines changed: 44 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -152,76 +152,56 @@ void parse_html(char *html_content) {
152152
* HTML実体参照を文字コードに変換
153153
*/
154154
void decode_html_entities(char *text) {
155+
// よく使われる実体参照の変換テーブル
156+
struct {
157+
const char *entity;
158+
int len;
159+
const char *replacement;
160+
int replacement_len;
161+
} entities[] = {
162+
{"&lt;", 4, "<", 1},
163+
{"&gt;", 4, ">", 1},
164+
{"&amp;", 5, "&", 1},
165+
{"&quot;", 6, "\"", 1},
166+
{"&apos;", 6, "'", 1},
167+
{"&laquo;", 7, "<<", 2},
168+
{"&raquo;", 7, ">>", 2},
169+
{"&nbsp;", 6, "\xC2\xA0", 2}, // 改行なしスペース (U+00A0)
170+
{"&copy;", 6, "\xC2\xA9", 2}, // copyright symbol (U+00A9)
171+
{"&reg;", 5, "\xC2\xAE", 2}, // registered trademark symbol (U+00AE)
172+
{"&trade;", 7, "\xE2\x84\xA2", 3}, // trademark symbol (U+2122)
173+
{"&yen;", 5, "\xC2\xA5", 2}, // yen sign (U+00A5)
174+
{"&euro;", 6, "\xE2\x82\xAC", 3}, // euro sign (U+20AC)
175+
{"&pound;", 7, "\xC2\xA3", 2}, // pound sign (U+00A3)
176+
{"&cent;", 6, "\xC2\xA2", 2}, // cent sign (U+00A2)
177+
{"&deg;", 5, "\xC2\xB0", 2}, // degree sign (U+00B0)
178+
{NULL, 0, NULL, 0}
179+
};
180+
155181
char *src = text;
156182
char *dst = text;
157183

158184
while (*src) {
159185
if (*src == '&') {
160186
// よく使われる実体参照を変換
161-
if (strncmp(src, "&lt;", 4) == 0) {
162-
*dst++ = '<';
163-
src += 4;
164-
} else if (strncmp(src, "&gt;", 4) == 0) {
165-
*dst++ = '>';
166-
src += 4;
167-
} else if (strncmp(src, "&amp;", 5) == 0) {
168-
*dst++ = '&';
169-
src += 5;
170-
} else if (strncmp(src, "&quot;", 6) == 0) {
171-
*dst++ = '"';
172-
src += 6;
173-
} else if (strncmp(src, "&apos;", 6) == 0) {
174-
*dst++ = '\'';
175-
src += 6;
176-
} else if (strncmp(src, "&nbsp;", 6) == 0) {
177-
// 改行なしスペース (U+00A0) -> UTF-8: 0xC2 0xA0
178-
*dst++ = 0xC2;
179-
*dst++ = 0xA0;
180-
src += 6;
181-
} else if (strncmp(src, "&copy;", 6) == 0) {
182-
// copyright symbol (U+00A9) -> UTF-8: 0xC2 0xA9
183-
*dst++ = 0xC2;
184-
*dst++ = 0xA9;
185-
src += 6;
186-
} else if (strncmp(src, "&reg;", 5) == 0) {
187-
// registered trademark symbol (U+00AE) -> UTF-8: 0xC2 0xAE
188-
*dst++ = 0xC2;
189-
*dst++ = 0xAE;
190-
src += 5;
191-
} else if (strncmp(src, "&trade;", 7) == 0) {
192-
// trademark symbol (U+2122) -> UTF-8: 0xE2 0x84 0xA2
193-
*dst++ = 0xE2;
194-
*dst++ = 0x84;
195-
*dst++ = 0xA2;
196-
src += 7;
197-
} else if (strncmp(src, "&yen;", 5) == 0) {
198-
// yen sign (U+00A5) -> UTF-8: 0xC2 0xA5
199-
*dst++ = 0xC2;
200-
*dst++ = 0xA5;
201-
src += 5;
202-
} else if (strncmp(src, "&euro;", 6) == 0) {
203-
// euro sign (U+20AC) -> UTF-8: 0xE2 0x82 0xAC
204-
*dst++ = 0xE2;
205-
*dst++ = 0x82;
206-
*dst++ = 0xAC;
207-
src += 6;
208-
} else if (strncmp(src, "&pound;", 7) == 0) {
209-
// pound sign (U+00A3) -> UTF-8: 0xC2 0xA3
210-
*dst++ = 0xC2;
211-
*dst++ = 0xA3;
212-
src += 7;
213-
} else if (strncmp(src, "&cent;", 6) == 0) {
214-
// cent sign (U+00A2) -> UTF-8: 0xC2 0xA2
215-
*dst++ = 0xC2;
216-
*dst++ = 0xA2;
217-
src += 6;
218-
} else if (strncmp(src, "&deg;", 5) == 0) {
219-
// degree sign (U+00B0) -> UTF-8: 0xC2 0xB0
220-
*dst++ = 0xC2;
221-
*dst++ = 0xB0;
222-
src += 5;
223-
src += 7;
224-
} else if (src[1] == '#') {
187+
int found = 0;
188+
int i;
189+
for (i = 0; entities[i].entity != NULL; i++) {
190+
if (strncmp(src, entities[i].entity, entities[i].len) == 0) {
191+
int j;
192+
for (j = 0; j < entities[i].replacement_len; j++) {
193+
*dst++ = entities[i].replacement[j];
194+
}
195+
src += entities[i].len;
196+
found = 1;
197+
break;
198+
}
199+
}
200+
if (found) {
201+
continue;
202+
}
203+
204+
if (src[1] == '#') {
225205
// 数値文字参照 &#nnn; または &#xhex;
226206
char *end_ptr;
227207
unsigned long code;

0 commit comments

Comments
 (0)