Skip to content

Commit 10e60f9

Browse files
committed
Fix html processing order
1 parent 8ae3a71 commit 10e60f9

File tree

1 file changed

+107
-112
lines changed

1 file changed

+107
-112
lines changed

httpsget.c

Lines changed: 107 additions & 112 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@
3939
#endif
4040

4141
uint8_t request[65536];
42-
uint8_t response[65536];
42+
uint8_t response[65536 * 8];
4343

4444
/*
4545
* Shift_JISの文字かどうかを判定
@@ -80,7 +80,7 @@ int is_block_element(const char *tag_start) {
8080
* 簡単なHTMLパース処理(Shift_JIS対応)
8181
* HTMLタグを除去してプレーンテキストを抽出
8282
*/
83-
void parse_html(char *html_content) {
83+
void parse_html(char *html_content, int sjis_mode) {
8484
unsigned char *src = (unsigned char*)html_content;
8585
unsigned char *dst = (unsigned char*)html_content;
8686
int in_tag = 0;
@@ -90,17 +90,29 @@ void parse_html(char *html_content) {
9090
unsigned char *tag_start = NULL;
9191

9292
while (*src) {
93-
// Shift_JISの2バイト文字の処理
94-
if (is_sjis_lead_byte(*src) && *(src + 1)) {
95-
if (!in_tag && !in_script && !in_style) {
96-
*dst++ = *src;
97-
*dst++ = *(src + 1);
98-
consecutive_spaces = 0;
93+
if (sjis_mode) {
94+
// Shift_JISの2バイト文字の処理
95+
if (is_sjis_lead_byte(*src) && *(src + 1)) {
96+
if (!in_tag && !in_script && !in_style) {
97+
*dst++ = *src;
98+
*dst++ = *(src + 1);
99+
consecutive_spaces = 0;
100+
}
101+
src += 2;
102+
continue;
103+
}
104+
} else {
105+
// UTF-8のマルチバイト文字の処理
106+
if (*src & 0x80) {
107+
if (!in_tag && !in_script && !in_style) {
108+
*dst++ = *src;
109+
consecutive_spaces = 0;
110+
}
111+
src++;
112+
continue;
99113
}
100-
src += 2;
101-
continue;
102114
}
103-
115+
104116
// スクリプトタグの検出
105117
if (!in_tag && strncmp((char*)src, "<script", 7) == 0) {
106118
in_script = 1;
@@ -151,69 +163,44 @@ void parse_html(char *html_content) {
151163
continue;
152164
}
153165

154-
// タグ内でない場合は文字をコピー
155-
if (!in_tag && !in_script && !in_style) {
156-
if (isspace(*src)) {
157-
// 連続する空白文字を1つのスペースに変換
158-
if (!consecutive_spaces) {
159-
*dst++ = ' ';
160-
consecutive_spaces = 1;
161-
}
162-
src++;
163-
} else {
164-
*dst++ = *src++;
165-
consecutive_spaces = 0;
166-
}
167-
} else {
166+
// タグ内はスキップ
167+
if (in_tag || in_script || in_style) {
168168
src++;
169+
continue;
169170
}
170-
}
171-
172-
*dst = '\0';
173-
}
174171

175-
/*
176-
* HTML実体参照を文字コードに変換
177-
*/
178-
void decode_html_entities(char *text) {
179-
// よく使われる実体参照の変換テーブル
180-
struct {
181-
const char *entity;
182-
int len;
183-
const char *replacement;
184-
int replacement_len;
185-
} entities[] = {
186-
{"&lt;", 4, "<", 1},
187-
{"&gt;", 4, ">", 1},
188-
{"&amp;", 5, "&", 1},
189-
{"&quot;", 6, "\"", 1},
190-
{"&apos;", 6, "'", 1},
191-
{"&laquo;", 7, "<<", 2},
192-
{"&raquo;", 7, ">>", 2},
193-
{"&nbsp;", 6, "\xC2\xA0", 2}, // 改行なしスペース (U+00A0)
194-
{"&copy;", 6, "\xC2\xA9", 2}, // copyright symbol (U+00A9)
195-
{"&reg;", 5, "\xC2\xAE", 2}, // registered trademark symbol (U+00AE)
196-
{"&trade;", 7, "\xE2\x84\xA2", 3}, // trademark symbol (U+2122)
197-
{"&yen;", 5, "\xC2\xA5", 2}, // yen sign (U+00A5)
198-
{"&euro;", 6, "\xE2\x82\xAC", 3}, // euro sign (U+20AC)
199-
{"&pound;", 7, "\xC2\xA3", 2}, // pound sign (U+00A3)
200-
{"&cent;", 6, "\xC2\xA2", 2}, // cent sign (U+00A2)
201-
{"&deg;", 5, "\xC2\xB0", 2}, // degree sign (U+00B0)
202-
{NULL, 0, NULL, 0}
203-
};
204-
205-
char *src = text;
206-
char *dst = text;
207-
208-
while (*src) {
209172
if (*src == '&') {
173+
// よく使われる実体参照の変換テーブル
174+
const struct {
175+
const char *entity;
176+
int len;
177+
const char *replacement;
178+
int replacement_len;
179+
} entities[] = {
180+
{"&lt;", 4, "<", 1},
181+
{"&gt;", 4, ">", 1},
182+
{"&amp;", 5, "&", 1},
183+
{"&quot;", 6, "\"", 1},
184+
{"&apos;", 6, "'", 1},
185+
{"&laquo;", 7, "<<", 2},
186+
{"&raquo;", 7, ">>", 2},
187+
{"&nbsp;", 6, " ", 1}, // 改行なしスペース
188+
{"&copy;", 6, "\xC2\xA9", 2}, // copyright symbol (U+00A9)
189+
{"&reg;", 5, "\xC2\xAE", 2}, // registered trademark symbol (U+00AE)
190+
{"&trade;", 7, "\xE2\x84\xA2", 3}, // trademark symbol (U+2122)
191+
{"&yen;", 5, "\xC2\xA5", 2}, // yen sign (U+00A5)
192+
{"&euro;", 6, "\xE2\x82\xAC", 3}, // euro sign (U+20AC)
193+
{"&pound;", 7, "\xC2\xA3", 2}, // pound sign (U+00A3)
194+
{"&cent;", 6, "\xC2\xA2", 2}, // cent sign (U+00A2)
195+
{"&deg;", 5, "\xC2\xB0", 2}, // degree sign (U+00B0)
196+
{NULL, 0, NULL, 0}
197+
};
198+
210199
// よく使われる実体参照を変換
211200
int found = 0;
212-
int i;
213-
for (i = 0; entities[i].entity != NULL; i++) {
214-
if (strncmp(src, entities[i].entity, entities[i].len) == 0) {
215-
int j;
216-
for (j = 0; j < entities[i].replacement_len; j++) {
201+
for (int i = 0; entities[i].entity != NULL; i++) {
202+
if (strncmp((char *)src, entities[i].entity, entities[i].len) == 0) {
203+
for (int j = 0; j < entities[i].replacement_len; j++) {
217204
*dst++ = entities[i].replacement[j];
218205
}
219206
src += entities[i].len;
@@ -231,12 +218,12 @@ void decode_html_entities(char *text) {
231218
unsigned long code;
232219
if (src[2] == 'x' || src[2] == 'X') {
233220
// 16進数
234-
code = strtoul(src + 3, &end_ptr, 16);
221+
code = strtoul((char *)src + 3, &end_ptr, 16);
235222
} else {
236223
// 10進数
237-
code = strtoul(src + 2, &end_ptr, 10);
224+
code = strtoul((char *)src + 2, &end_ptr, 10);
238225
}
239-
226+
240227
if (*end_ptr == ';' && code > 0) {
241228
// UTF-8文字をバイト列に変換
242229
if (code < 0x80) {
@@ -261,7 +248,7 @@ void decode_html_entities(char *text) {
261248
// 無効なコードポイント
262249
*dst++ = '?';
263250
}
264-
src = end_ptr + 1;
251+
src = (uint8_t *)end_ptr + 1;
265252
} else {
266253
// 無効な数値文字参照はそのまま
267254
*dst++ = *src++;
@@ -271,10 +258,20 @@ void decode_html_entities(char *text) {
271258
*dst++ = *src++;
272259
}
273260
} else {
274-
*dst++ = *src++;
261+
if (isspace(*src)) {
262+
// 連続する空白文字を1つのスペースに変換
263+
if (!consecutive_spaces) {
264+
*dst++ = ' ';
265+
consecutive_spaces = 1;
266+
}
267+
src++;
268+
} else {
269+
*dst++ = *src++;
270+
consecutive_spaces = 0;
271+
}
275272
}
276273
}
277-
274+
278275
*dst = '\0';
279276
}
280277

@@ -477,14 +474,9 @@ int main(int argc, char **argv)
477474
printf("Receiving HTTP response");
478475

479476
// HTTPレスポンスを受信してバッファに蓄積
480-
memset(response, 0, sizeof(response));
481-
482-
uint8_t *buffer;
483-
size_t buffer_size;
484-
485-
buffer = malloc(1);
486-
*buffer = 0;
487-
buffer_size = 1;
477+
uint8_t *buffer = response;
478+
size_t buffer_size = 0;
479+
size_t buffer_remain = sizeof(response) - 1;
488480

489481
int res;
490482
while (1) {
@@ -509,18 +501,23 @@ int main(int argc, char **argv)
509501
}
510502

511503
if (res > 0) {
512-
buffer_size += res;
513-
buffer = realloc(buffer, buffer_size);
514-
strcat((char *)buffer, (char *)ptr);
504+
size_t size = buffer_remain < res ? buffer_remain : res;
505+
memcpy(buffer, ptr, size);
506+
buffer += size;
507+
buffer_remain -= size;
508+
buffer_size += size;
509+
if (buffer_remain == 0) {
510+
break;
511+
}
515512
}
516513
putchar('.');
517514
fflush(stdout);
518515
}
519-
printf("\n\n");
516+
printf("\nReceived %lu bytes\n\n", buffer_size);
517+
*buffer = '\0';
520518

521519
if (res != SSL_CLOSE_NOTIFY && res != SSL_ERROR_CONN_LOST) {
522520
printf("ssl_read() failed: %d\n", res);
523-
free(buffer);
524521
if (!http_mode) {
525522
ssl_free(ssl_sock);
526523
ssl_ctx_free(ssl_ctx);
@@ -535,47 +532,45 @@ int main(int argc, char **argv)
535532
}
536533
close(client_fd);
537534

535+
// HTTPレスポンスからHTMLボディを抽出
536+
char *html_body = extract_html_body((char *)response);
537+
if (show_header) {
538+
printf("HTML Header:\n");
539+
printf("--------------------\n");
540+
fwrite(response, 1, html_body - (char *)response, stdout);
541+
printf("--------------------\n\n");
542+
}
543+
544+
// HTMLのパースと表示
538545
if (!raw_mode) {
539-
// HTML実体参照をデコード
540-
decode_html_entities((char *)buffer);
546+
parse_html(html_body, sjis_mode);
547+
size_t html_body_len = strlen(html_body);
541548

542-
uint8_t *sbuffer = malloc(buffer_size);
543549
if (!sjis_mode) {
544-
uint8_t *inbuf = buffer;
545-
size_t inbytesleft = buffer_size;
550+
uint8_t *sbuffer = malloc(html_body_len);
551+
552+
uint8_t *inbuf = (uint8_t *)html_body;
553+
size_t inbytesleft = html_body_len;
546554
uint8_t *outbuf = sbuffer;
547-
size_t outbytesleft = buffer_size;
555+
size_t outbytesleft = html_body_len;
548556

549557
do {
550558
int res = iconv_u2s((char **)&inbuf, &inbytesleft, (char **)&outbuf, &outbytesleft);
551559
if (res < 0) {
552560
inbuf++;
553561
inbytesleft--;
554562
}
555-
} while (inbuf < buffer + buffer_size);
563+
} while ((char *)inbuf < html_body + html_body_len);
556564
*outbuf = 0;
557-
} else {
558-
memcpy(sbuffer, buffer, buffer_size);
559-
}
560565

561-
char *html_body = extract_html_body((char *)sbuffer);
562-
563-
parse_html(html_body);
564-
565-
if (show_header) {
566-
printf("--------------------\n");
567-
fwrite(sbuffer, 1, html_body - (char *)sbuffer, stdout);
568-
printf("--------------------\n\n");
566+
printf("%s", sbuffer);
567+
free(sbuffer);
568+
} else {
569+
printf("%s", html_body);
569570
}
570-
571-
printf("%s", html_body);
572-
573-
free(sbuffer);
574571
} else {
575-
printf("%s", buffer);
572+
printf("%s", html_body);
576573
}
577574

578-
free(buffer);
579-
580575
return 0;
581576
}

0 commit comments

Comments
 (0)