3939#endif
4040
4141uint8_t request [65536 ];
42- uint8_t response [65536 ];
42+ uint8_t response [65536 * 8 ];
4343
4444/*
4545 * Shift_JISの文字かどうかを判定
@@ -80,7 +80,7 @@ int is_block_element(const char *tag_start) {
8080 * 簡単なHTMLパース処理(Shift_JIS対応)
8181 * HTMLタグを除去してプレーンテキストを抽出
8282 */
83- void parse_html (char * html_content ) {
83+ void parse_html (char * html_content , int sjis_mode ) {
8484 unsigned char * src = (unsigned char * )html_content ;
8585 unsigned char * dst = (unsigned char * )html_content ;
8686 int in_tag = 0 ;
@@ -90,17 +90,29 @@ void parse_html(char *html_content) {
9090 unsigned char * tag_start = NULL ;
9191
9292 while (* src ) {
93- // Shift_JISの2バイト文字の処理
94- if (is_sjis_lead_byte (* src ) && * (src + 1 )) {
95- if (!in_tag && !in_script && !in_style ) {
96- * dst ++ = * src ;
97- * dst ++ = * (src + 1 );
98- consecutive_spaces = 0 ;
93+ if (sjis_mode ) {
94+ // Shift_JISの2バイト文字の処理
95+ if (is_sjis_lead_byte (* src ) && * (src + 1 )) {
96+ if (!in_tag && !in_script && !in_style ) {
97+ * dst ++ = * src ;
98+ * dst ++ = * (src + 1 );
99+ consecutive_spaces = 0 ;
100+ }
101+ src += 2 ;
102+ continue ;
103+ }
104+ } else {
105+ // UTF-8のマルチバイト文字の処理
106+ if (* src & 0x80 ) {
107+ if (!in_tag && !in_script && !in_style ) {
108+ * dst ++ = * src ;
109+ consecutive_spaces = 0 ;
110+ }
111+ src ++ ;
112+ continue ;
99113 }
100- src += 2 ;
101- continue ;
102114 }
103-
115+
104116 // スクリプトタグの検出
105117 if (!in_tag && strncmp ((char * )src , "<script" , 7 ) == 0 ) {
106118 in_script = 1 ;
@@ -151,69 +163,44 @@ void parse_html(char *html_content) {
151163 continue ;
152164 }
153165
154- // タグ内でない場合は文字をコピー
155- if (!in_tag && !in_script && !in_style ) {
156- if (isspace (* src )) {
157- // 連続する空白文字を1つのスペースに変換
158- if (!consecutive_spaces ) {
159- * dst ++ = ' ' ;
160- consecutive_spaces = 1 ;
161- }
162- src ++ ;
163- } else {
164- * dst ++ = * src ++ ;
165- consecutive_spaces = 0 ;
166- }
167- } else {
166+ // タグ内はスキップ
167+ if (in_tag || in_script || in_style ) {
168168 src ++ ;
169+ continue ;
169170 }
170- }
171-
172- * dst = '\0' ;
173- }
174171
175- /*
176- * HTML実体参照を文字コードに変換
177- */
178- void decode_html_entities (char * text ) {
179- // よく使われる実体参照の変換テーブル
180- struct {
181- const char * entity ;
182- int len ;
183- const char * replacement ;
184- int replacement_len ;
185- } entities [] = {
186- {"<" , 4 , "<" , 1 },
187- {">" , 4 , ">" , 1 },
188- {"&" , 5 , "&" , 1 },
189- {""" , 6 , "\"" , 1 },
190- {"'" , 6 , "'" , 1 },
191- {"«" , 7 , "<<" , 2 },
192- {"»" , 7 , ">>" , 2 },
193- {" " , 6 , "\xC2\xA0" , 2 }, // 改行なしスペース (U+00A0)
194- {"©" , 6 , "\xC2\xA9" , 2 }, // copyright symbol (U+00A9)
195- {"®" , 5 , "\xC2\xAE" , 2 }, // registered trademark symbol (U+00AE)
196- {"™" , 7 , "\xE2\x84\xA2" , 3 }, // trademark symbol (U+2122)
197- {"¥" , 5 , "\xC2\xA5" , 2 }, // yen sign (U+00A5)
198- {"€" , 6 , "\xE2\x82\xAC" , 3 }, // euro sign (U+20AC)
199- {"£" , 7 , "\xC2\xA3" , 2 }, // pound sign (U+00A3)
200- {"¢" , 6 , "\xC2\xA2" , 2 }, // cent sign (U+00A2)
201- {"°" , 5 , "\xC2\xB0" , 2 }, // degree sign (U+00B0)
202- {NULL , 0 , NULL , 0 }
203- };
204-
205- char * src = text ;
206- char * dst = text ;
207-
208- while (* src ) {
209172 if (* src == '&' ) {
173+ // よく使われる実体参照の変換テーブル
174+ const struct {
175+ const char * entity ;
176+ int len ;
177+ const char * replacement ;
178+ int replacement_len ;
179+ } entities [] = {
180+ {"<" , 4 , "<" , 1 },
181+ {">" , 4 , ">" , 1 },
182+ {"&" , 5 , "&" , 1 },
183+ {""" , 6 , "\"" , 1 },
184+ {"'" , 6 , "'" , 1 },
185+ {"«" , 7 , "<<" , 2 },
186+ {"»" , 7 , ">>" , 2 },
187+ {" " , 6 , " " , 1 }, // 改行なしスペース
188+ {"©" , 6 , "\xC2\xA9" , 2 }, // copyright symbol (U+00A9)
189+ {"®" , 5 , "\xC2\xAE" , 2 }, // registered trademark symbol (U+00AE)
190+ {"™" , 7 , "\xE2\x84\xA2" , 3 }, // trademark symbol (U+2122)
191+ {"¥" , 5 , "\xC2\xA5" , 2 }, // yen sign (U+00A5)
192+ {"€" , 6 , "\xE2\x82\xAC" , 3 }, // euro sign (U+20AC)
193+ {"£" , 7 , "\xC2\xA3" , 2 }, // pound sign (U+00A3)
194+ {"¢" , 6 , "\xC2\xA2" , 2 }, // cent sign (U+00A2)
195+ {"°" , 5 , "\xC2\xB0" , 2 }, // degree sign (U+00B0)
196+ {NULL , 0 , NULL , 0 }
197+ };
198+
210199 // よく使われる実体参照を変換
211200 int found = 0 ;
212- int i ;
213- for (i = 0 ; entities [i ].entity != NULL ; i ++ ) {
214- if (strncmp (src , entities [i ].entity , entities [i ].len ) == 0 ) {
215- int j ;
216- for (j = 0 ; j < entities [i ].replacement_len ; j ++ ) {
201+ for (int i = 0 ; entities [i ].entity != NULL ; i ++ ) {
202+ if (strncmp ((char * )src , entities [i ].entity , entities [i ].len ) == 0 ) {
203+ for (int j = 0 ; j < entities [i ].replacement_len ; j ++ ) {
217204 * dst ++ = entities [i ].replacement [j ];
218205 }
219206 src += entities [i ].len ;
@@ -231,12 +218,12 @@ void decode_html_entities(char *text) {
231218 unsigned long code ;
232219 if (src [2 ] == 'x' || src [2 ] == 'X' ) {
233220 // 16進数
234- code = strtoul (src + 3 , & end_ptr , 16 );
221+ code = strtoul (( char * ) src + 3 , & end_ptr , 16 );
235222 } else {
236223 // 10進数
237- code = strtoul (src + 2 , & end_ptr , 10 );
224+ code = strtoul (( char * ) src + 2 , & end_ptr , 10 );
238225 }
239-
226+
240227 if (* end_ptr == ';' && code > 0 ) {
241228 // UTF-8文字をバイト列に変換
242229 if (code < 0x80 ) {
@@ -261,7 +248,7 @@ void decode_html_entities(char *text) {
261248 // 無効なコードポイント
262249 * dst ++ = '?' ;
263250 }
264- src = end_ptr + 1 ;
251+ src = ( uint8_t * ) end_ptr + 1 ;
265252 } else {
266253 // 無効な数値文字参照はそのまま
267254 * dst ++ = * src ++ ;
@@ -271,10 +258,20 @@ void decode_html_entities(char *text) {
271258 * dst ++ = * src ++ ;
272259 }
273260 } else {
274- * dst ++ = * src ++ ;
261+ if (isspace (* src )) {
262+ // 連続する空白文字を1つのスペースに変換
263+ if (!consecutive_spaces ) {
264+ * dst ++ = ' ' ;
265+ consecutive_spaces = 1 ;
266+ }
267+ src ++ ;
268+ } else {
269+ * dst ++ = * src ++ ;
270+ consecutive_spaces = 0 ;
271+ }
275272 }
276273 }
277-
274+
278275 * dst = '\0' ;
279276}
280277
@@ -477,14 +474,9 @@ int main(int argc, char **argv)
477474 printf ("Receiving HTTP response" );
478475
479476 // HTTPレスポンスを受信してバッファに蓄積
480- memset (response , 0 , sizeof (response ));
481-
482- uint8_t * buffer ;
483- size_t buffer_size ;
484-
485- buffer = malloc (1 );
486- * buffer = 0 ;
487- buffer_size = 1 ;
477+ uint8_t * buffer = response ;
478+ size_t buffer_size = 0 ;
479+ size_t buffer_remain = sizeof (response ) - 1 ;
488480
489481 int res ;
490482 while (1 ) {
@@ -509,18 +501,23 @@ int main(int argc, char **argv)
509501 }
510502
511503 if (res > 0 ) {
512- buffer_size += res ;
513- buffer = realloc (buffer , buffer_size );
514- strcat ((char * )buffer , (char * )ptr );
504+ size_t size = buffer_remain < res ? buffer_remain : res ;
505+ memcpy (buffer , ptr , size );
506+ buffer += size ;
507+ buffer_remain -= size ;
508+ buffer_size += size ;
509+ if (buffer_remain == 0 ) {
510+ break ;
511+ }
515512 }
516513 putchar ('.' );
517514 fflush (stdout );
518515 }
519- printf ("\n\n" );
516+ printf ("\nReceived %lu bytes\n\n" , buffer_size );
517+ * buffer = '\0' ;
520518
521519 if (res != SSL_CLOSE_NOTIFY && res != SSL_ERROR_CONN_LOST ) {
522520 printf ("ssl_read() failed: %d\n" , res );
523- free (buffer );
524521 if (!http_mode ) {
525522 ssl_free (ssl_sock );
526523 ssl_ctx_free (ssl_ctx );
@@ -535,47 +532,45 @@ int main(int argc, char **argv)
535532 }
536533 close (client_fd );
537534
535+ // HTTPレスポンスからHTMLボディを抽出
536+ char * html_body = extract_html_body ((char * )response );
537+ if (show_header ) {
538+ printf ("HTML Header:\n" );
539+ printf ("--------------------\n" );
540+ fwrite (response , 1 , html_body - (char * )response , stdout );
541+ printf ("--------------------\n\n" );
542+ }
543+
544+ // HTMLのパースと表示
538545 if (!raw_mode ) {
539- // HTML実体参照をデコード
540- decode_html_entities (( char * ) buffer );
546+ parse_html ( html_body , sjis_mode );
547+ size_t html_body_len = strlen ( html_body );
541548
542- uint8_t * sbuffer = malloc (buffer_size );
543549 if (!sjis_mode ) {
544- uint8_t * inbuf = buffer ;
545- size_t inbytesleft = buffer_size ;
550+ uint8_t * sbuffer = malloc (html_body_len );
551+
552+ uint8_t * inbuf = (uint8_t * )html_body ;
553+ size_t inbytesleft = html_body_len ;
546554 uint8_t * outbuf = sbuffer ;
547- size_t outbytesleft = buffer_size ;
555+ size_t outbytesleft = html_body_len ;
548556
549557 do {
550558 int res = iconv_u2s ((char * * )& inbuf , & inbytesleft , (char * * )& outbuf , & outbytesleft );
551559 if (res < 0 ) {
552560 inbuf ++ ;
553561 inbytesleft -- ;
554562 }
555- } while (inbuf < buffer + buffer_size );
563+ } while (( char * ) inbuf < html_body + html_body_len );
556564 * outbuf = 0 ;
557- } else {
558- memcpy (sbuffer , buffer , buffer_size );
559- }
560565
561- char * html_body = extract_html_body ((char * )sbuffer );
562-
563- parse_html (html_body );
564-
565- if (show_header ) {
566- printf ("--------------------\n" );
567- fwrite (sbuffer , 1 , html_body - (char * )sbuffer , stdout );
568- printf ("--------------------\n\n" );
566+ printf ("%s" , sbuffer );
567+ free (sbuffer );
568+ } else {
569+ printf ("%s" , html_body );
569570 }
570-
571- printf ("%s" , html_body );
572-
573- free (sbuffer );
574571 } else {
575- printf ("%s" , buffer );
572+ printf ("%s" , html_body );
576573 }
577574
578- free (buffer );
579-
580575 return 0 ;
581576}
0 commit comments