Fix html processing order

yunkya2 · yunkya2 · commit 10e60f94ec8a · 2025-09-28T23:52:23.000+09:00
diff --git a/httpsget.c b/httpsget.c
@@ -39,7 +39,7 @@
 #endif
 
 uint8_t request[65536];
-uint8_t response[65536];
+uint8_t response[65536 * 8];
 
 /*
  * Shift_JISの文字かどうかを判定
@@ -80,7 +80,7 @@ int is_block_element(const char *tag_start) {
  * 簡単なHTMLパース処理（Shift_JIS対応）
  * HTMLタグを除去してプレーンテキストを抽出
  */
-void parse_html(char *html_content) {
+void parse_html(char *html_content, int sjis_mode) {
     unsigned char *src = (unsigned char*)html_content;
     unsigned char *dst = (unsigned char*)html_content;
     int in_tag = 0;
@@ -90,17 +90,29 @@ void parse_html(char *html_content) {
     unsigned char *tag_start = NULL;
     
     while (*src) {
-        // Shift_JISの2バイト文字の処理
-        if (is_sjis_lead_byte(*src) && *(src + 1)) {
-            if (!in_tag && !in_script && !in_style) {
-                *dst++ = *src;
-                *dst++ = *(src + 1);
-                consecutive_spaces = 0;
+        if (sjis_mode) {
+            // Shift_JISの2バイト文字の処理
+            if (is_sjis_lead_byte(*src) && *(src + 1)) {
+                if (!in_tag && !in_script && !in_style) {
+                    *dst++ = *src;
+                    *dst++ = *(src + 1);
+                    consecutive_spaces = 0;
+                }
+                src += 2;
+                continue;
+            }
+        } else {
+            // UTF-8のマルチバイト文字の処理
+            if (*src & 0x80) {
+                if (!in_tag && !in_script && !in_style) {
+                    *dst++ = *src;
+                    consecutive_spaces = 0;
+                }
+                src++;
+                continue;
             }
-            src += 2;
-            continue;
         }
-        
+
         // スクリプトタグの検出
         if (!in_tag && strncmp((char*)src, "<script", 7) == 0) {
             in_script = 1;
@@ -151,69 +163,44 @@ void parse_html(char *html_content) {
             continue;
         }
         
-        // タグ内でない場合は文字をコピー
-        if (!in_tag && !in_script && !in_style) {
-            if (isspace(*src)) {
-                // 連続する空白文字を1つのスペースに変換
-                if (!consecutive_spaces) {
-                    *dst++ = ' ';
-                    consecutive_spaces = 1;
-                }
-                src++;
-            } else {
-                *dst++ = *src++;
-                consecutive_spaces = 0;
-            }
-        } else {
+        // タグ内はスキップ
+        if (in_tag || in_script || in_style) {
             src++;
+            continue;
         }
-    }
-    
-    *dst = '\0';
-}
 
-/*
- * HTML実体参照を文字コードに変換
- */
-void decode_html_entities(char *text) {
-    // よく使われる実体参照の変換テーブル
-    struct {
-        const char *entity;
-        int len;
-        const char *replacement;
-        int replacement_len;
-    } entities[] = {
-        {"&lt;", 4, "<", 1},
-        {"&gt;", 4, ">", 1},
-        {"&amp;", 5, "&", 1},
-        {"&quot;", 6, "\"", 1},
-        {"&apos;", 6, "'", 1},
-        {"&laquo;", 7, "<<", 2},
-        {"&raquo;", 7, ">>", 2},
-        {"&nbsp;", 6, "\xC2\xA0", 2},        // 改行なしスペース (U+00A0)
-        {"&copy;", 6, "\xC2\xA9", 2},        // copyright symbol (U+00A9)
-        {"&reg;", 5, "\xC2\xAE", 2},         // registered trademark symbol (U+00AE)
-        {"&trade;", 7, "\xE2\x84\xA2", 3},   // trademark symbol (U+2122)
-        {"&yen;", 5, "\xC2\xA5", 2},         // yen sign (U+00A5)
-        {"&euro;", 6, "\xE2\x82\xAC", 3},    // euro sign (U+20AC)
-        {"&pound;", 7, "\xC2\xA3", 2},       // pound sign (U+00A3)
-        {"&cent;", 6, "\xC2\xA2", 2},        // cent sign (U+00A2)
-        {"&deg;", 5, "\xC2\xB0", 2},         // degree sign (U+00B0)
-        {NULL, 0, NULL, 0}
-    };
-    
-    char *src = text;
-    char *dst = text;
-    
-    while (*src) {
         if (*src == '&') {
+            // よく使われる実体参照の変換テーブル
+            const struct {
+                const char *entity;
+                int len;
+                const char *replacement;
+                int replacement_len;
+            } entities[] = {
+                {"&lt;", 4, "<", 1},
+                {"&gt;", 4, ">", 1},
+                {"&amp;", 5, "&", 1},
+                {"&quot;", 6, "\"", 1},
+                {"&apos;", 6, "'", 1},
+                {"&laquo;", 7, "<<", 2},
+                {"&raquo;", 7, ">>", 2},
+                {"&nbsp;", 6, " ", 1},               // 改行なしスペース
+                {"&copy;", 6, "\xC2\xA9", 2},        // copyright symbol (U+00A9)
+                {"&reg;", 5, "\xC2\xAE", 2},         // registered trademark symbol (U+00AE)
+                {"&trade;", 7, "\xE2\x84\xA2", 3},   // trademark symbol (U+2122)
+                {"&yen;", 5, "\xC2\xA5", 2},         // yen sign (U+00A5)
+                {"&euro;", 6, "\xE2\x82\xAC", 3},    // euro sign (U+20AC)
+                {"&pound;", 7, "\xC2\xA3", 2},       // pound sign (U+00A3)
+                {"&cent;", 6, "\xC2\xA2", 2},        // cent sign (U+00A2)
+                {"&deg;", 5, "\xC2\xB0", 2},         // degree sign (U+00B0)
+                {NULL, 0, NULL, 0}
+            };
+
             // よく使われる実体参照を変換
             int found = 0;
-            int i;
-            for (i = 0; entities[i].entity != NULL; i++) {
-                if (strncmp(src, entities[i].entity, entities[i].len) == 0) {
-                    int j;
-                    for (j = 0; j < entities[i].replacement_len; j++) {
+            for (int i = 0; entities[i].entity != NULL; i++) {
+                if (strncmp((char *)src, entities[i].entity, entities[i].len) == 0) {
+                    for (int j = 0; j < entities[i].replacement_len; j++) {
                         *dst++ = entities[i].replacement[j];
                     }
                     src += entities[i].len;
@@ -231,12 +218,12 @@ void decode_html_entities(char *text) {
                 unsigned long code;
                 if (src[2] == 'x' || src[2] == 'X') {
                     // 16進数
-                    code = strtoul(src + 3, &end_ptr, 16);
+                    code = strtoul((char *)src + 3, &end_ptr, 16);
                 } else {
                     // 10進数
-                    code = strtoul(src + 2, &end_ptr, 10);
+                    code = strtoul((char *)src + 2, &end_ptr, 10);
                 }
-                
+
                 if (*end_ptr == ';' && code > 0) {
                     // UTF-8文字をバイト列に変換
                     if (code < 0x80) {
@@ -261,7 +248,7 @@ void decode_html_entities(char *text) {
                         // 無効なコードポイント
                         *dst++ = '?';
                     }
-                    src = end_ptr + 1;
+                    src = (uint8_t *)end_ptr + 1;
                 } else {
                     // 無効な数値文字参照はそのまま
                     *dst++ = *src++;
@@ -271,10 +258,20 @@ void decode_html_entities(char *text) {
                 *dst++ = *src++;
             }
         } else {
-            *dst++ = *src++;
+            if (isspace(*src)) {
+                // 連続する空白文字を1つのスペースに変換
+                if (!consecutive_spaces) {
+                    *dst++ = ' ';
+                    consecutive_spaces = 1;
+                }
+                src++;
+            } else {
+                *dst++ = *src++;
+                consecutive_spaces = 0;
+            }
         }
     }
-    
+
     *dst = '\0';
 }
 
@@ -477,14 +474,9 @@ int main(int argc, char **argv)
     printf("Receiving HTTP response");
 
     // HTTPレスポンスを受信してバッファに蓄積
-    memset(response, 0, sizeof(response));
-
-    uint8_t *buffer;
-    size_t buffer_size;
-
-    buffer = malloc(1);
-    *buffer = 0;
-    buffer_size = 1;
+    uint8_t *buffer = response;
+    size_t buffer_size = 0;
+    size_t buffer_remain = sizeof(response) - 1;
 
     int res;
     while (1) {
@@ -509,18 +501,23 @@ int main(int argc, char **argv)
         }
 
         if (res > 0) {
-            buffer_size += res;
-            buffer = realloc(buffer, buffer_size);
-            strcat((char *)buffer, (char *)ptr);
+            size_t size = buffer_remain < res ? buffer_remain : res;
+            memcpy(buffer, ptr, size);
+            buffer += size;
+            buffer_remain -= size;
+            buffer_size += size;
+            if (buffer_remain == 0) {
+                break;
+            }
         }
         putchar('.');
         fflush(stdout);
     }
-    printf("\n\n");
+    printf("\nReceived %lu bytes\n\n", buffer_size);
+    *buffer = '\0';
 
     if (res != SSL_CLOSE_NOTIFY && res != SSL_ERROR_CONN_LOST) {
         printf("ssl_read() failed: %d\n", res);
-        free(buffer);
         if (!http_mode) {
             ssl_free(ssl_sock);
             ssl_ctx_free(ssl_ctx);
@@ -535,47 +532,45 @@ int main(int argc, char **argv)
     }
     close(client_fd);
 
+    // HTTPレスポンスからHTMLボディを抽出
+    char *html_body = extract_html_body((char *)response);
+    if (show_header) {
+       printf("HTML Header:\n");
+       printf("--------------------\n");
+       fwrite(response, 1, html_body - (char *)response, stdout);
+       printf("--------------------\n\n");
+    }
+
+    // HTMLのパースと表示
     if (!raw_mode) {
-        // HTML実体参照をデコード
-        decode_html_entities((char *)buffer);
+        parse_html(html_body, sjis_mode);
+        size_t html_body_len = strlen(html_body);
 
-        uint8_t *sbuffer = malloc(buffer_size);
         if (!sjis_mode) {
-            uint8_t *inbuf = buffer;
-            size_t inbytesleft = buffer_size;
+            uint8_t *sbuffer = malloc(html_body_len);
+
+            uint8_t *inbuf = (uint8_t *)html_body;
+            size_t inbytesleft = html_body_len;
             uint8_t *outbuf = sbuffer;
-            size_t outbytesleft = buffer_size;
+            size_t outbytesleft = html_body_len;
 
             do {
                 int res = iconv_u2s((char **)&inbuf, &inbytesleft, (char **)&outbuf, &outbytesleft);
                 if (res < 0) {
                     inbuf++;
                     inbytesleft--;
                 }
-            } while (inbuf < buffer + buffer_size);
+            } while ((char *)inbuf < html_body + html_body_len);
             *outbuf = 0;
-        } else {
-            memcpy(sbuffer, buffer, buffer_size);
-        }
 
-        char *html_body = extract_html_body((char *)sbuffer);
-
-        parse_html(html_body);
-
-        if (show_header) {
-            printf("--------------------\n");
-            fwrite(sbuffer, 1, html_body - (char *)sbuffer, stdout);
-            printf("--------------------\n\n");
+            printf("%s", sbuffer);
+            free(sbuffer);
+        } else {
+            printf("%s", html_body);
         }
-
-        printf("%s", html_body);
-
-        free(sbuffer);
     } else {
-        printf("%s", buffer);
+        printf("%s", html_body);
     }
 
-    free(buffer);
-
     return 0;
 }