@@ -126,40 +126,145 @@ void parse_html(char *html_content) {
126126
127127 // タグ内でない場合は文字をコピー
128128 if (!in_tag && !in_script && !in_style ) {
129- // HTMLエンティティの簡単な変換
130- if (strncmp ((char * )src , "<" , 4 ) == 0 ) {
129+ if (isspace (* src )) {
130+ // 連続する空白文字を1つのスペースに変換
131+ if (!consecutive_spaces ) {
132+ * dst ++ = ' ' ;
133+ consecutive_spaces = 1 ;
134+ }
135+ src ++ ;
136+ } else {
137+ * dst ++ = * src ++ ;
138+ consecutive_spaces = 0 ;
139+ }
140+ } else {
141+ src ++ ;
142+ }
143+ }
144+
145+ * dst = '\0' ;
146+ }
147+
148+ /*
149+ * HTML実体参照を文字コードに変換
150+ */
151+ void decode_html_entities (char * text ) {
152+ char * src = text ;
153+ char * dst = text ;
154+
155+ while (* src ) {
156+ if (* src == '&' ) {
157+ // よく使われる実体参照を変換
158+ if (strncmp (src , "<" , 4 ) == 0 ) {
131159 * dst ++ = '<' ;
132160 src += 4 ;
133- consecutive_spaces = 0 ;
134- } else if (strncmp ((char * )src , ">" , 4 ) == 0 ) {
161+ } else if (strncmp (src , ">" , 4 ) == 0 ) {
135162 * dst ++ = '>' ;
136163 src += 4 ;
137- consecutive_spaces = 0 ;
138- } else if (strncmp ((char * )src , "&" , 5 ) == 0 ) {
164+ } else if (strncmp (src , "&" , 5 ) == 0 ) {
139165 * dst ++ = '&' ;
140166 src += 5 ;
141- consecutive_spaces = 0 ;
142- } else if (strncmp ((char * )src , """ , 6 ) == 0 ) {
167+ } else if (strncmp (src , """ , 6 ) == 0 ) {
143168 * dst ++ = '"' ;
144169 src += 6 ;
145- consecutive_spaces = 0 ;
146- } else if (strncmp ((char * )src , " " , 6 ) == 0 ) {
147- * dst ++ = ' ' ;
170+ } else if (strncmp (src , "'" , 6 ) == 0 ) {
171+ * dst ++ = '\'' ;
148172 src += 6 ;
149- consecutive_spaces = 1 ;
150- } else if (isspace (* src )) {
151- // 連続する空白文字を1つのスペースに変換
152- if (!consecutive_spaces ) {
153- * dst ++ = ' ' ;
154- consecutive_spaces = 1 ;
173+ } else if (strncmp (src , " " , 6 ) == 0 ) {
174+ // 改行なしスペース (U+00A0) -> UTF-8: 0xC2 0xA0
175+ * dst ++ = 0xC2 ;
176+ * dst ++ = 0xA0 ;
177+ src += 6 ;
178+ } else if (strncmp (src , "©" , 6 ) == 0 ) {
179+ // copyright symbol (U+00A9) -> UTF-8: 0xC2 0xA9
180+ * dst ++ = 0xC2 ;
181+ * dst ++ = 0xA9 ;
182+ src += 6 ;
183+ } else if (strncmp (src , "®" , 5 ) == 0 ) {
184+ // registered trademark symbol (U+00AE) -> UTF-8: 0xC2 0xAE
185+ * dst ++ = 0xC2 ;
186+ * dst ++ = 0xAE ;
187+ src += 5 ;
188+ } else if (strncmp (src , "™" , 7 ) == 0 ) {
189+ // trademark symbol (U+2122) -> UTF-8: 0xE2 0x84 0xA2
190+ * dst ++ = 0xE2 ;
191+ * dst ++ = 0x84 ;
192+ * dst ++ = 0xA2 ;
193+ src += 7 ;
194+ } else if (strncmp (src , "¥" , 5 ) == 0 ) {
195+ // yen sign (U+00A5) -> UTF-8: 0xC2 0xA5
196+ * dst ++ = 0xC2 ;
197+ * dst ++ = 0xA5 ;
198+ src += 5 ;
199+ } else if (strncmp (src , "€" , 6 ) == 0 ) {
200+ // euro sign (U+20AC) -> UTF-8: 0xE2 0x82 0xAC
201+ * dst ++ = 0xE2 ;
202+ * dst ++ = 0x82 ;
203+ * dst ++ = 0xAC ;
204+ src += 6 ;
205+ } else if (strncmp (src , "£" , 7 ) == 0 ) {
206+ // pound sign (U+00A3) -> UTF-8: 0xC2 0xA3
207+ * dst ++ = 0xC2 ;
208+ * dst ++ = 0xA3 ;
209+ src += 7 ;
210+ } else if (strncmp (src , "¢" , 6 ) == 0 ) {
211+ // cent sign (U+00A2) -> UTF-8: 0xC2 0xA2
212+ * dst ++ = 0xC2 ;
213+ * dst ++ = 0xA2 ;
214+ src += 6 ;
215+ } else if (strncmp (src , "°" , 5 ) == 0 ) {
216+ // degree sign (U+00B0) -> UTF-8: 0xC2 0xB0
217+ * dst ++ = 0xC2 ;
218+ * dst ++ = 0xB0 ;
219+ src += 5 ;
220+ src += 7 ;
221+ } else if (src [1 ] == '#' ) {
222+ // 数値文字参照 &#nnn; または &#xhex;
223+ char * end_ptr ;
224+ unsigned long code ;
225+ if (src [2 ] == 'x' || src [2 ] == 'X' ) {
226+ // 16進数
227+ code = strtoul (src + 3 , & end_ptr , 16 );
228+ } else {
229+ // 10進数
230+ code = strtoul (src + 2 , & end_ptr , 10 );
231+ }
232+
233+ if (* end_ptr == ';' && code > 0 ) {
234+ // UTF-8文字をバイト列に変換
235+ if (code < 0x80 ) {
236+ // 1バイト文字 (ASCII)
237+ * dst ++ = (char )code ;
238+ } else if (code < 0x800 ) {
239+ // 2バイト文字
240+ * dst ++ = (char )(0xC0 | (code >> 6 ));
241+ * dst ++ = (char )(0x80 | (code & 0x3F ));
242+ } else if (code < 0x10000 ) {
243+ // 3バイト文字
244+ * dst ++ = (char )(0xE0 | (code >> 12 ));
245+ * dst ++ = (char )(0x80 | ((code >> 6 ) & 0x3F ));
246+ * dst ++ = (char )(0x80 | (code & 0x3F ));
247+ } else if (code < 0x110000 ) {
248+ // 4バイト文字
249+ * dst ++ = (char )(0xF0 | (code >> 18 ));
250+ * dst ++ = (char )(0x80 | ((code >> 12 ) & 0x3F ));
251+ * dst ++ = (char )(0x80 | ((code >> 6 ) & 0x3F ));
252+ * dst ++ = (char )(0x80 | (code & 0x3F ));
253+ } else {
254+ // 無効なコードポイント
255+ * dst ++ = '?' ;
256+ }
257+ src = end_ptr + 1 ;
258+ } else {
259+ // 無効な数値文字参照はそのまま
260+ * dst ++ = * src ++ ;
155261 }
156- src ++ ;
157262 } else {
263+ // 未知の実体参照はそのまま
158264 * dst ++ = * src ++ ;
159- consecutive_spaces = 0 ;
160265 }
161266 } else {
162- src ++ ;
267+ * dst ++ = * src ++ ;
163268 }
164269 }
165270
@@ -334,6 +439,11 @@ int main(int argc, char **argv)
334439
335440 char * html_body = extract_html_body ((char * )sbuffer );
336441
442+ // HTML実体参照をデコード
443+ decode_html_entities ((char * )sbuffer );
444+
445+ fwrite (sbuffer , 1 , html_body - (char * )sbuffer , stdout );
446+
337447 parse_html (html_body );
338448 printf ("%s" , html_body );
339449
0 commit comments