@@ -152,76 +152,56 @@ void parse_html(char *html_content) {
152152 * HTML実体参照を文字コードに変換
153153 */
154154void decode_html_entities (char * text ) {
155+ // よく使われる実体参照の変換テーブル
156+ struct {
157+ const char * entity ;
158+ int len ;
159+ const char * replacement ;
160+ int replacement_len ;
161+ } entities [] = {
162+ {"<" , 4 , "<" , 1 },
163+ {">" , 4 , ">" , 1 },
164+ {"&" , 5 , "&" , 1 },
165+ {""" , 6 , "\"" , 1 },
166+ {"'" , 6 , "'" , 1 },
167+ {"«" , 7 , "<<" , 2 },
168+ {"»" , 7 , ">>" , 2 },
169+ {" " , 6 , "\xC2\xA0" , 2 }, // 改行なしスペース (U+00A0)
170+ {"©" , 6 , "\xC2\xA9" , 2 }, // copyright symbol (U+00A9)
171+ {"®" , 5 , "\xC2\xAE" , 2 }, // registered trademark symbol (U+00AE)
172+ {"™" , 7 , "\xE2\x84\xA2" , 3 }, // trademark symbol (U+2122)
173+ {"¥" , 5 , "\xC2\xA5" , 2 }, // yen sign (U+00A5)
174+ {"€" , 6 , "\xE2\x82\xAC" , 3 }, // euro sign (U+20AC)
175+ {"£" , 7 , "\xC2\xA3" , 2 }, // pound sign (U+00A3)
176+ {"¢" , 6 , "\xC2\xA2" , 2 }, // cent sign (U+00A2)
177+ {"°" , 5 , "\xC2\xB0" , 2 }, // degree sign (U+00B0)
178+ {NULL , 0 , NULL , 0 }
179+ };
180+
155181 char * src = text ;
156182 char * dst = text ;
157183
158184 while (* src ) {
159185 if (* src == '&' ) {
160186 // よく使われる実体参照を変換
161- if (strncmp (src , "<" , 4 ) == 0 ) {
162- * dst ++ = '<' ;
163- src += 4 ;
164- } else if (strncmp (src , ">" , 4 ) == 0 ) {
165- * dst ++ = '>' ;
166- src += 4 ;
167- } else if (strncmp (src , "&" , 5 ) == 0 ) {
168- * dst ++ = '&' ;
169- src += 5 ;
170- } else if (strncmp (src , """ , 6 ) == 0 ) {
171- * dst ++ = '"' ;
172- src += 6 ;
173- } else if (strncmp (src , "'" , 6 ) == 0 ) {
174- * dst ++ = '\'' ;
175- src += 6 ;
176- } else if (strncmp (src , " " , 6 ) == 0 ) {
177- // 改行なしスペース (U+00A0) -> UTF-8: 0xC2 0xA0
178- * dst ++ = 0xC2 ;
179- * dst ++ = 0xA0 ;
180- src += 6 ;
181- } else if (strncmp (src , "©" , 6 ) == 0 ) {
182- // copyright symbol (U+00A9) -> UTF-8: 0xC2 0xA9
183- * dst ++ = 0xC2 ;
184- * dst ++ = 0xA9 ;
185- src += 6 ;
186- } else if (strncmp (src , "®" , 5 ) == 0 ) {
187- // registered trademark symbol (U+00AE) -> UTF-8: 0xC2 0xAE
188- * dst ++ = 0xC2 ;
189- * dst ++ = 0xAE ;
190- src += 5 ;
191- } else if (strncmp (src , "™" , 7 ) == 0 ) {
192- // trademark symbol (U+2122) -> UTF-8: 0xE2 0x84 0xA2
193- * dst ++ = 0xE2 ;
194- * dst ++ = 0x84 ;
195- * dst ++ = 0xA2 ;
196- src += 7 ;
197- } else if (strncmp (src , "¥" , 5 ) == 0 ) {
198- // yen sign (U+00A5) -> UTF-8: 0xC2 0xA5
199- * dst ++ = 0xC2 ;
200- * dst ++ = 0xA5 ;
201- src += 5 ;
202- } else if (strncmp (src , "€" , 6 ) == 0 ) {
203- // euro sign (U+20AC) -> UTF-8: 0xE2 0x82 0xAC
204- * dst ++ = 0xE2 ;
205- * dst ++ = 0x82 ;
206- * dst ++ = 0xAC ;
207- src += 6 ;
208- } else if (strncmp (src , "£" , 7 ) == 0 ) {
209- // pound sign (U+00A3) -> UTF-8: 0xC2 0xA3
210- * dst ++ = 0xC2 ;
211- * dst ++ = 0xA3 ;
212- src += 7 ;
213- } else if (strncmp (src , "¢" , 6 ) == 0 ) {
214- // cent sign (U+00A2) -> UTF-8: 0xC2 0xA2
215- * dst ++ = 0xC2 ;
216- * dst ++ = 0xA2 ;
217- src += 6 ;
218- } else if (strncmp (src , "°" , 5 ) == 0 ) {
219- // degree sign (U+00B0) -> UTF-8: 0xC2 0xB0
220- * dst ++ = 0xC2 ;
221- * dst ++ = 0xB0 ;
222- src += 5 ;
223- src += 7 ;
224- } else if (src [1 ] == '#' ) {
187+ int found = 0 ;
188+ int i ;
189+ for (i = 0 ; entities [i ].entity != NULL ; i ++ ) {
190+ if (strncmp (src , entities [i ].entity , entities [i ].len ) == 0 ) {
191+ int j ;
192+ for (j = 0 ; j < entities [i ].replacement_len ; j ++ ) {
193+ * dst ++ = entities [i ].replacement [j ];
194+ }
195+ src += entities [i ].len ;
196+ found = 1 ;
197+ break ;
198+ }
199+ }
200+ if (found ) {
201+ continue ;
202+ }
203+
204+ if (src [1 ] == '#' ) {
225205 // 数値文字参照 &#nnn; または &#xhex;
226206 char * end_ptr ;
227207 unsigned long code ;
0 commit comments