00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031 #ifdef HAVE_CONFIG_H
00032 #include "config.h"
00033 #endif
00034
00035 #include "xmldef.h"
00036 #include "xmltok.h"
00037 #include "nametab.h"
00038
00039 #define VTABLE1 \
00040 { PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok) }, \
00041 { PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \
00042 PREFIX(sameName), \
00043 PREFIX(nameMatchesAscii), \
00044 PREFIX(nameLength), \
00045 PREFIX(skipS), \
00046 PREFIX(getAtts), \
00047 PREFIX(charRefNumber), \
00048 PREFIX(predefinedEntityName), \
00049 PREFIX(updatePosition), \
00050 PREFIX(isPublicId)
00051
00052 #define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
00053
00054 #define UCS2_GET_NAMING(pages, hi, lo) \
00055 (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F)))
00056
00057
00058
00059
00060
00061 #define UTF8_GET_NAMING2(pages, byte) \
00062 (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \
00063 + ((((byte)[0]) & 3) << 1) \
00064 + ((((byte)[1]) >> 5) & 1)] \
00065 & (1 << (((byte)[1]) & 0x1F)))
00066
00067
00068
00069
00070
00071 #define UTF8_GET_NAMING3(pages, byte) \
00072 (namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \
00073 + ((((byte)[1]) >> 2) & 0xF)] \
00074 << 3) \
00075 + ((((byte)[1]) & 3) << 1) \
00076 + ((((byte)[2]) >> 5) & 1)] \
00077 & (1 << (((byte)[2]) & 0x1F)))
00078
00079 #define UTF8_GET_NAMING(pages, p, n) \
00080 ((n) == 2 \
00081 ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \
00082 : ((n) == 3 \
00083 ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \
00084 : 0))
00085
00086 #define UTF8_INVALID3(p) \
00087 ((*p) == 0xED \
00088 ? (((p)[1] & 0x20) != 0) \
00089 : ((*p) == 0xEF \
00090 ? ((p)[1] == 0xBF && ((p)[2] == 0xBF || (p)[2] == 0xBE)) \
00091 : 0))
00092
00093 #define UTF8_INVALID4(p) ((*p) == 0xF4 && ((p)[1] & 0x30) != 0)
00094
00095 static
00096 int isNever(const ENCODING *enc, const char *p)
00097 {
00098 return 0;
00099 }
00100
00101 static
00102 int utf8_isName2(const ENCODING *enc, const char *p)
00103 {
00104 return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
00105 }
00106
00107 static
00108 int utf8_isName3(const ENCODING *enc, const char *p)
00109 {
00110 return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
00111 }
00112
00113 #define utf8_isName4 isNever
00114
00115 static
00116 int utf8_isNmstrt2(const ENCODING *enc, const char *p)
00117 {
00118 return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
00119 }
00120
00121 static
00122 int utf8_isNmstrt3(const ENCODING *enc, const char *p)
00123 {
00124 return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
00125 }
00126
00127 #define utf8_isNmstrt4 isNever
00128
00129 #define utf8_isInvalid2 isNever
00130
00131 static
00132 int utf8_isInvalid3(const ENCODING *enc, const char *p)
00133 {
00134 return UTF8_INVALID3((const unsigned char *)p);
00135 }
00136
00137 static
00138 int utf8_isInvalid4(const ENCODING *enc, const char *p)
00139 {
00140 return UTF8_INVALID4((const unsigned char *)p);
00141 }
00142
00143 struct normal_encoding {
00144 ENCODING enc;
00145 unsigned char type[256];
00146 #ifdef XML_MIN_SIZE
00147 int (*byteType)(const ENCODING *, const char *);
00148 int (*isNameMin)(const ENCODING *, const char *);
00149 int (*isNmstrtMin)(const ENCODING *, const char *);
00150 int (*byteToAscii)(const ENCODING *, const char *);
00151 int (*charMatches)(const ENCODING *, const char *, int);
00152 #endif
00153 int (*isName2)(const ENCODING *, const char *);
00154 int (*isName3)(const ENCODING *, const char *);
00155 int (*isName4)(const ENCODING *, const char *);
00156 int (*isNmstrt2)(const ENCODING *, const char *);
00157 int (*isNmstrt3)(const ENCODING *, const char *);
00158 int (*isNmstrt4)(const ENCODING *, const char *);
00159 int (*isInvalid2)(const ENCODING *, const char *);
00160 int (*isInvalid3)(const ENCODING *, const char *);
00161 int (*isInvalid4)(const ENCODING *, const char *);
00162 };
00163
00164 #ifdef XML_MIN_SIZE
00165
00166 #define STANDARD_VTABLE(E) \
00167 E ## byteType, \
00168 E ## isNameMin, \
00169 E ## isNmstrtMin, \
00170 E ## byteToAscii, \
00171 E ## charMatches,
00172
00173 #else
00174
00175 #define STANDARD_VTABLE(E)
00176
00177 #endif
00178
00179 #define NORMAL_VTABLE(E) \
00180 E ## isName2, \
00181 E ## isName3, \
00182 E ## isName4, \
00183 E ## isNmstrt2, \
00184 E ## isNmstrt3, \
00185 E ## isNmstrt4, \
00186 E ## isInvalid2, \
00187 E ## isInvalid3, \
00188 E ## isInvalid4
00189
00190 static int checkCharRefNumber(int);
00191
00192 #include "xmltok_impl.h"
00193
00194 #ifdef XML_MIN_SIZE
00195 #define sb_isNameMin isNever
00196 #define sb_isNmstrtMin isNever
00197 #endif
00198
00199 #ifdef XML_MIN_SIZE
00200 #define MINBPC(enc) ((enc)->minBytesPerChar)
00201 #else
00202
00203 #define MINBPC(enc) 1
00204 #endif
00205
00206 #define SB_BYTE_TYPE(enc, p) \
00207 (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
00208
00209 #ifdef XML_MIN_SIZE
00210 static
00211 int sb_byteType(const ENCODING *enc, const char *p)
00212 {
00213 return SB_BYTE_TYPE(enc, p);
00214 }
00215 #define BYTE_TYPE(enc, p) \
00216 (((const struct normal_encoding *)(enc))->byteType(enc, p))
00217 #else
00218 #define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
00219 #endif
00220
00221 #ifdef XML_MIN_SIZE
00222 #define BYTE_TO_ASCII(enc, p) \
00223 (((const struct normal_encoding *)(enc))->byteToAscii(enc, p))
00224 static
00225 int sb_byteToAscii(const ENCODING *enc, const char *p)
00226 {
00227 return *p;
00228 }
00229 #else
00230 #define BYTE_TO_ASCII(enc, p) (*p)
00231 #endif
00232
00233 #define IS_NAME_CHAR(enc, p, n) \
00234 (((const struct normal_encoding *)(enc))->isName ## n(enc, p))
00235 #define IS_NMSTRT_CHAR(enc, p, n) \
00236 (((const struct normal_encoding *)(enc))->isNmstrt ## n(enc, p))
00237 #define IS_INVALID_CHAR(enc, p, n) \
00238 (((const struct normal_encoding *)(enc))->isInvalid ## n(enc, p))
00239
00240 #ifdef XML_MIN_SIZE
00241 #define IS_NAME_CHAR_MINBPC(enc, p) \
00242 (((const struct normal_encoding *)(enc))->isNameMin(enc, p))
00243 #define IS_NMSTRT_CHAR_MINBPC(enc, p) \
00244 (((const struct normal_encoding *)(enc))->isNmstrtMin(enc, p))
00245 #else
00246 #define IS_NAME_CHAR_MINBPC(enc, p) (0)
00247 #define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
00248 #endif
00249
00250 #ifdef XML_MIN_SIZE
00251 #define CHAR_MATCHES(enc, p, c) \
00252 (((const struct normal_encoding *)(enc))->charMatches(enc, p, c))
00253 static
00254 int sb_charMatches(const ENCODING *enc, const char *p, int c)
00255 {
00256 return *p == c;
00257 }
00258 #else
00259
00260 #define CHAR_MATCHES(enc, p, c) (*(p) == c)
00261 #endif
00262
00263 #define PREFIX(ident) normal_ ## ident
00264 #include "xmltok_impl.c"
00265
00266 #undef MINBPC
00267 #undef BYTE_TYPE
00268 #undef BYTE_TO_ASCII
00269 #undef CHAR_MATCHES
00270 #undef IS_NAME_CHAR
00271 #undef IS_NAME_CHAR_MINBPC
00272 #undef IS_NMSTRT_CHAR
00273 #undef IS_NMSTRT_CHAR_MINBPC
00274 #undef IS_INVALID_CHAR
00275
00276 enum {
00277 UTF8_cval1 = 0x00,
00278 UTF8_cval2 = 0xc0,
00279 UTF8_cval3 = 0xe0,
00280 UTF8_cval4 = 0xf0
00281 };
00282
00283 static
00284 void utf8_toUtf8(const ENCODING *enc,
00285 const char **fromP, const char *fromLim,
00286 char **toP, const char *toLim)
00287 {
00288 char *to;
00289 const char *from;
00290 if (fromLim - *fromP > toLim - *toP) {
00291
00292 for (fromLim = *fromP + (toLim - *toP); fromLim > *fromP; fromLim--)
00293 if (((unsigned char)fromLim[-1] & 0xc0) != 0x80)
00294 break;
00295 }
00296 for (to = *toP, from = *fromP; from != fromLim; from++, to++)
00297 *to = *from;
00298 *fromP = from;
00299 *toP = to;
00300 }
00301
00302 static
00303 void utf8_toUtf16(const ENCODING *enc,
00304 const char **fromP, const char *fromLim,
00305 unsigned short **toP, const unsigned short *toLim)
00306 {
00307 unsigned short *to = *toP;
00308 const char *from = *fromP;
00309 while (from != fromLim && to != toLim) {
00310 switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) {
00311 case BT_LEAD2:
00312 *to++ = ((from[0] & 0x1f) << 6) | (from[1] & 0x3f);
00313 from += 2;
00314 break;
00315 case BT_LEAD3:
00316 *to++ = ((from[0] & 0xf) << 12) | ((from[1] & 0x3f) << 6) | (from[2] & 0x3f);
00317 from += 3;
00318 break;
00319 case BT_LEAD4:
00320 {
00321 unsigned long n;
00322 if (to + 1 == toLim)
00323 break;
00324 n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12) | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
00325 n -= 0x10000;
00326 to[0] = (unsigned short)((n >> 10) | 0xD800);
00327 to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
00328 to += 2;
00329 from += 4;
00330 }
00331 break;
00332 default:
00333 *to++ = *from++;
00334 break;
00335 }
00336 }
00337 *fromP = from;
00338 *toP = to;
00339 }
00340
00341 #ifdef XML_NS
00342 static const struct normal_encoding utf8_encoding_ns = {
00343 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
00344 {
00345 #include "asciitab.h"
00346 #include "utf8tab.h"
00347 },
00348 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
00349 };
00350 #endif
00351
00352 static const struct normal_encoding utf8_encoding = {
00353 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
00354 {
00355 #define BT_COLON BT_NMSTRT
00356 #include "asciitab.h"
00357 #undef BT_COLON
00358 #include "utf8tab.h"
00359 },
00360 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
00361 };
00362
00363 #ifdef XML_NS
00364
00365 static const struct normal_encoding internal_utf8_encoding_ns = {
00366 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
00367 {
00368 #include "iasciitab.h"
00369 #include "utf8tab.h"
00370 },
00371 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
00372 };
00373
00374 #endif
00375
00376 static const struct normal_encoding internal_utf8_encoding = {
00377 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
00378 {
00379 #define BT_COLON BT_NMSTRT
00380 #include "iasciitab.h"
00381 #undef BT_COLON
00382 #include "utf8tab.h"
00383 },
00384 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
00385 };
00386
00387 static
00388 void latin1_toUtf8(const ENCODING *enc,
00389 const char **fromP, const char *fromLim,
00390 char **toP, const char *toLim)
00391 {
00392 for (;;) {
00393 unsigned char c;
00394 if (*fromP == fromLim)
00395 break;
00396 c = (unsigned char)**fromP;
00397 if (c & 0x80) {
00398 if (toLim - *toP < 2)
00399 break;
00400 *(*toP)++ = ((c >> 6) | UTF8_cval2);
00401 *(*toP)++ = ((c & 0x3f) | 0x80);
00402 (*fromP)++;
00403 }
00404 else {
00405 if (*toP == toLim)
00406 break;
00407 *(*toP)++ = *(*fromP)++;
00408 }
00409 }
00410 }
00411
00412 static
00413 void latin1_toUtf16(const ENCODING *enc,
00414 const char **fromP, const char *fromLim,
00415 unsigned short **toP, const unsigned short *toLim)
00416 {
00417 while (*fromP != fromLim && *toP != toLim)
00418 *(*toP)++ = (unsigned char)*(*fromP)++;
00419 }
00420
00421 #ifdef XML_NS
00422
00423 static const struct normal_encoding latin1_encoding_ns = {
00424 { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
00425 {
00426 #include "asciitab.h"
00427 #include "latin1tab.h"
00428 },
00429 STANDARD_VTABLE(sb_)
00430 };
00431
00432 #endif
00433
00434 static const struct normal_encoding latin1_encoding = {
00435 { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
00436 {
00437 #define BT_COLON BT_NMSTRT
00438 #include "asciitab.h"
00439 #undef BT_COLON
00440 #include "latin1tab.h"
00441 },
00442 STANDARD_VTABLE(sb_)
00443 };
00444
00445 static
00446 void ascii_toUtf8(const ENCODING *enc,
00447 const char **fromP, const char *fromLim,
00448 char **toP, const char *toLim)
00449 {
00450 while (*fromP != fromLim && *toP != toLim)
00451 *(*toP)++ = *(*fromP)++;
00452 }
00453
00454 #ifdef XML_NS
00455
00456 static const struct normal_encoding ascii_encoding_ns = {
00457 { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
00458 {
00459 #include "asciitab.h"
00460
00461 },
00462 STANDARD_VTABLE(sb_)
00463 };
00464
00465 #endif
00466
00467 static const struct normal_encoding ascii_encoding = {
00468 { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
00469 {
00470 #define BT_COLON BT_NMSTRT
00471 #include "asciitab.h"
00472 #undef BT_COLON
00473
00474 },
00475 STANDARD_VTABLE(sb_)
00476 };
00477
00478 static int unicode_byte_type(char hi, char lo)
00479 {
00480 switch ((unsigned char)hi) {
00481 case 0xD8: case 0xD9: case 0xDA: case 0xDB:
00482 return BT_LEAD4;
00483 case 0xDC: case 0xDD: case 0xDE: case 0xDF:
00484 return BT_TRAIL;
00485 case 0xFF:
00486 switch ((unsigned char)lo) {
00487 case 0xFF:
00488 case 0xFE:
00489 return BT_NONXML;
00490 }
00491 break;
00492 }
00493 return BT_NONASCII;
00494 }
00495
00496 #define DEFINE_UTF16_TO_UTF8(E) \
00497 static \
00498 void E ## toUtf8(const ENCODING *enc, \
00499 const char **fromP, const char *fromLim, \
00500 char **toP, const char *toLim) \
00501 { \
00502 const char *from; \
00503 for (from = *fromP; from != fromLim; from += 2) { \
00504 int plane; \
00505 unsigned char lo2; \
00506 unsigned char lo = GET_LO(from); \
00507 unsigned char hi = GET_HI(from); \
00508 switch (hi) { \
00509 case 0: \
00510 if (lo < 0x80) { \
00511 if (*toP == toLim) { \
00512 *fromP = from; \
00513 return; \
00514 } \
00515 *(*toP)++ = lo; \
00516 break; \
00517 } \
00518 \
00519 case 0x1: case 0x2: case 0x3: \
00520 case 0x4: case 0x5: case 0x6: case 0x7: \
00521 if (toLim - *toP < 2) { \
00522 *fromP = from; \
00523 return; \
00524 } \
00525 *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2); \
00526 *(*toP)++ = ((lo & 0x3f) | 0x80); \
00527 break; \
00528 default: \
00529 if (toLim - *toP < 3) { \
00530 *fromP = from; \
00531 return; \
00532 } \
00533 \
00534 *(*toP)++ = ((hi >> 4) | UTF8_cval3); \
00535 *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \
00536 *(*toP)++ = ((lo & 0x3f) | 0x80); \
00537 break; \
00538 case 0xD8: case 0xD9: case 0xDA: case 0xDB: \
00539 if (toLim - *toP < 4) { \
00540 *fromP = from; \
00541 return; \
00542 } \
00543 plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \
00544 *(*toP)++ = ((plane >> 2) | UTF8_cval4); \
00545 *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \
00546 from += 2; \
00547 lo2 = GET_LO(from); \
00548 *(*toP)++ = (((lo & 0x3) << 4) \
00549 | ((GET_HI(from) & 0x3) << 2) \
00550 | (lo2 >> 6) \
00551 | 0x80); \
00552 *(*toP)++ = ((lo2 & 0x3f) | 0x80); \
00553 break; \
00554 } \
00555 } \
00556 *fromP = from; \
00557 }
00558
00559 #define DEFINE_UTF16_TO_UTF16(E) \
00560 static \
00561 void E ## toUtf16(const ENCODING *enc, \
00562 const char **fromP, const char *fromLim, \
00563 unsigned short **toP, const unsigned short *toLim) \
00564 { \
00565 \
00566 if (fromLim - *fromP > ((toLim - *toP) << 1) \
00567 && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) \
00568 fromLim -= 2; \
00569 for (; *fromP != fromLim && *toP != toLim; *fromP += 2) \
00570 *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \
00571 }
00572
00573 #define SET2(ptr, ch) \
00574 (((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8)))
00575 #define GET_LO(ptr) ((unsigned char)(ptr)[0])
00576 #define GET_HI(ptr) ((unsigned char)(ptr)[1])
00577
00578 DEFINE_UTF16_TO_UTF8(little2_)
00579 DEFINE_UTF16_TO_UTF16(little2_)
00580
00581 #undef SET2
00582 #undef GET_LO
00583 #undef GET_HI
00584
00585 #define SET2(ptr, ch) \
00586 (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF)))
00587 #define GET_LO(ptr) ((unsigned char)(ptr)[1])
00588 #define GET_HI(ptr) ((unsigned char)(ptr)[0])
00589
00590 DEFINE_UTF16_TO_UTF8(big2_)
00591 DEFINE_UTF16_TO_UTF16(big2_)
00592
00593 #undef SET2
00594 #undef GET_LO
00595 #undef GET_HI
00596
00597 #define LITTLE2_BYTE_TYPE(enc, p) \
00598 ((p)[1] == 0 \
00599 ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \
00600 : unicode_byte_type((p)[1], (p)[0]))
00601 #define LITTLE2_BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1)
00602 #define LITTLE2_CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c)
00603 #define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) \
00604 UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
00605 #define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
00606 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
00607
00608 #ifdef XML_MIN_SIZE
00609
00610 static
00611 int little2_byteType(const ENCODING *enc, const char *p)
00612 {
00613 return LITTLE2_BYTE_TYPE(enc, p);
00614 }
00615
00616 static
00617 int little2_byteToAscii(const ENCODING *enc, const char *p)
00618 {
00619 return LITTLE2_BYTE_TO_ASCII(enc, p);
00620 }
00621
00622 static
00623 int little2_charMatches(const ENCODING *enc, const char *p, int c)
00624 {
00625 return LITTLE2_CHAR_MATCHES(enc, p, c);
00626 }
00627
00628 static
00629 int little2_isNameMin(const ENCODING *enc, const char *p)
00630 {
00631 return LITTLE2_IS_NAME_CHAR_MINBPC(enc, p);
00632 }
00633
00634 static
00635 int little2_isNmstrtMin(const ENCODING *enc, const char *p)
00636 {
00637 return LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p);
00638 }
00639
00640 #undef VTABLE
00641 #define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
00642
00643 #else
00644
00645 #undef PREFIX
00646 #define PREFIX(ident) little2_ ## ident
00647 #define MINBPC(enc) 2
00648
00649 #define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
00650 #define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(enc, p)
00651 #define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(enc, p, c)
00652 #define IS_NAME_CHAR(enc, p, n) 0
00653 #define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(enc, p)
00654 #define IS_NMSTRT_CHAR(enc, p, n) (0)
00655 #define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p)
00656
00657 #include "xmltok_impl.c"
00658
00659 #undef MINBPC
00660 #undef BYTE_TYPE
00661 #undef BYTE_TO_ASCII
00662 #undef CHAR_MATCHES
00663 #undef IS_NAME_CHAR
00664 #undef IS_NAME_CHAR_MINBPC
00665 #undef IS_NMSTRT_CHAR
00666 #undef IS_NMSTRT_CHAR_MINBPC
00667 #undef IS_INVALID_CHAR
00668
00669 #endif
00670
00671 #ifdef XML_NS
00672
00673 static const struct normal_encoding little2_encoding_ns = {
00674 { VTABLE, 2, 0,
00675 #if XML_BYTE_ORDER == 12
00676 1
00677 #else
00678 0
00679 #endif
00680 },
00681 {
00682 #include "asciitab.h"
00683 #include "latin1tab.h"
00684 },
00685 STANDARD_VTABLE(little2_)
00686 };
00687
00688 #endif
00689
00690 static const struct normal_encoding little2_encoding = {
00691 { VTABLE, 2, 0,
00692 #if XML_BYTE_ORDER == 12
00693 1
00694 #else
00695 0
00696 #endif
00697 },
00698 {
00699 #define BT_COLON BT_NMSTRT
00700 #include "asciitab.h"
00701 #undef BT_COLON
00702 #include "latin1tab.h"
00703 },
00704 STANDARD_VTABLE(little2_)
00705 };
00706
00707 #if XML_BYTE_ORDER != 21
00708
00709 #ifdef XML_NS
00710
00711 static const struct normal_encoding internal_little2_encoding_ns = {
00712 { VTABLE, 2, 0, 1 },
00713 {
00714 #include "iasciitab.h"
00715 #include "latin1tab.h"
00716 },
00717 STANDARD_VTABLE(little2_)
00718 };
00719
00720 #endif
00721
00722 static const struct normal_encoding internal_little2_encoding = {
00723 { VTABLE, 2, 0, 1 },
00724 {
00725 #define BT_COLON BT_NMSTRT
00726 #include "iasciitab.h"
00727 #undef BT_COLON
00728 #include "latin1tab.h"
00729 },
00730 STANDARD_VTABLE(little2_)
00731 };
00732
00733 #endif
00734
00735
00736 #define BIG2_BYTE_TYPE(enc, p) \
00737 ((p)[0] == 0 \
00738 ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \
00739 : unicode_byte_type((p)[0], (p)[1]))
00740 #define BIG2_BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1)
00741 #define BIG2_CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c)
00742 #define BIG2_IS_NAME_CHAR_MINBPC(enc, p) \
00743 UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
00744 #define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
00745 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
00746
00747 #ifdef XML_MIN_SIZE
00748
00749 static
00750 int big2_byteType(const ENCODING *enc, const char *p)
00751 {
00752 return BIG2_BYTE_TYPE(enc, p);
00753 }
00754
00755 static
00756 int big2_byteToAscii(const ENCODING *enc, const char *p)
00757 {
00758 return BIG2_BYTE_TO_ASCII(enc, p);
00759 }
00760
00761 static
00762 int big2_charMatches(const ENCODING *enc, const char *p, int c)
00763 {
00764 return BIG2_CHAR_MATCHES(enc, p, c);
00765 }
00766
00767 static
00768 int big2_isNameMin(const ENCODING *enc, const char *p)
00769 {
00770 return BIG2_IS_NAME_CHAR_MINBPC(enc, p);
00771 }
00772
00773 static
00774 int big2_isNmstrtMin(const ENCODING *enc, const char *p)
00775 {
00776 return BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p);
00777 }
00778
00779 #undef VTABLE
00780 #define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
00781
00782 #else
00783
00784 #undef PREFIX
00785 #define PREFIX(ident) big2_ ## ident
00786 #define MINBPC(enc) 2
00787
00788 #define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
00789 #define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(enc, p)
00790 #define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(enc, p, c)
00791 #define IS_NAME_CHAR(enc, p, n) 0
00792 #define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(enc, p)
00793 #define IS_NMSTRT_CHAR(enc, p, n) (0)
00794 #define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p)
00795
00796 #include "xmltok_impl.c"
00797
00798 #undef MINBPC
00799 #undef BYTE_TYPE
00800 #undef BYTE_TO_ASCII
00801 #undef CHAR_MATCHES
00802 #undef IS_NAME_CHAR
00803 #undef IS_NAME_CHAR_MINBPC
00804 #undef IS_NMSTRT_CHAR
00805 #undef IS_NMSTRT_CHAR_MINBPC
00806 #undef IS_INVALID_CHAR
00807
00808 #endif
00809
00810 #ifdef XML_NS
00811
00812 static const struct normal_encoding big2_encoding_ns = {
00813 { VTABLE, 2, 0,
00814 #if XML_BYTE_ORDER == 21
00815 1
00816 #else
00817 0
00818 #endif
00819 },
00820 {
00821 #include "asciitab.h"
00822 #include "latin1tab.h"
00823 },
00824 STANDARD_VTABLE(big2_)
00825 };
00826
00827 #endif
00828
00829 static const struct normal_encoding big2_encoding = {
00830 { VTABLE, 2, 0,
00831 #if XML_BYTE_ORDER == 21
00832 1
00833 #else
00834 0
00835 #endif
00836 },
00837 {
00838 #define BT_COLON BT_NMSTRT
00839 #include "asciitab.h"
00840 #undef BT_COLON
00841 #include "latin1tab.h"
00842 },
00843 STANDARD_VTABLE(big2_)
00844 };
00845
00846 #if XML_BYTE_ORDER != 12
00847
00848 #ifdef XML_NS
00849
00850 static const struct normal_encoding internal_big2_encoding_ns = {
00851 { VTABLE, 2, 0, 1 },
00852 {
00853 #include "iasciitab.h"
00854 #include "latin1tab.h"
00855 },
00856 STANDARD_VTABLE(big2_)
00857 };
00858
00859 #endif
00860
00861 static const struct normal_encoding internal_big2_encoding = {
00862 { VTABLE, 2, 0, 1 },
00863 {
00864 #define BT_COLON BT_NMSTRT
00865 #include "iasciitab.h"
00866 #undef BT_COLON
00867 #include "latin1tab.h"
00868 },
00869 STANDARD_VTABLE(big2_)
00870 };
00871
00872 #endif
00873
00874 #undef PREFIX
00875
00876 static
00877 int streqci(const char *s1, const char *s2)
00878 {
00879 for (;;) {
00880 char c1 = *s1++;
00881 char c2 = *s2++;
00882 if ('a' <= c1 && c1 <= 'z')
00883 c1 += 'A' - 'a';
00884 if ('a' <= c2 && c2 <= 'z')
00885 c2 += 'A' - 'a';
00886 if (c1 != c2)
00887 return 0;
00888 if (!c1)
00889 break;
00890 }
00891 return 1;
00892 }
00893
00894 static
00895 void initUpdatePosition(const ENCODING *enc, const char *ptr,
00896 const char *end, POSITION *pos)
00897 {
00898 normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
00899 }
00900
00901 static
00902 int toAscii(const ENCODING *enc, const char *ptr, const char *end)
00903 {
00904 char buf[1];
00905 char *p = buf;
00906 XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
00907 if (p == buf)
00908 return -1;
00909 else
00910 return buf[0];
00911 }
00912
00913 static
00914 int isSpace(int c)
00915 {
00916 switch (c) {
00917 case 0x20:
00918 case 0xD:
00919 case 0xA:
00920 case 0x9:
00921 return 1;
00922 }
00923 return 0;
00924 }
00925
00926
00927
00928 static
00929 int parsePseudoAttribute(const ENCODING *enc,
00930 const char *ptr,
00931 const char *end,
00932 const char **namePtr,
00933 const char **valPtr,
00934 const char **nextTokPtr)
00935 {
00936 int c;
00937 char open;
00938 if (ptr == end) {
00939 *namePtr = 0;
00940 return 1;
00941 }
00942 if (!isSpace(toAscii(enc, ptr, end))) {
00943 *nextTokPtr = ptr;
00944 return 0;
00945 }
00946 do {
00947 ptr += enc->minBytesPerChar;
00948 } while (isSpace(toAscii(enc, ptr, end)));
00949 if (ptr == end) {
00950 *namePtr = 0;
00951 return 1;
00952 }
00953 *namePtr = ptr;
00954 for (;;) {
00955 c = toAscii(enc, ptr, end);
00956 if (c == -1) {
00957 *nextTokPtr = ptr;
00958 return 0;
00959 }
00960 if (c == '=')
00961 break;
00962 if (isSpace(c)) {
00963 do {
00964 ptr += enc->minBytesPerChar;
00965 } while (isSpace(c = toAscii(enc, ptr, end)));
00966 if (c != '=') {
00967 *nextTokPtr = ptr;
00968 return 0;
00969 }
00970 break;
00971 }
00972 ptr += enc->minBytesPerChar;
00973 }
00974 if (ptr == *namePtr) {
00975 *nextTokPtr = ptr;
00976 return 0;
00977 }
00978 ptr += enc->minBytesPerChar;
00979 c = toAscii(enc, ptr, end);
00980 while (isSpace(c)) {
00981 ptr += enc->minBytesPerChar;
00982 c = toAscii(enc, ptr, end);
00983 }
00984 if (c != '"' && c != '\'') {
00985 *nextTokPtr = ptr;
00986 return 0;
00987 }
00988 open = c;
00989 ptr += enc->minBytesPerChar;
00990 *valPtr = ptr;
00991 for (;; ptr += enc->minBytesPerChar) {
00992 c = toAscii(enc, ptr, end);
00993 if (c == open)
00994 break;
00995 if (!('a' <= c && c <= 'z')
00996 && !('A' <= c && c <= 'Z')
00997 && !('0' <= c && c <= '9')
00998 && c != '.'
00999 && c != '-'
01000 && c != '_') {
01001 *nextTokPtr = ptr;
01002 return 0;
01003 }
01004 }
01005 *nextTokPtr = ptr + enc->minBytesPerChar;
01006 return 1;
01007 }
01008
01009 static
01010 int doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *,
01011 const char *,
01012 const char *),
01013 int isGeneralTextEntity,
01014 const ENCODING *enc,
01015 const char *ptr,
01016 const char *end,
01017 const char **badPtr,
01018 const char **versionPtr,
01019 const char **encodingName,
01020 const ENCODING **encoding,
01021 int *standalone)
01022 {
01023 const char *val = 0;
01024 const char *name = 0;
01025 ptr += 5 * enc->minBytesPerChar;
01026 end -= 2 * enc->minBytesPerChar;
01027 if (!parsePseudoAttribute(enc, ptr, end, &name, &val, &ptr) || !name) {
01028 *badPtr = ptr;
01029 return 0;
01030 }
01031 if (!XmlNameMatchesAscii(enc, name, "version")) {
01032 if (!isGeneralTextEntity) {
01033 *badPtr = name;
01034 return 0;
01035 }
01036 }
01037 else {
01038 if (versionPtr)
01039 *versionPtr = val;
01040 if (!parsePseudoAttribute(enc, ptr, end, &name, &val, &ptr)) {
01041 *badPtr = ptr;
01042 return 0;
01043 }
01044 if (!name) {
01045 if (isGeneralTextEntity) {
01046
01047 *badPtr = ptr;
01048 return 0;
01049 }
01050 return 1;
01051 }
01052 }
01053 if (XmlNameMatchesAscii(enc, name, "encoding")) {
01054 int c = toAscii(enc, val, end);
01055 if (!('a' <= c && c <= 'z') && !('A' <= c && c <= 'Z')) {
01056 *badPtr = val;
01057 return 0;
01058 }
01059 if (encodingName)
01060 *encodingName = val;
01061 if (encoding)
01062 *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
01063 if (!parsePseudoAttribute(enc, ptr, end, &name, &val, &ptr)) {
01064 *badPtr = ptr;
01065 return 0;
01066 }
01067 if (!name)
01068 return 1;
01069 }
01070 if (!XmlNameMatchesAscii(enc, name, "standalone") || isGeneralTextEntity) {
01071 *badPtr = name;
01072 return 0;
01073 }
01074 if (XmlNameMatchesAscii(enc, val, "yes")) {
01075 if (standalone)
01076 *standalone = 1;
01077 }
01078 else if (XmlNameMatchesAscii(enc, val, "no")) {
01079 if (standalone)
01080 *standalone = 0;
01081 }
01082 else {
01083 *badPtr = val;
01084 return 0;
01085 }
01086 while (isSpace(toAscii(enc, ptr, end)))
01087 ptr += enc->minBytesPerChar;
01088 if (ptr != end) {
01089 *badPtr = ptr;
01090 return 0;
01091 }
01092 return 1;
01093 }
01094
01095 static
01096 int checkCharRefNumber(int result)
01097 {
01098 switch (result >> 8) {
01099 case 0xD8: case 0xD9: case 0xDA: case 0xDB:
01100 case 0xDC: case 0xDD: case 0xDE: case 0xDF:
01101 return -1;
01102 case 0:
01103 if (latin1_encoding.type[result] == BT_NONXML)
01104 return -1;
01105 break;
01106 case 0xFF:
01107 if (result == 0xFFFE || result == 0xFFFF)
01108 return -1;
01109 break;
01110 }
01111 return result;
01112 }
01113
01114 int XmlUtf8Encode(int c, char *buf)
01115 {
01116 enum {
01117
01118 min2 = 0x80,
01119 min3 = 0x800,
01120 min4 = 0x10000
01121 };
01122
01123 if (c < 0)
01124 return 0;
01125 if (c < min2) {
01126 buf[0] = (c | UTF8_cval1);
01127 return 1;
01128 }
01129 if (c < min3) {
01130 buf[0] = ((c >> 6) | UTF8_cval2);
01131 buf[1] = ((c & 0x3f) | 0x80);
01132 return 2;
01133 }
01134 if (c < min4) {
01135 buf[0] = ((c >> 12) | UTF8_cval3);
01136 buf[1] = (((c >> 6) & 0x3f) | 0x80);
01137 buf[2] = ((c & 0x3f) | 0x80);
01138 return 3;
01139 }
01140 if (c < 0x110000) {
01141 buf[0] = ((c >> 18) | UTF8_cval4);
01142 buf[1] = (((c >> 12) & 0x3f) | 0x80);
01143 buf[2] = (((c >> 6) & 0x3f) | 0x80);
01144 buf[3] = ((c & 0x3f) | 0x80);
01145 return 4;
01146 }
01147 return 0;
01148 }
01149
01150 int XmlUtf16Encode(int charNum, unsigned short *buf)
01151 {
01152 if (charNum < 0)
01153 return 0;
01154 if (charNum < 0x10000) {
01155 buf[0] = charNum;
01156 return 1;
01157 }
01158 if (charNum < 0x110000) {
01159 charNum -= 0x10000;
01160 buf[0] = (charNum >> 10) + 0xD800;
01161 buf[1] = (charNum & 0x3FF) + 0xDC00;
01162 return 2;
01163 }
01164 return 0;
01165 }
01166
01167 struct unknown_encoding {
01168 struct normal_encoding normal;
01169 int (*convert)(void *userData, const char *p);
01170 void *userData;
01171 unsigned short utf16[256];
01172 char utf8[256][4];
01173 };
01174
01175 int XmlSizeOfUnknownEncoding()
01176 {
01177 return sizeof(struct unknown_encoding);
01178 }
01179
01180 static
01181 int unknown_isName(const ENCODING *enc, const char *p)
01182 {
01183 int c = ((const struct unknown_encoding *)enc)
01184 ->convert(((const struct unknown_encoding *)enc)->userData, p);
01185 if (c & ~0xFFFF)
01186 return 0;
01187 return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
01188 }
01189
01190 static
01191 int unknown_isNmstrt(const ENCODING *enc, const char *p)
01192 {
01193 int c = ((const struct unknown_encoding *)enc)
01194 ->convert(((const struct unknown_encoding *)enc)->userData, p);
01195 if (c & ~0xFFFF)
01196 return 0;
01197 return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
01198 }
01199
01200 static
01201 int unknown_isInvalid(const ENCODING *enc, const char *p)
01202 {
01203 int c = ((const struct unknown_encoding *)enc)
01204 ->convert(((const struct unknown_encoding *)enc)->userData, p);
01205 return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
01206 }
01207
01208 static
01209 void unknown_toUtf8(const ENCODING *enc,
01210 const char **fromP, const char *fromLim,
01211 char **toP, const char *toLim)
01212 {
01213 char buf[XML_UTF8_ENCODE_MAX];
01214 for (;;) {
01215 const char *utf8;
01216 int n;
01217 if (*fromP == fromLim)
01218 break;
01219 utf8 = ((const struct unknown_encoding *)enc)->utf8[(unsigned char)**fromP];
01220 n = *utf8++;
01221 if (n == 0) {
01222 int c = ((const struct unknown_encoding *)enc)
01223 ->convert(((const struct unknown_encoding *)enc)->userData, *fromP);
01224 n = XmlUtf8Encode(c, buf);
01225 if (n > toLim - *toP)
01226 break;
01227 utf8 = buf;
01228 *fromP += ((const struct normal_encoding *)enc)->type[(unsigned char)**fromP]
01229 - (BT_LEAD2 - 2);
01230 }
01231 else {
01232 if (n > toLim - *toP)
01233 break;
01234 (*fromP)++;
01235 }
01236 do {
01237 *(*toP)++ = *utf8++;
01238 } while (--n != 0);
01239 }
01240 }
01241
01242 static
01243 void unknown_toUtf16(const ENCODING *enc,
01244 const char **fromP, const char *fromLim,
01245 unsigned short **toP, const unsigned short *toLim)
01246 {
01247 while (*fromP != fromLim && *toP != toLim) {
01248 unsigned short c
01249 = ((const struct unknown_encoding *)enc)->utf16[(unsigned char)**fromP];
01250 if (c == 0) {
01251 c = (unsigned short)((const struct unknown_encoding *)enc)
01252 ->convert(((const struct unknown_encoding *)enc)->userData, *fromP);
01253 *fromP += ((const struct normal_encoding *)enc)->type[(unsigned char)**fromP]
01254 - (BT_LEAD2 - 2);
01255 }
01256 else
01257 (*fromP)++;
01258 *(*toP)++ = c;
01259 }
01260 }
01261
01262 ENCODING *
01263 XmlInitUnknownEncoding(void *mem,
01264 int *table,
01265 int (*convert)(void *userData, const char *p),
01266 void *userData)
01267 {
01268 int i;
01269 struct unknown_encoding *e = mem;
01270 for (i = 0; i < sizeof(struct normal_encoding); i++)
01271 ((char *)mem)[i] = ((char *)&latin1_encoding)[i];
01272 for (i = 0; i < 128; i++)
01273 if (latin1_encoding.type[i] != BT_OTHER
01274 && latin1_encoding.type[i] != BT_NONXML
01275 && table[i] != i)
01276 return 0;
01277 for (i = 0; i < 256; i++) {
01278 int c = table[i];
01279 if (c == -1) {
01280 e->normal.type[i] = BT_MALFORM;
01281
01282 e->utf16[i] = 0xFFFF;
01283 e->utf8[i][0] = 1;
01284 e->utf8[i][1] = 0;
01285 }
01286 else if (c < 0) {
01287 if (c < -4)
01288 return 0;
01289 e->normal.type[i] = BT_LEAD2 - (c + 2);
01290 e->utf8[i][0] = 0;
01291 e->utf16[i] = 0;
01292 }
01293 else if (c < 0x80) {
01294 if (latin1_encoding.type[c] != BT_OTHER
01295 && latin1_encoding.type[c] != BT_NONXML
01296 && c != i)
01297 return 0;
01298 e->normal.type[i] = latin1_encoding.type[c];
01299 e->utf8[i][0] = 1;
01300 e->utf8[i][1] = (char)c;
01301 e->utf16[i] = c == 0 ? 0xFFFF : c;
01302 }
01303 else if (checkCharRefNumber(c) < 0) {
01304 e->normal.type[i] = BT_NONXML;
01305
01306 e->utf16[i] = 0xFFFF;
01307 e->utf8[i][0] = 1;
01308 e->utf8[i][1] = 0;
01309 }
01310 else {
01311 if (c > 0xFFFF)
01312 return 0;
01313 if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
01314 e->normal.type[i] = BT_NMSTRT;
01315 else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
01316 e->normal.type[i] = BT_NAME;
01317 else
01318 e->normal.type[i] = BT_OTHER;
01319 e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
01320 e->utf16[i] = c;
01321 }
01322 }
01323 e->userData = userData;
01324 e->convert = convert;
01325 if (convert) {
01326 e->normal.isName2 = unknown_isName;
01327 e->normal.isName3 = unknown_isName;
01328 e->normal.isName4 = unknown_isName;
01329 e->normal.isNmstrt2 = unknown_isNmstrt;
01330 e->normal.isNmstrt3 = unknown_isNmstrt;
01331 e->normal.isNmstrt4 = unknown_isNmstrt;
01332 e->normal.isInvalid2 = unknown_isInvalid;
01333 e->normal.isInvalid3 = unknown_isInvalid;
01334 e->normal.isInvalid4 = unknown_isInvalid;
01335 }
01336 e->normal.enc.utf8Convert = unknown_toUtf8;
01337 e->normal.enc.utf16Convert = unknown_toUtf16;
01338 return &(e->normal.enc);
01339 }
01340
01341
01342
01343 enum {
01344 UNKNOWN_ENC = -1,
01345 ISO_8859_1_ENC = 0,
01346 US_ASCII_ENC,
01347 UTF_8_ENC,
01348 UTF_16_ENC,
01349 UTF_16BE_ENC,
01350 UTF_16LE_ENC,
01351
01352 NO_ENC
01353 };
01354
01355 static
01356 int getEncodingIndex(const char *name)
01357 {
01358 static const char *encodingNames[] = {
01359 "ISO-8859-1",
01360 "US-ASCII",
01361 "UTF-8",
01362 "UTF-16",
01363 "UTF-16BE"
01364 "UTF-16LE",
01365 };
01366 int i;
01367 if (name == 0)
01368 return NO_ENC;
01369 for (i = 0; i < sizeof(encodingNames)/sizeof(encodingNames[0]); i++)
01370 if (streqci(name, encodingNames[i]))
01371 return i;
01372 return UNKNOWN_ENC;
01373 }
01374
01375
01376
01377
01378 #define INIT_ENC_INDEX(enc) ((enc)->initEnc.isUtf16)
01379
01380
01381
01382
01383
01384
01385
01386
01387
01388 static
01389 int initScan(const ENCODING **encodingTable,
01390 const INIT_ENCODING *enc,
01391 int state,
01392 const char *ptr,
01393 const char *end,
01394 const char **nextTokPtr)
01395 {
01396 const ENCODING **encPtr;
01397
01398 if (ptr == end)
01399 return XML_TOK_NONE;
01400 encPtr = enc->encPtr;
01401 if (ptr + 1 == end) {
01402
01403
01404 if (state != XML_CONTENT_STATE)
01405 return XML_TOK_PARTIAL;
01406
01407
01408 switch (INIT_ENC_INDEX(enc)) {
01409 case UTF_16_ENC:
01410 case UTF_16LE_ENC:
01411 case UTF_16BE_ENC:
01412 return XML_TOK_PARTIAL;
01413 }
01414 switch ((unsigned char)*ptr) {
01415 case 0xFE:
01416 case 0xFF:
01417 case 0xEF:
01418 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
01419 && state == XML_CONTENT_STATE)
01420 break;
01421
01422 case 0x00:
01423 case 0x3C:
01424 return XML_TOK_PARTIAL;
01425 }
01426 }
01427 else {
01428 switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
01429 case 0xFEFF:
01430 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
01431 && state == XML_CONTENT_STATE)
01432 break;
01433 *nextTokPtr = ptr + 2;
01434 *encPtr = encodingTable[UTF_16BE_ENC];
01435 return XML_TOK_BOM;
01436
01437 case 0x3C00:
01438 if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
01439 || INIT_ENC_INDEX(enc) == UTF_16_ENC)
01440 && state == XML_CONTENT_STATE)
01441 break;
01442 *encPtr = encodingTable[UTF_16LE_ENC];
01443 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
01444 case 0xFFFE:
01445 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
01446 && state == XML_CONTENT_STATE)
01447 break;
01448 *nextTokPtr = ptr + 2;
01449 *encPtr = encodingTable[UTF_16LE_ENC];
01450 return XML_TOK_BOM;
01451 case 0xEFBB:
01452
01453
01454
01455
01456
01457
01458 if (state == XML_CONTENT_STATE) {
01459 int e = INIT_ENC_INDEX(enc);
01460 if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC || e == UTF_16LE_ENC || e == UTF_16_ENC)
01461 break;
01462 }
01463 if (ptr + 2 == end)
01464 return XML_TOK_PARTIAL;
01465 if ((unsigned char)ptr[2] == 0xBF) {
01466 *encPtr = encodingTable[UTF_8_ENC];
01467 return XML_TOK_BOM;
01468 }
01469 break;
01470 default:
01471 if (ptr[0] == '\0') {
01472
01473
01474
01475
01476 if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
01477 break;
01478 *encPtr = encodingTable[UTF_16BE_ENC];
01479 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
01480 }
01481 else if (ptr[1] == '\0') {
01482
01483
01484
01485
01486
01487
01488
01489
01490 if (state == XML_CONTENT_STATE)
01491 break;
01492 *encPtr = encodingTable[UTF_16LE_ENC];
01493 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
01494 }
01495 break;
01496 }
01497 }
01498 *encPtr = encodingTable[INIT_ENC_INDEX(enc)];
01499 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
01500 }
01501
01502
01503 #define NS(x) x
01504 #define ns(x) x
01505 #include "xmltok_ns.c"
01506 #undef NS
01507 #undef ns
01508
01509 #ifdef XML_NS
01510
01511 #define NS(x) x ## NS
01512 #define ns(x) x ## _ns
01513
01514 #include "xmltok_ns.c"
01515
01516 #undef NS
01517 #undef ns
01518
01519 ENCODING *
01520 XmlInitUnknownEncodingNS(void *mem,
01521 int *table,
01522 int (*convert)(void *userData, const char *p),
01523 void *userData)
01524 {
01525 ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
01526 if (enc)
01527 ((struct normal_encoding *)enc)->type[':'] = BT_COLON;
01528 return enc;
01529 }
01530
01531 #endif