BOSS_7.0.2: /home/bes3soft/bes3soft/Boss/7.0.2/dist/7.0.2/Calibration/xmlBase/xmlBase-00-00-03/expat/xmltok.c Source File

00001 /*
00002 The contents of this file are subject to the Mozilla Public License
00003 Version 1.1 (the "License"); you may not use this file except in
00004 compliance with the License. You may obtain a copy of the License at
00005 http://www.mozilla.org/MPL/
00006 
00007 Software distributed under the License is distributed on an "AS IS"
00008 basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
00009 License for the specific language governing rights and limitations
00010 under the License.
00011 
00012 The Original Code is expat.
00013 
00014 The Initial Developer of the Original Code is James Clark.
00015 Portions created by James Clark are Copyright (C) 1998, 1999
00016 James Clark. All Rights Reserved.
00017 
00018 Contributor(s):
00019 
00020 Alternatively, the contents of this file may be used under the terms
00021 of the GNU General Public License (the "GPL"), in which case the
00022 provisions of the GPL are applicable instead of those above.  If you
00023 wish to allow use of your version of this file only under the terms of
00024 the GPL and not to allow others to use your version of this file under
00025 the MPL, indicate your decision by deleting the provisions above and
00026 replace them with the notice and other provisions required by the
00027 GPL. If you do not delete the provisions above, a recipient may use
00028 your version of this file under either the MPL or the GPL.
00029 */
00030 
00031 #ifdef HAVE_CONFIG_H
00032 #include "config.h"
00033 #endif
00034 
00035 #include "xmldef.h"
00036 #include "xmltok.h"
00037 #include "nametab.h"
00038 
00039 #define VTABLE1 \
00040   { PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok) }, \
00041   { PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \
00042   PREFIX(sameName), \
00043   PREFIX(nameMatchesAscii), \
00044   PREFIX(nameLength), \
00045   PREFIX(skipS), \
00046   PREFIX(getAtts), \
00047   PREFIX(charRefNumber), \
00048   PREFIX(predefinedEntityName), \
00049   PREFIX(updatePosition), \
00050   PREFIX(isPublicId)
00051 
00052 #define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
00053 
00054 #define UCS2_GET_NAMING(pages, hi, lo) \
00055    (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F)))
00056 
00057 /* A 2 byte UTF-8 representation splits the characters 11 bits
00058 between the bottom 5 and 6 bits of the bytes.
00059 We need 8 bits to index into pages, 3 bits to add to that index and
00060 5 bits to generate the mask. */
00061 #define UTF8_GET_NAMING2(pages, byte) \
00062     (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \
00063                       + ((((byte)[0]) & 3) << 1) \
00064                       + ((((byte)[1]) >> 5) & 1)] \
00065          & (1 << (((byte)[1]) & 0x1F)))
00066 
00067 /* A 3 byte UTF-8 representation splits the characters 16 bits
00068 between the bottom 4, 6 and 6 bits of the bytes.
00069 We need 8 bits to index into pages, 3 bits to add to that index and
00070 5 bits to generate the mask. */
00071 #define UTF8_GET_NAMING3(pages, byte) \
00072   (namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \
00073                              + ((((byte)[1]) >> 2) & 0xF)] \
00074                        << 3) \
00075                       + ((((byte)[1]) & 3) << 1) \
00076                       + ((((byte)[2]) >> 5) & 1)] \
00077          & (1 << (((byte)[2]) & 0x1F)))
00078 
00079 #define UTF8_GET_NAMING(pages, p, n) \
00080   ((n) == 2 \
00081   ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \
00082   : ((n) == 3 \
00083      ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \
00084      : 0))
00085 
00086 #define UTF8_INVALID3(p) \
00087   ((*p) == 0xED \
00088   ? (((p)[1] & 0x20) != 0) \
00089   : ((*p) == 0xEF \
00090      ? ((p)[1] == 0xBF && ((p)[2] == 0xBF || (p)[2] == 0xBE)) \
00091      : 0))
00092 
00093 #define UTF8_INVALID4(p) ((*p) == 0xF4 && ((p)[1] & 0x30) != 0)
00094 
00095 static
00096 int isNever(const ENCODING *enc, const char *p)
00097 {
00098   return 0;
00099 }
00100 
00101 static
00102 int utf8_isName2(const ENCODING *enc, const char *p)
00103 {
00104   return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
00105 }
00106 
00107 static
00108 int utf8_isName3(const ENCODING *enc, const char *p)
00109 {
00110   return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
00111 }
00112 
00113 #define utf8_isName4 isNever
00114 
00115 static
00116 int utf8_isNmstrt2(const ENCODING *enc, const char *p)
00117 {
00118   return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
00119 }
00120 
00121 static
00122 int utf8_isNmstrt3(const ENCODING *enc, const char *p)
00123 {
00124   return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
00125 }
00126 
00127 #define utf8_isNmstrt4 isNever
00128 
00129 #define utf8_isInvalid2 isNever
00130 
00131 static
00132 int utf8_isInvalid3(const ENCODING *enc, const char *p)
00133 {
00134   return UTF8_INVALID3((const unsigned char *)p);
00135 }
00136 
00137 static
00138 int utf8_isInvalid4(const ENCODING *enc, const char *p)
00139 {
00140   return UTF8_INVALID4((const unsigned char *)p);
00141 }
00142 
00143 struct normal_encoding {
00144   ENCODING enc;
00145   unsigned char type[256];
00146 #ifdef XML_MIN_SIZE
00147   int (*byteType)(const ENCODING *, const char *);
00148   int (*isNameMin)(const ENCODING *, const char *);
00149   int (*isNmstrtMin)(const ENCODING *, const char *);
00150   int (*byteToAscii)(const ENCODING *, const char *);
00151   int (*charMatches)(const ENCODING *, const char *, int);
00152 #endif /* XML_MIN_SIZE */
00153   int (*isName2)(const ENCODING *, const char *);
00154   int (*isName3)(const ENCODING *, const char *);
00155   int (*isName4)(const ENCODING *, const char *);
00156   int (*isNmstrt2)(const ENCODING *, const char *);
00157   int (*isNmstrt3)(const ENCODING *, const char *);
00158   int (*isNmstrt4)(const ENCODING *, const char *);
00159   int (*isInvalid2)(const ENCODING *, const char *);
00160   int (*isInvalid3)(const ENCODING *, const char *);
00161   int (*isInvalid4)(const ENCODING *, const char *);
00162 };
00163 
00164 #ifdef XML_MIN_SIZE
00165 
00166 #define STANDARD_VTABLE(E) \
00167  E ## byteType, \
00168  E ## isNameMin, \
00169  E ## isNmstrtMin, \
00170  E ## byteToAscii, \
00171  E ## charMatches,
00172 
00173 #else
00174 
00175 #define STANDARD_VTABLE(E) /* as nothing */
00176 
00177 #endif
00178 
00179 #define NORMAL_VTABLE(E) \
00180  E ## isName2, \
00181  E ## isName3, \
00182  E ## isName4, \
00183  E ## isNmstrt2, \
00184  E ## isNmstrt3, \
00185  E ## isNmstrt4, \
00186  E ## isInvalid2, \
00187  E ## isInvalid3, \
00188  E ## isInvalid4
00189 
00190 static int checkCharRefNumber(int);
00191 
00192 #include "xmltok_impl.h"
00193 
00194 #ifdef XML_MIN_SIZE
00195 #define sb_isNameMin isNever
00196 #define sb_isNmstrtMin isNever
00197 #endif
00198 
00199 #ifdef XML_MIN_SIZE
00200 #define MINBPC(enc) ((enc)->minBytesPerChar)
00201 #else
00202 /* minimum bytes per character */
00203 #define MINBPC(enc) 1
00204 #endif
00205 
00206 #define SB_BYTE_TYPE(enc, p) \
00207   (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
00208 
00209 #ifdef XML_MIN_SIZE
00210 static
00211 int sb_byteType(const ENCODING *enc, const char *p)
00212 {
00213   return SB_BYTE_TYPE(enc, p);
00214 }
00215 #define BYTE_TYPE(enc, p) \
00216  (((const struct normal_encoding *)(enc))->byteType(enc, p))
00217 #else
00218 #define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
00219 #endif
00220 
00221 #ifdef XML_MIN_SIZE
00222 #define BYTE_TO_ASCII(enc, p) \
00223  (((const struct normal_encoding *)(enc))->byteToAscii(enc, p))
00224 static
00225 int sb_byteToAscii(const ENCODING *enc, const char *p)
00226 {
00227   return *p;
00228 }
00229 #else
00230 #define BYTE_TO_ASCII(enc, p) (*p)
00231 #endif
00232 
00233 #define IS_NAME_CHAR(enc, p, n) \
00234  (((const struct normal_encoding *)(enc))->isName ## n(enc, p))
00235 #define IS_NMSTRT_CHAR(enc, p, n) \
00236  (((const struct normal_encoding *)(enc))->isNmstrt ## n(enc, p))
00237 #define IS_INVALID_CHAR(enc, p, n) \
00238  (((const struct normal_encoding *)(enc))->isInvalid ## n(enc, p))
00239 
00240 #ifdef XML_MIN_SIZE
00241 #define IS_NAME_CHAR_MINBPC(enc, p) \
00242  (((const struct normal_encoding *)(enc))->isNameMin(enc, p))
00243 #define IS_NMSTRT_CHAR_MINBPC(enc, p) \
00244  (((const struct normal_encoding *)(enc))->isNmstrtMin(enc, p))
00245 #else
00246 #define IS_NAME_CHAR_MINBPC(enc, p) (0)
00247 #define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
00248 #endif
00249 
00250 #ifdef XML_MIN_SIZE
00251 #define CHAR_MATCHES(enc, p, c) \
00252  (((const struct normal_encoding *)(enc))->charMatches(enc, p, c))
00253 static
00254 int sb_charMatches(const ENCODING *enc, const char *p, int c)
00255 {
00256   return *p == c;
00257 }
00258 #else
00259 /* c is an ASCII character */
00260 #define CHAR_MATCHES(enc, p, c) (*(p) == c)
00261 #endif
00262 
00263 #define PREFIX(ident) normal_ ## ident
00264 #include "xmltok_impl.c"
00265 
00266 #undef MINBPC
00267 #undef BYTE_TYPE
00268 #undef BYTE_TO_ASCII
00269 #undef CHAR_MATCHES
00270 #undef IS_NAME_CHAR
00271 #undef IS_NAME_CHAR_MINBPC
00272 #undef IS_NMSTRT_CHAR
00273 #undef IS_NMSTRT_CHAR_MINBPC
00274 #undef IS_INVALID_CHAR
00275 
00276 enum {  /* UTF8_cvalN is value of masked first byte of N byte sequence */
00277   UTF8_cval1 = 0x00,
00278   UTF8_cval2 = 0xc0,
00279   UTF8_cval3 = 0xe0,
00280   UTF8_cval4 = 0xf0
00281 };
00282 
00283 static
00284 void utf8_toUtf8(const ENCODING *enc,
00285                  const char **fromP, const char *fromLim,
00286                  char **toP, const char *toLim)
00287 {
00288   char *to;
00289   const char *from;
00290   if (fromLim - *fromP > toLim - *toP) {
00291     /* Avoid copying partial characters. */
00292     for (fromLim = *fromP + (toLim - *toP); fromLim > *fromP; fromLim--)
00293       if (((unsigned char)fromLim[-1] & 0xc0) != 0x80)
00294         break;
00295   }
00296   for (to = *toP, from = *fromP; from != fromLim; from++, to++)
00297     *to = *from;
00298   *fromP = from;
00299   *toP = to;
00300 }
00301 
00302 static
00303 void utf8_toUtf16(const ENCODING *enc,
00304                   const char **fromP, const char *fromLim,
00305                   unsigned short **toP, const unsigned short *toLim)
00306 {
00307   unsigned short *to = *toP;
00308   const char *from = *fromP;
00309   while (from != fromLim && to != toLim) {
00310     switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) {
00311     case BT_LEAD2:
00312       *to++ = ((from[0] & 0x1f) << 6) | (from[1] & 0x3f);
00313       from += 2;
00314       break;
00315     case BT_LEAD3:
00316       *to++ = ((from[0] & 0xf) << 12) | ((from[1] & 0x3f) << 6) | (from[2] & 0x3f);
00317       from += 3;
00318       break;
00319     case BT_LEAD4:
00320       {
00321         unsigned long n;
00322         if (to + 1 == toLim)
00323           break;
00324         n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12) | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
00325         n -= 0x10000;
00326         to[0] = (unsigned short)((n >> 10) | 0xD800);
00327         to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
00328         to += 2;
00329         from += 4;
00330       }
00331       break;
00332     default:
00333       *to++ = *from++;
00334       break;
00335     }
00336   }
00337   *fromP = from;
00338   *toP = to;
00339 }
00340 
00341 #ifdef XML_NS
00342 static const struct normal_encoding utf8_encoding_ns = {
00343   { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
00344   {
00345 #include "asciitab.h"
00346 #include "utf8tab.h"
00347   },
00348   STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
00349 };
00350 #endif
00351 
00352 static const struct normal_encoding utf8_encoding = {
00353   { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
00354   {
00355 #define BT_COLON BT_NMSTRT
00356 #include "asciitab.h"
00357 #undef BT_COLON
00358 #include "utf8tab.h"
00359   },
00360   STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
00361 };
00362 
00363 #ifdef XML_NS
00364 
00365 static const struct normal_encoding internal_utf8_encoding_ns = {
00366   { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
00367   {
00368 #include "iasciitab.h"
00369 #include "utf8tab.h"
00370   },
00371   STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
00372 };
00373 
00374 #endif
00375 
00376 static const struct normal_encoding internal_utf8_encoding = {
00377   { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
00378   {
00379 #define BT_COLON BT_NMSTRT
00380 #include "iasciitab.h"
00381 #undef BT_COLON
00382 #include "utf8tab.h"
00383   },
00384   STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
00385 };
00386 
00387 static
00388 void latin1_toUtf8(const ENCODING *enc,
00389                    const char **fromP, const char *fromLim,
00390                    char **toP, const char *toLim)
00391 {
00392   for (;;) {
00393     unsigned char c;
00394     if (*fromP == fromLim)
00395       break;
00396     c = (unsigned char)**fromP;
00397     if (c & 0x80) {
00398       if (toLim - *toP < 2)
00399         break;
00400       *(*toP)++ = ((c >> 6) | UTF8_cval2);
00401       *(*toP)++ = ((c & 0x3f) | 0x80);
00402       (*fromP)++;
00403     }
00404     else {
00405       if (*toP == toLim)
00406         break;
00407       *(*toP)++ = *(*fromP)++;
00408     }
00409   }
00410 }
00411 
00412 static
00413 void latin1_toUtf16(const ENCODING *enc,
00414                     const char **fromP, const char *fromLim,
00415                     unsigned short **toP, const unsigned short *toLim)
00416 {
00417   while (*fromP != fromLim && *toP != toLim)
00418     *(*toP)++ = (unsigned char)*(*fromP)++;
00419 }
00420 
00421 #ifdef XML_NS
00422 
00423 static const struct normal_encoding latin1_encoding_ns = {
00424   { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
00425   {
00426 #include "asciitab.h"
00427 #include "latin1tab.h"
00428   },
00429   STANDARD_VTABLE(sb_)
00430 };
00431 
00432 #endif
00433 
00434 static const struct normal_encoding latin1_encoding = {
00435   { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
00436   {
00437 #define BT_COLON BT_NMSTRT
00438 #include "asciitab.h"
00439 #undef BT_COLON
00440 #include "latin1tab.h"
00441   },
00442   STANDARD_VTABLE(sb_)
00443 };
00444 
00445 static
00446 void ascii_toUtf8(const ENCODING *enc,
00447                   const char **fromP, const char *fromLim,
00448                   char **toP, const char *toLim)
00449 {
00450   while (*fromP != fromLim && *toP != toLim)
00451     *(*toP)++ = *(*fromP)++;
00452 }
00453 
00454 #ifdef XML_NS
00455 
00456 static const struct normal_encoding ascii_encoding_ns = {
00457   { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
00458   {
00459 #include "asciitab.h"
00460 /* BT_NONXML == 0 */
00461   },
00462   STANDARD_VTABLE(sb_)
00463 };
00464 
00465 #endif
00466 
00467 static const struct normal_encoding ascii_encoding = {
00468   { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
00469   {
00470 #define BT_COLON BT_NMSTRT
00471 #include "asciitab.h"
00472 #undef BT_COLON
00473 /* BT_NONXML == 0 */
00474   },
00475   STANDARD_VTABLE(sb_)
00476 };
00477 
00478 static int unicode_byte_type(char hi, char lo)
00479 {
00480   switch ((unsigned char)hi) {
00481   case 0xD8: case 0xD9: case 0xDA: case 0xDB:
00482     return BT_LEAD4;
00483   case 0xDC: case 0xDD: case 0xDE: case 0xDF:
00484     return BT_TRAIL;
00485   case 0xFF:
00486     switch ((unsigned char)lo) {
00487     case 0xFF:
00488     case 0xFE:
00489       return BT_NONXML;
00490     }
00491     break;
00492   }
00493   return BT_NONASCII;
00494 }
00495 
00496 #define DEFINE_UTF16_TO_UTF8(E) \
00497 static \
00498 void E ## toUtf8(const ENCODING *enc, \
00499                  const char **fromP, const char *fromLim, \
00500                  char **toP, const char *toLim) \
00501 { \
00502   const char *from; \
00503   for (from = *fromP; from != fromLim; from += 2) { \
00504     int plane; \
00505     unsigned char lo2; \
00506     unsigned char lo = GET_LO(from); \
00507     unsigned char hi = GET_HI(from); \
00508     switch (hi) { \
00509     case 0: \
00510       if (lo < 0x80) { \
00511         if (*toP == toLim) { \
00512           *fromP = from; \
00513           return; \
00514         } \
00515         *(*toP)++ = lo; \
00516         break; \
00517       } \
00518       /* fall through */ \
00519     case 0x1: case 0x2: case 0x3: \
00520     case 0x4: case 0x5: case 0x6: case 0x7: \
00521       if (toLim -  *toP < 2) { \
00522         *fromP = from; \
00523         return; \
00524       } \
00525       *(*toP)++ = ((lo >> 6) | (hi << 2) |  UTF8_cval2); \
00526       *(*toP)++ = ((lo & 0x3f) | 0x80); \
00527       break; \
00528     default: \
00529       if (toLim -  *toP < 3)  { \
00530         *fromP = from; \
00531         return; \
00532       } \
00533       /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \
00534       *(*toP)++ = ((hi >> 4) | UTF8_cval3); \
00535       *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \
00536       *(*toP)++ = ((lo & 0x3f) | 0x80); \
00537       break; \
00538     case 0xD8: case 0xD9: case 0xDA: case 0xDB: \
00539       if (toLim -  *toP < 4) { \
00540         *fromP = from; \
00541         return; \
00542       } \
00543       plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \
00544       *(*toP)++ = ((plane >> 2) | UTF8_cval4); \
00545       *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \
00546       from += 2; \
00547       lo2 = GET_LO(from); \
00548       *(*toP)++ = (((lo & 0x3) << 4) \
00549                    | ((GET_HI(from) & 0x3) << 2) \
00550                    | (lo2 >> 6) \
00551                    | 0x80); \
00552       *(*toP)++ = ((lo2 & 0x3f) | 0x80); \
00553       break; \
00554     } \
00555   } \
00556   *fromP = from; \
00557 }
00558 
00559 #define DEFINE_UTF16_TO_UTF16(E) \
00560 static \
00561 void E ## toUtf16(const ENCODING *enc, \
00562                   const char **fromP, const char *fromLim, \
00563                   unsigned short **toP, const unsigned short *toLim) \
00564 { \
00565   /* Avoid copying first half only of surrogate */ \
00566   if (fromLim - *fromP > ((toLim - *toP) << 1) \
00567       && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) \
00568     fromLim -= 2; \
00569   for (; *fromP != fromLim && *toP != toLim; *fromP += 2) \
00570     *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \
00571 }
00572 
00573 #define SET2(ptr, ch) \
00574   (((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8)))
00575 #define GET_LO(ptr) ((unsigned char)(ptr)[0])
00576 #define GET_HI(ptr) ((unsigned char)(ptr)[1])
00577 
00578 DEFINE_UTF16_TO_UTF8(little2_)
00579 DEFINE_UTF16_TO_UTF16(little2_)
00580 
00581 #undef SET2
00582 #undef GET_LO
00583 #undef GET_HI
00584 
00585 #define SET2(ptr, ch) \
00586   (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF)))
00587 #define GET_LO(ptr) ((unsigned char)(ptr)[1])
00588 #define GET_HI(ptr) ((unsigned char)(ptr)[0])
00589 
00590 DEFINE_UTF16_TO_UTF8(big2_)
00591 DEFINE_UTF16_TO_UTF16(big2_)
00592 
00593 #undef SET2
00594 #undef GET_LO
00595 #undef GET_HI
00596 
00597 #define LITTLE2_BYTE_TYPE(enc, p) \
00598  ((p)[1] == 0 \
00599   ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \
00600   : unicode_byte_type((p)[1], (p)[0]))
00601 #define LITTLE2_BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1)
00602 #define LITTLE2_CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c)
00603 #define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) \
00604   UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
00605 #define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
00606   UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
00607 
00608 #ifdef XML_MIN_SIZE
00609 
00610 static
00611 int little2_byteType(const ENCODING *enc, const char *p)
00612 {
00613   return LITTLE2_BYTE_TYPE(enc, p);
00614 }
00615 
00616 static
00617 int little2_byteToAscii(const ENCODING *enc, const char *p)
00618 {
00619   return LITTLE2_BYTE_TO_ASCII(enc, p);
00620 }
00621 
00622 static
00623 int little2_charMatches(const ENCODING *enc, const char *p, int c)
00624 {
00625   return LITTLE2_CHAR_MATCHES(enc, p, c);
00626 }
00627 
00628 static
00629 int little2_isNameMin(const ENCODING *enc, const char *p)
00630 {
00631   return LITTLE2_IS_NAME_CHAR_MINBPC(enc, p);
00632 }
00633 
00634 static
00635 int little2_isNmstrtMin(const ENCODING *enc, const char *p)
00636 {
00637   return LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p);
00638 }
00639 
00640 #undef VTABLE
00641 #define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
00642 
00643 #else /* not XML_MIN_SIZE */
00644 
00645 #undef PREFIX
00646 #define PREFIX(ident) little2_ ## ident
00647 #define MINBPC(enc) 2
00648 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
00649 #define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
00650 #define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(enc, p) 
00651 #define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(enc, p, c)
00652 #define IS_NAME_CHAR(enc, p, n) 0
00653 #define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(enc, p)
00654 #define IS_NMSTRT_CHAR(enc, p, n) (0)
00655 #define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p)
00656 
00657 #include "xmltok_impl.c"
00658 
00659 #undef MINBPC
00660 #undef BYTE_TYPE
00661 #undef BYTE_TO_ASCII
00662 #undef CHAR_MATCHES
00663 #undef IS_NAME_CHAR
00664 #undef IS_NAME_CHAR_MINBPC
00665 #undef IS_NMSTRT_CHAR
00666 #undef IS_NMSTRT_CHAR_MINBPC
00667 #undef IS_INVALID_CHAR
00668 
00669 #endif /* not XML_MIN_SIZE */
00670 
00671 #ifdef XML_NS
00672 
00673 static const struct normal_encoding little2_encoding_ns = { 
00674   { VTABLE, 2, 0,
00675 #if XML_BYTE_ORDER == 12
00676     1
00677 #else
00678     0
00679 #endif
00680   },
00681   {
00682 #include "asciitab.h"
00683 #include "latin1tab.h"
00684   },
00685   STANDARD_VTABLE(little2_)
00686 };
00687 
00688 #endif
00689 
00690 static const struct normal_encoding little2_encoding = { 
00691   { VTABLE, 2, 0,
00692 #if XML_BYTE_ORDER == 12
00693     1
00694 #else
00695     0
00696 #endif
00697   },
00698   {
00699 #define BT_COLON BT_NMSTRT
00700 #include "asciitab.h"
00701 #undef BT_COLON
00702 #include "latin1tab.h"
00703   },
00704   STANDARD_VTABLE(little2_)
00705 };
00706 
00707 #if XML_BYTE_ORDER != 21
00708 
00709 #ifdef XML_NS
00710 
00711 static const struct normal_encoding internal_little2_encoding_ns = { 
00712   { VTABLE, 2, 0, 1 },
00713   {
00714 #include "iasciitab.h"
00715 #include "latin1tab.h"
00716   },
00717   STANDARD_VTABLE(little2_)
00718 };
00719 
00720 #endif
00721 
00722 static const struct normal_encoding internal_little2_encoding = { 
00723   { VTABLE, 2, 0, 1 },
00724   {
00725 #define BT_COLON BT_NMSTRT
00726 #include "iasciitab.h"
00727 #undef BT_COLON
00728 #include "latin1tab.h"
00729   },
00730   STANDARD_VTABLE(little2_)
00731 };
00732 
00733 #endif
00734 
00735 
00736 #define BIG2_BYTE_TYPE(enc, p) \
00737  ((p)[0] == 0 \
00738   ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \
00739   : unicode_byte_type((p)[0], (p)[1]))
00740 #define BIG2_BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1)
00741 #define BIG2_CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c)
00742 #define BIG2_IS_NAME_CHAR_MINBPC(enc, p) \
00743   UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
00744 #define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
00745   UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
00746 
00747 #ifdef XML_MIN_SIZE
00748 
00749 static
00750 int big2_byteType(const ENCODING *enc, const char *p)
00751 {
00752   return BIG2_BYTE_TYPE(enc, p);
00753 }
00754 
00755 static
00756 int big2_byteToAscii(const ENCODING *enc, const char *p)
00757 {
00758   return BIG2_BYTE_TO_ASCII(enc, p);
00759 }
00760 
00761 static
00762 int big2_charMatches(const ENCODING *enc, const char *p, int c)
00763 {
00764   return BIG2_CHAR_MATCHES(enc, p, c);
00765 }
00766 
00767 static
00768 int big2_isNameMin(const ENCODING *enc, const char *p)
00769 {
00770   return BIG2_IS_NAME_CHAR_MINBPC(enc, p);
00771 }
00772 
00773 static
00774 int big2_isNmstrtMin(const ENCODING *enc, const char *p)
00775 {
00776   return BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p);
00777 }
00778 
00779 #undef VTABLE
00780 #define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
00781 
00782 #else /* not XML_MIN_SIZE */
00783 
00784 #undef PREFIX
00785 #define PREFIX(ident) big2_ ## ident
00786 #define MINBPC(enc) 2
00787 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
00788 #define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
00789 #define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(enc, p) 
00790 #define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(enc, p, c)
00791 #define IS_NAME_CHAR(enc, p, n) 0
00792 #define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(enc, p)
00793 #define IS_NMSTRT_CHAR(enc, p, n) (0)
00794 #define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p)
00795 
00796 #include "xmltok_impl.c"
00797 
00798 #undef MINBPC
00799 #undef BYTE_TYPE
00800 #undef BYTE_TO_ASCII
00801 #undef CHAR_MATCHES
00802 #undef IS_NAME_CHAR
00803 #undef IS_NAME_CHAR_MINBPC
00804 #undef IS_NMSTRT_CHAR
00805 #undef IS_NMSTRT_CHAR_MINBPC
00806 #undef IS_INVALID_CHAR
00807 
00808 #endif /* not XML_MIN_SIZE */
00809 
00810 #ifdef XML_NS
00811 
00812 static const struct normal_encoding big2_encoding_ns = {
00813   { VTABLE, 2, 0,
00814 #if XML_BYTE_ORDER == 21
00815   1
00816 #else
00817   0
00818 #endif
00819   },
00820   {
00821 #include "asciitab.h"
00822 #include "latin1tab.h"
00823   },
00824   STANDARD_VTABLE(big2_)
00825 };
00826 
00827 #endif
00828 
00829 static const struct normal_encoding big2_encoding = {
00830   { VTABLE, 2, 0,
00831 #if XML_BYTE_ORDER == 21
00832   1
00833 #else
00834   0
00835 #endif
00836   },
00837   {
00838 #define BT_COLON BT_NMSTRT
00839 #include "asciitab.h"
00840 #undef BT_COLON
00841 #include "latin1tab.h"
00842   },
00843   STANDARD_VTABLE(big2_)
00844 };
00845 
00846 #if XML_BYTE_ORDER != 12
00847 
00848 #ifdef XML_NS
00849 
00850 static const struct normal_encoding internal_big2_encoding_ns = {
00851   { VTABLE, 2, 0, 1 },
00852   {
00853 #include "iasciitab.h"
00854 #include "latin1tab.h"
00855   },
00856   STANDARD_VTABLE(big2_)
00857 };
00858 
00859 #endif
00860 
00861 static const struct normal_encoding internal_big2_encoding = {
00862   { VTABLE, 2, 0, 1 },
00863   {
00864 #define BT_COLON BT_NMSTRT
00865 #include "iasciitab.h"
00866 #undef BT_COLON
00867 #include "latin1tab.h"
00868   },
00869   STANDARD_VTABLE(big2_)
00870 };
00871 
00872 #endif
00873 
00874 #undef PREFIX
00875 
00876 static
00877 int streqci(const char *s1, const char *s2)
00878 {
00879   for (;;) {
00880     char c1 = *s1++;
00881     char c2 = *s2++;
00882     if ('a' <= c1 && c1 <= 'z')
00883       c1 += 'A' - 'a';
00884     if ('a' <= c2 && c2 <= 'z')
00885       c2 += 'A' - 'a';
00886     if (c1 != c2)
00887       return 0;
00888     if (!c1)
00889       break;
00890   }
00891   return 1;
00892 }
00893 
00894 static
00895 void initUpdatePosition(const ENCODING *enc, const char *ptr,
00896                         const char *end, POSITION *pos)
00897 {
00898   normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
00899 }
00900 
00901 static
00902 int toAscii(const ENCODING *enc, const char *ptr, const char *end)
00903 {
00904   char buf[1];
00905   char *p = buf;
00906   XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
00907   if (p == buf)
00908     return -1;
00909   else
00910     return buf[0];
00911 }
00912 
00913 static
00914 int isSpace(int c)
00915 {
00916   switch (c) {
00917   case 0x20:
00918   case 0xD:
00919   case 0xA:
00920   case 0x9:     
00921     return 1;
00922   }
00923   return 0;
00924 }
00925 
00926 /* Return 1 if there's just optional white space
00927 or there's an S followed by name=val. */
00928 static
00929 int parsePseudoAttribute(const ENCODING *enc,
00930                          const char *ptr,
00931                          const char *end,
00932                          const char **namePtr,
00933                          const char **valPtr,
00934                          const char **nextTokPtr)
00935 {
00936   int c;
00937   char open;
00938   if (ptr == end) {
00939     *namePtr = 0;
00940     return 1;
00941   }
00942   if (!isSpace(toAscii(enc, ptr, end))) {
00943     *nextTokPtr = ptr;
00944     return 0;
00945   }
00946   do {
00947     ptr += enc->minBytesPerChar;
00948   } while (isSpace(toAscii(enc, ptr, end)));
00949   if (ptr == end) {
00950     *namePtr = 0;
00951     return 1;
00952   }
00953   *namePtr = ptr;
00954   for (;;) {
00955     c = toAscii(enc, ptr, end);
00956     if (c == -1) {
00957       *nextTokPtr = ptr;
00958       return 0;
00959     }
00960     if (c == '=')
00961       break;
00962     if (isSpace(c)) {
00963       do {
00964         ptr += enc->minBytesPerChar;
00965       } while (isSpace(c = toAscii(enc, ptr, end)));
00966       if (c != '=') {
00967         *nextTokPtr = ptr;
00968         return 0;
00969       }
00970       break;
00971     }
00972     ptr += enc->minBytesPerChar;
00973   }
00974   if (ptr == *namePtr) {
00975     *nextTokPtr = ptr;
00976     return 0;
00977   }
00978   ptr += enc->minBytesPerChar;
00979   c = toAscii(enc, ptr, end);
00980   while (isSpace(c)) {
00981     ptr += enc->minBytesPerChar;
00982     c = toAscii(enc, ptr, end);
00983   }
00984   if (c != '"' && c != '\'') {
00985     *nextTokPtr = ptr;
00986     return 0;
00987   }
00988   open = c;
00989   ptr += enc->minBytesPerChar;
00990   *valPtr = ptr;
00991   for (;; ptr += enc->minBytesPerChar) {
00992     c = toAscii(enc, ptr, end);
00993     if (c == open)
00994       break;
00995     if (!('a' <= c && c <= 'z')
00996         && !('A' <= c && c <= 'Z')
00997         && !('0' <= c && c <= '9')
00998         && c != '.'
00999         && c != '-'
01000         && c != '_') {
01001       *nextTokPtr = ptr;
01002       return 0;
01003     }
01004   }
01005   *nextTokPtr = ptr + enc->minBytesPerChar;
01006   return 1;
01007 }
01008 
01009 static
01010 int doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *,
01011                                                      const char *,
01012                                                      const char *),
01013                    int isGeneralTextEntity,
01014                    const ENCODING *enc,
01015                    const char *ptr,
01016                    const char *end,
01017                    const char **badPtr,
01018                    const char **versionPtr,
01019                    const char **encodingName,
01020                    const ENCODING **encoding,
01021                    int *standalone)
01022 {
01023   const char *val = 0;
01024   const char *name = 0;
01025   ptr += 5 * enc->minBytesPerChar;
01026   end -= 2 * enc->minBytesPerChar;
01027   if (!parsePseudoAttribute(enc, ptr, end, &name, &val, &ptr) || !name) {
01028     *badPtr = ptr;
01029     return 0;
01030   }
01031   if (!XmlNameMatchesAscii(enc, name, "version")) {
01032     if (!isGeneralTextEntity) {
01033       *badPtr = name;
01034       return 0;
01035     }
01036   }
01037   else {
01038     if (versionPtr)
01039       *versionPtr = val;
01040     if (!parsePseudoAttribute(enc, ptr, end, &name, &val, &ptr)) {
01041       *badPtr = ptr;
01042       return 0;
01043     }
01044     if (!name) {
01045       if (isGeneralTextEntity) {
01046         /* a TextDecl must have an EncodingDecl */
01047         *badPtr = ptr;
01048         return 0;
01049       }
01050       return 1;
01051     }
01052   }
01053   if (XmlNameMatchesAscii(enc, name, "encoding")) {
01054     int c = toAscii(enc, val, end);
01055     if (!('a' <= c && c <= 'z') && !('A' <= c && c <= 'Z')) {
01056       *badPtr = val;
01057       return 0;
01058     }
01059     if (encodingName)
01060       *encodingName = val;
01061     if (encoding)
01062       *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
01063     if (!parsePseudoAttribute(enc, ptr, end, &name, &val, &ptr)) {
01064       *badPtr = ptr;
01065       return 0;
01066     }
01067     if (!name)
01068       return 1;
01069   }
01070   if (!XmlNameMatchesAscii(enc, name, "standalone") || isGeneralTextEntity) {
01071     *badPtr = name;
01072     return 0;
01073   }
01074   if (XmlNameMatchesAscii(enc, val, "yes")) {
01075     if (standalone)
01076       *standalone = 1;
01077   }
01078   else if (XmlNameMatchesAscii(enc, val, "no")) {
01079     if (standalone)
01080       *standalone = 0;
01081   }
01082   else {
01083     *badPtr = val;
01084     return 0;
01085   }
01086   while (isSpace(toAscii(enc, ptr, end)))
01087     ptr += enc->minBytesPerChar;
01088   if (ptr != end) {
01089     *badPtr = ptr;
01090     return 0;
01091   }
01092   return 1;
01093 }
01094 
01095 static
01096 int checkCharRefNumber(int result)
01097 {
01098   switch (result >> 8) {
01099   case 0xD8: case 0xD9: case 0xDA: case 0xDB:
01100   case 0xDC: case 0xDD: case 0xDE: case 0xDF:
01101     return -1;
01102   case 0:
01103     if (latin1_encoding.type[result] == BT_NONXML)
01104       return -1;
01105     break;
01106   case 0xFF:
01107     if (result == 0xFFFE || result == 0xFFFF)
01108       return -1;
01109     break;
01110   }
01111   return result;
01112 }
01113 
01114 int XmlUtf8Encode(int c, char *buf)
01115 {
01116   enum {
01117     /* minN is minimum legal resulting value for N byte sequence */
01118     min2 = 0x80,
01119     min3 = 0x800,
01120     min4 = 0x10000
01121   };
01122 
01123   if (c < 0)
01124     return 0;
01125   if (c < min2) {
01126     buf[0] = (c | UTF8_cval1);
01127     return 1;
01128   }
01129   if (c < min3) {
01130     buf[0] = ((c >> 6) | UTF8_cval2);
01131     buf[1] = ((c & 0x3f) | 0x80);
01132     return 2;
01133   }
01134   if (c < min4) {
01135     buf[0] = ((c >> 12) | UTF8_cval3);
01136     buf[1] = (((c >> 6) & 0x3f) | 0x80);
01137     buf[2] = ((c & 0x3f) | 0x80);
01138     return 3;
01139   }
01140   if (c < 0x110000) {
01141     buf[0] = ((c >> 18) | UTF8_cval4);
01142     buf[1] = (((c >> 12) & 0x3f) | 0x80);
01143     buf[2] = (((c >> 6) & 0x3f) | 0x80);
01144     buf[3] = ((c & 0x3f) | 0x80);
01145     return 4;
01146   }
01147   return 0;
01148 }
01149 
01150 int XmlUtf16Encode(int charNum, unsigned short *buf)
01151 {
01152   if (charNum < 0)
01153     return 0;
01154   if (charNum < 0x10000) {
01155     buf[0] = charNum;
01156     return 1;
01157   }
01158   if (charNum < 0x110000) {
01159     charNum -= 0x10000;
01160     buf[0] = (charNum >> 10) + 0xD800;
01161     buf[1] = (charNum & 0x3FF) + 0xDC00;
01162     return 2;
01163   }
01164   return 0;
01165 }
01166 
01167 struct unknown_encoding {
01168   struct normal_encoding normal;
01169   int (*convert)(void *userData, const char *p);
01170   void *userData;
01171   unsigned short utf16[256];
01172   char utf8[256][4];
01173 };
01174 
01175 int XmlSizeOfUnknownEncoding()
01176 {
01177   return sizeof(struct unknown_encoding);
01178 }
01179 
01180 static
01181 int unknown_isName(const ENCODING *enc, const char *p)
01182 {
01183   int c = ((const struct unknown_encoding *)enc)
01184           ->convert(((const struct unknown_encoding *)enc)->userData, p);
01185   if (c & ~0xFFFF)
01186     return 0;
01187   return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
01188 }
01189 
01190 static
01191 int unknown_isNmstrt(const ENCODING *enc, const char *p)
01192 {
01193   int c = ((const struct unknown_encoding *)enc)
01194           ->convert(((const struct unknown_encoding *)enc)->userData, p);
01195   if (c & ~0xFFFF)
01196     return 0;
01197   return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
01198 }
01199 
01200 static
01201 int unknown_isInvalid(const ENCODING *enc, const char *p)
01202 {
01203   int c = ((const struct unknown_encoding *)enc)
01204            ->convert(((const struct unknown_encoding *)enc)->userData, p);
01205   return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
01206 }
01207 
01208 static
01209 void unknown_toUtf8(const ENCODING *enc,
01210                     const char **fromP, const char *fromLim,
01211                     char **toP, const char *toLim)
01212 {
01213   char buf[XML_UTF8_ENCODE_MAX];
01214   for (;;) {
01215     const char *utf8;
01216     int n;
01217     if (*fromP == fromLim)
01218       break;
01219     utf8 = ((const struct unknown_encoding *)enc)->utf8[(unsigned char)**fromP];
01220     n = *utf8++;
01221     if (n == 0) {
01222       int c = ((const struct unknown_encoding *)enc)
01223               ->convert(((const struct unknown_encoding *)enc)->userData, *fromP);
01224       n = XmlUtf8Encode(c, buf);
01225       if (n > toLim - *toP)
01226         break;
01227       utf8 = buf;
01228       *fromP += ((const struct normal_encoding *)enc)->type[(unsigned char)**fromP]
01229                  - (BT_LEAD2 - 2);
01230     }
01231     else {
01232       if (n > toLim - *toP)
01233         break;
01234       (*fromP)++;
01235     }
01236     do {
01237       *(*toP)++ = *utf8++;
01238     } while (--n != 0);
01239   }
01240 }
01241 
01242 static
01243 void unknown_toUtf16(const ENCODING *enc,
01244                      const char **fromP, const char *fromLim,
01245                      unsigned short **toP, const unsigned short *toLim)
01246 {
01247   while (*fromP != fromLim && *toP != toLim) {
01248     unsigned short c
01249       = ((const struct unknown_encoding *)enc)->utf16[(unsigned char)**fromP];
01250     if (c == 0) {
01251       c = (unsigned short)((const struct unknown_encoding *)enc)
01252            ->convert(((const struct unknown_encoding *)enc)->userData, *fromP);
01253       *fromP += ((const struct normal_encoding *)enc)->type[(unsigned char)**fromP]
01254                  - (BT_LEAD2 - 2);
01255     }
01256     else
01257       (*fromP)++;
01258     *(*toP)++ = c;
01259   }
01260 }
01261 
01262 ENCODING *
01263 XmlInitUnknownEncoding(void *mem,
01264                        int *table,
01265                        int (*convert)(void *userData, const char *p),
01266                        void *userData)
01267 {
01268   int i;
01269   struct unknown_encoding *e = mem;
01270   for (i = 0; i < sizeof(struct normal_encoding); i++)
01271     ((char *)mem)[i] = ((char *)&latin1_encoding)[i];
01272   for (i = 0; i < 128; i++)
01273     if (latin1_encoding.type[i] != BT_OTHER
01274         && latin1_encoding.type[i] != BT_NONXML
01275         && table[i] != i)
01276       return 0;
01277   for (i = 0; i < 256; i++) {
01278     int c = table[i];
01279     if (c == -1) {
01280       e->normal.type[i] = BT_MALFORM;
01281       /* This shouldn't really get used. */
01282       e->utf16[i] = 0xFFFF;
01283       e->utf8[i][0] = 1;
01284       e->utf8[i][1] = 0;
01285     }
01286     else if (c < 0) {
01287       if (c < -4)
01288         return 0;
01289       e->normal.type[i] = BT_LEAD2 - (c + 2);
01290       e->utf8[i][0] = 0;
01291       e->utf16[i] = 0;
01292     }
01293     else if (c < 0x80) {
01294       if (latin1_encoding.type[c] != BT_OTHER
01295           && latin1_encoding.type[c] != BT_NONXML
01296           && c != i)
01297         return 0;
01298       e->normal.type[i] = latin1_encoding.type[c];
01299       e->utf8[i][0] = 1;
01300       e->utf8[i][1] = (char)c;
01301       e->utf16[i] = c == 0 ? 0xFFFF : c;
01302     }
01303     else if (checkCharRefNumber(c) < 0) {
01304       e->normal.type[i] = BT_NONXML;
01305       /* This shouldn't really get used. */
01306       e->utf16[i] = 0xFFFF;
01307       e->utf8[i][0] = 1;
01308       e->utf8[i][1] = 0;
01309     }
01310     else {
01311       if (c > 0xFFFF)
01312         return 0;
01313       if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
01314         e->normal.type[i] = BT_NMSTRT;
01315       else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
01316         e->normal.type[i] = BT_NAME;
01317       else
01318         e->normal.type[i] = BT_OTHER;
01319       e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
01320       e->utf16[i] = c;
01321     }
01322   }
01323   e->userData = userData;
01324   e->convert = convert;
01325   if (convert) {
01326     e->normal.isName2 = unknown_isName;
01327     e->normal.isName3 = unknown_isName;
01328     e->normal.isName4 = unknown_isName;
01329     e->normal.isNmstrt2 = unknown_isNmstrt;
01330     e->normal.isNmstrt3 = unknown_isNmstrt;
01331     e->normal.isNmstrt4 = unknown_isNmstrt;
01332     e->normal.isInvalid2 = unknown_isInvalid;
01333     e->normal.isInvalid3 = unknown_isInvalid;
01334     e->normal.isInvalid4 = unknown_isInvalid;
01335   }
01336   e->normal.enc.utf8Convert = unknown_toUtf8;
01337   e->normal.enc.utf16Convert = unknown_toUtf16;
01338   return &(e->normal.enc);
01339 }
01340 
01341 /* If this enumeration is changed, getEncodingIndex and encodings
01342 must also be changed. */
01343 enum {
01344   UNKNOWN_ENC = -1,
01345   ISO_8859_1_ENC = 0,
01346   US_ASCII_ENC,
01347   UTF_8_ENC,
01348   UTF_16_ENC,
01349   UTF_16BE_ENC,
01350   UTF_16LE_ENC,
01351   /* must match encodingNames up to here */
01352   NO_ENC
01353 };
01354 
01355 static
01356 int getEncodingIndex(const char *name)
01357 {
01358   static const char *encodingNames[] = {
01359     "ISO-8859-1",
01360     "US-ASCII",
01361     "UTF-8",
01362     "UTF-16",
01363     "UTF-16BE"
01364     "UTF-16LE",
01365   };
01366   int i;
01367   if (name == 0)
01368     return NO_ENC;
01369   for (i = 0; i < sizeof(encodingNames)/sizeof(encodingNames[0]); i++)
01370     if (streqci(name, encodingNames[i]))
01371       return i;
01372   return UNKNOWN_ENC;
01373 }
01374 
01375 /* For binary compatibility, we store the index of the encoding specified
01376 at initialization in the isUtf16 member. */
01377 
01378 #define INIT_ENC_INDEX(enc) ((enc)->initEnc.isUtf16)
01379 
01380 /* This is what detects the encoding.
01381 encodingTable maps from encoding indices to encodings;
01382 INIT_ENC_INDEX(enc) is the index of the external (protocol) specified encoding;
01383 state is XML_CONTENT_STATE if we're parsing an external text entity,
01384 and XML_PROLOG_STATE otherwise.
01385 */
01386 
01387 
01388 static
01389 int initScan(const ENCODING **encodingTable,
01390              const INIT_ENCODING *enc,
01391              int state,
01392              const char *ptr,
01393              const char *end,
01394              const char **nextTokPtr)
01395 {
01396   const ENCODING **encPtr;
01397 
01398   if (ptr == end)
01399     return XML_TOK_NONE;
01400   encPtr = enc->encPtr;
01401   if (ptr + 1 == end) {
01402     /* only a single byte available for auto-detection */
01403     /* a well-formed document entity must have more than one byte */
01404     if (state != XML_CONTENT_STATE)
01405       return XML_TOK_PARTIAL;
01406     /* so we're parsing an external text entity... */
01407     /* if UTF-16 was externally specified, then we need at least 2 bytes */
01408     switch (INIT_ENC_INDEX(enc)) {
01409     case UTF_16_ENC:
01410     case UTF_16LE_ENC:
01411     case UTF_16BE_ENC:
01412       return XML_TOK_PARTIAL;
01413     }
01414     switch ((unsigned char)*ptr) {
01415     case 0xFE:
01416     case 0xFF:
01417     case 0xEF: /* possibly first byte of UTF-8 BOM */
01418       if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
01419           && state == XML_CONTENT_STATE)
01420         break;
01421       /* fall through */
01422     case 0x00:
01423     case 0x3C:
01424       return XML_TOK_PARTIAL;
01425     }
01426   }
01427   else {
01428     switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
01429     case 0xFEFF:
01430       if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
01431           && state == XML_CONTENT_STATE)
01432         break;
01433       *nextTokPtr = ptr + 2;
01434       *encPtr = encodingTable[UTF_16BE_ENC];
01435       return XML_TOK_BOM;
01436     /* 00 3C is handled in the default case */
01437     case 0x3C00:
01438       if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
01439            || INIT_ENC_INDEX(enc) == UTF_16_ENC)
01440           && state == XML_CONTENT_STATE)
01441         break;
01442       *encPtr = encodingTable[UTF_16LE_ENC];
01443       return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
01444     case 0xFFFE:
01445       if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
01446           && state == XML_CONTENT_STATE)
01447         break;
01448       *nextTokPtr = ptr + 2;
01449       *encPtr = encodingTable[UTF_16LE_ENC];
01450       return XML_TOK_BOM;
01451     case 0xEFBB:
01452       /* Maybe a UTF-8 BOM (EF BB BF) */
01453       /* If there's an explicitly specified (external) encoding
01454          of ISO-8859-1 or some flavour of UTF-16
01455          and this is an external text entity,
01456          don't look for the BOM,
01457          because it might be a legal data. */
01458       if (state == XML_CONTENT_STATE) {
01459         int e = INIT_ENC_INDEX(enc);
01460         if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC || e == UTF_16LE_ENC || e == UTF_16_ENC)
01461           break;
01462       }
01463       if (ptr + 2 == end)
01464         return XML_TOK_PARTIAL;
01465       if ((unsigned char)ptr[2] == 0xBF) {
01466         *encPtr = encodingTable[UTF_8_ENC];
01467         return XML_TOK_BOM;
01468       }
01469       break;
01470     default:
01471       if (ptr[0] == '\0') {
01472         /* 0 isn't a legal data character. Furthermore a document entity can only
01473            start with ASCII characters.  So the only way this can fail to be big-endian
01474            UTF-16 if it it's an external parsed general entity that's labelled as
01475            UTF-16LE. */
01476         if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
01477           break;
01478         *encPtr = encodingTable[UTF_16BE_ENC];
01479         return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
01480       }
01481       else if (ptr[1] == '\0') {
01482         /* We could recover here in the case:
01483             - parsing an external entity
01484             - second byte is 0
01485             - no externally specified encoding
01486             - no encoding declaration
01487            by assuming UTF-16LE.  But we don't, because this would mean when
01488            presented just with a single byte, we couldn't reliably determine
01489            whether we needed further bytes. */
01490         if (state == XML_CONTENT_STATE)
01491           break;
01492         *encPtr = encodingTable[UTF_16LE_ENC];
01493         return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
01494       }
01495       break;
01496     }
01497   }
01498   *encPtr = encodingTable[INIT_ENC_INDEX(enc)];
01499   return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
01500 }
01501 
01502 
01503 #define NS(x) x
01504 #define ns(x) x
01505 #include "xmltok_ns.c"
01506 #undef NS
01507 #undef ns
01508 
01509 #ifdef XML_NS
01510 
01511 #define NS(x) x ## NS
01512 #define ns(x) x ## _ns
01513 
01514 #include "xmltok_ns.c"
01515 
01516 #undef NS
01517 #undef ns
01518 
01519 ENCODING *
01520 XmlInitUnknownEncodingNS(void *mem,
01521                          int *table,
01522                          int (*convert)(void *userData, const char *p),
01523                          void *userData)
01524 {
01525   ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
01526   if (enc)
01527     ((struct normal_encoding *)enc)->type[':'] = BT_COLON;
01528   return enc;
01529 }
01530 
01531 #endif /* XML_NS */