Changeset 738 for code/branches/FICN/src/tinyxml/tinyxmlparser.cc
- Timestamp:
- Dec 31, 2007, 12:06:33 AM (17 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
code/branches/FICN/src/tinyxml/tinyxmlparser.cc
r471 r738 3 3 Original code (2.0 and earlier )copyright (c) 2000-2002 Lee Thomason (www.grinninglizard.com) 4 4 5 This software is provided 'as-is', without any express or implied 6 warranty. In no event will the authors be held liable for any 5 This software is provided 'as-is', without any express or implied 6 warranty. In no event will the authors be held liable for any 7 7 damages arising from the use of this software. 8 8 9 Permission is granted to anyone to use this software for any 10 purpose, including commercial applications, and to alter it and 9 Permission is granted to anyone to use this software for any 10 purpose, including commercial applications, and to alter it and 11 11 redistribute it freely, subject to the following restrictions: 12 12 13 1. The origin of this software must not be misrepresented; you must 13 1. The origin of this software must not be misrepresented; you must 14 14 not claim that you wrote the original software. If you use this 15 15 software in a product, an acknowledgment in the product documentation 16 16 would be appreciated but is not required. 17 17 18 2. Altered source versions must be plainly marked as such, and 18 2. Altered source versions must be plainly marked as such, and 19 19 must not be misrepresented as being the original software. 20 20 21 3. This notice may not be removed or altered from any source 21 3. This notice may not be removed or altered from any source 22 22 distribution. 23 23 */ 24 24 25 #include "tinyxml.h"26 25 #include <ctype.h> 27 26 #include <stddef.h> 28 27 28 #include "tinyxml.h" 29 29 30 //#define DEBUG_PARSER 31 #if defined( DEBUG_PARSER ) 32 # if defined( DEBUG ) && defined( _MSC_VER ) 33 # include <windows.h> 34 # define TIXML_LOG OutputDebugString 35 # else 36 # define TIXML_LOG printf 37 # endif 38 #endif 30 39 31 40 // Note tha "PutString" hardcodes the same list. This 32 41 // is less flexible than it appears. Changing the entries 33 // or order will break putstring. 34 TiXmlBase::Entity TiXmlBase::entity[ NUM_ENTITY ] = 42 // or order will break putstring. 43 TiXmlBase::Entity TiXmlBase::entity[ NUM_ENTITY ] = 35 44 { 36 45 { "&", 5, '&' }, … … 46 55 // sequence from the lead byte. 1 placed for invalid sequences -- 47 56 // although the result will be junk, pass it through as much as possible. 48 // Beware of the non-characters in UTF-8: 57 // Beware of the non-characters in UTF-8: 49 58 // ef bb bf (Microsoft "lead bytes") 50 59 // ef bf be 51 // ef bf bf 60 // ef bf bf 52 61 53 62 const unsigned char TIXML_UTF_LEAD_0 = 0xefU; … … 55 64 const unsigned char TIXML_UTF_LEAD_2 = 0xbfU; 56 65 57 const int TiXmlBase::utf8ByteTable[256] = 66 const int TiXmlBase::utf8ByteTable[256] = 58 67 { 59 68 // 0 1 2 3 4 5 6 7 8 9 a b c d e f … … 67 76 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x70 End of ASCII range 68 77 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x80 0x80 to 0xc1 invalid 69 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x90 70 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0xa0 71 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0xb0 78 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x90 79 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0xa0 80 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0xb0 72 81 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xc0 0xc2 to 0xdf 2 byte 73 82 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xd0 … … 83 92 const unsigned long FIRST_BYTE_MARK[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; 84 93 85 if (input < 0x80) 94 if (input < 0x80) 86 95 *length = 1; 87 96 else if ( input < 0x800 ) … … 97 106 98 107 // Scary scary fall throughs. 99 switch (*length) 108 switch (*length) 100 109 { 101 110 case 4: 102 --output; 103 *output = (char)((input | BYTE_MARK) & BYTE_MASK); 111 --output; 112 *output = (char)((input | BYTE_MARK) & BYTE_MASK); 104 113 input >>= 6; 105 114 case 3: 106 --output; 107 *output = (char)((input | BYTE_MARK) & BYTE_MASK); 115 --output; 116 *output = (char)((input | BYTE_MARK) & BYTE_MASK); 108 117 input >>= 6; 109 118 case 2: 110 --output; 111 *output = (char)((input | BYTE_MARK) & BYTE_MASK); 119 --output; 120 *output = (char)((input | BYTE_MARK) & BYTE_MASK); 112 121 input >>= 6; 113 122 case 1: 114 --output; 123 --output; 115 124 *output = (char)(input | FIRST_BYTE_MARK[*length]); 116 125 } … … 122 131 // This will only work for low-ascii, everything else is assumed to be a valid 123 132 // letter. I'm not sure this is the best approach, but it is quite tricky trying 124 // to figure out alhabetical vs. not across encoding. So take a very 133 // to figure out alhabetical vs. not across encoding. So take a very 125 134 // conservative approach. 126 135 … … 143 152 // This will only work for low-ascii, everything else is assumed to be a valid 144 153 // letter. I'm not sure this is the best approach, but it is quite tricky trying 145 // to figure out alhabetical vs. not across encoding. So take a very 154 // to figure out alhabetical vs. not across encoding. So take a very 146 155 // conservative approach. 147 156 … … 216 225 // bump down to the next line 217 226 ++row; 218 col = 0; 227 col = 0; 219 228 // Eat the character 220 229 ++p; … … 258 267 // 0-width spaces. 259 268 if ( *(pU+1)==TIXML_UTF_LEAD_1 && *(pU+2)==TIXML_UTF_LEAD_2 ) 260 p += 3; 269 p += 3; 261 270 else if ( *(pU+1)==0xbfU && *(pU+2)==0xbeU ) 262 p += 3; 271 p += 3; 263 272 else if ( *(pU+1)==0xbfU && *(pU+2)==0xbfU ) 264 p += 3; 273 p += 3; 265 274 else 266 275 { p +=3; ++col; } // A normal character. … … 278 287 { 279 288 // Eat the 1 to 4 byte utf8 character. 280 int step = TiXmlBase::utf8ByteTable[*(( unsigned char*)p)];289 int step = TiXmlBase::utf8ByteTable[*((const unsigned char*)p)]; 281 290 if ( step == 0 ) 282 291 step = 1; // Error case from bad encoding, but handle gracefully. … … 314 323 { 315 324 const unsigned char* pU = (const unsigned char*)p; 316 325 317 326 // Skip the stupid Microsoft UTF-8 Byte order marks 318 327 if ( *(pU+0)==TIXML_UTF_LEAD_0 319 && *(pU+1)==TIXML_UTF_LEAD_1 328 && *(pU+1)==TIXML_UTF_LEAD_1 320 329 && *(pU+2)==TIXML_UTF_LEAD_2 ) 321 330 { … … 354 363 355 364 #ifdef TIXML_USE_STL 356 /*static*/ bool TiXmlBase::StreamWhiteSpace( TIXML_ISTREAM* in, TIXML_STRING * tag )365 /*static*/ bool TiXmlBase::StreamWhiteSpace( std::istream * in, TIXML_STRING * tag ) 357 366 { 358 367 for( ;; ) … … 369 378 } 370 379 371 /*static*/ bool TiXmlBase::StreamTo( TIXML_ISTREAM* in, int character, TIXML_STRING * tag )380 /*static*/ bool TiXmlBase::StreamTo( std::istream * in, int character, TIXML_STRING * tag ) 372 381 { 373 382 //assert( character > 0 && character < 128 ); // else it won't work in utf-8 … … 387 396 #endif 388 397 398 // One of TinyXML's more performance demanding functions. Try to keep the memory overhead down. The 399 // "assign" optimization removes over 10% of the execution time. 400 // 389 401 const char* TiXmlBase::ReadName( const char* p, TIXML_STRING * name, TiXmlEncoding encoding ) 390 402 { 403 // Oddly, not supported on some comilers, 404 //name->clear(); 405 // So use this: 391 406 *name = ""; 392 407 assert( p ); … … 399 414 // hyphens, or colons. (Colons are valid ony for namespaces, 400 415 // but tinyxml can't tell namespaces from names.) 401 if ( p && *p 416 if ( p && *p 402 417 && ( IsAlpha( (unsigned char) *p, encoding ) || *p == '_' ) ) 403 418 { 419 const char* start = p; 404 420 while( p && *p 405 && ( IsAlphaNum( (unsigned char ) *p, encoding ) 421 && ( IsAlphaNum( (unsigned char ) *p, encoding ) 406 422 || *p == '_' 407 423 || *p == '-' … … 409 425 || *p == ':' ) ) 410 426 { 411 (*name) += *p;427 //(*name) += *p; // expensive 412 428 ++p; 429 } 430 if ( p-start > 0 ) { 431 name->assign( start, p-start ); 413 432 } 414 433 return p; … … 451 470 else if ( *q >= 'A' && *q <= 'F' ) 452 471 ucs += mult * (*q - 'A' + 10 ); 453 else 472 else 454 473 return 0; 455 474 mult *= 16; … … 474 493 if ( *q >= '0' && *q <= '9' ) 475 494 ucs += mult * (*q - '0'); 476 else 495 else 477 496 return 0; 478 497 mult *= 10; … … 507 526 // So it wasn't an entity, its unrecognized, or something like that. 508 527 *value = *p; // Don't put back the last one, since we return it! 528 //*length = 1; // Leave unrecognized entities - this doesn't really work. 529 // Just writes strange XML. 509 530 return p+1; 510 531 } … … 551 572 } 552 573 553 const char* TiXmlBase::ReadText( const char* p, 554 TIXML_STRING * text, 555 bool trimWhiteSpace, 556 const char* endTag, 574 const char* TiXmlBase::ReadText( const char* p, 575 TIXML_STRING * text, 576 bool trimWhiteSpace, 577 const char* endTag, 557 578 bool caseInsensitive, 558 579 TiXmlEncoding encoding ) … … 611 632 } 612 633 } 613 return p + strlen( endTag ); 634 if ( p ) 635 p += strlen( endTag ); 636 return p; 614 637 } 615 638 616 639 #ifdef TIXML_USE_STL 617 640 618 void TiXmlDocument::StreamIn( TIXML_ISTREAM* in, TIXML_STRING * tag )641 void TiXmlDocument::StreamIn( std::istream * in, TIXML_STRING * tag ) 619 642 { 620 643 // The basic issue with a document is that we don't know what we're … … 625 648 // sub-tag can orient itself. 626 649 627 if ( !StreamTo( in, '<', tag ) ) 650 if ( !StreamTo( in, '<', tag ) ) 628 651 { 629 652 SetError( TIXML_ERROR_PARSING_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN ); … … 647 670 if ( in->good() ) 648 671 { 649 // We now have something we presume to be a node of 672 // We now have something we presume to be a node of 650 673 // some sort. Identify it, and call the node to 651 674 // continue streaming. … … 756 779 else if ( StringEqual( enc, "UTF8", true, TIXML_ENCODING_UNKNOWN ) ) 757 780 encoding = TIXML_ENCODING_UTF8; // incorrect, but be nice 758 else 781 else 759 782 encoding = TIXML_ENCODING_LEGACY; 760 783 } … … 774 797 775 798 void TiXmlDocument::SetError( int err, const char* pError, TiXmlParsingData* data, TiXmlEncoding encoding ) 776 { 799 { 777 800 // The first error in a chain is more accurate - don't set again! 778 801 if ( error ) … … 811 834 } 812 835 813 // What is this thing? 836 // What is this thing? 814 837 // - Elements start with a letter or underscore, but xml is reserved. 815 838 // - Comments: <!-- … … 884 907 #ifdef TIXML_USE_STL 885 908 886 void TiXmlElement::StreamIn ( TIXML_ISTREAM* in, TIXML_STRING * tag)909 void TiXmlElement::StreamIn (std::istream * in, TIXML_STRING * tag) 887 910 { 888 911 // We're called with some amount of pre-parsing. That is, some of "this" … … 899 922 } 900 923 (*tag) += (char) c ; 901 924 902 925 if ( c == '>' ) 903 926 break; … … 909 932 // If not, identify and stream. 910 933 911 if ( tag->at( tag->length() - 1 ) == '>' 934 if ( tag->at( tag->length() - 1 ) == '>' 912 935 && tag->at( tag->length() - 2 ) == '/' ) 913 936 { … … 919 942 // There is more. Could be: 920 943 // text 944 // cdata text (which looks like another node) 921 945 // closing tag 922 946 // another node. … … 926 950 927 951 // Do we have text? 928 if ( in->good() && in->peek() != '<' ) 952 if ( in->good() && in->peek() != '<' ) 929 953 { 930 954 // Yep, text. … … 959 983 return; 960 984 } 961 985 962 986 if ( c == '>' ) 963 987 break; … … 965 989 *tag += (char) c; 966 990 in->get(); 991 992 // Early out if we find the CDATA id. 993 if ( c == '[' && tag->size() >= 9 ) 994 { 995 size_t len = tag->size(); 996 const char* start = tag->c_str() + len - 9; 997 if ( strcmp( start, "<![CDATA[" ) == 0 ) { 998 assert( !closingTag ); 999 break; 1000 } 1001 } 967 1002 968 1003 if ( !firstCharFound && c != '<' && !IsWhiteSpace( c ) ) … … 1068 1103 if ( *p != '>' ) 1069 1104 { 1070 if ( document ) document->SetError( TIXML_ERROR_PARSING_EMPTY, p, data, encoding ); 1105 if ( document ) document->SetError( TIXML_ERROR_PARSING_EMPTY, p, data, encoding ); 1071 1106 return 0; 1072 1107 } … … 1080 1115 ++p; 1081 1116 p = ReadValue( p, data, encoding ); // Note this is an Element method, and will set the error if one happens. 1082 if ( !p || !*p ) 1117 if ( !p || !*p ) { 1118 // We were looking for the end tag, but found nothing. 1119 // Fix for [ 1663758 ] Failure to report error on bad XML 1120 if ( document ) document->SetError( TIXML_ERROR_READING_END_TAG, p, data, encoding ); 1083 1121 return 0; 1122 } 1084 1123 1085 1124 // We should find the end tag now … … 1106 1145 1107 1146 attrib->SetDocument( document ); 1108 const char*pErr = p;1147 pErr = p; 1109 1148 p = attrib->Parse( p, data, encoding ); 1110 1149 … … 1117 1156 1118 1157 // Handle the strange case of double attributes: 1158 #ifdef TIXML_USE_STL 1159 TiXmlAttribute* node = attributeSet.Find( attrib->NameTStr() ); 1160 #else 1119 1161 TiXmlAttribute* node = attributeSet.Find( attrib->Name() ); 1162 #endif 1120 1163 if ( node ) 1121 1164 { … … 1168 1211 else 1169 1212 delete textNode; 1170 } 1171 else 1213 } 1214 else 1172 1215 { 1173 1216 // We hit a '<' … … 1185 1228 p = node->Parse( p, data, encoding ); 1186 1229 LinkEndChild( node ); 1187 } 1230 } 1188 1231 else 1189 1232 { … … 1199 1242 { 1200 1243 if ( document ) document->SetError( TIXML_ERROR_READING_ELEMENT_VALUE, 0, 0, encoding ); 1201 } 1244 } 1202 1245 return p; 1203 1246 } … … 1205 1248 1206 1249 #ifdef TIXML_USE_STL 1207 void TiXmlUnknown::StreamIn( TIXML_ISTREAM* in, TIXML_STRING * tag )1250 void TiXmlUnknown::StreamIn( std::istream * in, TIXML_STRING * tag ) 1208 1251 { 1209 1252 while ( in->good() ) 1210 1253 { 1211 int c = in->get(); 1254 int c = in->get(); 1212 1255 if ( c <= 0 ) 1213 1256 { … … 1222 1265 { 1223 1266 // All is well. 1224 return; 1267 return; 1225 1268 } 1226 1269 } … … 1263 1306 1264 1307 #ifdef TIXML_USE_STL 1265 void TiXmlComment::StreamIn( TIXML_ISTREAM* in, TIXML_STRING * tag )1308 void TiXmlComment::StreamIn( std::istream * in, TIXML_STRING * tag ) 1266 1309 { 1267 1310 while ( in->good() ) 1268 1311 { 1269 int c = in->get(); 1312 int c = in->get(); 1270 1313 if ( c <= 0 ) 1271 1314 { … … 1278 1321 (*tag) += (char) c; 1279 1322 1280 if ( c == '>' 1323 if ( c == '>' 1281 1324 && tag->at( tag->length() - 2 ) == '-' 1282 1325 && tag->at( tag->length() - 3 ) == '-' ) 1283 1326 { 1284 1327 // All is well. 1285 return; 1328 return; 1286 1329 } 1287 1330 } … … 1311 1354 } 1312 1355 p += strlen( startTag ); 1313 p = ReadText( p, &value, false, endTag, false, encoding ); 1356 1357 // [ 1475201 ] TinyXML parses entities in comments 1358 // Oops - ReadText doesn't work, because we don't want to parse the entities. 1359 // p = ReadText( p, &value, false, endTag, false, encoding ); 1360 // 1361 // from the XML spec: 1362 /* 1363 [Definition: Comments may appear anywhere in a document outside other markup; in addition, 1364 they may appear within the document type declaration at places allowed by the grammar. 1365 They are not part of the document's character data; an XML processor MAY, but need not, 1366 make it possible for an application to retrieve the text of comments. For compatibility, 1367 the string "--" (double-hyphen) MUST NOT occur within comments.] Parameter entity 1368 references MUST NOT be recognized within comments. 1369 1370 An example of a comment: 1371 1372 <!-- declarations for <head> & <body> --> 1373 */ 1374 1375 value = ""; 1376 // Keep all the white space. 1377 while ( p && *p && !StringEqual( p, endTag, false, encoding ) ) 1378 { 1379 value.append( p, 1 ); 1380 ++p; 1381 } 1382 if ( p ) 1383 p += strlen( endTag ); 1384 1314 1385 return p; 1315 1386 } … … 1321 1392 if ( !p || !*p ) return 0; 1322 1393 1323 int tabsize = 4;1324 if ( document )1325 tabsize = document->TabSize();1394 // int tabsize = 4; 1395 // if ( document ) 1396 // tabsize = document->TabSize(); 1326 1397 1327 1398 if ( data ) … … 1352 1423 return 0; 1353 1424 } 1354 1425 1355 1426 const char* end; 1356 1357 if ( *p == '\'' ) 1427 const char SINGLE_QUOTE = '\''; 1428 const char DOUBLE_QUOTE = '\"'; 1429 1430 if ( *p == SINGLE_QUOTE ) 1358 1431 { 1359 1432 ++p; 1360 end = "\'"; 1433 end = "\'"; // single quote in string 1361 1434 p = ReadText( p, &value, false, end, false, encoding ); 1362 1435 } 1363 else if ( *p == '"')1436 else if ( *p == DOUBLE_QUOTE ) 1364 1437 { 1365 1438 ++p; 1366 end = "\""; 1439 end = "\""; // double quote in string 1367 1440 p = ReadText( p, &value, false, end, false, encoding ); 1368 1441 } … … 1373 1446 // its best, even without them. 1374 1447 value = ""; 1375 while ( p && *p // existence1448 while ( p && *p // existence 1376 1449 && !IsWhiteSpace( *p ) && *p != '\n' && *p != '\r' // whitespace 1377 && *p != '/' && *p != '>' ) // tag end 1378 { 1450 && *p != '/' && *p != '>' ) // tag end 1451 { 1452 if ( *p == SINGLE_QUOTE || *p == DOUBLE_QUOTE ) { 1453 // [ 1451649 ] Attribute values with trailing quotes not handled correctly 1454 // We did not have an opening quote but seem to have a 1455 // closing one. Give up and throw an error. 1456 if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding ); 1457 return 0; 1458 } 1379 1459 value += *p; 1380 1460 ++p; … … 1385 1465 1386 1466 #ifdef TIXML_USE_STL 1387 void TiXmlText::StreamIn( TIXML_ISTREAM * in, TIXML_STRING * tag ) 1388 { 1389 if ( cdata ) 1390 { 1391 int c = in->get(); 1467 void TiXmlText::StreamIn( std::istream * in, TIXML_STRING * tag ) 1468 { 1469 while ( in->good() ) 1470 { 1471 int c = in->peek(); 1472 if ( !cdata && (c == '<' ) ) 1473 { 1474 return; 1475 } 1392 1476 if ( c <= 0 ) 1393 1477 { … … 1399 1483 1400 1484 (*tag) += (char) c; 1401 1402 if ( c == '>' 1403 && tag->at( tag->length() - 2 ) == ']' 1404 && tag->at( tag->length() - 3 ) == ']' ) 1405 { 1406 // All is well. 1407 return; 1408 } 1409 } 1410 else 1411 { 1412 while ( in->good() ) 1413 { 1414 int c = in->peek(); 1415 if ( c == '<' ) 1485 in->get(); // "commits" the peek made above 1486 1487 if ( cdata && c == '>' && tag->size() >= 3 ) { 1488 size_t len = tag->size(); 1489 if ( (*tag)[len-2] == ']' && (*tag)[len-3] == ']' ) { 1490 // terminator of cdata. 1416 1491 return; 1417 if ( c <= 0 ) 1418 { 1419 TiXmlDocument* document = GetDocument(); 1420 if ( document ) 1421 document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN ); 1422 return; 1423 } 1424 1425 (*tag) += (char) c; 1426 in->get(); 1492 } 1427 1493 } 1428 1494 } … … 1464 1530 } 1465 1531 1466 TIXML_STRING dummy; 1532 TIXML_STRING dummy; 1467 1533 p = ReadText( p, &dummy, false, endTag, false, encoding ); 1468 1534 return p; … … 1481 1547 1482 1548 #ifdef TIXML_USE_STL 1483 void TiXmlDeclaration::StreamIn( TIXML_ISTREAM* in, TIXML_STRING * tag )1549 void TiXmlDeclaration::StreamIn( std::istream * in, TIXML_STRING * tag ) 1484 1550 { 1485 1551 while ( in->good() ) … … 1538 1604 { 1539 1605 TiXmlAttribute attrib; 1540 p = attrib.Parse( p, data, _encoding ); 1606 p = attrib.Parse( p, data, _encoding ); 1541 1607 version = attrib.Value(); 1542 1608 } … … 1544 1610 { 1545 1611 TiXmlAttribute attrib; 1546 p = attrib.Parse( p, data, _encoding ); 1612 p = attrib.Parse( p, data, _encoding ); 1547 1613 encoding = attrib.Value(); 1548 1614 } … … 1550 1616 { 1551 1617 TiXmlAttribute attrib; 1552 p = attrib.Parse( p, data, _encoding ); 1618 p = attrib.Parse( p, data, _encoding ); 1553 1619 standalone = attrib.Value(); 1554 1620 }
Note: See TracChangeset
for help on using the changeset viewer.