Planet

navi

home

PPS

about

screenshots

download

development

forum

Context Navigation

source: orxonox.OLD/orxonox/trunk/src/lib/tinyxml/tinyxmlparser.cc @ 4491

Last change on this file since 4491 was 4491, checked in by bensch, 20 years ago
orxonox/trunk: new tinyXML-lib installed (v-2.3.4)
File size: 32.8 KB

Line
1	/*
2	www.sourceforge.net/projects/tinyxml
3	Original code (2.0 and earlier )copyright (c) 2000-2002 Lee Thomason (www.grinninglizard.com)
4
5	This software is provided 'as-is', without any express or implied
6	warranty. In no event will the authors be held liable for any
7	damages arising from the use of this software.
8
9	Permission is granted to anyone to use this software for any
10	purpose, including commercial applications, and to alter it and
11	redistribute it freely, subject to the following restrictions:
12
13	1. The origin of this software must not be misrepresented; you must
14	not claim that you wrote the original software. If you use this
15	software in a product, an acknowledgment in the product documentation
16	would be appreciated but is not required.
17
18	2. Altered source versions must be plainly marked as such, and
19	must not be misrepresented as being the original software.
20
21	3. This notice may not be removed or altered from any source
22	distribution.
23	*/
24
25	#include "tinyxml.h"
26	#include <ctype.h>
27	#include <stddef.h>
28
29	//#define DEBUG_PARSER
30
31	// Note tha "PutString" hardcodes the same list. This
32	// is less flexible than it appears. Changing the entries
33	// or order will break putstring.
34	TiXmlBase::Entity TiXmlBase::entity[ NUM_ENTITY ] =
35	{
36	{ "&", 5, '&' },
37	{ "<", 4, '<' },
38	{ ">", 4, '>' },
39	{ """, 6, '\"' },
40	{ "'", 6, '\'' }
41	};
42
43	// Bunch of unicode info at:
44	// http://www.unicode.org/faq/utf_bom.html
45	// Including the basic of this table, which determines the #bytes in the
46	// sequence from the lead byte. 1 placed for invalid sequences --
47	// although the result will be junk, pass it through as much as possible.
48	// Beware of the non-characters in UTF-8:
49	// ef bb bf (Microsoft "lead bytes")
50	// ef bf be
51	// ef bf bf
52
53	const unsigned char TIXML_UTF_LEAD_0 = 0xefU;
54	const unsigned char TIXML_UTF_LEAD_1 = 0xbbU;
55	const unsigned char TIXML_UTF_LEAD_2 = 0xbfU;
56
57	const int TiXmlBase::utf8ByteTable[256] =
58	{
59	// 0 1 2 3 4 5 6 7 8 9 a b c d e f
60	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x00
61	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x10
62	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x20
63	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x30
64	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x40
65	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x50
66	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x60
67	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x70 End of ASCII range
68	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x80 0x80 to 0xc1 invalid
69	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x90
70	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0xa0
71	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0xb0
72	1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xc0 0xc2 to 0xdf 2 byte
73	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xd0
74	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 0xe0 0xe0 to 0xef 3 byte
75	4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 // 0xf0 0xf0 to 0xf4 4 byte, 0xf5 and higher invalid
76	};
77
78
79	void TiXmlBase::ConvertUTF32ToUTF8( unsigned long input, char* output, int* length )
80	{
81	const unsigned long BYTE_MASK = 0xBF;
82	const unsigned long BYTE_MARK = 0x80;
83	const unsigned long FIRST_BYTE_MARK[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
84
85	if (input < 0x80)
86	*length = 1;
87	else if ( input < 0x800 )
88	*length = 2;
89	else if ( input < 0x10000 )
90	*length = 3;
91	else if ( input < 0x200000 )
92	*length = 4;
93	else
94	{ *length = 0; return; } // This code won't covert this correctly anyway.
95
96	output += *length;
97
98	// Scary scary fall throughs.
99	switch (*length)
100	{
101	case 4:
102	--output;
103	*output = (char)((input \| BYTE_MARK) & BYTE_MASK);
104	input >>= 6;
105	case 3:
106	--output;
107	*output = (char)((input \| BYTE_MARK) & BYTE_MASK);
108	input >>= 6;
109	case 2:
110	--output;
111	*output = (char)((input \| BYTE_MARK) & BYTE_MASK);
112	input >>= 6;
113	case 1:
114	--output;
115	output = (char)(input \| FIRST_BYTE_MARK[length]);
116	}
117	}
118
119
120	/static/ int TiXmlBase::IsAlpha( unsigned char anyByte, TiXmlEncoding /encoding/ )
121	{
122	// This will only work for low-ascii, everything else is assumed to be a valid
123	// letter. I'm not sure this is the best approach, but it is quite tricky trying
124	// to figure out alhabetical vs. not across encoding. So take a very
125	// conservative approach.
126
127	// if ( encoding == TIXML_ENCODING_UTF8 )
128	// {
129	if ( anyByte < 127 )
130	return isalpha( anyByte );
131	else
132	return 1; // What else to do? The unicode set is huge...get the english ones right.
133	// }
134	// else
135	// {
136	// return isalpha( anyByte );
137	// }
138	}
139
140
141	/static/ int TiXmlBase::IsAlphaNum( unsigned char anyByte, TiXmlEncoding /encoding/ )
142	{
143	// This will only work for low-ascii, everything else is assumed to be a valid
144	// letter. I'm not sure this is the best approach, but it is quite tricky trying
145	// to figure out alhabetical vs. not across encoding. So take a very
146	// conservative approach.
147
148	// if ( encoding == TIXML_ENCODING_UTF8 )
149	// {
150	if ( anyByte < 127 )
151	return isalnum( anyByte );
152	else
153	return 1; // What else to do? The unicode set is huge...get the english ones right.
154	// }
155	// else
156	// {
157	// return isalnum( anyByte );
158	// }
159	}
160
161
162	class TiXmlParsingData
163	{
164	friend class TiXmlDocument;
165	public:
166	void Stamp( const char* now, TiXmlEncoding encoding );
167
168	const TiXmlCursor& Cursor() { return cursor; }
169
170	private:
171	// Only used by the document!
172	TiXmlParsingData( const char* start, int _tabsize, int row, int col )
173	{
174	assert( start );
175	stamp = start;
176	tabsize = _tabsize;
177	cursor.row = row;
178	cursor.col = col;
179	}
180
181	TiXmlCursor cursor;
182	const char* stamp;
183	int tabsize;
184	};
185
186
187	void TiXmlParsingData::Stamp( const char* now, TiXmlEncoding encoding )
188	{
189	assert( now );
190
191	// Do nothing if the tabsize is 0.
192	if ( tabsize < 1 )
193	{
194	return;
195	}
196
197	// Get the current row, column.
198	int row = cursor.row;
199	int col = cursor.col;
200	const char* p = stamp;
201	assert( p );
202
203	while ( p < now )
204	{
205	// Treat p as unsigned, so we have a happy compiler.
206	const unsigned char* pU = (const unsigned char*)p;
207
208	// Code contributed by Fletcher Dunn: (modified by lee)
209	switch (*pU) {
210	case 0:
211	// We should never get here, but in case we do, don't
212	// advance past the terminating null character, ever
213	return;
214
215	case '\r':
216	// bump down to the next line
217	++row;
218	col = 0;
219	// Eat the character
220	++p;
221
222	// Check for \r\n sequence, and treat this as a single character
223	if (*p == '\n') {
224	++p;
225	}
226	break;
227
228	case '\n':
229	// bump down to the next line
230	++row;
231	col = 0;
232
233	// Eat the character
234	++p;
235
236	// Check for \n\r sequence, and treat this as a single
237	// character. (Yes, this bizarre thing does occur still
238	// on some arcane platforms...)
239	if (*p == '\r') {
240	++p;
241	}
242	break;
243
244	case '\t':
245	// Eat the character
246	++p;
247
248	// Skip to next tab stop
249	col = (col / tabsize + 1) * tabsize;
250	break;
251
252	case TIXML_UTF_LEAD_0:
253	if ( encoding == TIXML_ENCODING_UTF8 )
254	{
255	if ( (p+1) && (p+2) )
256	{
257	// In these cases, don't advance the column. These are
258	// 0-width spaces.
259	if ( (pU+1)==TIXML_UTF_LEAD_1 && (pU+2)==TIXML_UTF_LEAD_2 )
260	p += 3;
261	else if ( (pU+1)==0xbfU && (pU+2)==0xbeU )
262	p += 3;
263	else if ( (pU+1)==0xbfU && (pU+2)==0xbfU )
264	p += 3;
265	else
266	{ p +=3; ++col; } // A normal character.
267	}
268	}
269	else
270	{
271	++p;
272	++col;
273	}
274	break;
275
276	default:
277	if ( encoding == TIXML_ENCODING_UTF8 )
278	{
279	// Eat the 1 to 4 byte utf8 character.
280	int step = TiXmlBase::utf8ByteTable[((unsigned char)p)];
281	if ( step == 0 )
282	step = 1; // Error case from bad encoding, but handle gracefully.
283	p += step;
284
285	// Just advance one column, of course.
286	++col;
287	}
288	else
289	{
290	++p;
291	++col;
292	}
293	break;
294	}
295	}
296	cursor.row = row;
297	cursor.col = col;
298	assert( cursor.row >= -1 );
299	assert( cursor.col >= -1 );
300	stamp = p;
301	assert( stamp );
302	}
303
304
305	const char* TiXmlBase::SkipWhiteSpace( const char* p, TiXmlEncoding encoding )
306	{
307	if ( !p \|\| !*p )
308	{
309	return 0;
310	}
311	if ( encoding == TIXML_ENCODING_UTF8 )
312	{
313	while ( *p )
314	{
315	const unsigned char* pU = (const unsigned char*)p;
316
317	// Skip the stupid Microsoft UTF-8 Byte order marks
318	if ( *(pU+0)==TIXML_UTF_LEAD_0
319	&& *(pU+1)==TIXML_UTF_LEAD_1
320	&& *(pU+2)==TIXML_UTF_LEAD_2 )
321	{
322	p += 3;
323	continue;
324	}
325	else if(*(pU+0)==TIXML_UTF_LEAD_0
326	&& *(pU+1)==0xbfU
327	&& *(pU+2)==0xbeU )
328	{
329	p += 3;
330	continue;
331	}
332	else if(*(pU+0)==TIXML_UTF_LEAD_0
333	&& *(pU+1)==0xbfU
334	&& *(pU+2)==0xbfU )
335	{
336	p += 3;
337	continue;
338	}
339
340	if ( IsWhiteSpace( p ) \|\| p == '\n' \|\| *p =='\r' ) // Still using old rules for white space.
341	++p;
342	else
343	break;
344	}
345	}
346	else
347	{
348	while ( p && IsWhiteSpace( p ) \|\| p == '\n' \|\| p =='\r' )
349	++p;
350	}
351
352	return p;
353	}
354
355	#ifdef TIXML_USE_STL
356	/static/ bool TiXmlBase::StreamWhiteSpace( TIXML_ISTREAM * in, TIXML_STRING * tag )
357	{
358	for( ;; )
359	{
360	if ( !in->good() ) return false;
361
362	int c = in->peek();
363	// At this scope, we can't get to a document. So fail silently.
364	if ( !IsWhiteSpace( c ) \|\| c <= 0 )
365	return true;
366
367	*tag += (char) in->get();
368	}
369	}
370
371	/static/ bool TiXmlBase::StreamTo( TIXML_ISTREAM * in, int character, TIXML_STRING * tag )
372	{
373	//assert( character > 0 && character < 128 ); // else it won't work in utf-8
374	while ( in->good() )
375	{
376	int c = in->peek();
377	if ( c == character )
378	return true;
379	if ( c <= 0 ) // Silent failure: can't get document at this scope
380	return false;
381
382	in->get();
383	*tag += (char) c;
384	}
385	return false;
386	}
387	#endif
388
389	const char* TiXmlBase::ReadName( const char* p, TIXML_STRING * name, TiXmlEncoding encoding )
390	{
391	*name = "";
392	assert( p );
393
394	// Names start with letters or underscores.
395	// Of course, in unicode, tinyxml has no idea what a letter is. The
396	// algorithm is generous.
397	//
398	// After that, they can be letters, underscores, numbers,
399	// hyphens, or colons. (Colons are valid ony for namespaces,
400	// but tinyxml can't tell namespaces from names.)
401	if ( p && *p
402	&& ( IsAlpha( (unsigned char) p, encoding ) \|\| p == '_' ) )
403	{
404	while( p && *p
405	&& ( IsAlphaNum( (unsigned char ) *p, encoding )
406	\|\| *p == '_'
407	\|\| *p == '-'
408	\|\| *p == '.'
409	\|\| *p == ':' ) )
410	{
411	(name) += p;
412	++p;
413	}
414	return p;
415	}
416	return 0;
417	}
418
419	const char* TiXmlBase::GetEntity( const char* p, char* value, int* length, TiXmlEncoding encoding )
420	{
421	// Presume an entity, and pull it out.
422	TIXML_STRING ent;
423	int i;
424	*length = 0;
425
426	if ( (p+1) && (p+1) == '#' && *(p+2) )
427	{
428	unsigned long ucs = 0;
429	ptrdiff_t delta = 0;
430	unsigned mult = 1;
431
432	if ( *(p+2) == 'x' )
433	{
434	// Hexadecimal.
435	if ( !*(p+3) ) return 0;
436
437	const char* q = p+3;
438	q = strchr( q, ';' );
439
440	if ( !q \|\| !*q ) return 0;
441
442	delta = q-p;
443	--q;
444
445	while ( *q != 'x' )
446	{
447	if ( q >= '0' && q <= '9' )
448	ucs += mult * (*q - '0');
449	else if ( q >= 'a' && q <= 'f' )
450	ucs += mult * (*q - 'a' + 10);
451	else if ( q >= 'A' && q <= 'F' )
452	ucs += mult * (*q - 'A' + 10 );
453	else
454	return 0;
455	mult *= 16;
456	--q;
457	}
458	}
459	else
460	{
461	// Decimal.
462	if ( !*(p+2) ) return 0;
463
464	const char* q = p+2;
465	q = strchr( q, ';' );
466
467	if ( !q \|\| !*q ) return 0;
468
469	delta = q-p;
470	--q;
471
472	while ( *q != '#' )
473	{
474	if ( q >= '0' && q <= '9' )
475	ucs += mult * (*q - '0');
476	else
477	return 0;
478	mult *= 10;
479	--q;
480	}
481	}
482	if ( encoding == TIXML_ENCODING_UTF8 )
483	{
484	// convert the UCS to UTF-8
485	ConvertUTF32ToUTF8( ucs, value, length );
486	}
487	else
488	{
489	*value = (char)ucs;
490	*length = 1;
491	}
492	return p + delta + 1;
493	}
494
495	// Now try to match it.
496	for( i=0; i<NUM_ENTITY; ++i )
497	{
498	if ( strncmp( entity[i].str, p, entity[i].strLength ) == 0 )
499	{
500	assert( strlen( entity[i].str ) == entity[i].strLength );
501	*value = entity[i].chr;
502	*length = 1;
503	return ( p + entity[i].strLength );
504	}
505	}
506
507	// So it wasn't an entity, its unrecognized, or something like that.
508	value = p; // Don't put back the last one, since we return it!
509	return p+1;
510	}
511
512
513	bool TiXmlBase::StringEqual( const char* p,
514	const char* tag,
515	bool ignoreCase,
516	TiXmlEncoding encoding )
517	{
518	assert( p );
519	assert( tag );
520	if ( !p \|\| !*p )
521	{
522	assert( 0 );
523	return false;
524	}
525
526	const char* q = p;
527
528	if ( ignoreCase )
529	{
530	while ( q && tag && ToLower( q, encoding ) == ToLower( tag, encoding ) )
531	{
532	++q;
533	++tag;
534	}
535
536	if ( *tag == 0 )
537	return true;
538	}
539	else
540	{
541	while ( q && tag && q == tag )
542	{
543	++q;
544	++tag;
545	}
546
547	if ( *tag == 0 ) // Have we found the end of the tag, and everything equal?
548	return true;
549	}
550	return false;
551	}
552
553	const char* TiXmlBase::ReadText( const char* p,
554	TIXML_STRING * text,
555	bool trimWhiteSpace,
556	const char* endTag,
557	bool caseInsensitive,
558	TiXmlEncoding encoding )
559	{
560	*text = "";
561	if ( !trimWhiteSpace // certain tags always keep whitespace
562	\|\| !condenseWhiteSpace ) // if true, whitespace is always kept
563	{
564	// Keep all the white space.
565	while ( p && *p
566	&& !StringEqual( p, endTag, caseInsensitive, encoding )
567	)
568	{
569	int len;
570	char cArr[4] = { 0, 0, 0, 0 };
571	p = GetChar( p, cArr, &len, encoding );
572	text->append( cArr, len );
573	}
574	}
575	else
576	{
577	bool whitespace = false;
578
579	// Remove leading white space:
580	p = SkipWhiteSpace( p, encoding );
581	while ( p && *p
582	&& !StringEqual( p, endTag, caseInsensitive, encoding ) )
583	{
584	if ( p == '\r' \|\| p == '\n' )
585	{
586	whitespace = true;
587	++p;
588	}
589	else if ( IsWhiteSpace( *p ) )
590	{
591	whitespace = true;
592	++p;
593	}
594	else
595	{
596	// If we've found whitespace, add it before the
597	// new character. Any whitespace just becomes a space.
598	if ( whitespace )
599	{
600	(*text) += ' ';
601	whitespace = false;
602	}
603	int len;
604	char cArr[4] = { 0, 0, 0, 0 };
605	p = GetChar( p, cArr, &len, encoding );
606	if ( len == 1 )
607	(*text) += cArr[0]; // more efficient
608	else
609	text->append( cArr, len );
610	}
611	}
612	}
613	return p + strlen( endTag );
614	}
615
616	#ifdef TIXML_USE_STL
617
618	void TiXmlDocument::StreamIn( TIXML_ISTREAM * in, TIXML_STRING * tag )
619	{
620	// The basic issue with a document is that we don't know what we're
621	// streaming. Read something presumed to be a tag (and hope), then
622	// identify it, and call the appropriate stream method on the tag.
623	//
624	// This "pre-streaming" will never read the closing ">" so the
625	// sub-tag can orient itself.
626
627	if ( !StreamTo( in, '<', tag ) )
628	{
629	SetError( TIXML_ERROR_PARSING_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
630	return;
631	}
632
633	while ( in->good() )
634	{
635	int tagIndex = (int) tag->length();
636	while ( in->good() && in->peek() != '>' )
637	{
638	int c = in->get();
639	if ( c <= 0 )
640	{
641	SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
642	break;
643	}
644	(*tag) += (char) c;
645	}
646
647	if ( in->good() )
648	{
649	// We now have something we presume to be a node of
650	// some sort. Identify it, and call the node to
651	// continue streaming.
652	TiXmlNode* node = Identify( tag->c_str() + tagIndex, TIXML_DEFAULT_ENCODING );
653
654	if ( node )
655	{
656	node->StreamIn( in, tag );
657	bool isElement = node->ToElement() != 0;
658	delete node;
659	node = 0;
660
661	// If this is the root element, we're done. Parsing will be
662	// done by the >> operator.
663	if ( isElement )
664	{
665	return;
666	}
667	}
668	else
669	{
670	SetError( TIXML_ERROR, 0, 0, TIXML_ENCODING_UNKNOWN );
671	return;
672	}
673	}
674	}
675	// We should have returned sooner.
676	SetError( TIXML_ERROR, 0, 0, TIXML_ENCODING_UNKNOWN );
677	}
678
679	#endif
680
681	const char* TiXmlDocument::Parse( const char* p, TiXmlParsingData* prevData, TiXmlEncoding encoding )
682	{
683	ClearError();
684
685	// Parse away, at the document level. Since a document
686	// contains nothing but other tags, most of what happens
687	// here is skipping white space.
688	if ( !p \|\| !*p )
689	{
690	SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
691	return 0;
692	}
693
694	// Note that, for a document, this needs to come
695	// before the while space skip, so that parsing
696	// starts from the pointer we are given.
697	location.Clear();
698	if ( prevData )
699	{
700	location.row = prevData->cursor.row;
701	location.col = prevData->cursor.col;
702	}
703	else
704	{
705	location.row = 0;
706	location.col = 0;
707	}
708	TiXmlParsingData data( p, TabSize(), location.row, location.col );
709	location = data.Cursor();
710
711	if ( encoding == TIXML_ENCODING_UNKNOWN )
712	{
713	// Check for the Microsoft UTF-8 lead bytes.
714	const unsigned char* pU = (const unsigned char*)p;
715	if ( (pU+0) && (pU+0) == TIXML_UTF_LEAD_0
716	&& (pU+1) && (pU+1) == TIXML_UTF_LEAD_1
717	&& (pU+2) && (pU+2) == TIXML_UTF_LEAD_2 )
718	{
719	encoding = TIXML_ENCODING_UTF8;
720	}
721	}
722
723	p = SkipWhiteSpace( p, encoding );
724	if ( !p )
725	{
726	SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
727	return 0;
728	}
729
730	while ( p && *p )
731	{
732	TiXmlNode* node = Identify( p, encoding );
733	if ( node )
734	{
735	p = node->Parse( p, &data, encoding );
736	LinkEndChild( node );
737	}
738	else
739	{
740	break;
741	}
742
743	// Did we get encoding info?
744	if ( encoding == TIXML_ENCODING_UNKNOWN
745	&& node->ToDeclaration() )
746	{
747	TiXmlDeclaration* dec = node->ToDeclaration();
748	const char* enc = dec->Encoding();
749	assert( enc );
750
751	if ( *enc == 0 )
752	encoding = TIXML_ENCODING_UTF8;
753	else if ( StringEqual( enc, "UTF-8", true, TIXML_ENCODING_UNKNOWN ) )
754	encoding = TIXML_ENCODING_UTF8;
755	else if ( StringEqual( enc, "UTF8", true, TIXML_ENCODING_UNKNOWN ) )
756	encoding = TIXML_ENCODING_UTF8; // incorrect, but be nice
757	else
758	encoding = TIXML_ENCODING_LEGACY;
759	}
760
761	p = SkipWhiteSpace( p, encoding );
762	}
763
764	// Was this empty?
765	if ( !firstChild ) {
766	SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, encoding );
767	return 0;
768	}
769
770	// All is well.
771	return p;
772	}
773
774	void TiXmlDocument::SetError( int err, const char* pError, TiXmlParsingData* data, TiXmlEncoding encoding )
775	{
776	// The first error in a chain is more accurate - don't set again!
777	if ( error )
778	return;
779
780	assert( err > 0 && err < TIXML_ERROR_STRING_COUNT );
781	error = true;
782	errorId = err;
783	errorDesc = errorString[ errorId ];
784
785	errorLocation.Clear();
786	if ( pError && data )
787	{
788	//TiXmlParsingData data( pError, prevData );
789	data->Stamp( pError, encoding );
790	errorLocation = data->Cursor();
791	}
792	}
793
794
795	TiXmlNode* TiXmlNode::Identify( const char* p, TiXmlEncoding encoding )
796	{
797	TiXmlNode* returnNode = 0;
798
799	p = SkipWhiteSpace( p, encoding );
800	if( !p \|\| !p \|\| p != '<' )
801	{
802	return 0;
803	}
804
805	TiXmlDocument* doc = GetDocument();
806	p = SkipWhiteSpace( p, encoding );
807
808	if ( !p \|\| !*p )
809	{
810	return 0;
811	}
812
813	// What is this thing?
814	// - Elements start with a letter or underscore, but xml is reserved.
815	// - Comments: <!--
816	// - Decleration: <?xml
817	// - Everthing else is unknown to tinyxml.
818	//
819
820	const char* xmlHeader = { "<?xml" };
821	const char* commentHeader = { "<!--" };
822	const char* dtdHeader = { "<!" };
823
824	if ( StringEqual( p, xmlHeader, true, encoding ) )
825	{
826	#ifdef DEBUG_PARSER
827	TIXML_LOG( "XML parsing Declaration\n" );
828	#endif
829	returnNode = new TiXmlDeclaration();
830	}
831	else if ( StringEqual( p, commentHeader, false, encoding ) )
832	{
833	#ifdef DEBUG_PARSER
834	TIXML_LOG( "XML parsing Comment\n" );
835	#endif
836	returnNode = new TiXmlComment();
837	}
838	else if ( StringEqual( p, dtdHeader, false, encoding ) )
839	{
840	#ifdef DEBUG_PARSER
841	TIXML_LOG( "XML parsing Unknown(1)\n" );
842	#endif
843	returnNode = new TiXmlUnknown();
844	}
845	else if ( IsAlpha( *(p+1), encoding )
846	\|\| *(p+1) == '_' )
847	{
848	#ifdef DEBUG_PARSER
849	TIXML_LOG( "XML parsing Element\n" );
850	#endif
851	returnNode = new TiXmlElement( "" );
852	}
853	else
854	{
855	#ifdef DEBUG_PARSER
856	TIXML_LOG( "XML parsing Unknown(2)\n" );
857	#endif
858	returnNode = new TiXmlUnknown();
859	}
860
861	if ( returnNode )
862	{
863	// Set the parent, so it can report errors
864	returnNode->parent = this;
865	}
866	else
867	{
868	if ( doc )
869	doc->SetError( TIXML_ERROR_OUT_OF_MEMORY, 0, 0, TIXML_ENCODING_UNKNOWN );
870	}
871	return returnNode;
872	}
873
874	#ifdef TIXML_USE_STL
875
876	void TiXmlElement::StreamIn (TIXML_ISTREAM * in, TIXML_STRING * tag)
877	{
878	// We're called with some amount of pre-parsing. That is, some of "this"
879	// element is in "tag". Go ahead and stream to the closing ">"
880	while( in->good() )
881	{
882	int c = in->get();
883	if ( c <= 0 )
884	{
885	TiXmlDocument* document = GetDocument();
886	if ( document )
887	document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
888	return;
889	}
890	(*tag) += (char) c ;
891
892	if ( c == '>' )
893	break;
894	}
895
896	if ( tag->length() < 3 ) return;
897
898	// Okay...if we are a "/>" tag, then we're done. We've read a complete tag.
899	// If not, identify and stream.
900
901	if ( tag->at( tag->length() - 1 ) == '>'
902	&& tag->at( tag->length() - 2 ) == '/' )
903	{
904	// All good!
905	return;
906	}
907	else if ( tag->at( tag->length() - 1 ) == '>' )
908	{
909	// There is more. Could be:
910	// text
911	// closing tag
912	// another node.
913	for ( ;; )
914	{
915	StreamWhiteSpace( in, tag );
916
917	// Do we have text?
918	if ( in->good() && in->peek() != '<' )
919	{
920	// Yep, text.
921	TiXmlText text( "" );
922	text.StreamIn( in, tag );
923
924	// What follows text is a closing tag or another node.
925	// Go around again and figure it out.
926	continue;
927	}
928
929	// We now have either a closing tag...or another node.
930	// We should be at a "<", regardless.
931	if ( !in->good() ) return;
932	assert( in->peek() == '<' );
933	int tagIndex = tag->length();
934
935	bool closingTag = false;
936	bool firstCharFound = false;
937
938	for( ;; )
939	{
940	if ( !in->good() )
941	return;
942
943	int c = in->peek();
944	if ( c <= 0 )
945	{
946	TiXmlDocument* document = GetDocument();
947	if ( document )
948	document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
949	return;
950	}
951
952	if ( c == '>' )
953	break;
954
955	*tag += (char) c;
956	in->get();
957
958	if ( !firstCharFound && c != '<' && !IsWhiteSpace( c ) )
959	{
960	firstCharFound = true;
961	if ( c == '/' )
962	closingTag = true;
963	}
964	}
965	// If it was a closing tag, then read in the closing '>' to clean up the input stream.
966	// If it was not, the streaming will be done by the tag.
967	if ( closingTag )
968	{
969	if ( !in->good() )
970	return;
971
972	int c = in->get();
973	if ( c <= 0 )
974	{
975	TiXmlDocument* document = GetDocument();
976	if ( document )
977	document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
978	return;
979	}
980	assert( c == '>' );
981	*tag += (char) c;
982
983	// We are done, once we've found our closing tag.
984	return;
985	}
986	else
987	{
988	// If not a closing tag, id it, and stream.
989	const char* tagloc = tag->c_str() + tagIndex;
990	TiXmlNode* node = Identify( tagloc, TIXML_DEFAULT_ENCODING );
991	if ( !node )
992	return;
993	node->StreamIn( in, tag );
994	delete node;
995	node = 0;
996
997	// No return: go around from the beginning: text, closing tag, or node.
998	}
999	}
1000	}
1001	}
1002	#endif
1003
1004	const char* TiXmlElement::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1005	{
1006	p = SkipWhiteSpace( p, encoding );
1007	TiXmlDocument* document = GetDocument();
1008
1009	if ( !p \|\| !*p )
1010	{
1011	if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, 0, 0, encoding );
1012	return 0;
1013	}
1014
1015	// TiXmlParsingData data( p, prevData );
1016	if ( data )
1017	{
1018	data->Stamp( p, encoding );
1019	location = data->Cursor();
1020	}
1021
1022	if ( *p != '<' )
1023	{
1024	if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, p, data, encoding );
1025	return 0;
1026	}
1027
1028	p = SkipWhiteSpace( p+1, encoding );
1029
1030	// Read the name.
1031	const char* pErr = p;
1032
1033	p = ReadName( p, &value, encoding );
1034	if ( !p \|\| !*p )
1035	{
1036	if ( document ) document->SetError( TIXML_ERROR_FAILED_TO_READ_ELEMENT_NAME, pErr, data, encoding );
1037	return 0;
1038	}
1039
1040	TIXML_STRING endTag ("</");
1041	endTag += value;
1042	endTag += ">";
1043
1044	// Check for and read attributes. Also look for an empty
1045	// tag or an end tag.
1046	while ( p && *p )
1047	{
1048	pErr = p;
1049	p = SkipWhiteSpace( p, encoding );
1050	if ( !p \|\| !*p )
1051	{
1052	if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, pErr, data, encoding );
1053	return 0;
1054	}
1055	if ( *p == '/' )
1056	{
1057	++p;
1058	// Empty tag.
1059	if ( *p != '>' )
1060	{
1061	if ( document ) document->SetError( TIXML_ERROR_PARSING_EMPTY, p, data, encoding );
1062	return 0;
1063	}
1064	return (p+1);
1065	}
1066	else if ( *p == '>' )
1067	{
1068	// Done with attributes (if there were any.)
1069	// Read the value -- which can include other
1070	// elements -- read the end tag, and return.
1071	++p;
1072	p = ReadValue( p, data, encoding ); // Note this is an Element method, and will set the error if one happens.
1073	if ( !p \|\| !*p )
1074	return 0;
1075
1076	// We should find the end tag now
1077	if ( StringEqual( p, endTag.c_str(), false, encoding ) )
1078	{
1079	p += endTag.length();
1080	return p;
1081	}
1082	else
1083	{
1084	if ( document ) document->SetError( TIXML_ERROR_READING_END_TAG, p, data, encoding );
1085	return 0;
1086	}
1087	}
1088	else
1089	{
1090	// Try to read an attribute:
1091	TiXmlAttribute* attrib = new TiXmlAttribute();
1092	if ( !attrib )
1093	{
1094	if ( document ) document->SetError( TIXML_ERROR_OUT_OF_MEMORY, pErr, data, encoding );
1095	return 0;
1096	}
1097
1098	attrib->SetDocument( document );
1099	const char* pErr = p;
1100	p = attrib->Parse( p, data, encoding );
1101
1102	if ( !p \|\| !*p )
1103	{
1104	if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, pErr, data, encoding );
1105	delete attrib;
1106	return 0;
1107	}
1108
1109	// Handle the strange case of double attributes:
1110	TiXmlAttribute* node = attributeSet.Find( attrib->Name() );
1111	if ( node )
1112	{
1113	node->SetValue( attrib->Value() );
1114	delete attrib;
1115	return 0;
1116	}
1117
1118	attributeSet.Add( attrib );
1119	}
1120	}
1121	return p;
1122	}
1123
1124
1125	const char* TiXmlElement::ReadValue( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1126	{
1127	TiXmlDocument* document = GetDocument();
1128
1129	const char* pWithWhiteSpace = p;
1130	// Read in text and elements in any order.
1131	p = SkipWhiteSpace( p, encoding );
1132	while ( p && *p )
1133	{
1134	if ( *p != '<' )
1135	{
1136	// Take what we have, make a text element.
1137	TiXmlText* textNode = new TiXmlText( "" );
1138
1139	if ( !textNode )
1140	{
1141	if ( document ) document->SetError( TIXML_ERROR_OUT_OF_MEMORY, 0, 0, encoding );
1142	return 0;
1143	}
1144
1145	if ( TiXmlBase::IsWhiteSpaceCondensed() )
1146	{
1147	p = textNode->Parse( p, data, encoding );
1148	}
1149	else
1150	{
1151	// Special case: we want to keep the white space
1152	// so that leading spaces aren't removed.
1153	p = textNode->Parse( pWithWhiteSpace, data, encoding );
1154	}
1155
1156	if ( !textNode->Blank() )
1157	LinkEndChild( textNode );
1158	else
1159	delete textNode;
1160	}
1161	else
1162	{
1163	// We hit a '<'
1164	// Have we hit a new element or an end tag?
1165	if ( StringEqual( p, "</", false, encoding ) )
1166	{
1167	return p;
1168	}
1169	else
1170	{
1171	TiXmlNode* node = Identify( p, encoding );
1172	if ( node )
1173	{
1174	p = node->Parse( p, data, encoding );
1175	LinkEndChild( node );
1176	}
1177	else
1178	{
1179	return 0;
1180	}
1181	}
1182	}
1183	p = SkipWhiteSpace( p, encoding );
1184	}
1185
1186	if ( !p )
1187	{
1188	if ( document ) document->SetError( TIXML_ERROR_READING_ELEMENT_VALUE, 0, 0, encoding );
1189	}
1190	return p;
1191	}
1192
1193
1194	#ifdef TIXML_USE_STL
1195	void TiXmlUnknown::StreamIn( TIXML_ISTREAM * in, TIXML_STRING * tag )
1196	{
1197	while ( in->good() )
1198	{
1199	int c = in->get();
1200	if ( c <= 0 )
1201	{
1202	TiXmlDocument* document = GetDocument();
1203	if ( document )
1204	document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
1205	return;
1206	}
1207	(*tag) += (char) c;
1208
1209	if ( c == '>' )
1210	{
1211	// All is well.
1212	return;
1213	}
1214	}
1215	}
1216	#endif
1217
1218
1219	const char* TiXmlUnknown::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1220	{
1221	TiXmlDocument* document = GetDocument();
1222	p = SkipWhiteSpace( p, encoding );
1223
1224	// TiXmlParsingData data( p, prevData );
1225	if ( data )
1226	{
1227	data->Stamp( p, encoding );
1228	location = data->Cursor();
1229	}
1230	if ( !p \|\| !p \|\| p != '<' )
1231	{
1232	if ( document ) document->SetError( TIXML_ERROR_PARSING_UNKNOWN, p, data, encoding );
1233	return 0;
1234	}
1235	++p;
1236	value = "";
1237
1238	while ( p && p && p != '>' )
1239	{
1240	value += *p;
1241	++p;
1242	}
1243
1244	if ( !p )
1245	{
1246	if ( document ) document->SetError( TIXML_ERROR_PARSING_UNKNOWN, 0, 0, encoding );
1247	}
1248	if ( *p == '>' )
1249	return p+1;
1250	return p;
1251	}
1252
1253	#ifdef TIXML_USE_STL
1254	void TiXmlComment::StreamIn( TIXML_ISTREAM * in, TIXML_STRING * tag )
1255	{
1256	while ( in->good() )
1257	{
1258	int c = in->get();
1259	if ( c <= 0 )
1260	{
1261	TiXmlDocument* document = GetDocument();
1262	if ( document )
1263	document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
1264	return;
1265	}
1266
1267	(*tag) += (char) c;
1268
1269	if ( c == '>'
1270	&& tag->at( tag->length() - 2 ) == '-'
1271	&& tag->at( tag->length() - 3 ) == '-' )
1272	{
1273	// All is well.
1274	return;
1275	}
1276	}
1277	}
1278	#endif
1279
1280
1281	const char* TiXmlComment::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1282	{
1283	TiXmlDocument* document = GetDocument();
1284	value = "";
1285
1286	p = SkipWhiteSpace( p, encoding );
1287
1288	// TiXmlParsingData data( p, prevData );
1289	if ( data )
1290	{
1291	data->Stamp( p, encoding );
1292	location = data->Cursor();
1293	}
1294	const char* startTag = "<!--";
1295	const char* endTag = "-->";
1296
1297	if ( !StringEqual( p, startTag, false, encoding ) )
1298	{
1299	document->SetError( TIXML_ERROR_PARSING_COMMENT, p, data, encoding );
1300	return 0;
1301	}
1302	p += strlen( startTag );
1303	p = ReadText( p, &value, false, endTag, false, encoding );
1304	return p;
1305	}
1306
1307
1308	const char* TiXmlAttribute::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1309	{
1310	p = SkipWhiteSpace( p, encoding );
1311	if ( !p \|\| !*p ) return 0;
1312
1313	int tabsize = 4;
1314	if ( document )
1315	tabsize = document->TabSize();
1316
1317	// TiXmlParsingData data( p, prevData );
1318	if ( data )
1319	{
1320	data->Stamp( p, encoding );
1321	location = data->Cursor();
1322	}
1323	// Read the name, the '=' and the value.
1324	const char* pErr = p;
1325	p = ReadName( p, &name, encoding );
1326	if ( !p \|\| !*p )
1327	{
1328	if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, pErr, data, encoding );
1329	return 0;
1330	}
1331	p = SkipWhiteSpace( p, encoding );
1332	if ( !p \|\| !p \|\| p != '=' )
1333	{
1334	if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding );
1335	return 0;
1336	}
1337
1338	++p; // skip '='
1339	p = SkipWhiteSpace( p, encoding );
1340	if ( !p \|\| !*p )
1341	{
1342	if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding );
1343	return 0;
1344	}
1345
1346	const char* end;
1347
1348	if ( *p == '\'' )
1349	{
1350	++p;
1351	end = "\'";
1352	p = ReadText( p, &value, false, end, false, encoding );
1353	}
1354	else if ( *p == '"' )
1355	{
1356	++p;
1357	end = "\"";
1358	p = ReadText( p, &value, false, end, false, encoding );
1359	}
1360	else
1361	{
1362	// All attribute values should be in single or double quotes.
1363	// But this is such a common error that the parser will try
1364	// its best, even without them.
1365	value = "";
1366	while ( p && *p // existence
1367	&& !IsWhiteSpace( p ) && p != '\n' && *p != '\r' // whitespace
1368	&& p != '/' && p != '>' ) // tag end
1369	{
1370	value += *p;
1371	++p;
1372	}
1373	}
1374	return p;
1375	}
1376
1377	#ifdef TIXML_USE_STL
1378	void TiXmlText::StreamIn( TIXML_ISTREAM * in, TIXML_STRING * tag )
1379	{
1380	while ( in->good() )
1381	{
1382	int c = in->peek();
1383	if ( c == '<' )
1384	return;
1385	if ( c <= 0 )
1386	{
1387	TiXmlDocument* document = GetDocument();
1388	if ( document )
1389	document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
1390	return;
1391	}
1392
1393	(*tag) += (char) c;
1394	in->get();
1395	}
1396	}
1397	#endif
1398
1399	const char* TiXmlText::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1400	{
1401	value = "";
1402	// TiXmlParsingData data( p, prevData );
1403	if ( data )
1404	{
1405	data->Stamp( p, encoding );
1406	location = data->Cursor();
1407	}
1408	bool ignoreWhite = true;
1409
1410	const char* end = "<";
1411	p = ReadText( p, &value, ignoreWhite, end, false, encoding );
1412	if ( p )
1413	return p-1; // don't truncate the '<'
1414	return 0;
1415	}
1416
1417	#ifdef TIXML_USE_STL
1418	void TiXmlDeclaration::StreamIn( TIXML_ISTREAM * in, TIXML_STRING * tag )
1419	{
1420	while ( in->good() )
1421	{
1422	int c = in->get();
1423	if ( c <= 0 )
1424	{
1425	TiXmlDocument* document = GetDocument();
1426	if ( document )
1427	document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
1428	return;
1429	}
1430	(*tag) += (char) c;
1431
1432	if ( c == '>' )
1433	{
1434	// All is well.
1435	return;
1436	}
1437	}
1438	}
1439	#endif
1440
1441	const char* TiXmlDeclaration::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding _encoding )
1442	{
1443	p = SkipWhiteSpace( p, _encoding );
1444	// Find the beginning, find the end, and look for
1445	// the stuff in-between.
1446	TiXmlDocument* document = GetDocument();
1447	if ( !p \|\| !*p \|\| !StringEqual( p, "<?xml", true, _encoding ) )
1448	{
1449	if ( document ) document->SetError( TIXML_ERROR_PARSING_DECLARATION, 0, 0, _encoding );
1450	return 0;
1451	}
1452	// TiXmlParsingData data( p, prevData );
1453	if ( data )
1454	{
1455	data->Stamp( p, _encoding );
1456	location = data->Cursor();
1457	}
1458	p += 5;
1459
1460	version = "";
1461	encoding = "";
1462	standalone = "";
1463
1464	while ( p && *p )
1465	{
1466	if ( *p == '>' )
1467	{
1468	++p;
1469	return p;
1470	}
1471
1472	p = SkipWhiteSpace( p, _encoding );
1473	if ( StringEqual( p, "version", true, _encoding ) )
1474	{
1475	TiXmlAttribute attrib;
1476	p = attrib.Parse( p, data, _encoding );
1477	version = attrib.Value();
1478	}
1479	else if ( StringEqual( p, "encoding", true, _encoding ) )
1480	{
1481	TiXmlAttribute attrib;
1482	p = attrib.Parse( p, data, _encoding );
1483	encoding = attrib.Value();
1484	}
1485	else if ( StringEqual( p, "standalone", true, _encoding ) )
1486	{
1487	TiXmlAttribute attrib;
1488	p = attrib.Parse( p, data, _encoding );
1489	standalone = attrib.Value();
1490	}
1491	else
1492	{
1493	// Read over whatever it is.
1494	while( p && p && p != '>' && !IsWhiteSpace( *p ) )
1495	++p;
1496	}
1497	}
1498	return 0;
1499	}
1500
1501	bool TiXmlText::Blank() const
1502	{
1503	for ( unsigned i=0; i<value.length(); i++ )
1504	if ( !IsWhiteSpace( value[i] ) )
1505	return false;
1506	return true;
1507	}
1508

Note: See TracBrowser for help on using the repository browser.

Download in other formats: