Planet

navi

home

PPS

about

screenshots

download

development

forum

Context Navigation

source: downloads/tcl8.5.2/generic/tclUtf.c @ 25

Last change on this file since 25 was 25, checked in by landauf, 18 years ago
added tcl to libs
File size: 45.7 KB

Line
1	/*
2	* tclUtf.c --
3	*
4	* Routines for manipulating UTF-8 strings.
5	*
6	* Copyright (c) 1997-1998 Sun Microsystems, Inc.
7	*
8	* See the file "license.terms" for information on usage and redistribution of
9	* this file, and for a DISCLAIMER OF ALL WARRANTIES.
10	*
11	* RCS: @(#) $Id: tclUtf.c,v 1.37 2005/10/31 15:59:41 dkf Exp $
12	*/
13
14	#include "tclInt.h"
15
16	/*
17	* Include the static character classification tables and macros.
18	*/
19
20	#include "tclUniData.c"
21
22	/*
23	* The following macros are used for fast character category tests. The x_BITS
24	* values are shifted right by the category value to determine whether the
25	* given category is included in the set.
26	*/
27
28	#define ALPHA_BITS ((1 << UPPERCASE_LETTER) \| (1 << LOWERCASE_LETTER) \
29	\| (1 << TITLECASE_LETTER) \| (1 << MODIFIER_LETTER) \| (1<<OTHER_LETTER))
30
31	#define DIGIT_BITS (1 << DECIMAL_DIGIT_NUMBER)
32
33	#define SPACE_BITS ((1 << SPACE_SEPARATOR) \| (1 << LINE_SEPARATOR) \
34	\| (1 << PARAGRAPH_SEPARATOR))
35
36	#define CONNECTOR_BITS (1 << CONNECTOR_PUNCTUATION)
37
38	#define PRINT_BITS (ALPHA_BITS \| DIGIT_BITS \| SPACE_BITS \| \
39	(1 << NON_SPACING_MARK) \| (1 << ENCLOSING_MARK) \| \
40	(1 << COMBINING_SPACING_MARK) \| (1 << LETTER_NUMBER) \| \
41	(1 << OTHER_NUMBER) \| (1 << CONNECTOR_PUNCTUATION) \| \
42	(1 << DASH_PUNCTUATION) \| (1 << OPEN_PUNCTUATION) \| \
43	(1 << CLOSE_PUNCTUATION) \| (1 << INITIAL_QUOTE_PUNCTUATION) \| \
44	(1 << FINAL_QUOTE_PUNCTUATION) \| (1 << OTHER_PUNCTUATION) \| \
45	(1 << MATH_SYMBOL) \| (1 << CURRENCY_SYMBOL) \| \
46	(1 << MODIFIER_SYMBOL) \| (1 << OTHER_SYMBOL))
47
48	#define PUNCT_BITS ((1 << CONNECTOR_PUNCTUATION) \| \
49	(1 << DASH_PUNCTUATION) \| (1 << OPEN_PUNCTUATION) \| \
50	(1 << CLOSE_PUNCTUATION) \| (1 << INITIAL_QUOTE_PUNCTUATION) \| \
51	(1 << FINAL_QUOTE_PUNCTUATION) \| (1 << OTHER_PUNCTUATION))
52
53	/*
54	* Unicode characters less than this value are represented by themselves in
55	* UTF-8 strings.
56	*/
57
58	#define UNICODE_SELF 0x80
59
60	/*
61	* The following structures are used when mapping between Unicode (UCS-2) and
62	* UTF-8.
63	*/
64
65	static CONST unsigned char totalBytes[256] = {
66	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
67	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
68	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
69	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
70	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
71	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
72	2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
73	3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
74	#if TCL_UTF_MAX > 3
75	4,4,4,4,4,4,4,4,
76	#else
77	1,1,1,1,1,1,1,1,
78	#endif
79	#if TCL_UTF_MAX > 4
80	5,5,5,5,
81	#else
82	1,1,1,1,
83	#endif
84	#if TCL_UTF_MAX > 5
85	6,6,6,6
86	#else
87	1,1,1,1
88	#endif
89	};
90
91	/*
92	* Functions used only in this module.
93	*/
94
95	static int UtfCount(int ch);
96
97	/*
98	*---------------------------------------------------------------------------
99	*
100	* UtfCount --
101	*
102	* Find the number of bytes in the Utf character "ch".
103	*
104	* Results:
105	* The return values is the number of bytes in the Utf character "ch".
106	*
107	* Side effects:
108	* None.
109	*
110	*---------------------------------------------------------------------------
111	*/
112
113	INLINE static int
114	UtfCount(
115	int ch) /* The Tcl_UniChar whose size is returned. */
116	{
117	if ((ch > 0) && (ch < UNICODE_SELF)) {
118	return 1;
119	}
120	if (ch <= 0x7FF) {
121	return 2;
122	}
123	if (ch <= 0xFFFF) {
124	return 3;
125	}
126	#if TCL_UTF_MAX > 3
127	if (ch <= 0x1FFFFF) {
128	return 4;
129	}
130	if (ch <= 0x3FFFFFF) {
131	return 5;
132	}
133	if (ch <= 0x7FFFFFFF) {
134	return 6;
135	}
136	#endif
137	return 3;
138	}
139
140	/*
141	*---------------------------------------------------------------------------
142	*
143	* Tcl_UniCharToUtf --
144	*
145	* Store the given Tcl_UniChar as a sequence of UTF-8 bytes in the
146	* provided buffer. Equivalent to Plan 9 runetochar().
147	*
148	* Results:
149	* The return values is the number of bytes in the buffer that were
150	* consumed.
151	*
152	* Side effects:
153	* None.
154	*
155	*---------------------------------------------------------------------------
156	*/
157
158	INLINE int
159	Tcl_UniCharToUtf(
160	int ch, /* The Tcl_UniChar to be stored in the
161	* buffer. */
162	char buf) / Buffer in which the UTF-8 representation of
163	* the Tcl_UniChar is stored. Buffer must be
164	* large enough to hold the UTF-8 character
165	* (at most TCL_UTF_MAX bytes). */
166	{
167	if ((ch > 0) && (ch < UNICODE_SELF)) {
168	buf[0] = (char) ch;
169	return 1;
170	}
171	if (ch >= 0) {
172	if (ch <= 0x7FF) {
173	buf[1] = (char) ((ch \| 0x80) & 0xBF);
174	buf[0] = (char) ((ch >> 6) \| 0xC0);
175	return 2;
176	}
177	if (ch <= 0xFFFF) {
178	three:
179	buf[2] = (char) ((ch \| 0x80) & 0xBF);
180	buf[1] = (char) (((ch >> 6) \| 0x80) & 0xBF);
181	buf[0] = (char) ((ch >> 12) \| 0xE0);
182	return 3;
183	}
184
185	#if TCL_UTF_MAX > 3
186	if (ch <= 0x1FFFFF) {
187	buf[3] = (char) ((ch \| 0x80) & 0xBF);
188	buf[2] = (char) (((ch >> 6) \| 0x80) & 0xBF);
189	buf[1] = (char) (((ch >> 12) \| 0x80) & 0xBF);
190	buf[0] = (char) ((ch >> 18) \| 0xF0);
191	return 4;
192	}
193	if (ch <= 0x3FFFFFF) {
194	buf[4] = (char) ((ch \| 0x80) & 0xBF);
195	buf[3] = (char) (((ch >> 6) \| 0x80) & 0xBF);
196	buf[2] = (char) (((ch >> 12) \| 0x80) & 0xBF);
197	buf[1] = (char) (((ch >> 18) \| 0x80) & 0xBF);
198	buf[0] = (char) ((ch >> 24) \| 0xF8);
199	return 5;
200	}
201	if (ch <= 0x7FFFFFFF) {
202	buf[5] = (char) ((ch \| 0x80) & 0xBF);
203	buf[4] = (char) (((ch >> 6) \| 0x80) & 0xBF);
204	buf[3] = (char) (((ch >> 12) \| 0x80) & 0xBF);
205	buf[2] = (char) (((ch >> 18) \| 0x80) & 0xBF);
206	buf[1] = (char) (((ch >> 24) \| 0x80) & 0xBF);
207	buf[0] = (char) ((ch >> 30) \| 0xFC);
208	return 6;
209	}
210	#endif
211	}
212
213	ch = 0xFFFD;
214	goto three;
215	}
216
217	/*
218	*---------------------------------------------------------------------------
219	*
220	* Tcl_UniCharToUtfDString --
221	*
222	* Convert the given Unicode string to UTF-8.
223	*
224	* Results:
225	* The return value is a pointer to the UTF-8 representation of the
226	* Unicode string. Storage for the return value is appended to the end of
227	* dsPtr.
228	*
229	* Side effects:
230	* None.
231	*
232	*---------------------------------------------------------------------------
233	*/
234
235	char *
236	Tcl_UniCharToUtfDString(
237	CONST Tcl_UniChar uniStr, / Unicode string to convert to UTF-8. */
238	int uniLength, /* Length of Unicode string in Tcl_UniChars
239	* (must be >= 0). */
240	Tcl_DString dsPtr) / UTF-8 representation of string is appended
241	* to this previously initialized DString. */
242	{
243	CONST Tcl_UniChar w, wEnd;
244	char p, string;
245	int oldLength;
246
247	/*
248	* UTF-8 string length in bytes will be <= Unicode string length *
249	* TCL_UTF_MAX.
250	*/
251
252	oldLength = Tcl_DStringLength(dsPtr);
253	Tcl_DStringSetLength(dsPtr, (oldLength + uniLength + 1) * TCL_UTF_MAX);
254	string = Tcl_DStringValue(dsPtr) + oldLength;
255
256	p = string;
257	wEnd = uniStr + uniLength;
258	for (w = uniStr; w < wEnd; ) {
259	p += Tcl_UniCharToUtf(*w, p);
260	w++;
261	}
262	Tcl_DStringSetLength(dsPtr, oldLength + (p - string));
263
264	return string;
265	}
266
267	/*
268	*---------------------------------------------------------------------------
269	*
270	* Tcl_UtfToUniChar --
271	*
272	* Extract the Tcl_UniChar represented by the UTF-8 string. Bad UTF-8
273	* sequences are converted to valid Tcl_UniChars and processing
274	* continues. Equivalent to Plan 9 chartorune().
275	*
276	* The caller must ensure that the source buffer is long enough that this
277	* routine does not run off the end and dereference non-existent memory
278	* looking for trail bytes. If the source buffer is known to be '\0'
279	* terminated, this cannot happen. Otherwise, the caller should call
280	* Tcl_UtfCharComplete() before calling this routine to ensure that
281	* enough bytes remain in the string.
282	*
283	* Results:
284	* *chPtr is filled with the Tcl_UniChar, and the return value is the
285	* number of bytes from the UTF-8 string that were consumed.
286	*
287	* Side effects:
288	* None.
289	*
290	*---------------------------------------------------------------------------
291	*/
292
293	int
294	Tcl_UtfToUniChar(
295	register CONST char src, / The UTF-8 string. */
296	register Tcl_UniChar chPtr)/ Filled with the Tcl_UniChar represented by
297	* the UTF-8 string. */
298	{
299	register int byte;
300
301	/*
302	* Unroll 1 to 3 byte UTF-8 sequences, use loop to handle longer ones.
303	*/
304
305	byte = ((unsigned char ) src);
306	if (byte < 0xC0) {
307	/*
308	* Handles properly formed UTF-8 characters between 0x01 and 0x7F.
309	* Also treats \0 and naked trail bytes 0x80 to 0xBF as valid
310	* characters representing themselves.
311	*/
312
313	*chPtr = (Tcl_UniChar) byte;
314	return 1;
315	} else if (byte < 0xE0) {
316	if ((src[1] & 0xC0) == 0x80) {
317	/*
318	* Two-byte-character lead-byte followed by a trail-byte.
319	*/
320
321	*chPtr = (Tcl_UniChar) (((byte & 0x1F) << 6) \| (src[1] & 0x3F));
322	return 2;
323	}
324
325	/*
326	* A two-byte-character lead-byte not followed by trail-byte
327	* represents itself.
328	*/
329
330	*chPtr = (Tcl_UniChar) byte;
331	return 1;
332	} else if (byte < 0xF0) {
333	if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80)) {
334	/*
335	* Three-byte-character lead byte followed by two trail bytes.
336	*/
337
338	*chPtr = (Tcl_UniChar) (((byte & 0x0F) << 12)
339	\| ((src[1] & 0x3F) << 6) \| (src[2] & 0x3F));
340	return 3;
341	}
342
343	/*
344	* A three-byte-character lead-byte not followed by two trail-bytes
345	* represents itself.
346	*/
347
348	*chPtr = (Tcl_UniChar) byte;
349	return 1;
350	}
351	#if TCL_UTF_MAX > 3
352	{
353	int ch, total, trail;
354
355	total = totalBytes[byte];
356	trail = total - 1;
357	if (trail > 0) {
358	ch = byte & (0x3F >> trail);
359	do {
360	src++;
361	if ((*src & 0xC0) != 0x80) {
362	*chPtr = byte;
363	return 1;
364	}
365	ch <<= 6;
366	ch \|= (*src & 0x3F);
367	trail--;
368	} while (trail > 0);
369	*chPtr = ch;
370	return total;
371	}
372	}
373	#endif
374
375	*chPtr = (Tcl_UniChar) byte;
376	return 1;
377	}
378
379	/*
380	*---------------------------------------------------------------------------
381	*
382	* Tcl_UtfToUniCharDString --
383	*
384	* Convert the UTF-8 string to Unicode.
385	*
386	* Results:
387	* The return value is a pointer to the Unicode representation of the
388	* UTF-8 string. Storage for the return value is appended to the end of
389	* dsPtr. The Unicode string is terminated with a Unicode NULL character.
390	*
391	* Side effects:
392	* None.
393	*
394	*---------------------------------------------------------------------------
395	*/
396
397	Tcl_UniChar *
398	Tcl_UtfToUniCharDString(
399	CONST char src, / UTF-8 string to convert to Unicode. */
400	int length, /* Length of UTF-8 string in bytes, or -1 for
401	* strlen(). */
402	Tcl_DString dsPtr) / Unicode representation of string is
403	* appended to this previously initialized
404	* DString. */
405	{
406	Tcl_UniChar w, wString;
407	CONST char p, end;
408	int oldLength;
409
410	if (length < 0) {
411	length = strlen(src);
412	}
413
414	/*
415	* Unicode string length in Tcl_UniChars will be <= UTF-8 string length in
416	* bytes.
417	*/
418
419	oldLength = Tcl_DStringLength(dsPtr);
420	Tcl_DStringSetLength(dsPtr,
421	(int) ((oldLength + length + 1) * sizeof(Tcl_UniChar)));
422	wString = (Tcl_UniChar *) (Tcl_DStringValue(dsPtr) + oldLength);
423
424	w = wString;
425	end = src + length;
426	for (p = src; p < end; ) {
427	p += TclUtfToUniChar(p, w);
428	w++;
429	}
430	*w = '\0';
431	Tcl_DStringSetLength(dsPtr,
432	(oldLength + ((char ) w - (char ) wString)));
433
434	return wString;
435	}
436
437	/*
438	*---------------------------------------------------------------------------
439	*
440	* Tcl_UtfCharComplete --
441	*
442	* Determine if the UTF-8 string of the given length is long enough to be
443	* decoded by Tcl_UtfToUniChar(). This does not ensure that the UTF-8
444	* string is properly formed. Equivalent to Plan 9 fullrune().
445	*
446	* Results:
447	* The return value is 0 if the string is not long enough, non-zero
448	* otherwise.
449	*
450	* Side effects:
451	* None.
452	*
453	*---------------------------------------------------------------------------
454	*/
455
456	int
457	Tcl_UtfCharComplete(
458	CONST char src, / String to check if first few bytes contain
459	* a complete UTF-8 character. */
460	int length) /* Length of above string in bytes. */
461	{
462	int ch;
463
464	ch = ((unsigned char ) src);
465	return length >= totalBytes[ch];
466	}
467
468	/*
469	*---------------------------------------------------------------------------
470	*
471	* Tcl_NumUtfChars --
472	*
473	* Returns the number of characters (not bytes) in the UTF-8 string, not
474	* including the terminating NULL byte. This is equivalent to Plan 9
475	* utflen() and utfnlen().
476	*
477	* Results:
478	* As above.
479	*
480	* Side effects:
481	* None.
482	*
483	*---------------------------------------------------------------------------
484	*/
485
486	int
487	Tcl_NumUtfChars(
488	register CONST char src, / The UTF-8 string to measure. */
489	int length) /* The length of the string in bytes, or -1
490	* for strlen(string). */
491	{
492	Tcl_UniChar ch;
493	register Tcl_UniChar *chPtr = &ch;
494	register int i;
495
496	/*
497	* The separate implementations are faster.
498	*
499	* Since this is a time-sensitive function, we also do the check for the
500	* single-byte char case specially.
501	*/
502
503	i = 0;
504	if (length < 0) {
505	while (*src != '\0') {
506	src += TclUtfToUniChar(src, chPtr);
507	i++;
508	}
509	} else {
510	register int n;
511
512	while (length > 0) {
513	if (UCHAR(*src) < 0xC0) {
514	length--;
515	src++;
516	} else {
517	n = Tcl_UtfToUniChar(src, chPtr);
518	length -= n;
519	src += n;
520	}
521	i++;
522	}
523	}
524	return i;
525	}
526
527	/*
528	*---------------------------------------------------------------------------
529	*
530	* Tcl_UtfFindFirst --
531	*
532	* Returns a pointer to the first occurance of the given Tcl_UniChar in
533	* the NULL-terminated UTF-8 string. The NULL terminator is considered
534	* part of the UTF-8 string. Equivalent to Plan 9 utfrune().
535	*
536	* Results:
537	* As above. If the Tcl_UniChar does not exist in the given string, the
538	* return value is NULL.
539	*
540	* Side effects:
541	* None.
542	*
543	*---------------------------------------------------------------------------
544	*/
545
546	CONST char *
547	Tcl_UtfFindFirst(
548	CONST char src, / The UTF-8 string to be searched. */
549	int ch) /* The Tcl_UniChar to search for. */
550	{
551	int len;
552	Tcl_UniChar find;
553
554	while (1) {
555	len = TclUtfToUniChar(src, &find);
556	if (find == ch) {
557	return src;
558	}
559	if (*src == '\0') {
560	return NULL;
561	}
562	src += len;
563	}
564	}
565
566	/*
567	*---------------------------------------------------------------------------
568	*
569	* Tcl_UtfFindLast --
570	*
571	* Returns a pointer to the last occurance of the given Tcl_UniChar in
572	* the NULL-terminated UTF-8 string. The NULL terminator is considered
573	* part of the UTF-8 string. Equivalent to Plan 9 utfrrune().
574	*
575	* Results:
576	* As above. If the Tcl_UniChar does not exist in the given string, the
577	* return value is NULL.
578	*
579	* Side effects:
580	* None.
581	*
582	*---------------------------------------------------------------------------
583	*/
584
585	CONST char *
586	Tcl_UtfFindLast(
587	CONST char src, / The UTF-8 string to be searched. */
588	int ch) /* The Tcl_UniChar to search for. */
589	{
590	int len;
591	Tcl_UniChar find;
592	CONST char *last;
593
594	last = NULL;
595	while (1) {
596	len = TclUtfToUniChar(src, &find);
597	if (find == ch) {
598	last = src;
599	}
600	if (*src == '\0') {
601	break;
602	}
603	src += len;
604	}
605	return last;
606	}
607
608	/*
609	*---------------------------------------------------------------------------
610	*
611	* Tcl_UtfNext --
612	*
613	* Given a pointer to some current location in a UTF-8 string, move
614	* forward one character. The caller must ensure that they are not asking
615	* for the next character after the last character in the string.
616	*
617	* Results:
618	* The return value is the pointer to the next character in the UTF-8
619	* string.
620	*
621	* Side effects:
622	* None.
623	*
624	*---------------------------------------------------------------------------
625	*/
626
627	CONST char *
628	Tcl_UtfNext(
629	CONST char src) / The current location in the string. */
630	{
631	Tcl_UniChar ch;
632
633	return src + TclUtfToUniChar(src, &ch);
634	}
635
636	/*
637	*---------------------------------------------------------------------------
638	*
639	* Tcl_UtfPrev --
640	*
641	* Given a pointer to some current location in a UTF-8 string, move
642	* backwards one character. This works correctly when the pointer is in
643	* the middle of a UTF-8 character.
644	*
645	* Results:
646	* The return value is a pointer to the previous character in the UTF-8
647	* string. If the current location was already at the beginning of the
648	* string, the return value will also be a pointer to the beginning of
649	* the string.
650	*
651	* Side effects:
652	* None.
653	*
654	*---------------------------------------------------------------------------
655	*/
656
657	CONST char *
658	Tcl_UtfPrev(
659	CONST char src, / The current location in the string. */
660	CONST char start) / Pointer to the beginning of the string, to
661	* avoid going backwards too far. */
662	{
663	CONST char *look;
664	int i, byte;
665
666	src--;
667	look = src;
668	for (i = 0; i < TCL_UTF_MAX; i++) {
669	if (look < start) {
670	if (src < start) {
671	src = start;
672	}
673	break;
674	}
675	byte = ((unsigned char ) look);
676	if (byte < 0x80) {
677	break;
678	}
679	if (byte >= 0xC0) {
680	return look;
681	}
682	look--;
683	}
684	return src;
685	}
686
687	/*
688	*---------------------------------------------------------------------------
689	*
690	* Tcl_UniCharAtIndex --
691	*
692	* Returns the Unicode character represented at the specified character
693	* (not byte) position in the UTF-8 string.
694	*
695	* Results:
696	* As above.
697	*
698	* Side effects:
699	* None.
700	*
701	*---------------------------------------------------------------------------
702	*/
703
704	Tcl_UniChar
705	Tcl_UniCharAtIndex(
706	register CONST char src, / The UTF-8 string to dereference. */
707	register int index) /* The position of the desired character. */
708	{
709	Tcl_UniChar ch;
710
711	while (index >= 0) {
712	index--;
713	src += TclUtfToUniChar(src, &ch);
714	}
715	return ch;
716	}
717
718	/*
719	*---------------------------------------------------------------------------
720	*
721	* Tcl_UtfAtIndex --
722	*
723	* Returns a pointer to the specified character (not byte) position in
724	* the UTF-8 string.
725	*
726	* Results:
727	* As above.
728	*
729	* Side effects:
730	* None.
731	*
732	*---------------------------------------------------------------------------
733	*/
734
735	CONST char *
736	Tcl_UtfAtIndex(
737	register CONST char src, / The UTF-8 string. */
738	register int index) /* The position of the desired character. */
739	{
740	Tcl_UniChar ch;
741
742	while (index > 0) {
743	index--;
744	src += TclUtfToUniChar(src, &ch);
745	}
746	return src;
747	}
748
749	/*
750	*---------------------------------------------------------------------------
751	*
752	* Tcl_UtfBackslash --
753	*
754	* Figure out how to handle a backslash sequence.
755	*
756	* Results:
757	* Stores the bytes represented by the backslash sequence in dst and
758	* returns the number of bytes written to dst. At most TCL_UTF_MAX bytes
759	* are written to dst; dst must have been large enough to accept those
760	* bytes. If readPtr isn't NULL then it is filled in with a count of the
761	* number of bytes in the backslash sequence.
762	*
763	* Side effects:
764	* The maximum number of bytes it takes to represent a Unicode character
765	* in UTF-8 is guaranteed to be less than the number of bytes used to
766	* express the backslash sequence that represents that Unicode character.
767	* If the target buffer into which the caller is going to store the bytes
768	* that represent the Unicode character is at least as large as the
769	* source buffer from which the backslashed sequence was extracted, no
770	* buffer overruns should occur.
771	*
772	*---------------------------------------------------------------------------
773	*/
774
775	int
776	Tcl_UtfBackslash(
777	CONST char src, / Points to the backslash character of a
778	* backslash sequence. */
779	int readPtr, / Fill in with number of characters read from
780	* src, unless NULL. */
781	char dst) / Filled with the bytes represented by the
782	* backslash sequence. */
783	{
784	#define LINE_LENGTH 128
785	int numRead;
786	int result;
787
788	result = TclParseBackslash(src, LINE_LENGTH, &numRead, dst);
789	if (numRead == LINE_LENGTH) {
790	/*
791	* We ate a whole line. Pay the price of a strlen()
792	*/
793
794	result = TclParseBackslash(src, (int)strlen(src), &numRead, dst);
795	}
796	if (readPtr != NULL) {
797	*readPtr = numRead;
798	}
799	return result;
800	}
801
802	/*
803	*----------------------------------------------------------------------
804	*
805	* Tcl_UtfToUpper --
806	*
807	* Convert lowercase characters to uppercase characters in a UTF string
808	* in place. The conversion may shrink the UTF string.
809	*
810	* Results:
811	* Returns the number of bytes in the resulting string excluding the
812	* trailing null.
813	*
814	* Side effects:
815	* Writes a terminating null after the last converted character.
816	*
817	*----------------------------------------------------------------------
818	*/
819
820	int
821	Tcl_UtfToUpper(
822	char str) / String to convert in place. */
823	{
824	Tcl_UniChar ch, upChar;
825	char src, dst;
826	int bytes;
827
828	/*
829	* Iterate over the string until we hit the terminating null.
830	*/
831
832	src = dst = str;
833	while (*src) {
834	bytes = TclUtfToUniChar(src, &ch);
835	upChar = Tcl_UniCharToUpper(ch);
836
837	/*
838	* To keep badly formed Utf strings from getting inflated by the
839	* conversion (thereby causing a segfault), only copy the upper case
840	* char to dst if its size is <= the original char.
841	*/
842
843	if (bytes < UtfCount(upChar)) {
844	memcpy(dst, src, (size_t) bytes);
845	dst += bytes;
846	} else {
847	dst += Tcl_UniCharToUtf(upChar, dst);
848	}
849	src += bytes;
850	}
851	*dst = '\0';
852	return (dst - str);
853	}
854
855	/*
856	*----------------------------------------------------------------------
857	*
858	* Tcl_UtfToLower --
859	*
860	* Convert uppercase characters to lowercase characters in a UTF string
861	* in place. The conversion may shrink the UTF string.
862	*
863	* Results:
864	* Returns the number of bytes in the resulting string excluding the
865	* trailing null.
866	*
867	* Side effects:
868	* Writes a terminating null after the last converted character.
869	*
870	*----------------------------------------------------------------------
871	*/
872
873	int
874	Tcl_UtfToLower(
875	char str) / String to convert in place. */
876	{
877	Tcl_UniChar ch, lowChar;
878	char src, dst;
879	int bytes;
880
881	/*
882	* Iterate over the string until we hit the terminating null.
883	*/
884
885	src = dst = str;
886	while (*src) {
887	bytes = TclUtfToUniChar(src, &ch);
888	lowChar = Tcl_UniCharToLower(ch);
889
890	/*
891	* To keep badly formed Utf strings from getting inflated by the
892	* conversion (thereby causing a segfault), only copy the lower case
893	* char to dst if its size is <= the original char.
894	*/
895
896	if (bytes < UtfCount(lowChar)) {
897	memcpy(dst, src, (size_t) bytes);
898	dst += bytes;
899	} else {
900	dst += Tcl_UniCharToUtf(lowChar, dst);
901	}
902	src += bytes;
903	}
904	*dst = '\0';
905	return (dst - str);
906	}
907
908	/*
909	*----------------------------------------------------------------------
910	*
911	* Tcl_UtfToTitle --
912	*
913	* Changes the first character of a UTF string to title case or uppercase
914	* and the rest of the string to lowercase. The conversion happens in
915	* place and may shrink the UTF string.
916	*
917	* Results:
918	* Returns the number of bytes in the resulting string excluding the
919	* trailing null.
920	*
921	* Side effects:
922	* Writes a terminating null after the last converted character.
923	*
924	*----------------------------------------------------------------------
925	*/
926
927	int
928	Tcl_UtfToTitle(
929	char str) / String to convert in place. */
930	{
931	Tcl_UniChar ch, titleChar, lowChar;
932	char src, dst;
933	int bytes;
934
935	/*
936	* Capitalize the first character and then lowercase the rest of the
937	* characters until we get to a null.
938	*/
939
940	src = dst = str;
941
942	if (*src) {
943	bytes = TclUtfToUniChar(src, &ch);
944	titleChar = Tcl_UniCharToTitle(ch);
945
946	if (bytes < UtfCount(titleChar)) {
947	memcpy(dst, src, (size_t) bytes);
948	dst += bytes;
949	} else {
950	dst += Tcl_UniCharToUtf(titleChar, dst);
951	}
952	src += bytes;
953	}
954	while (*src) {
955	bytes = TclUtfToUniChar(src, &ch);
956	lowChar = Tcl_UniCharToLower(ch);
957
958	if (bytes < UtfCount(lowChar)) {
959	memcpy(dst, src, (size_t) bytes);
960	dst += bytes;
961	} else {
962	dst += Tcl_UniCharToUtf(lowChar, dst);
963	}
964	src += bytes;
965	}
966	*dst = '\0';
967	return (dst - str);
968	}
969
970	/*
971	*----------------------------------------------------------------------
972	*
973	* TclpUtfNcmp2 --
974	*
975	* Compare at most numBytes bytes of utf-8 strings cs and ct. Both cs and
976	* ct are assumed to be at least numBytes bytes long.
977	*
978	* Results:
979	* Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
980	*
981	* Side effects:
982	* None.
983	*
984	*----------------------------------------------------------------------
985	*/
986
987	int
988	TclpUtfNcmp2(
989	CONST char cs, / UTF string to compare to ct. */
990	CONST char ct, / UTF string cs is compared to. */
991	unsigned long numBytes) /* Number of bytes to compare. */
992	{
993	/*
994	* We can't simply call 'memcmp(cs, ct, numBytes);' because we need to
995	* check for Tcl's \xC0\x80 non-utf-8 null encoding. Otherwise utf-8 lexes
996	* fine in the strcmp manner.
997	*/
998
999	register int result = 0;
1000
1001	for ( ; numBytes != 0; numBytes--, cs++, ct++) {
1002	if (cs != ct) {
1003	result = UCHAR(cs) - UCHAR(ct);
1004	break;
1005	}
1006	}
1007	if (numBytes && ((UCHAR(cs) == 0xC0) \|\| (UCHAR(ct) == 0xC0))) {
1008	unsigned char c1, c2;
1009
1010	c1 = ((UCHAR(cs) == 0xC0) && (UCHAR(cs[1]) == 0x80)) ? 0 : UCHAR(cs);
1011	c2 = ((UCHAR(ct) == 0xC0) && (UCHAR(ct[1]) == 0x80)) ? 0 : UCHAR(ct);
1012	result = (c1 - c2);
1013	}
1014	return result;
1015	}
1016
1017	/*
1018	*----------------------------------------------------------------------
1019	*
1020	* Tcl_UtfNcmp --
1021	*
1022	* Compare at most numChars UTF chars of string cs to string ct. Both cs
1023	* and ct are assumed to be at least numChars UTF chars long.
1024	*
1025	* Results:
1026	* Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
1027	*
1028	* Side effects:
1029	* None.
1030	*
1031	*----------------------------------------------------------------------
1032	*/
1033
1034	int
1035	Tcl_UtfNcmp(
1036	CONST char cs, / UTF string to compare to ct. */
1037	CONST char ct, / UTF string cs is compared to. */
1038	unsigned long numChars) /* Number of UTF chars to compare. */
1039	{
1040	Tcl_UniChar ch1, ch2;
1041
1042	/*
1043	* Cannot use 'memcmp(cs, ct, n);' as byte representation of \u0000 (the
1044	* pair of bytes 0xc0,0x80) is larger than byte representation of \u0001
1045	* (the byte 0x01.)
1046	*/
1047
1048	while (numChars-- > 0) {
1049	/*
1050	* n must be interpreted as chars, not bytes. This should be called
1051	* only when both strings are of at least n chars long (no need for \0
1052	* check)
1053	*/
1054
1055	cs += TclUtfToUniChar(cs, &ch1);
1056	ct += TclUtfToUniChar(ct, &ch2);
1057	if (ch1 != ch2) {
1058	return (ch1 - ch2);
1059	}
1060	}
1061	return 0;
1062	}
1063
1064	/*
1065	*----------------------------------------------------------------------
1066	*
1067	* Tcl_UtfNcasecmp --
1068	*
1069	* Compare at most numChars UTF chars of string cs to string ct case
1070	* insensitive. Both cs and ct are assumed to be at least numChars UTF
1071	* chars long.
1072	*
1073	* Results:
1074	* Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
1075	*
1076	* Side effects:
1077	* None.
1078	*
1079	*----------------------------------------------------------------------
1080	*/
1081
1082	int
1083	Tcl_UtfNcasecmp(
1084	CONST char cs, / UTF string to compare to ct. */
1085	CONST char ct, / UTF string cs is compared to. */
1086	unsigned long numChars) /* Number of UTF chars to compare. */
1087	{
1088	Tcl_UniChar ch1, ch2;
1089	while (numChars-- > 0) {
1090	/*
1091	* n must be interpreted as chars, not bytes.
1092	* This should be called only when both strings are of
1093	* at least n chars long (no need for \0 check)
1094	*/
1095	cs += TclUtfToUniChar(cs, &ch1);
1096	ct += TclUtfToUniChar(ct, &ch2);
1097	if (ch1 != ch2) {
1098	ch1 = Tcl_UniCharToLower(ch1);
1099	ch2 = Tcl_UniCharToLower(ch2);
1100	if (ch1 != ch2) {
1101	return (ch1 - ch2);
1102	}
1103	}
1104	}
1105	return 0;
1106	}
1107
1108	/*
1109	*----------------------------------------------------------------------
1110	*
1111	* Tcl_UniCharToUpper --
1112	*
1113	* Compute the uppercase equivalent of the given Unicode character.
1114	*
1115	* Results:
1116	* Returns the uppercase Unicode character.
1117	*
1118	* Side effects:
1119	* None.
1120	*
1121	*----------------------------------------------------------------------
1122	*/
1123
1124	Tcl_UniChar
1125	Tcl_UniCharToUpper(
1126	int ch) /* Unicode character to convert. */
1127	{
1128	int info = GetUniCharInfo(ch);
1129
1130	if (GetCaseType(info) & 0x04) {
1131	return (Tcl_UniChar) (ch - GetDelta(info));
1132	} else {
1133	return ch;
1134	}
1135	}
1136
1137	/*
1138	*----------------------------------------------------------------------
1139	*
1140	* Tcl_UniCharToLower --
1141	*
1142	* Compute the lowercase equivalent of the given Unicode character.
1143	*
1144	* Results:
1145	* Returns the lowercase Unicode character.
1146	*
1147	* Side effects:
1148	* None.
1149	*
1150	*----------------------------------------------------------------------
1151	*/
1152
1153	Tcl_UniChar
1154	Tcl_UniCharToLower(
1155	int ch) /* Unicode character to convert. */
1156	{
1157	int info = GetUniCharInfo(ch);
1158
1159	if (GetCaseType(info) & 0x02) {
1160	return (Tcl_UniChar) (ch + GetDelta(info));
1161	} else {
1162	return ch;
1163	}
1164	}
1165
1166	/*
1167	*----------------------------------------------------------------------
1168	*
1169	* Tcl_UniCharToTitle --
1170	*
1171	* Compute the titlecase equivalent of the given Unicode character.
1172	*
1173	* Results:
1174	* Returns the titlecase Unicode character.
1175	*
1176	* Side effects:
1177	* None.
1178	*
1179	*----------------------------------------------------------------------
1180	*/
1181
1182	Tcl_UniChar
1183	Tcl_UniCharToTitle(
1184	int ch) /* Unicode character to convert. */
1185	{
1186	int info = GetUniCharInfo(ch);
1187	int mode = GetCaseType(info);
1188
1189	if (mode & 0x1) {
1190	/*
1191	* Subtract or add one depending on the original case.
1192	*/
1193
1194	return (Tcl_UniChar) (ch + ((mode & 0x4) ? -1 : 1));
1195	} else if (mode == 0x4) {
1196	return (Tcl_UniChar) (ch - GetDelta(info));
1197	} else {
1198	return ch;
1199	}
1200	}
1201
1202	/*
1203	*----------------------------------------------------------------------
1204	*
1205	* Tcl_UniCharLen --
1206	*
1207	* Find the length of a UniChar string. The str input must be null
1208	* terminated.
1209	*
1210	* Results:
1211	* Returns the length of str in UniChars (not bytes).
1212	*
1213	* Side effects:
1214	* None.
1215	*
1216	*----------------------------------------------------------------------
1217	*/
1218
1219	int
1220	Tcl_UniCharLen(
1221	CONST Tcl_UniChar uniStr) / Unicode string to find length of. */
1222	{
1223	int len = 0;
1224
1225	while (*uniStr != '\0') {
1226	len++;
1227	uniStr++;
1228	}
1229	return len;
1230	}
1231
1232	/*
1233	*----------------------------------------------------------------------
1234	*
1235	* Tcl_UniCharNcmp --
1236	*
1237	* Compare at most numChars unichars of string ucs to string uct.
1238	* Both ucs and uct are assumed to be at least numChars unichars long.
1239	*
1240	* Results:
1241	* Return <0 if ucs < uct, 0 if ucs == uct, or >0 if ucs > uct.
1242	*
1243	* Side effects:
1244	* None.
1245	*
1246	*----------------------------------------------------------------------
1247	*/
1248
1249	int
1250	Tcl_UniCharNcmp(
1251	CONST Tcl_UniChar ucs, / Unicode string to compare to uct. */
1252	CONST Tcl_UniChar uct, / Unicode string ucs is compared to. */
1253	unsigned long numChars) /* Number of unichars to compare. */
1254	{
1255	#ifdef WORDS_BIGENDIAN
1256	/*
1257	* We are definitely on a big-endian machine; memcmp() is safe
1258	*/
1259
1260	return memcmp(ucs, uct, numChars*sizeof(Tcl_UniChar));
1261
1262	#else /* !WORDS_BIGENDIAN */
1263	/*
1264	* We can't simply call memcmp() because that is not lexically correct.
1265	*/
1266
1267	for ( ; numChars != 0; ucs++, uct++, numChars--) {
1268	if (ucs != uct) {
1269	return (ucs - uct);
1270	}
1271	}
1272	return 0;
1273	#endif /* WORDS_BIGENDIAN */
1274	}
1275
1276	/*
1277	*----------------------------------------------------------------------
1278	*
1279	* Tcl_UniCharNcasecmp --
1280	*
1281	* Compare at most numChars unichars of string ucs to string uct case
1282	* insensitive. Both ucs and uct are assumed to be at least numChars
1283	* unichars long.
1284	*
1285	* Results:
1286	* Return <0 if ucs < uct, 0 if ucs == uct, or >0 if ucs > uct.
1287	*
1288	* Side effects:
1289	* None.
1290	*
1291	*----------------------------------------------------------------------
1292	*/
1293
1294	int
1295	Tcl_UniCharNcasecmp(
1296	CONST Tcl_UniChar ucs, / Unicode string to compare to uct. */
1297	CONST Tcl_UniChar uct, / Unicode string ucs is compared to. */
1298	unsigned long numChars) /* Number of unichars to compare. */
1299	{
1300	for ( ; numChars != 0; numChars--, ucs++, uct++) {
1301	if (ucs != uct) {
1302	Tcl_UniChar lcs = Tcl_UniCharToLower(*ucs);
1303	Tcl_UniChar lct = Tcl_UniCharToLower(*uct);
1304
1305	if (lcs != lct) {
1306	return (lcs - lct);
1307	}
1308	}
1309	}
1310	return 0;
1311	}
1312
1313	/*
1314	*----------------------------------------------------------------------
1315	*
1316	* Tcl_UniCharIsAlnum --
1317	*
1318	* Test if a character is an alphanumeric Unicode character.
1319	*
1320	* Results:
1321	* Returns 1 if character is alphanumeric.
1322	*
1323	* Side effects:
1324	* None.
1325	*
1326	*----------------------------------------------------------------------
1327	*/
1328
1329	int
1330	Tcl_UniCharIsAlnum(
1331	int ch) /* Unicode character to test. */
1332	{
1333	register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1334
1335	return (((ALPHA_BITS \| DIGIT_BITS) >> category) & 1);
1336	}
1337
1338	/*
1339	*----------------------------------------------------------------------
1340	*
1341	* Tcl_UniCharIsAlpha --
1342	*
1343	* Test if a character is an alphabetic Unicode character.
1344	*
1345	* Results:
1346	* Returns 1 if character is alphabetic.
1347	*
1348	* Side effects:
1349	* None.
1350	*
1351	*----------------------------------------------------------------------
1352	*/
1353
1354	int
1355	Tcl_UniCharIsAlpha(
1356	int ch) /* Unicode character to test. */
1357	{
1358	register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1359	return ((ALPHA_BITS >> category) & 1);
1360	}
1361
1362	/*
1363	*----------------------------------------------------------------------
1364	*
1365	* Tcl_UniCharIsControl --
1366	*
1367	* Test if a character is a Unicode control character.
1368	*
1369	* Results:
1370	* Returns non-zero if character is a control.
1371	*
1372	* Side effects:
1373	* None.
1374	*
1375	*----------------------------------------------------------------------
1376	*/
1377
1378	int
1379	Tcl_UniCharIsControl(
1380	int ch) /* Unicode character to test. */
1381	{
1382	return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == CONTROL);
1383	}
1384
1385	/*
1386	*----------------------------------------------------------------------
1387	*
1388	* Tcl_UniCharIsDigit --
1389	*
1390	* Test if a character is a numeric Unicode character.
1391	*
1392	* Results:
1393	* Returns non-zero if character is a digit.
1394	*
1395	* Side effects:
1396	* None.
1397	*
1398	*----------------------------------------------------------------------
1399	*/
1400
1401	int
1402	Tcl_UniCharIsDigit(
1403	int ch) /* Unicode character to test. */
1404	{
1405	return (GetUniCharInfo(ch)&UNICODE_CATEGORY_MASK) == DECIMAL_DIGIT_NUMBER;
1406	}
1407
1408	/*
1409	*----------------------------------------------------------------------
1410	*
1411	* Tcl_UniCharIsGraph --
1412	*
1413	* Test if a character is any Unicode print character except space.
1414	*
1415	* Results:
1416	* Returns non-zero if character is printable, but not space.
1417	*
1418	* Side effects:
1419	* None.
1420	*
1421	*----------------------------------------------------------------------
1422	*/
1423
1424	int
1425	Tcl_UniCharIsGraph(
1426	int ch) /* Unicode character to test. */
1427	{
1428	register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1429	return (((PRINT_BITS >> category) & 1) && ((unsigned char) ch != ' '));
1430	}
1431
1432	/*
1433	*----------------------------------------------------------------------
1434	*
1435	* Tcl_UniCharIsLower --
1436	*
1437	* Test if a character is a lowercase Unicode character.
1438	*
1439	* Results:
1440	* Returns non-zero if character is lowercase.
1441	*
1442	* Side effects:
1443	* None.
1444	*
1445	*----------------------------------------------------------------------
1446	*/
1447
1448	int
1449	Tcl_UniCharIsLower(
1450	int ch) /* Unicode character to test. */
1451	{
1452	return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == LOWERCASE_LETTER);
1453	}
1454
1455	/*
1456	*----------------------------------------------------------------------
1457	*
1458	* Tcl_UniCharIsPrint --
1459	*
1460	* Test if a character is a Unicode print character.
1461	*
1462	* Results:
1463	* Returns non-zero if character is printable.
1464	*
1465	* Side effects:
1466	* None.
1467	*
1468	*----------------------------------------------------------------------
1469	*/
1470
1471	int
1472	Tcl_UniCharIsPrint(
1473	int ch) /* Unicode character to test. */
1474	{
1475	register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1476	return ((PRINT_BITS >> category) & 1);
1477	}
1478
1479	/*
1480	*----------------------------------------------------------------------
1481	*
1482	* Tcl_UniCharIsPunct --
1483	*
1484	* Test if a character is a Unicode punctuation character.
1485	*
1486	* Results:
1487	* Returns non-zero if character is punct.
1488	*
1489	* Side effects:
1490	* None.
1491	*
1492	*----------------------------------------------------------------------
1493	*/
1494
1495	int
1496	Tcl_UniCharIsPunct(
1497	int ch) /* Unicode character to test. */
1498	{
1499	register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1500	return ((PUNCT_BITS >> category) & 1);
1501	}
1502
1503	/*
1504	*----------------------------------------------------------------------
1505	*
1506	* Tcl_UniCharIsSpace --
1507	*
1508	* Test if a character is a whitespace Unicode character.
1509	*
1510	* Results:
1511	* Returns non-zero if character is a space.
1512	*
1513	* Side effects:
1514	* None.
1515	*
1516	*----------------------------------------------------------------------
1517	*/
1518
1519	int
1520	Tcl_UniCharIsSpace(
1521	int ch) /* Unicode character to test. */
1522	{
1523	register int category;
1524
1525	/*
1526	* If the character is within the first 127 characters, just use the
1527	* standard C function, otherwise consult the Unicode table.
1528	*/
1529
1530	if (ch < 0x80) {
1531	return isspace(UCHAR(ch)); /* INTL: ISO space */
1532	} else {
1533	category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1534	return ((SPACE_BITS >> category) & 1);
1535	}
1536	}
1537
1538	/*
1539	*----------------------------------------------------------------------
1540	*
1541	* Tcl_UniCharIsUpper --
1542	*
1543	* Test if a character is a uppercase Unicode character.
1544	*
1545	* Results:
1546	* Returns non-zero if character is uppercase.
1547	*
1548	* Side effects:
1549	* None.
1550	*
1551	*----------------------------------------------------------------------
1552	*/
1553
1554	int
1555	Tcl_UniCharIsUpper(
1556	int ch) /* Unicode character to test. */
1557	{
1558	return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == UPPERCASE_LETTER);
1559	}
1560
1561	/*
1562	*----------------------------------------------------------------------
1563	*
1564	* Tcl_UniCharIsWordChar --
1565	*
1566	* Test if a character is alphanumeric or a connector punctuation mark.
1567	*
1568	* Results:
1569	* Returns 1 if character is a word character.
1570	*
1571	* Side effects:
1572	* None.
1573	*
1574	*----------------------------------------------------------------------
1575	*/
1576
1577	int
1578	Tcl_UniCharIsWordChar(
1579	int ch) /* Unicode character to test. */
1580	{
1581	register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1582
1583	return (((ALPHA_BITS \| DIGIT_BITS \| CONNECTOR_BITS) >> category) & 1);
1584	}
1585
1586	/*
1587	*----------------------------------------------------------------------
1588	*
1589	* Tcl_UniCharCaseMatch --
1590	*
1591	* See if a particular Unicode string matches a particular pattern.
1592	* Allows case insensitivity. This is the Unicode equivalent of the char*
1593	* Tcl_StringCaseMatch. The UniChar strings must be NULL-terminated.
1594	* This has no provision for counted UniChar strings, thus should not be
1595	* used where NULLs are expected in the UniChar string. Use
1596	* TclUniCharMatch where possible.
1597	*
1598	* Results:
1599	* The return value is 1 if string matches pattern, and 0 otherwise. The
1600	* matching operation permits the following special characters in the
1601	* pattern: *?\[] (see the manual entry for details on what these mean).
1602	*
1603	* Side effects:
1604	* None.
1605	*
1606	*----------------------------------------------------------------------
1607	*/
1608
1609	int
1610	Tcl_UniCharCaseMatch(
1611	CONST Tcl_UniChar uniStr, / Unicode String. */
1612	CONST Tcl_UniChar *uniPattern,
1613	/* Pattern, which may contain special
1614	* characters. */
1615	int nocase) /* 0 for case sensitive, 1 for insensitive */
1616	{
1617	Tcl_UniChar ch1, p;
1618
1619	while (1) {
1620	p = *uniPattern;
1621
1622	/*
1623	* See if we're at the end of both the pattern and the string. If so,
1624	* we succeeded. If we're at the end of the pattern but not at the end
1625	* of the string, we failed.
1626	*/
1627
1628	if (p == 0) {
1629	return (*uniStr == 0);
1630	}
1631	if ((uniStr == 0) && (p != '')) {
1632	return 0;
1633	}
1634
1635	/*
1636	* Check for a "*" as the next pattern character. It matches any
1637	* substring. We handle this by skipping all the characters up to the
1638	* next matching one in the pattern, and then calling ourselves
1639	* recursively for each postfix of string, until either we match or we
1640	* reach the end of the string.
1641	*/
1642
1643	if (p == '*') {
1644	/*
1645	* Skip all successive *'s in the pattern
1646	*/
1647
1648	while ((++uniPattern) == '') {
1649	/* empty body */
1650	}
1651	p = *uniPattern;
1652	if (p == 0) {
1653	return 1;
1654	}
1655	if (nocase) {
1656	p = Tcl_UniCharToLower(p);
1657	}
1658	while (1) {
1659	/*
1660	* Optimization for matching - cruise through the string
1661	* quickly if the next char in the pattern isn't a special
1662	* character
1663	*/
1664
1665	if ((p != '[') && (p != '?') && (p != '\\')) {
1666	if (nocase) {
1667	while (uniStr && (p != uniStr)
1668	&& (p != Tcl_UniCharToLower(*uniStr))) {
1669	uniStr++;
1670	}
1671	} else {
1672	while (uniStr && (p != uniStr)) {
1673	uniStr++;
1674	}
1675	}
1676	}
1677	if (Tcl_UniCharCaseMatch(uniStr, uniPattern, nocase)) {
1678	return 1;
1679	}
1680	if (*uniStr == 0) {
1681	return 0;
1682	}
1683	uniStr++;
1684	}
1685	}
1686
1687	/*
1688	* Check for a "?" as the next pattern character. It matches any
1689	* single character.
1690	*/
1691
1692	if (p == '?') {
1693	uniPattern++;
1694	uniStr++;
1695	continue;
1696	}
1697
1698	/*
1699	* Check for a "[" as the next pattern character. It is followed by a
1700	* list of characters that are acceptable, or by a range (two
1701	* characters separated by "-").
1702	*/
1703
1704	if (p == '[') {
1705	Tcl_UniChar startChar, endChar;
1706
1707	uniPattern++;
1708	ch1 = (nocase ? Tcl_UniCharToLower(uniStr) : uniStr);
1709	uniStr++;
1710	while (1) {
1711	if ((uniPattern == ']') \|\| (uniPattern == 0)) {
1712	return 0;
1713	}
1714	startChar = (nocase ? Tcl_UniCharToLower(*uniPattern)
1715	: *uniPattern);
1716	uniPattern++;
1717	if (*uniPattern == '-') {
1718	uniPattern++;
1719	if (*uniPattern == 0) {
1720	return 0;
1721	}
1722	endChar = (nocase ? Tcl_UniCharToLower(*uniPattern)
1723	: *uniPattern);
1724	uniPattern++;
1725	if (((startChar <= ch1) && (ch1 <= endChar))
1726	\|\| ((endChar <= ch1) && (ch1 <= startChar))) {
1727	/*
1728	* Matches ranges of form [a-z] or [z-a].
1729	*/
1730	break;
1731	}
1732	} else if (startChar == ch1) {
1733	break;
1734	}
1735	}
1736	while (*uniPattern != ']') {
1737	if (*uniPattern == 0) {
1738	uniPattern--;
1739	break;
1740	}
1741	uniPattern++;
1742	}
1743	uniPattern++;
1744	continue;
1745	}
1746
1747	/*
1748	* If the next pattern character is '\', just strip off the '\' so we
1749	* do exact matching on the character that follows.
1750	*/
1751
1752	if (p == '\\') {
1753	if (*(++uniPattern) == '\0') {
1754	return 0;
1755	}
1756	}
1757
1758	/*
1759	* There's no special character. Just make sure that the next bytes of
1760	* each string match.
1761	*/
1762
1763	if (nocase) {
1764	if (Tcl_UniCharToLower(*uniStr) !=
1765	Tcl_UniCharToLower(*uniPattern)) {
1766	return 0;
1767	}
1768	} else if (uniStr != uniPattern) {
1769	return 0;
1770	}
1771	uniStr++;
1772	uniPattern++;
1773	}
1774	}
1775
1776	/*
1777	*----------------------------------------------------------------------
1778	*
1779	* TclUniCharMatch --
1780	*
1781	* See if a particular Unicode string matches a particular pattern.
1782	* Allows case insensitivity. This is the Unicode equivalent of the char*
1783	* Tcl_StringCaseMatch. This variant of Tcl_UniCharCaseMatch uses counted
1784	* Strings, so embedded NULLs are allowed.
1785	*
1786	* Results:
1787	* The return value is 1 if string matches pattern, and 0 otherwise. The
1788	* matching operation permits the following special characters in the
1789	* pattern: *?\[] (see the manual entry for details on what these mean).
1790	*
1791	* Side effects:
1792	* None.
1793	*
1794	*----------------------------------------------------------------------
1795	*/
1796
1797	int
1798	TclUniCharMatch(
1799	CONST Tcl_UniChar string, / Unicode String. */
1800	int strLen, /* Length of String */
1801	CONST Tcl_UniChar pattern, / Pattern, which may contain special
1802	* characters. */
1803	int ptnLen, /* Length of Pattern */
1804	int nocase) /* 0 for case sensitive, 1 for insensitive */
1805	{
1806	CONST Tcl_UniChar stringEnd, patternEnd;
1807	Tcl_UniChar p;
1808
1809	stringEnd = string + strLen;
1810	patternEnd = pattern + ptnLen;
1811
1812	while (1) {
1813	/*
1814	* See if we're at the end of both the pattern and the string. If so,
1815	* we succeeded. If we're at the end of the pattern but not at the end
1816	* of the string, we failed.
1817	*/
1818
1819	if (pattern == patternEnd) {
1820	return (string == stringEnd);
1821	}
1822	p = *pattern;
1823	if ((string == stringEnd) && (p != '*')) {
1824	return 0;
1825	}
1826
1827	/*
1828	* Check for a "*" as the next pattern character. It matches any
1829	* substring. We handle this by skipping all the characters up to the
1830	* next matching one in the pattern, and then calling ourselves
1831	* recursively for each postfix of string, until either we match or we
1832	* reach the end of the string.
1833	*/
1834
1835	if (p == '*') {
1836	/*
1837	* Skip all successive *'s in the pattern.
1838	*/
1839
1840	while ((++pattern) == '') {
1841	/* empty body */
1842	}
1843	if (pattern == patternEnd) {
1844	return 1;
1845	}
1846	p = *pattern;
1847	if (nocase) {
1848	p = Tcl_UniCharToLower(p);
1849	}
1850	while (1) {
1851	/*
1852	* Optimization for matching - cruise through the string
1853	* quickly if the next char in the pattern isn't a special
1854	* character.
1855	*/
1856
1857	if ((p != '[') && (p != '?') && (p != '\\')) {
1858	if (nocase) {
1859	while ((string < stringEnd) && (p != *string)
1860	&& (p != Tcl_UniCharToLower(*string))) {
1861	string++;
1862	}
1863	} else {
1864	while ((string < stringEnd) && (p != *string)) {
1865	string++;
1866	}
1867	}
1868	}
1869	if (TclUniCharMatch(string, stringEnd - string,
1870	pattern, patternEnd - pattern, nocase)) {
1871	return 1;
1872	}
1873	if (string == stringEnd) {
1874	return 0;
1875	}
1876	string++;
1877	}
1878	}
1879
1880	/*
1881	* Check for a "?" as the next pattern character. It matches any
1882	* single character.
1883	*/
1884
1885	if (p == '?') {
1886	pattern++;
1887	string++;
1888	continue;
1889	}
1890
1891	/*
1892	* Check for a "[" as the next pattern character. It is followed by a
1893	* list of characters that are acceptable, or by a range (two
1894	* characters separated by "-").
1895	*/
1896
1897	if (p == '[') {
1898	Tcl_UniChar ch1, startChar, endChar;
1899
1900	pattern++;
1901	ch1 = (nocase ? Tcl_UniCharToLower(string) : string);
1902	string++;
1903	while (1) {
1904	if ((*pattern == ']') \|\| (pattern == patternEnd)) {
1905	return 0;
1906	}
1907	startChar = (nocase ? Tcl_UniCharToLower(pattern) : pattern);
1908	pattern++;
1909	if (*pattern == '-') {
1910	pattern++;
1911	if (pattern == patternEnd) {
1912	return 0;
1913	}
1914	endChar = (nocase ? Tcl_UniCharToLower(*pattern)
1915	: *pattern);
1916	pattern++;
1917	if (((startChar <= ch1) && (ch1 <= endChar))
1918	\|\| ((endChar <= ch1) && (ch1 <= startChar))) {
1919	/*
1920	* Matches ranges of form [a-z] or [z-a].
1921	*/
1922	break;
1923	}
1924	} else if (startChar == ch1) {
1925	break;
1926	}
1927	}
1928	while (*pattern != ']') {
1929	if (pattern == patternEnd) {
1930	pattern--;
1931	break;
1932	}
1933	pattern++;
1934	}
1935	pattern++;
1936	continue;
1937	}
1938
1939	/*
1940	* If the next pattern character is '\', just strip off the '\' so we
1941	* do exact matching on the character that follows.
1942	*/
1943
1944	if (p == '\\') {
1945	if (++pattern == patternEnd) {
1946	return 0;
1947	}
1948	}
1949
1950	/*
1951	* There's no special character. Just make sure that the next bytes of
1952	* each string match.
1953	*/
1954
1955	if (nocase) {
1956	if (Tcl_UniCharToLower(string) != Tcl_UniCharToLower(pattern)) {
1957	return 0;
1958	}
1959	} else if (string != pattern) {
1960	return 0;
1961	}
1962	string++;
1963	pattern++;
1964	}
1965	}
1966
1967	/*
1968	* Local Variables:
1969	* mode: c
1970	* c-basic-offset: 4
1971	* fill-column: 78
1972	* End:
1973	*/

Note: See TracBrowser for help on using the repository browser.

Download in other formats: