Planet

navi

home

PPS

about

screenshots

download

development

forum

Context Navigation

source: downloads/tcl8.5.2/generic/tclUtf.c @ 35

Last change on this file since 35 was 25, checked in by landauf, 18 years ago
added tcl to libs
File size: 45.7 KB

Rev	Line
[25]	1	/*
	2	* tclUtf.c --
	3	*
	4	* Routines for manipulating UTF-8 strings.
	5	*
	6	* Copyright (c) 1997-1998 Sun Microsystems, Inc.
	7	*
	8	* See the file "license.terms" for information on usage and redistribution of
	9	* this file, and for a DISCLAIMER OF ALL WARRANTIES.
	10	*
	11	* RCS: @(#) $Id: tclUtf.c,v 1.37 2005/10/31 15:59:41 dkf Exp $
	12	*/
	13
	14	#include "tclInt.h"
	15
	16	/*
	17	* Include the static character classification tables and macros.
	18	*/
	19
	20	#include "tclUniData.c"
	21
	22	/*
	23	* The following macros are used for fast character category tests. The x_BITS
	24	* values are shifted right by the category value to determine whether the
	25	* given category is included in the set.
	26	*/
	27
	28	#define ALPHA_BITS ((1 << UPPERCASE_LETTER) \| (1 << LOWERCASE_LETTER) \
	29	\| (1 << TITLECASE_LETTER) \| (1 << MODIFIER_LETTER) \| (1<<OTHER_LETTER))
	30
	31	#define DIGIT_BITS (1 << DECIMAL_DIGIT_NUMBER)
	32
	33	#define SPACE_BITS ((1 << SPACE_SEPARATOR) \| (1 << LINE_SEPARATOR) \
	34	\| (1 << PARAGRAPH_SEPARATOR))
	35
	36	#define CONNECTOR_BITS (1 << CONNECTOR_PUNCTUATION)
	37
	38	#define PRINT_BITS (ALPHA_BITS \| DIGIT_BITS \| SPACE_BITS \| \
	39	(1 << NON_SPACING_MARK) \| (1 << ENCLOSING_MARK) \| \
	40	(1 << COMBINING_SPACING_MARK) \| (1 << LETTER_NUMBER) \| \
	41	(1 << OTHER_NUMBER) \| (1 << CONNECTOR_PUNCTUATION) \| \
	42	(1 << DASH_PUNCTUATION) \| (1 << OPEN_PUNCTUATION) \| \
	43	(1 << CLOSE_PUNCTUATION) \| (1 << INITIAL_QUOTE_PUNCTUATION) \| \
	44	(1 << FINAL_QUOTE_PUNCTUATION) \| (1 << OTHER_PUNCTUATION) \| \
	45	(1 << MATH_SYMBOL) \| (1 << CURRENCY_SYMBOL) \| \
	46	(1 << MODIFIER_SYMBOL) \| (1 << OTHER_SYMBOL))
	47
	48	#define PUNCT_BITS ((1 << CONNECTOR_PUNCTUATION) \| \
	49	(1 << DASH_PUNCTUATION) \| (1 << OPEN_PUNCTUATION) \| \
	50	(1 << CLOSE_PUNCTUATION) \| (1 << INITIAL_QUOTE_PUNCTUATION) \| \
	51	(1 << FINAL_QUOTE_PUNCTUATION) \| (1 << OTHER_PUNCTUATION))
	52
	53	/*
	54	* Unicode characters less than this value are represented by themselves in
	55	* UTF-8 strings.
	56	*/
	57
	58	#define UNICODE_SELF 0x80
	59
	60	/*
	61	* The following structures are used when mapping between Unicode (UCS-2) and
	62	* UTF-8.
	63	*/
	64
	65	static CONST unsigned char totalBytes[256] = {
	66	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
	67	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
	68	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
	69	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
	70	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
	71	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
	72	2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
	73	3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
	74	#if TCL_UTF_MAX > 3
	75	4,4,4,4,4,4,4,4,
	76	#else
	77	1,1,1,1,1,1,1,1,
	78	#endif
	79	#if TCL_UTF_MAX > 4
	80	5,5,5,5,
	81	#else
	82	1,1,1,1,
	83	#endif
	84	#if TCL_UTF_MAX > 5
	85	6,6,6,6
	86	#else
	87	1,1,1,1
	88	#endif
	89	};
	90
	91	/*
	92	* Functions used only in this module.
	93	*/
	94
	95	static int UtfCount(int ch);
	96
	97	/*
	98	*---------------------------------------------------------------------------
	99	*
	100	* UtfCount --
	101	*
	102	* Find the number of bytes in the Utf character "ch".
	103	*
	104	* Results:
	105	* The return values is the number of bytes in the Utf character "ch".
	106	*
	107	* Side effects:
	108	* None.
	109	*
	110	*---------------------------------------------------------------------------
	111	*/
	112
	113	INLINE static int
	114	UtfCount(
	115	int ch) /* The Tcl_UniChar whose size is returned. */
	116	{
	117	if ((ch > 0) && (ch < UNICODE_SELF)) {
	118	return 1;
	119	}
	120	if (ch <= 0x7FF) {
	121	return 2;
	122	}
	123	if (ch <= 0xFFFF) {
	124	return 3;
	125	}
	126	#if TCL_UTF_MAX > 3
	127	if (ch <= 0x1FFFFF) {
	128	return 4;
	129	}
	130	if (ch <= 0x3FFFFFF) {
	131	return 5;
	132	}
	133	if (ch <= 0x7FFFFFFF) {
	134	return 6;
	135	}
	136	#endif
	137	return 3;
	138	}
	139
	140	/*
	141	*---------------------------------------------------------------------------
	142	*
	143	* Tcl_UniCharToUtf --
	144	*
	145	* Store the given Tcl_UniChar as a sequence of UTF-8 bytes in the
	146	* provided buffer. Equivalent to Plan 9 runetochar().
	147	*
	148	* Results:
	149	* The return values is the number of bytes in the buffer that were
	150	* consumed.
	151	*
	152	* Side effects:
	153	* None.
	154	*
	155	*---------------------------------------------------------------------------
	156	*/
	157
	158	INLINE int
	159	Tcl_UniCharToUtf(
	160	int ch, /* The Tcl_UniChar to be stored in the
	161	* buffer. */
	162	char buf) / Buffer in which the UTF-8 representation of
	163	* the Tcl_UniChar is stored. Buffer must be
	164	* large enough to hold the UTF-8 character
	165	* (at most TCL_UTF_MAX bytes). */
	166	{
	167	if ((ch > 0) && (ch < UNICODE_SELF)) {
	168	buf[0] = (char) ch;
	169	return 1;
	170	}
	171	if (ch >= 0) {
	172	if (ch <= 0x7FF) {
	173	buf[1] = (char) ((ch \| 0x80) & 0xBF);
	174	buf[0] = (char) ((ch >> 6) \| 0xC0);
	175	return 2;
	176	}
	177	if (ch <= 0xFFFF) {
	178	three:
	179	buf[2] = (char) ((ch \| 0x80) & 0xBF);
	180	buf[1] = (char) (((ch >> 6) \| 0x80) & 0xBF);
	181	buf[0] = (char) ((ch >> 12) \| 0xE0);
	182	return 3;
	183	}
	184
	185	#if TCL_UTF_MAX > 3
	186	if (ch <= 0x1FFFFF) {
	187	buf[3] = (char) ((ch \| 0x80) & 0xBF);
	188	buf[2] = (char) (((ch >> 6) \| 0x80) & 0xBF);
	189	buf[1] = (char) (((ch >> 12) \| 0x80) & 0xBF);
	190	buf[0] = (char) ((ch >> 18) \| 0xF0);
	191	return 4;
	192	}
	193	if (ch <= 0x3FFFFFF) {
	194	buf[4] = (char) ((ch \| 0x80) & 0xBF);
	195	buf[3] = (char) (((ch >> 6) \| 0x80) & 0xBF);
	196	buf[2] = (char) (((ch >> 12) \| 0x80) & 0xBF);
	197	buf[1] = (char) (((ch >> 18) \| 0x80) & 0xBF);
	198	buf[0] = (char) ((ch >> 24) \| 0xF8);
	199	return 5;
	200	}
	201	if (ch <= 0x7FFFFFFF) {
	202	buf[5] = (char) ((ch \| 0x80) & 0xBF);
	203	buf[4] = (char) (((ch >> 6) \| 0x80) & 0xBF);
	204	buf[3] = (char) (((ch >> 12) \| 0x80) & 0xBF);
	205	buf[2] = (char) (((ch >> 18) \| 0x80) & 0xBF);
	206	buf[1] = (char) (((ch >> 24) \| 0x80) & 0xBF);
	207	buf[0] = (char) ((ch >> 30) \| 0xFC);
	208	return 6;
	209	}
	210	#endif
	211	}
	212
	213	ch = 0xFFFD;
	214	goto three;
	215	}
	216
	217	/*
	218	*---------------------------------------------------------------------------
	219	*
	220	* Tcl_UniCharToUtfDString --
	221	*
	222	* Convert the given Unicode string to UTF-8.
	223	*
	224	* Results:
	225	* The return value is a pointer to the UTF-8 representation of the
	226	* Unicode string. Storage for the return value is appended to the end of
	227	* dsPtr.
	228	*
	229	* Side effects:
	230	* None.
	231	*
	232	*---------------------------------------------------------------------------
	233	*/
	234
	235	char *
	236	Tcl_UniCharToUtfDString(
	237	CONST Tcl_UniChar uniStr, / Unicode string to convert to UTF-8. */
	238	int uniLength, /* Length of Unicode string in Tcl_UniChars
	239	* (must be >= 0). */
	240	Tcl_DString dsPtr) / UTF-8 representation of string is appended
	241	* to this previously initialized DString. */
	242	{
	243	CONST Tcl_UniChar w, wEnd;
	244	char p, string;
	245	int oldLength;
	246
	247	/*
	248	* UTF-8 string length in bytes will be <= Unicode string length *
	249	* TCL_UTF_MAX.
	250	*/
	251
	252	oldLength = Tcl_DStringLength(dsPtr);
	253	Tcl_DStringSetLength(dsPtr, (oldLength + uniLength + 1) * TCL_UTF_MAX);
	254	string = Tcl_DStringValue(dsPtr) + oldLength;
	255
	256	p = string;
	257	wEnd = uniStr + uniLength;
	258	for (w = uniStr; w < wEnd; ) {
	259	p += Tcl_UniCharToUtf(*w, p);
	260	w++;
	261	}
	262	Tcl_DStringSetLength(dsPtr, oldLength + (p - string));
	263
	264	return string;
	265	}
	266
	267	/*
	268	*---------------------------------------------------------------------------
	269	*
	270	* Tcl_UtfToUniChar --
	271	*
	272	* Extract the Tcl_UniChar represented by the UTF-8 string. Bad UTF-8
	273	* sequences are converted to valid Tcl_UniChars and processing
	274	* continues. Equivalent to Plan 9 chartorune().
	275	*
	276	* The caller must ensure that the source buffer is long enough that this
	277	* routine does not run off the end and dereference non-existent memory
	278	* looking for trail bytes. If the source buffer is known to be '\0'
	279	* terminated, this cannot happen. Otherwise, the caller should call
	280	* Tcl_UtfCharComplete() before calling this routine to ensure that
	281	* enough bytes remain in the string.
	282	*
	283	* Results:
	284	* *chPtr is filled with the Tcl_UniChar, and the return value is the
	285	* number of bytes from the UTF-8 string that were consumed.
	286	*
	287	* Side effects:
	288	* None.
	289	*
	290	*---------------------------------------------------------------------------
	291	*/
	292
	293	int
	294	Tcl_UtfToUniChar(
	295	register CONST char src, / The UTF-8 string. */
	296	register Tcl_UniChar chPtr)/ Filled with the Tcl_UniChar represented by
	297	* the UTF-8 string. */
	298	{
	299	register int byte;
	300
	301	/*
	302	* Unroll 1 to 3 byte UTF-8 sequences, use loop to handle longer ones.
	303	*/
	304
	305	byte = ((unsigned char ) src);
	306	if (byte < 0xC0) {
	307	/*
	308	* Handles properly formed UTF-8 characters between 0x01 and 0x7F.
	309	* Also treats \0 and naked trail bytes 0x80 to 0xBF as valid
	310	* characters representing themselves.
	311	*/
	312
	313	*chPtr = (Tcl_UniChar) byte;
	314	return 1;
	315	} else if (byte < 0xE0) {
	316	if ((src[1] & 0xC0) == 0x80) {
	317	/*
	318	* Two-byte-character lead-byte followed by a trail-byte.
	319	*/
	320
	321	*chPtr = (Tcl_UniChar) (((byte & 0x1F) << 6) \| (src[1] & 0x3F));
	322	return 2;
	323	}
	324
	325	/*
	326	* A two-byte-character lead-byte not followed by trail-byte
	327	* represents itself.
	328	*/
	329
	330	*chPtr = (Tcl_UniChar) byte;
	331	return 1;
	332	} else if (byte < 0xF0) {
	333	if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80)) {
	334	/*
	335	* Three-byte-character lead byte followed by two trail bytes.
	336	*/
	337
	338	*chPtr = (Tcl_UniChar) (((byte & 0x0F) << 12)
	339	\| ((src[1] & 0x3F) << 6) \| (src[2] & 0x3F));
	340	return 3;
	341	}
	342
	343	/*
	344	* A three-byte-character lead-byte not followed by two trail-bytes
	345	* represents itself.
	346	*/
	347
	348	*chPtr = (Tcl_UniChar) byte;
	349	return 1;
	350	}
	351	#if TCL_UTF_MAX > 3
	352	{
	353	int ch, total, trail;
	354
	355	total = totalBytes[byte];
	356	trail = total - 1;
	357	if (trail > 0) {
	358	ch = byte & (0x3F >> trail);
	359	do {
	360	src++;
	361	if ((*src & 0xC0) != 0x80) {
	362	*chPtr = byte;
	363	return 1;
	364	}
	365	ch <<= 6;
	366	ch \|= (*src & 0x3F);
	367	trail--;
	368	} while (trail > 0);
	369	*chPtr = ch;
	370	return total;
	371	}
	372	}
	373	#endif
	374
	375	*chPtr = (Tcl_UniChar) byte;
	376	return 1;
	377	}
	378
	379	/*
	380	*---------------------------------------------------------------------------
	381	*
	382	* Tcl_UtfToUniCharDString --
	383	*
	384	* Convert the UTF-8 string to Unicode.
	385	*
	386	* Results:
	387	* The return value is a pointer to the Unicode representation of the
	388	* UTF-8 string. Storage for the return value is appended to the end of
	389	* dsPtr. The Unicode string is terminated with a Unicode NULL character.
	390	*
	391	* Side effects:
	392	* None.
	393	*
	394	*---------------------------------------------------------------------------
	395	*/
	396
	397	Tcl_UniChar *
	398	Tcl_UtfToUniCharDString(
	399	CONST char src, / UTF-8 string to convert to Unicode. */
	400	int length, /* Length of UTF-8 string in bytes, or -1 for
	401	* strlen(). */
	402	Tcl_DString dsPtr) / Unicode representation of string is
	403	* appended to this previously initialized
	404	* DString. */
	405	{
	406	Tcl_UniChar w, wString;
	407	CONST char p, end;
	408	int oldLength;
	409
	410	if (length < 0) {
	411	length = strlen(src);
	412	}
	413
	414	/*
	415	* Unicode string length in Tcl_UniChars will be <= UTF-8 string length in
	416	* bytes.
	417	*/
	418
	419	oldLength = Tcl_DStringLength(dsPtr);
	420	Tcl_DStringSetLength(dsPtr,
	421	(int) ((oldLength + length + 1) * sizeof(Tcl_UniChar)));
	422	wString = (Tcl_UniChar *) (Tcl_DStringValue(dsPtr) + oldLength);
	423
	424	w = wString;
	425	end = src + length;
	426	for (p = src; p < end; ) {
	427	p += TclUtfToUniChar(p, w);
	428	w++;
	429	}
	430	*w = '\0';
	431	Tcl_DStringSetLength(dsPtr,
	432	(oldLength + ((char ) w - (char ) wString)));
	433
	434	return wString;
	435	}
	436
	437	/*
	438	*---------------------------------------------------------------------------
	439	*
	440	* Tcl_UtfCharComplete --
	441	*
	442	* Determine if the UTF-8 string of the given length is long enough to be
	443	* decoded by Tcl_UtfToUniChar(). This does not ensure that the UTF-8
	444	* string is properly formed. Equivalent to Plan 9 fullrune().
	445	*
	446	* Results:
	447	* The return value is 0 if the string is not long enough, non-zero
	448	* otherwise.
	449	*
	450	* Side effects:
	451	* None.
	452	*
	453	*---------------------------------------------------------------------------
	454	*/
	455
	456	int
	457	Tcl_UtfCharComplete(
	458	CONST char src, / String to check if first few bytes contain
	459	* a complete UTF-8 character. */
	460	int length) /* Length of above string in bytes. */
	461	{
	462	int ch;
	463
	464	ch = ((unsigned char ) src);
	465	return length >= totalBytes[ch];
	466	}
	467
	468	/*
	469	*---------------------------------------------------------------------------
	470	*
	471	* Tcl_NumUtfChars --
	472	*
	473	* Returns the number of characters (not bytes) in the UTF-8 string, not
	474	* including the terminating NULL byte. This is equivalent to Plan 9
	475	* utflen() and utfnlen().
	476	*
	477	* Results:
	478	* As above.
	479	*
	480	* Side effects:
	481	* None.
	482	*
	483	*---------------------------------------------------------------------------
	484	*/
	485
	486	int
	487	Tcl_NumUtfChars(
	488	register CONST char src, / The UTF-8 string to measure. */
	489	int length) /* The length of the string in bytes, or -1
	490	* for strlen(string). */
	491	{
	492	Tcl_UniChar ch;
	493	register Tcl_UniChar *chPtr = &ch;
	494	register int i;
	495
	496	/*
	497	* The separate implementations are faster.
	498	*
	499	* Since this is a time-sensitive function, we also do the check for the
	500	* single-byte char case specially.
	501	*/
	502
	503	i = 0;
	504	if (length < 0) {
	505	while (*src != '\0') {
	506	src += TclUtfToUniChar(src, chPtr);
	507	i++;
	508	}
	509	} else {
	510	register int n;
	511
	512	while (length > 0) {
	513	if (UCHAR(*src) < 0xC0) {
	514	length--;
	515	src++;
	516	} else {
	517	n = Tcl_UtfToUniChar(src, chPtr);
	518	length -= n;
	519	src += n;
	520	}
	521	i++;
	522	}
	523	}
	524	return i;
	525	}
	526
	527	/*
	528	*---------------------------------------------------------------------------
	529	*
	530	* Tcl_UtfFindFirst --
	531	*
	532	* Returns a pointer to the first occurance of the given Tcl_UniChar in
	533	* the NULL-terminated UTF-8 string. The NULL terminator is considered
	534	* part of the UTF-8 string. Equivalent to Plan 9 utfrune().
	535	*
	536	* Results:
	537	* As above. If the Tcl_UniChar does not exist in the given string, the
	538	* return value is NULL.
	539	*
	540	* Side effects:
	541	* None.
	542	*
	543	*---------------------------------------------------------------------------
	544	*/
	545
	546	CONST char *
	547	Tcl_UtfFindFirst(
	548	CONST char src, / The UTF-8 string to be searched. */
	549	int ch) /* The Tcl_UniChar to search for. */
	550	{
	551	int len;
	552	Tcl_UniChar find;
	553
	554	while (1) {
	555	len = TclUtfToUniChar(src, &find);
	556	if (find == ch) {
	557	return src;
	558	}
	559	if (*src == '\0') {
	560	return NULL;
	561	}
	562	src += len;
	563	}
	564	}
	565
	566	/*
	567	*---------------------------------------------------------------------------
	568	*
	569	* Tcl_UtfFindLast --
	570	*
	571	* Returns a pointer to the last occurance of the given Tcl_UniChar in
	572	* the NULL-terminated UTF-8 string. The NULL terminator is considered
	573	* part of the UTF-8 string. Equivalent to Plan 9 utfrrune().
	574	*
	575	* Results:
	576	* As above. If the Tcl_UniChar does not exist in the given string, the
	577	* return value is NULL.
	578	*
	579	* Side effects:
	580	* None.
	581	*
	582	*---------------------------------------------------------------------------
	583	*/
	584
	585	CONST char *
	586	Tcl_UtfFindLast(
	587	CONST char src, / The UTF-8 string to be searched. */
	588	int ch) /* The Tcl_UniChar to search for. */
	589	{
	590	int len;
	591	Tcl_UniChar find;
	592	CONST char *last;
	593
	594	last = NULL;
	595	while (1) {
	596	len = TclUtfToUniChar(src, &find);
	597	if (find == ch) {
	598	last = src;
	599	}
	600	if (*src == '\0') {
	601	break;
	602	}
	603	src += len;
	604	}
	605	return last;
	606	}
	607
	608	/*
	609	*---------------------------------------------------------------------------
	610	*
	611	* Tcl_UtfNext --
	612	*
	613	* Given a pointer to some current location in a UTF-8 string, move
	614	* forward one character. The caller must ensure that they are not asking
	615	* for the next character after the last character in the string.
	616	*
	617	* Results:
	618	* The return value is the pointer to the next character in the UTF-8
	619	* string.
	620	*
	621	* Side effects:
	622	* None.
	623	*
	624	*---------------------------------------------------------------------------
	625	*/
	626
	627	CONST char *
	628	Tcl_UtfNext(
	629	CONST char src) / The current location in the string. */
	630	{
	631	Tcl_UniChar ch;
	632
	633	return src + TclUtfToUniChar(src, &ch);
	634	}
	635
	636	/*
	637	*---------------------------------------------------------------------------
	638	*
	639	* Tcl_UtfPrev --
	640	*
	641	* Given a pointer to some current location in a UTF-8 string, move
	642	* backwards one character. This works correctly when the pointer is in
	643	* the middle of a UTF-8 character.
	644	*
	645	* Results:
	646	* The return value is a pointer to the previous character in the UTF-8
	647	* string. If the current location was already at the beginning of the
	648	* string, the return value will also be a pointer to the beginning of
	649	* the string.
	650	*
	651	* Side effects:
	652	* None.
	653	*
	654	*---------------------------------------------------------------------------
	655	*/
	656
	657	CONST char *
	658	Tcl_UtfPrev(
	659	CONST char src, / The current location in the string. */
	660	CONST char start) / Pointer to the beginning of the string, to
	661	* avoid going backwards too far. */
	662	{
	663	CONST char *look;
	664	int i, byte;
	665
	666	src--;
	667	look = src;
	668	for (i = 0; i < TCL_UTF_MAX; i++) {
	669	if (look < start) {
	670	if (src < start) {
	671	src = start;
	672	}
	673	break;
	674	}
	675	byte = ((unsigned char ) look);
	676	if (byte < 0x80) {
	677	break;
	678	}
	679	if (byte >= 0xC0) {
	680	return look;
	681	}
	682	look--;
	683	}
	684	return src;
	685	}
	686
	687	/*
	688	*---------------------------------------------------------------------------
	689	*
	690	* Tcl_UniCharAtIndex --
	691	*
	692	* Returns the Unicode character represented at the specified character
	693	* (not byte) position in the UTF-8 string.
	694	*
	695	* Results:
	696	* As above.
	697	*
	698	* Side effects:
	699	* None.
	700	*
	701	*---------------------------------------------------------------------------
	702	*/
	703
	704	Tcl_UniChar
	705	Tcl_UniCharAtIndex(
	706	register CONST char src, / The UTF-8 string to dereference. */
	707	register int index) /* The position of the desired character. */
	708	{
	709	Tcl_UniChar ch;
	710
	711	while (index >= 0) {
	712	index--;
	713	src += TclUtfToUniChar(src, &ch);
	714	}
	715	return ch;
	716	}
	717
	718	/*
	719	*---------------------------------------------------------------------------
	720	*
	721	* Tcl_UtfAtIndex --
	722	*
	723	* Returns a pointer to the specified character (not byte) position in
	724	* the UTF-8 string.
	725	*
	726	* Results:
	727	* As above.
	728	*
	729	* Side effects:
	730	* None.
	731	*
	732	*---------------------------------------------------------------------------
	733	*/
	734
	735	CONST char *
	736	Tcl_UtfAtIndex(
	737	register CONST char src, / The UTF-8 string. */
	738	register int index) /* The position of the desired character. */
	739	{
	740	Tcl_UniChar ch;
	741
	742	while (index > 0) {
	743	index--;
	744	src += TclUtfToUniChar(src, &ch);
	745	}
	746	return src;
	747	}
	748
	749	/*
	750	*---------------------------------------------------------------------------
	751	*
	752	* Tcl_UtfBackslash --
	753	*
	754	* Figure out how to handle a backslash sequence.
	755	*
	756	* Results:
	757	* Stores the bytes represented by the backslash sequence in dst and
	758	* returns the number of bytes written to dst. At most TCL_UTF_MAX bytes
	759	* are written to dst; dst must have been large enough to accept those
	760	* bytes. If readPtr isn't NULL then it is filled in with a count of the
	761	* number of bytes in the backslash sequence.
	762	*
	763	* Side effects:
	764	* The maximum number of bytes it takes to represent a Unicode character
	765	* in UTF-8 is guaranteed to be less than the number of bytes used to
	766	* express the backslash sequence that represents that Unicode character.
	767	* If the target buffer into which the caller is going to store the bytes
	768	* that represent the Unicode character is at least as large as the
	769	* source buffer from which the backslashed sequence was extracted, no
	770	* buffer overruns should occur.
	771	*
	772	*---------------------------------------------------------------------------
	773	*/
	774
	775	int
	776	Tcl_UtfBackslash(
	777	CONST char src, / Points to the backslash character of a
	778	* backslash sequence. */
	779	int readPtr, / Fill in with number of characters read from
	780	* src, unless NULL. */
	781	char dst) / Filled with the bytes represented by the
	782	* backslash sequence. */
	783	{
	784	#define LINE_LENGTH 128
	785	int numRead;
	786	int result;
	787
	788	result = TclParseBackslash(src, LINE_LENGTH, &numRead, dst);
	789	if (numRead == LINE_LENGTH) {
	790	/*
	791	* We ate a whole line. Pay the price of a strlen()
	792	*/
	793
	794	result = TclParseBackslash(src, (int)strlen(src), &numRead, dst);
	795	}
	796	if (readPtr != NULL) {
	797	*readPtr = numRead;
	798	}
	799	return result;
	800	}
	801
	802	/*
	803	*----------------------------------------------------------------------
	804	*
	805	* Tcl_UtfToUpper --
	806	*
	807	* Convert lowercase characters to uppercase characters in a UTF string
	808	* in place. The conversion may shrink the UTF string.
	809	*
	810	* Results:
	811	* Returns the number of bytes in the resulting string excluding the
	812	* trailing null.
	813	*
	814	* Side effects:
	815	* Writes a terminating null after the last converted character.
	816	*
	817	*----------------------------------------------------------------------
	818	*/
	819
	820	int
	821	Tcl_UtfToUpper(
	822	char str) / String to convert in place. */
	823	{
	824	Tcl_UniChar ch, upChar;
	825	char src, dst;
	826	int bytes;
	827
	828	/*
	829	* Iterate over the string until we hit the terminating null.
	830	*/
	831
	832	src = dst = str;
	833	while (*src) {
	834	bytes = TclUtfToUniChar(src, &ch);
	835	upChar = Tcl_UniCharToUpper(ch);
	836
	837	/*
	838	* To keep badly formed Utf strings from getting inflated by the
	839	* conversion (thereby causing a segfault), only copy the upper case
	840	* char to dst if its size is <= the original char.
	841	*/
	842
	843	if (bytes < UtfCount(upChar)) {
	844	memcpy(dst, src, (size_t) bytes);
	845	dst += bytes;
	846	} else {
	847	dst += Tcl_UniCharToUtf(upChar, dst);
	848	}
	849	src += bytes;
	850	}
	851	*dst = '\0';
	852	return (dst - str);
	853	}
	854
	855	/*
	856	*----------------------------------------------------------------------
	857	*
	858	* Tcl_UtfToLower --
	859	*
	860	* Convert uppercase characters to lowercase characters in a UTF string
	861	* in place. The conversion may shrink the UTF string.
	862	*
	863	* Results:
	864	* Returns the number of bytes in the resulting string excluding the
	865	* trailing null.
	866	*
	867	* Side effects:
	868	* Writes a terminating null after the last converted character.
	869	*
	870	*----------------------------------------------------------------------
	871	*/
	872
	873	int
	874	Tcl_UtfToLower(
	875	char str) / String to convert in place. */
	876	{
	877	Tcl_UniChar ch, lowChar;
	878	char src, dst;
	879	int bytes;
	880
	881	/*
	882	* Iterate over the string until we hit the terminating null.
	883	*/
	884
	885	src = dst = str;
	886	while (*src) {
	887	bytes = TclUtfToUniChar(src, &ch);
	888	lowChar = Tcl_UniCharToLower(ch);
	889
	890	/*
	891	* To keep badly formed Utf strings from getting inflated by the
	892	* conversion (thereby causing a segfault), only copy the lower case
	893	* char to dst if its size is <= the original char.
	894	*/
	895
	896	if (bytes < UtfCount(lowChar)) {
	897	memcpy(dst, src, (size_t) bytes);
	898	dst += bytes;
	899	} else {
	900	dst += Tcl_UniCharToUtf(lowChar, dst);
	901	}
	902	src += bytes;
	903	}
	904	*dst = '\0';
	905	return (dst - str);
	906	}
	907
	908	/*
	909	*----------------------------------------------------------------------
	910	*
	911	* Tcl_UtfToTitle --
	912	*
	913	* Changes the first character of a UTF string to title case or uppercase
	914	* and the rest of the string to lowercase. The conversion happens in
	915	* place and may shrink the UTF string.
	916	*
	917	* Results:
	918	* Returns the number of bytes in the resulting string excluding the
	919	* trailing null.
	920	*
	921	* Side effects:
	922	* Writes a terminating null after the last converted character.
	923	*
	924	*----------------------------------------------------------------------
	925	*/
	926
	927	int
	928	Tcl_UtfToTitle(
	929	char str) / String to convert in place. */
	930	{
	931	Tcl_UniChar ch, titleChar, lowChar;
	932	char src, dst;
	933	int bytes;
	934
	935	/*
	936	* Capitalize the first character and then lowercase the rest of the
	937	* characters until we get to a null.
	938	*/
	939
	940	src = dst = str;
	941
	942	if (*src) {
	943	bytes = TclUtfToUniChar(src, &ch);
	944	titleChar = Tcl_UniCharToTitle(ch);
	945
	946	if (bytes < UtfCount(titleChar)) {
	947	memcpy(dst, src, (size_t) bytes);
	948	dst += bytes;
	949	} else {
	950	dst += Tcl_UniCharToUtf(titleChar, dst);
	951	}
	952	src += bytes;
	953	}
	954	while (*src) {
	955	bytes = TclUtfToUniChar(src, &ch);
	956	lowChar = Tcl_UniCharToLower(ch);
	957
	958	if (bytes < UtfCount(lowChar)) {
	959	memcpy(dst, src, (size_t) bytes);
	960	dst += bytes;
	961	} else {
	962	dst += Tcl_UniCharToUtf(lowChar, dst);
	963	}
	964	src += bytes;
	965	}
	966	*dst = '\0';
	967	return (dst - str);
	968	}
	969
	970	/*
	971	*----------------------------------------------------------------------
	972	*
	973	* TclpUtfNcmp2 --
	974	*
	975	* Compare at most numBytes bytes of utf-8 strings cs and ct. Both cs and
	976	* ct are assumed to be at least numBytes bytes long.
	977	*
	978	* Results:
	979	* Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
	980	*
	981	* Side effects:
	982	* None.
	983	*
	984	*----------------------------------------------------------------------
	985	*/
	986
	987	int
	988	TclpUtfNcmp2(
	989	CONST char cs, / UTF string to compare to ct. */
	990	CONST char ct, / UTF string cs is compared to. */
	991	unsigned long numBytes) /* Number of bytes to compare. */
	992	{
	993	/*
	994	* We can't simply call 'memcmp(cs, ct, numBytes);' because we need to
	995	* check for Tcl's \xC0\x80 non-utf-8 null encoding. Otherwise utf-8 lexes
	996	* fine in the strcmp manner.
	997	*/
	998
	999	register int result = 0;
	1000
	1001	for ( ; numBytes != 0; numBytes--, cs++, ct++) {
	1002	if (cs != ct) {
	1003	result = UCHAR(cs) - UCHAR(ct);
	1004	break;
	1005	}
	1006	}
	1007	if (numBytes && ((UCHAR(cs) == 0xC0) \|\| (UCHAR(ct) == 0xC0))) {
	1008	unsigned char c1, c2;
	1009
	1010	c1 = ((UCHAR(cs) == 0xC0) && (UCHAR(cs[1]) == 0x80)) ? 0 : UCHAR(cs);
	1011	c2 = ((UCHAR(ct) == 0xC0) && (UCHAR(ct[1]) == 0x80)) ? 0 : UCHAR(ct);
	1012	result = (c1 - c2);
	1013	}
	1014	return result;
	1015	}
	1016
	1017	/*
	1018	*----------------------------------------------------------------------
	1019	*
	1020	* Tcl_UtfNcmp --
	1021	*
	1022	* Compare at most numChars UTF chars of string cs to string ct. Both cs
	1023	* and ct are assumed to be at least numChars UTF chars long.
	1024	*
	1025	* Results:
	1026	* Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
	1027	*
	1028	* Side effects:
	1029	* None.
	1030	*
	1031	*----------------------------------------------------------------------
	1032	*/
	1033
	1034	int
	1035	Tcl_UtfNcmp(
	1036	CONST char cs, / UTF string to compare to ct. */
	1037	CONST char ct, / UTF string cs is compared to. */
	1038	unsigned long numChars) /* Number of UTF chars to compare. */
	1039	{
	1040	Tcl_UniChar ch1, ch2;
	1041
	1042	/*
	1043	* Cannot use 'memcmp(cs, ct, n);' as byte representation of \u0000 (the
	1044	* pair of bytes 0xc0,0x80) is larger than byte representation of \u0001
	1045	* (the byte 0x01.)
	1046	*/
	1047
	1048	while (numChars-- > 0) {
	1049	/*
	1050	* n must be interpreted as chars, not bytes. This should be called
	1051	* only when both strings are of at least n chars long (no need for \0
	1052	* check)
	1053	*/
	1054
	1055	cs += TclUtfToUniChar(cs, &ch1);
	1056	ct += TclUtfToUniChar(ct, &ch2);
	1057	if (ch1 != ch2) {
	1058	return (ch1 - ch2);
	1059	}
	1060	}
	1061	return 0;
	1062	}
	1063
	1064	/*
	1065	*----------------------------------------------------------------------
	1066	*
	1067	* Tcl_UtfNcasecmp --
	1068	*
	1069	* Compare at most numChars UTF chars of string cs to string ct case
	1070	* insensitive. Both cs and ct are assumed to be at least numChars UTF
	1071	* chars long.
	1072	*
	1073	* Results:
	1074	* Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
	1075	*
	1076	* Side effects:
	1077	* None.
	1078	*
	1079	*----------------------------------------------------------------------
	1080	*/
	1081
	1082	int
	1083	Tcl_UtfNcasecmp(
	1084	CONST char cs, / UTF string to compare to ct. */
	1085	CONST char ct, / UTF string cs is compared to. */
	1086	unsigned long numChars) /* Number of UTF chars to compare. */
	1087	{
	1088	Tcl_UniChar ch1, ch2;
	1089	while (numChars-- > 0) {
	1090	/*
	1091	* n must be interpreted as chars, not bytes.
	1092	* This should be called only when both strings are of
	1093	* at least n chars long (no need for \0 check)
	1094	*/
	1095	cs += TclUtfToUniChar(cs, &ch1);
	1096	ct += TclUtfToUniChar(ct, &ch2);
	1097	if (ch1 != ch2) {
	1098	ch1 = Tcl_UniCharToLower(ch1);
	1099	ch2 = Tcl_UniCharToLower(ch2);
	1100	if (ch1 != ch2) {
	1101	return (ch1 - ch2);
	1102	}
	1103	}
	1104	}
	1105	return 0;
	1106	}
	1107
	1108	/*
	1109	*----------------------------------------------------------------------
	1110	*
	1111	* Tcl_UniCharToUpper --
	1112	*
	1113	* Compute the uppercase equivalent of the given Unicode character.
	1114	*
	1115	* Results:
	1116	* Returns the uppercase Unicode character.
	1117	*
	1118	* Side effects:
	1119	* None.
	1120	*
	1121	*----------------------------------------------------------------------
	1122	*/
	1123
	1124	Tcl_UniChar
	1125	Tcl_UniCharToUpper(
	1126	int ch) /* Unicode character to convert. */
	1127	{
	1128	int info = GetUniCharInfo(ch);
	1129
	1130	if (GetCaseType(info) & 0x04) {
	1131	return (Tcl_UniChar) (ch - GetDelta(info));
	1132	} else {
	1133	return ch;
	1134	}
	1135	}
	1136
	1137	/*
	1138	*----------------------------------------------------------------------
	1139	*
	1140	* Tcl_UniCharToLower --
	1141	*
	1142	* Compute the lowercase equivalent of the given Unicode character.
	1143	*
	1144	* Results:
	1145	* Returns the lowercase Unicode character.
	1146	*
	1147	* Side effects:
	1148	* None.
	1149	*
	1150	*----------------------------------------------------------------------
	1151	*/
	1152
	1153	Tcl_UniChar
	1154	Tcl_UniCharToLower(
	1155	int ch) /* Unicode character to convert. */
	1156	{
	1157	int info = GetUniCharInfo(ch);
	1158
	1159	if (GetCaseType(info) & 0x02) {
	1160	return (Tcl_UniChar) (ch + GetDelta(info));
	1161	} else {
	1162	return ch;
	1163	}
	1164	}
	1165
	1166	/*
	1167	*----------------------------------------------------------------------
	1168	*
	1169	* Tcl_UniCharToTitle --
	1170	*
	1171	* Compute the titlecase equivalent of the given Unicode character.
	1172	*
	1173	* Results:
	1174	* Returns the titlecase Unicode character.
	1175	*
	1176	* Side effects:
	1177	* None.
	1178	*
	1179	*----------------------------------------------------------------------
	1180	*/
	1181
	1182	Tcl_UniChar
	1183	Tcl_UniCharToTitle(
	1184	int ch) /* Unicode character to convert. */
	1185	{
	1186	int info = GetUniCharInfo(ch);
	1187	int mode = GetCaseType(info);
	1188
	1189	if (mode & 0x1) {
	1190	/*
	1191	* Subtract or add one depending on the original case.
	1192	*/
	1193
	1194	return (Tcl_UniChar) (ch + ((mode & 0x4) ? -1 : 1));
	1195	} else if (mode == 0x4) {
	1196	return (Tcl_UniChar) (ch - GetDelta(info));
	1197	} else {
	1198	return ch;
	1199	}
	1200	}
	1201
	1202	/*
	1203	*----------------------------------------------------------------------
	1204	*
	1205	* Tcl_UniCharLen --
	1206	*
	1207	* Find the length of a UniChar string. The str input must be null
	1208	* terminated.
	1209	*
	1210	* Results:
	1211	* Returns the length of str in UniChars (not bytes).
	1212	*
	1213	* Side effects:
	1214	* None.
	1215	*
	1216	*----------------------------------------------------------------------
	1217	*/
	1218
	1219	int
	1220	Tcl_UniCharLen(
	1221	CONST Tcl_UniChar uniStr) / Unicode string to find length of. */
	1222	{
	1223	int len = 0;
	1224
	1225	while (*uniStr != '\0') {
	1226	len++;
	1227	uniStr++;
	1228	}
	1229	return len;
	1230	}
	1231
	1232	/*
	1233	*----------------------------------------------------------------------
	1234	*
	1235	* Tcl_UniCharNcmp --
	1236	*
	1237	* Compare at most numChars unichars of string ucs to string uct.
	1238	* Both ucs and uct are assumed to be at least numChars unichars long.
	1239	*
	1240	* Results:
	1241	* Return <0 if ucs < uct, 0 if ucs == uct, or >0 if ucs > uct.
	1242	*
	1243	* Side effects:
	1244	* None.
	1245	*
	1246	*----------------------------------------------------------------------
	1247	*/
	1248
	1249	int
	1250	Tcl_UniCharNcmp(
	1251	CONST Tcl_UniChar ucs, / Unicode string to compare to uct. */
	1252	CONST Tcl_UniChar uct, / Unicode string ucs is compared to. */
	1253	unsigned long numChars) /* Number of unichars to compare. */
	1254	{
	1255	#ifdef WORDS_BIGENDIAN
	1256	/*
	1257	* We are definitely on a big-endian machine; memcmp() is safe
	1258	*/
	1259
	1260	return memcmp(ucs, uct, numChars*sizeof(Tcl_UniChar));
	1261
	1262	#else /* !WORDS_BIGENDIAN */
	1263	/*
	1264	* We can't simply call memcmp() because that is not lexically correct.
	1265	*/
	1266
	1267	for ( ; numChars != 0; ucs++, uct++, numChars--) {
	1268	if (ucs != uct) {
	1269	return (ucs - uct);
	1270	}
	1271	}
	1272	return 0;
	1273	#endif /* WORDS_BIGENDIAN */
	1274	}
	1275
	1276	/*
	1277	*----------------------------------------------------------------------
	1278	*
	1279	* Tcl_UniCharNcasecmp --
	1280	*
	1281	* Compare at most numChars unichars of string ucs to string uct case
	1282	* insensitive. Both ucs and uct are assumed to be at least numChars
	1283	* unichars long.
	1284	*
	1285	* Results:
	1286	* Return <0 if ucs < uct, 0 if ucs == uct, or >0 if ucs > uct.
	1287	*
	1288	* Side effects:
	1289	* None.
	1290	*
	1291	*----------------------------------------------------------------------
	1292	*/
	1293
	1294	int
	1295	Tcl_UniCharNcasecmp(
	1296	CONST Tcl_UniChar ucs, / Unicode string to compare to uct. */
	1297	CONST Tcl_UniChar uct, / Unicode string ucs is compared to. */
	1298	unsigned long numChars) /* Number of unichars to compare. */
	1299	{
	1300	for ( ; numChars != 0; numChars--, ucs++, uct++) {
	1301	if (ucs != uct) {
	1302	Tcl_UniChar lcs = Tcl_UniCharToLower(*ucs);
	1303	Tcl_UniChar lct = Tcl_UniCharToLower(*uct);
	1304
	1305	if (lcs != lct) {
	1306	return (lcs - lct);
	1307	}
	1308	}
	1309	}
	1310	return 0;
	1311	}
	1312
	1313	/*
	1314	*----------------------------------------------------------------------
	1315	*
	1316	* Tcl_UniCharIsAlnum --
	1317	*
	1318	* Test if a character is an alphanumeric Unicode character.
	1319	*
	1320	* Results:
	1321	* Returns 1 if character is alphanumeric.
	1322	*
	1323	* Side effects:
	1324	* None.
	1325	*
	1326	*----------------------------------------------------------------------
	1327	*/
	1328
	1329	int
	1330	Tcl_UniCharIsAlnum(
	1331	int ch) /* Unicode character to test. */
	1332	{
	1333	register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
	1334
	1335	return (((ALPHA_BITS \| DIGIT_BITS) >> category) & 1);
	1336	}
	1337
	1338	/*
	1339	*----------------------------------------------------------------------
	1340	*
	1341	* Tcl_UniCharIsAlpha --
	1342	*
	1343	* Test if a character is an alphabetic Unicode character.
	1344	*
	1345	* Results:
	1346	* Returns 1 if character is alphabetic.
	1347	*
	1348	* Side effects:
	1349	* None.
	1350	*
	1351	*----------------------------------------------------------------------
	1352	*/
	1353
	1354	int
	1355	Tcl_UniCharIsAlpha(
	1356	int ch) /* Unicode character to test. */
	1357	{
	1358	register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
	1359	return ((ALPHA_BITS >> category) & 1);
	1360	}
	1361
	1362	/*
	1363	*----------------------------------------------------------------------
	1364	*
	1365	* Tcl_UniCharIsControl --
	1366	*
	1367	* Test if a character is a Unicode control character.
	1368	*
	1369	* Results:
	1370	* Returns non-zero if character is a control.
	1371	*
	1372	* Side effects:
	1373	* None.
	1374	*
	1375	*----------------------------------------------------------------------
	1376	*/
	1377
	1378	int
	1379	Tcl_UniCharIsControl(
	1380	int ch) /* Unicode character to test. */
	1381	{
	1382	return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == CONTROL);
	1383	}
	1384
	1385	/*
	1386	*----------------------------------------------------------------------
	1387	*
	1388	* Tcl_UniCharIsDigit --
	1389	*
	1390	* Test if a character is a numeric Unicode character.
	1391	*
	1392	* Results:
	1393	* Returns non-zero if character is a digit.
	1394	*
	1395	* Side effects:
	1396	* None.
	1397	*
	1398	*----------------------------------------------------------------------
	1399	*/
	1400
	1401	int
	1402	Tcl_UniCharIsDigit(
	1403	int ch) /* Unicode character to test. */
	1404	{
	1405	return (GetUniCharInfo(ch)&UNICODE_CATEGORY_MASK) == DECIMAL_DIGIT_NUMBER;
	1406	}
	1407
	1408	/*
	1409	*----------------------------------------------------------------------
	1410	*
	1411	* Tcl_UniCharIsGraph --
	1412	*
	1413	* Test if a character is any Unicode print character except space.
	1414	*
	1415	* Results:
	1416	* Returns non-zero if character is printable, but not space.
	1417	*
	1418	* Side effects:
	1419	* None.
	1420	*
	1421	*----------------------------------------------------------------------
	1422	*/
	1423
	1424	int
	1425	Tcl_UniCharIsGraph(
	1426	int ch) /* Unicode character to test. */
	1427	{
	1428	register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
	1429	return (((PRINT_BITS >> category) & 1) && ((unsigned char) ch != ' '));
	1430	}
	1431
	1432	/*
	1433	*----------------------------------------------------------------------
	1434	*
	1435	* Tcl_UniCharIsLower --
	1436	*
	1437	* Test if a character is a lowercase Unicode character.
	1438	*
	1439	* Results:
	1440	* Returns non-zero if character is lowercase.
	1441	*
	1442	* Side effects:
	1443	* None.
	1444	*
	1445	*----------------------------------------------------------------------
	1446	*/
	1447
	1448	int
	1449	Tcl_UniCharIsLower(
	1450	int ch) /* Unicode character to test. */
	1451	{
	1452	return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == LOWERCASE_LETTER);
	1453	}
	1454
	1455	/*
	1456	*----------------------------------------------------------------------
	1457	*
	1458	* Tcl_UniCharIsPrint --
	1459	*
	1460	* Test if a character is a Unicode print character.
	1461	*
	1462	* Results:
	1463	* Returns non-zero if character is printable.
	1464	*
	1465	* Side effects:
	1466	* None.
	1467	*
	1468	*----------------------------------------------------------------------
	1469	*/
	1470
	1471	int
	1472	Tcl_UniCharIsPrint(
	1473	int ch) /* Unicode character to test. */
	1474	{
	1475	register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
	1476	return ((PRINT_BITS >> category) & 1);
	1477	}
	1478
	1479	/*
	1480	*----------------------------------------------------------------------
	1481	*
	1482	* Tcl_UniCharIsPunct --
	1483	*
	1484	* Test if a character is a Unicode punctuation character.
	1485	*
	1486	* Results:
	1487	* Returns non-zero if character is punct.
	1488	*
	1489	* Side effects:
	1490	* None.
	1491	*
	1492	*----------------------------------------------------------------------
	1493	*/
	1494
	1495	int
	1496	Tcl_UniCharIsPunct(
	1497	int ch) /* Unicode character to test. */
	1498	{
	1499	register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
	1500	return ((PUNCT_BITS >> category) & 1);
	1501	}
	1502
	1503	/*
	1504	*----------------------------------------------------------------------
	1505	*
	1506	* Tcl_UniCharIsSpace --
	1507	*
	1508	* Test if a character is a whitespace Unicode character.
	1509	*
	1510	* Results:
	1511	* Returns non-zero if character is a space.
	1512	*
	1513	* Side effects:
	1514	* None.
	1515	*
	1516	*----------------------------------------------------------------------
	1517	*/
	1518
	1519	int
	1520	Tcl_UniCharIsSpace(
	1521	int ch) /* Unicode character to test. */
	1522	{
	1523	register int category;
	1524
	1525	/*
	1526	* If the character is within the first 127 characters, just use the
	1527	* standard C function, otherwise consult the Unicode table.
	1528	*/
	1529
	1530	if (ch < 0x80) {
	1531	return isspace(UCHAR(ch)); /* INTL: ISO space */
	1532	} else {
	1533	category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
	1534	return ((SPACE_BITS >> category) & 1);
	1535	}
	1536	}
	1537
	1538	/*
	1539	*----------------------------------------------------------------------
	1540	*
	1541	* Tcl_UniCharIsUpper --
	1542	*
	1543	* Test if a character is a uppercase Unicode character.
	1544	*
	1545	* Results:
	1546	* Returns non-zero if character is uppercase.
	1547	*
	1548	* Side effects:
	1549	* None.
	1550	*
	1551	*----------------------------------------------------------------------
	1552	*/
	1553
	1554	int
	1555	Tcl_UniCharIsUpper(
	1556	int ch) /* Unicode character to test. */
	1557	{
	1558	return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == UPPERCASE_LETTER);
	1559	}
	1560
	1561	/*
	1562	*----------------------------------------------------------------------
	1563	*
	1564	* Tcl_UniCharIsWordChar --
	1565	*
	1566	* Test if a character is alphanumeric or a connector punctuation mark.
	1567	*
	1568	* Results:
	1569	* Returns 1 if character is a word character.
	1570	*
	1571	* Side effects:
	1572	* None.
	1573	*
	1574	*----------------------------------------------------------------------
	1575	*/
	1576
	1577	int
	1578	Tcl_UniCharIsWordChar(
	1579	int ch) /* Unicode character to test. */
	1580	{
	1581	register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
	1582
	1583	return (((ALPHA_BITS \| DIGIT_BITS \| CONNECTOR_BITS) >> category) & 1);
	1584	}
	1585
	1586	/*
	1587	*----------------------------------------------------------------------
	1588	*
	1589	* Tcl_UniCharCaseMatch --
	1590	*
	1591	* See if a particular Unicode string matches a particular pattern.
	1592	* Allows case insensitivity. This is the Unicode equivalent of the char*
	1593	* Tcl_StringCaseMatch. The UniChar strings must be NULL-terminated.
	1594	* This has no provision for counted UniChar strings, thus should not be
	1595	* used where NULLs are expected in the UniChar string. Use
	1596	* TclUniCharMatch where possible.
	1597	*
	1598	* Results:
	1599	* The return value is 1 if string matches pattern, and 0 otherwise. The
	1600	* matching operation permits the following special characters in the
	1601	* pattern: *?\[] (see the manual entry for details on what these mean).
	1602	*
	1603	* Side effects:
	1604	* None.
	1605	*
	1606	*----------------------------------------------------------------------
	1607	*/
	1608
	1609	int
	1610	Tcl_UniCharCaseMatch(
	1611	CONST Tcl_UniChar uniStr, / Unicode String. */
	1612	CONST Tcl_UniChar *uniPattern,
	1613	/* Pattern, which may contain special
	1614	* characters. */
	1615	int nocase) /* 0 for case sensitive, 1 for insensitive */
	1616	{
	1617	Tcl_UniChar ch1, p;
	1618
	1619	while (1) {
	1620	p = *uniPattern;
	1621
	1622	/*
	1623	* See if we're at the end of both the pattern and the string. If so,
	1624	* we succeeded. If we're at the end of the pattern but not at the end
	1625	* of the string, we failed.
	1626	*/
	1627
	1628	if (p == 0) {
	1629	return (*uniStr == 0);
	1630	}
	1631	if ((uniStr == 0) && (p != '')) {
	1632	return 0;
	1633	}
	1634
	1635	/*
	1636	* Check for a "*" as the next pattern character. It matches any
	1637	* substring. We handle this by skipping all the characters up to the
	1638	* next matching one in the pattern, and then calling ourselves
	1639	* recursively for each postfix of string, until either we match or we
	1640	* reach the end of the string.
	1641	*/
	1642
	1643	if (p == '*') {
	1644	/*
	1645	* Skip all successive *'s in the pattern
	1646	*/
	1647
	1648	while ((++uniPattern) == '') {
	1649	/* empty body */
	1650	}
	1651	p = *uniPattern;
	1652	if (p == 0) {
	1653	return 1;
	1654	}
	1655	if (nocase) {
	1656	p = Tcl_UniCharToLower(p);
	1657	}
	1658	while (1) {
	1659	/*
	1660	* Optimization for matching - cruise through the string
	1661	* quickly if the next char in the pattern isn't a special
	1662	* character
	1663	*/
	1664
	1665	if ((p != '[') && (p != '?') && (p != '\\')) {
	1666	if (nocase) {
	1667	while (uniStr && (p != uniStr)
	1668	&& (p != Tcl_UniCharToLower(*uniStr))) {
	1669	uniStr++;
	1670	}
	1671	} else {
	1672	while (uniStr && (p != uniStr)) {
	1673	uniStr++;
	1674	}
	1675	}
	1676	}
	1677	if (Tcl_UniCharCaseMatch(uniStr, uniPattern, nocase)) {
	1678	return 1;
	1679	}
	1680	if (*uniStr == 0) {
	1681	return 0;
	1682	}
	1683	uniStr++;
	1684	}
	1685	}
	1686
	1687	/*
	1688	* Check for a "?" as the next pattern character. It matches any
	1689	* single character.
	1690	*/
	1691
	1692	if (p == '?') {
	1693	uniPattern++;
	1694	uniStr++;
	1695	continue;
	1696	}
	1697
	1698	/*
	1699	* Check for a "[" as the next pattern character. It is followed by a
	1700	* list of characters that are acceptable, or by a range (two
	1701	* characters separated by "-").
	1702	*/
	1703
	1704	if (p == '[') {
	1705	Tcl_UniChar startChar, endChar;
	1706
	1707	uniPattern++;
	1708	ch1 = (nocase ? Tcl_UniCharToLower(uniStr) : uniStr);
	1709	uniStr++;
	1710	while (1) {
	1711	if ((uniPattern == ']') \|\| (uniPattern == 0)) {
	1712	return 0;
	1713	}
	1714	startChar = (nocase ? Tcl_UniCharToLower(*uniPattern)
	1715	: *uniPattern);
	1716	uniPattern++;
	1717	if (*uniPattern == '-') {
	1718	uniPattern++;
	1719	if (*uniPattern == 0) {
	1720	return 0;
	1721	}
	1722	endChar = (nocase ? Tcl_UniCharToLower(*uniPattern)
	1723	: *uniPattern);
	1724	uniPattern++;
	1725	if (((startChar <= ch1) && (ch1 <= endChar))
	1726	\|\| ((endChar <= ch1) && (ch1 <= startChar))) {
	1727	/*
	1728	* Matches ranges of form [a-z] or [z-a].
	1729	*/
	1730	break;
	1731	}
	1732	} else if (startChar == ch1) {
	1733	break;
	1734	}
	1735	}
	1736	while (*uniPattern != ']') {
	1737	if (*uniPattern == 0) {
	1738	uniPattern--;
	1739	break;
	1740	}
	1741	uniPattern++;
	1742	}
	1743	uniPattern++;
	1744	continue;
	1745	}
	1746
	1747	/*
	1748	* If the next pattern character is '\', just strip off the '\' so we
	1749	* do exact matching on the character that follows.
	1750	*/
	1751
	1752	if (p == '\\') {
	1753	if (*(++uniPattern) == '\0') {
	1754	return 0;
	1755	}
	1756	}
	1757
	1758	/*
	1759	* There's no special character. Just make sure that the next bytes of
	1760	* each string match.
	1761	*/
	1762
	1763	if (nocase) {
	1764	if (Tcl_UniCharToLower(*uniStr) !=
	1765	Tcl_UniCharToLower(*uniPattern)) {
	1766	return 0;
	1767	}
	1768	} else if (uniStr != uniPattern) {
	1769	return 0;
	1770	}
	1771	uniStr++;
	1772	uniPattern++;
	1773	}
	1774	}
	1775
	1776	/*
	1777	*----------------------------------------------------------------------
	1778	*
	1779	* TclUniCharMatch --
	1780	*
	1781	* See if a particular Unicode string matches a particular pattern.
	1782	* Allows case insensitivity. This is the Unicode equivalent of the char*
	1783	* Tcl_StringCaseMatch. This variant of Tcl_UniCharCaseMatch uses counted
	1784	* Strings, so embedded NULLs are allowed.
	1785	*
	1786	* Results:
	1787	* The return value is 1 if string matches pattern, and 0 otherwise. The
	1788	* matching operation permits the following special characters in the
	1789	* pattern: *?\[] (see the manual entry for details on what these mean).
	1790	*
	1791	* Side effects:
	1792	* None.
	1793	*
	1794	*----------------------------------------------------------------------
	1795	*/
	1796
	1797	int
	1798	TclUniCharMatch(
	1799	CONST Tcl_UniChar string, / Unicode String. */
	1800	int strLen, /* Length of String */
	1801	CONST Tcl_UniChar pattern, / Pattern, which may contain special
	1802	* characters. */
	1803	int ptnLen, /* Length of Pattern */
	1804	int nocase) /* 0 for case sensitive, 1 for insensitive */
	1805	{
	1806	CONST Tcl_UniChar stringEnd, patternEnd;
	1807	Tcl_UniChar p;
	1808
	1809	stringEnd = string + strLen;
	1810	patternEnd = pattern + ptnLen;
	1811
	1812	while (1) {
	1813	/*
	1814	* See if we're at the end of both the pattern and the string. If so,
	1815	* we succeeded. If we're at the end of the pattern but not at the end
	1816	* of the string, we failed.
	1817	*/
	1818
	1819	if (pattern == patternEnd) {
	1820	return (string == stringEnd);
	1821	}
	1822	p = *pattern;
	1823	if ((string == stringEnd) && (p != '*')) {
	1824	return 0;
	1825	}
	1826
	1827	/*
	1828	* Check for a "*" as the next pattern character. It matches any
	1829	* substring. We handle this by skipping all the characters up to the
	1830	* next matching one in the pattern, and then calling ourselves
	1831	* recursively for each postfix of string, until either we match or we
	1832	* reach the end of the string.
	1833	*/
	1834
	1835	if (p == '*') {
	1836	/*
	1837	* Skip all successive *'s in the pattern.
	1838	*/
	1839
	1840	while ((++pattern) == '') {
	1841	/* empty body */
	1842	}
	1843	if (pattern == patternEnd) {
	1844	return 1;
	1845	}
	1846	p = *pattern;
	1847	if (nocase) {
	1848	p = Tcl_UniCharToLower(p);
	1849	}
	1850	while (1) {
	1851	/*
	1852	* Optimization for matching - cruise through the string
	1853	* quickly if the next char in the pattern isn't a special
	1854	* character.
	1855	*/
	1856
	1857	if ((p != '[') && (p != '?') && (p != '\\')) {
	1858	if (nocase) {
	1859	while ((string < stringEnd) && (p != *string)
	1860	&& (p != Tcl_UniCharToLower(*string))) {
	1861	string++;
	1862	}
	1863	} else {
	1864	while ((string < stringEnd) && (p != *string)) {
	1865	string++;
	1866	}
	1867	}
	1868	}
	1869	if (TclUniCharMatch(string, stringEnd - string,
	1870	pattern, patternEnd - pattern, nocase)) {
	1871	return 1;
	1872	}
	1873	if (string == stringEnd) {
	1874	return 0;
	1875	}
	1876	string++;
	1877	}
	1878	}
	1879
	1880	/*
	1881	* Check for a "?" as the next pattern character. It matches any
	1882	* single character.
	1883	*/
	1884
	1885	if (p == '?') {
	1886	pattern++;
	1887	string++;
	1888	continue;
	1889	}
	1890
	1891	/*
	1892	* Check for a "[" as the next pattern character. It is followed by a
	1893	* list of characters that are acceptable, or by a range (two
	1894	* characters separated by "-").
	1895	*/
	1896
	1897	if (p == '[') {
	1898	Tcl_UniChar ch1, startChar, endChar;
	1899
	1900	pattern++;
	1901	ch1 = (nocase ? Tcl_UniCharToLower(string) : string);
	1902	string++;
	1903	while (1) {
	1904	if ((*pattern == ']') \|\| (pattern == patternEnd)) {
	1905	return 0;
	1906	}
	1907	startChar = (nocase ? Tcl_UniCharToLower(pattern) : pattern);
	1908	pattern++;
	1909	if (*pattern == '-') {
	1910	pattern++;
	1911	if (pattern == patternEnd) {
	1912	return 0;
	1913	}
	1914	endChar = (nocase ? Tcl_UniCharToLower(*pattern)
	1915	: *pattern);
	1916	pattern++;
	1917	if (((startChar <= ch1) && (ch1 <= endChar))
	1918	\|\| ((endChar <= ch1) && (ch1 <= startChar))) {
	1919	/*
	1920	* Matches ranges of form [a-z] or [z-a].
	1921	*/
	1922	break;
	1923	}
	1924	} else if (startChar == ch1) {
	1925	break;
	1926	}
	1927	}
	1928	while (*pattern != ']') {
	1929	if (pattern == patternEnd) {
	1930	pattern--;
	1931	break;
	1932	}
	1933	pattern++;
	1934	}
	1935	pattern++;
	1936	continue;
	1937	}
	1938
	1939	/*
	1940	* If the next pattern character is '\', just strip off the '\' so we
	1941	* do exact matching on the character that follows.
	1942	*/
	1943
	1944	if (p == '\\') {
	1945	if (++pattern == patternEnd) {
	1946	return 0;
	1947	}
	1948	}
	1949
	1950	/*
	1951	* There's no special character. Just make sure that the next bytes of
	1952	* each string match.
	1953	*/
	1954
	1955	if (nocase) {
	1956	if (Tcl_UniCharToLower(string) != Tcl_UniCharToLower(pattern)) {
	1957	return 0;
	1958	}
	1959	} else if (string != pattern) {
	1960	return 0;
	1961	}
	1962	string++;
	1963	pattern++;
	1964	}
	1965	}
	1966
	1967	/*
	1968	* Local Variables:
	1969	* mode: c
	1970	* c-basic-offset: 4
	1971	* fill-column: 78
	1972	* End:
	1973	*/

Note: See TracBrowser for help on using the repository browser.

Download in other formats: