Context Navigation

Utf.3 @ 43

Last change on this file since 43 was 25, checked in by landauf, 16 years ago
added tcl to libs
File size: 10.7 KB

Line
1	'\"
2	'\" Copyright (c) 1997 Sun Microsystems, Inc.
3	'\"
4	'\" See the file "license.terms" for information on usage and redistribution
5	'\" of this file, and for a DISCLAIMER OF ALL WARRANTIES.
6	'\"
7	'\" RCS: @(#) $Id: Utf.3,v 1.25 2007/12/13 15:22:32 dgp Exp $
8	'\"
9	.so man.macros
10	.TH Utf 3 "8.1" Tcl "Tcl Library Procedures"
11	.BS
12	.SH NAME
13	Tcl_UniChar, Tcl_UniCharCaseMatch, Tcl_UniCharNcasecmp, Tcl_UniCharToUtf, Tcl_UtfToUniChar, Tcl_UniCharToUtfDString, Tcl_UtfToUniCharDString, Tcl_UniCharLen, Tcl_UniCharNcmp, Tcl_UtfCharComplete, Tcl_NumUtfChars, Tcl_UtfFindFirst, Tcl_UtfFindLast, Tcl_UtfNext, Tcl_UtfPrev, Tcl_UniCharAtIndex, Tcl_UtfAtIndex, Tcl_UtfBackslash \- routines for manipulating UTF-8 strings
14	.SH SYNOPSIS
15	.nf
16	\fB#include <tcl.h>\fR
17	.sp
18	typedef ... Tcl_UniChar;
19	.sp
20	int
21	\fBTcl_UniCharToUtf\fR(\fIch, buf\fR)
22	.sp
23	int
24	\fBTcl_UtfToUniChar\fR(\fIsrc, chPtr\fR)
25	.sp
26	char *
27	\fBTcl_UniCharToUtfDString\fR(\fIuniStr, uniLength, dsPtr\fR)
28	.sp
29	Tcl_UniChar *
30	\fBTcl_UtfToUniCharDString\fR(\fIsrc, length, dsPtr\fR)
31	.sp
32	int
33	\fBTcl_UniCharLen\fR(\fIuniStr\fR)
34	.sp
35	int
36	\fBTcl_UniCharNcmp\fR(\fIucs, uct, numChars\fR)
37	.sp
38	int
39	\fBTcl_UniCharNcasecmp\fR(\fIucs, uct, numChars\fR)
40	.sp
41	int
42	\fBTcl_UniCharCaseMatch\fR(\fIuniStr, uniPattern, nocase\fR)
43	.sp
44	int
45	\fBTcl_UtfNcmp\fR(\fIcs, ct, numChars\fR)
46	.sp
47	int
48	\fBTcl_UtfNcasecmp\fR(\fIcs, ct, numChars\fR)
49	.sp
50	int
51	\fBTcl_UtfCharComplete\fR(\fIsrc, length\fR)
52	.sp
53	int
54	\fBTcl_NumUtfChars\fR(\fIsrc, length\fR)
55	.sp
56	const char *
57	\fBTcl_UtfFindFirst\fR(\fIsrc, ch\fR)
58	.sp
59	const char *
60	\fBTcl_UtfFindLast\fR(\fIsrc, ch\fR)
61	.sp
62	const char *
63	\fBTcl_UtfNext\fR(\fIsrc\fR)
64	.sp
65	const char *
66	\fBTcl_UtfPrev\fR(\fIsrc, start\fR)
67	.sp
68	Tcl_UniChar
69	\fBTcl_UniCharAtIndex\fR(\fIsrc, index\fR)
70	.sp
71	const char *
72	\fBTcl_UtfAtIndex\fR(\fIsrc, index\fR)
73	.sp
74	int
75	\fBTcl_UtfBackslash\fR(\fIsrc, readPtr, dst\fR)
76	.SH ARGUMENTS
77	.AS "const Tcl_UniChar" *uniPattern in/out
78	.AP char *buf out
79	Buffer in which the UTF-8 representation of the Tcl_UniChar is stored. At most
80	\fBTCL_UTF_MAX\fR bytes are stored in the buffer.
81	.AP int ch in
82	The Tcl_UniChar to be converted or examined.
83	.AP Tcl_UniChar *chPtr out
84	Filled with the Tcl_UniChar represented by the head of the UTF-8 string.
85	.AP "const char" *src in
86	Pointer to a UTF-8 string.
87	.AP "const char" *cs in
88	Pointer to a UTF-8 string.
89	.AP "const char" *ct in
90	Pointer to a UTF-8 string.
91	.AP "const Tcl_UniChar" *uniStr in
92	A null-terminated Unicode string.
93	.AP "const Tcl_UniChar" *ucs in
94	A null-terminated Unicode string.
95	.AP "const Tcl_UniChar" *uct in
96	A null-terminated Unicode string.
97	.AP "const Tcl_UniChar" *uniPattern in
98	A null-terminated Unicode string.
99	.AP int length in
100	The length of the UTF-8 string in bytes (not UTF-8 characters). If
101	negative, all bytes up to the first null byte are used.
102	.AP int uniLength in
103	The length of the Unicode string in characters. Must be greater than or
104	equal to 0.
105	.AP "Tcl_DString" *dsPtr in/out
106	A pointer to a previously initialized \fBTcl_DString\fR.
107	.AP "unsigned long" numChars in
108	The number of characters to compare.
109	.AP "const char" *start in
110	Pointer to the beginning of a UTF-8 string.
111	.AP int index in
112	The index of a character (not byte) in the UTF-8 string.
113	.AP int *readPtr out
114	If non-NULL, filled with the number of bytes in the backslash sequence,
115	including the backslash character.
116	.AP char *dst out
117	Buffer in which the bytes represented by the backslash sequence are stored.
118	At most \fBTCL_UTF_MAX\fR bytes are stored in the buffer.
119	.AP int nocase in
120	Specifies whether the match should be done case-sensitive (0) or
121	case-insensitive (1).
122	.BE
123
124	.SH DESCRIPTION
125	.PP
126	These routines convert between UTF-8 strings and Tcl_UniChars. A
127	Tcl_UniChar is a Unicode character represented as an unsigned, fixed-size
128	quantity. A UTF-8 character is a Unicode character represented as
129	a varying-length sequence of up to \fBTCL_UTF_MAX\fR bytes. A multibyte UTF-8
130	sequence consists of a lead byte followed by some number of trail bytes.
131	.PP
132	\fBTCL_UTF_MAX\fR is the maximum number of bytes that it takes to
133	represent one Unicode character in the UTF-8 representation.
134	.PP
135	\fBTcl_UniCharToUtf\fR stores the Tcl_UniChar \fIch\fR as a UTF-8 string
136	in starting at \fIbuf\fR. The return value is the number of bytes stored
137	in \fIbuf\fR.
138	.PP
139	\fBTcl_UtfToUniChar\fR reads one UTF-8 character starting at \fIsrc\fR
140	and stores it as a Tcl_UniChar in \fI*chPtr\fR. The return value is the
141	number of bytes read from \fIsrc\fR. The caller must ensure that the
142	source buffer is long enough such that this routine does not run off the
143	end and dereference non-existent or random memory; if the source buffer
144	is known to be null-terminated, this will not happen. If the input is
145	not in proper UTF-8 format, \fBTcl_UtfToUniChar\fR will store the first
146	byte of \fIsrc\fR in \fI*chPtr\fR as a Tcl_UniChar between 0x0000 and
147	0x00ff and return 1.
148	.PP
149	\fBTcl_UniCharToUtfDString\fR converts the given Unicode string
150	to UTF-8, storing the result in a previously initialized \fBTcl_DString\fR.
151	You must specify \fIuniLength\fR, the length of the given Unicode string.
152	The return value is a pointer to the UTF-8 representation of the
153	Unicode string. Storage for the return value is appended to the
154	end of the \fBTcl_DString\fR.
155	.PP
156	\fBTcl_UtfToUniCharDString\fR converts the given UTF-8 string to Unicode,
157	storing the result in the previously initialized \fBTcl_DString\fR.
158	In the argument \fIlength\fR, you may either specify the length of
159	the given UTF-8 string in bytes or
160	.QW \-1 ,
161	in which case \fBTcl_UtfToUniCharDString\fR uses \fBstrlen\fR to
162	calculate the length. The return value is a pointer to the Unicode
163	representation of the UTF-8 string. Storage for the return value
164	is appended to the end of the \fBTcl_DString\fR. The Unicode string
165	is terminated with a Unicode null character.
166	.PP
167	\fBTcl_UniCharLen\fR corresponds to \fBstrlen\fR for Unicode
168	characters. It accepts a null-terminated Unicode string and returns
169	the number of Unicode characters (not bytes) in that string.
170	.PP
171	\fBTcl_UniCharNcmp\fR and \fBTcl_UniCharNcasecmp\fR correspond to
172	\fBstrncmp\fR and \fBstrncasecmp\fR, respectively, for Unicode characters.
173	They accept two null-terminated Unicode strings and the number of characters
174	to compare. Both strings are assumed to be at least \fInumChars\fR characters
175	long. \fBTcl_UniCharNcmp\fR compares the two strings character-by-character
176	according to the Unicode character ordering. It returns an integer greater
177	than, equal to, or less than 0 if the first string is greater than, equal
178	to, or less than the second string respectively. \fBTcl_UniCharNcasecmp\fR
179	is the Unicode case insensitive version.
180	.PP
181	\fBTcl_UniCharCaseMatch\fR is the Unicode equivalent to
182	\fBTcl_StringCaseMatch\fR. It accepts a null-terminated Unicode string,
183	a Unicode pattern, and a boolean value specifying whether the match should
184	be case sensitive and returns whether the string matches the pattern.
185	.PP
186	\fBTcl_UtfNcmp\fR corresponds to \fBstrncmp\fR for UTF-8 strings. It
187	accepts two null-terminated UTF-8 strings and the number of characters
188	to compare. (Both strings are assumed to be at least \fInumChars\fR
189	characters long.) \fBTcl_UtfNcmp\fR compares the two strings
190	character-by-character according to the Unicode character ordering.
191	It returns an integer greater than, equal to, or less than 0 if the
192	first string is greater than, equal to, or less than the second string
193	respectively.
194	.PP
195	\fBTcl_UtfNcasecmp\fR corresponds to \fBstrncasecmp\fR for UTF-8
196	strings. It is similar to \fBTcl_UtfNcmp\fR except comparisons ignore
197	differences in case when comparing upper, lower or title case
198	characters.
199	.PP
200	\fBTcl_UtfCharComplete\fR returns 1 if the source UTF-8 string \fIsrc\fR
201	of \fIlength\fR bytes is long enough to be decoded by
202	\fBTcl_UtfToUniChar\fR, or 0 otherwise. This function does not guarantee
203	that the UTF-8 string is properly formed. This routine is used by
204	procedures that are operating on a byte at a time and need to know if a
205	full Tcl_UniChar has been seen.
206	.PP
207	\fBTcl_NumUtfChars\fR corresponds to \fBstrlen\fR for UTF-8 strings. It
208	returns the number of Tcl_UniChars that are represented by the UTF-8 string
209	\fIsrc\fR. The length of the source string is \fIlength\fR bytes. If the
210	length is negative, all bytes up to the first null byte are used.
211	.PP
212	\fBTcl_UtfFindFirst\fR corresponds to \fBstrchr\fR for UTF-8 strings. It
213	returns a pointer to the first occurrence of the Tcl_UniChar \fIch\fR
214	in the null-terminated UTF-8 string \fIsrc\fR. The null terminator is
215	considered part of the UTF-8 string.
216	.PP
217	\fBTcl_UtfFindLast\fR corresponds to \fBstrrchr\fR for UTF-8 strings. It
218	returns a pointer to the last occurrence of the Tcl_UniChar \fIch\fR
219	in the null-terminated UTF-8 string \fIsrc\fR. The null terminator is
220	considered part of the UTF-8 string.
221	.PP
222	Given \fIsrc\fR, a pointer to some location in a UTF-8 string,
223	\fBTcl_UtfNext\fR returns a pointer to the next UTF-8 character in the
224	string. The caller must not ask for the next character after the last
225	character in the string if the string is not terminated by a null
226	character.
227	.PP
228	Given \fIsrc\fR, a pointer to some location in a UTF-8 string (or to a
229	null byte immediately following such a string), \fBTcl_UtfPrev\fR
230	returns a pointer to the closest preceding byte that starts a UTF-8
231	character.
232	This function will not back up to a position before \fIstart\fR,
233	the start of the UTF-8 string. If \fIsrc\fR was already at \fIstart\fR, the
234	return value will be \fIstart\fR.
235	.PP
236	\fBTcl_UniCharAtIndex\fR corresponds to a C string array dereference or the
237	Pascal Ord() function. It returns the Tcl_UniChar represented at the
238	specified character (not byte) \fIindex\fR in the UTF-8 string
239	\fIsrc\fR. The source string must contain at least \fIindex\fR
240	characters. Behavior is undefined if a negative \fIindex\fR is given.
241	.PP
242	\fBTcl_UtfAtIndex\fR returns a pointer to the specified character (not
243	byte) \fIindex\fR in the UTF-8 string \fIsrc\fR. The source string must
244	contain at least \fIindex\fR characters. This is equivalent to calling
245	\fBTcl_UtfNext\fR \fIindex\fR times. If a negative \fIindex\fR is given,
246	the return pointer points to the first character in the source string.
247	.PP
248	\fBTcl_UtfBackslash\fR is a utility procedure used by several of the Tcl
249	commands. It parses a backslash sequence and stores the properly formed
250	UTF-8 character represented by the backslash sequence in the output
251	buffer \fIdst\fR. At most \fBTCL_UTF_MAX\fR bytes are stored in the buffer.
252	\fBTcl_UtfBackslash\fR modifies \fI*readPtr\fR to contain the number
253	of bytes in the backslash sequence, including the backslash character.
254	The return value is the number of bytes stored in the output buffer.
255	.PP
256	See the \fBTcl\fR manual entry for information on the valid backslash
257	sequences. All of the sequences described in the Tcl manual entry are
258	supported by \fBTcl_UtfBackslash\fR.
259
260	.SH KEYWORDS
261	utf, unicode, backslash

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Original Format