1 | '\" |
---|
2 | '\" Copyright (c) 1997 Sun Microsystems, Inc. |
---|
3 | '\" |
---|
4 | '\" See the file "license.terms" for information on usage and redistribution |
---|
5 | '\" of this file, and for a DISCLAIMER OF ALL WARRANTIES. |
---|
6 | '\" |
---|
7 | '\" RCS: @(#) $Id: Utf.3,v 1.25 2007/12/13 15:22:32 dgp Exp $ |
---|
8 | '\" |
---|
9 | .so man.macros |
---|
10 | .TH Utf 3 "8.1" Tcl "Tcl Library Procedures" |
---|
11 | .BS |
---|
12 | .SH NAME |
---|
13 | Tcl_UniChar, Tcl_UniCharCaseMatch, Tcl_UniCharNcasecmp, Tcl_UniCharToUtf, Tcl_UtfToUniChar, Tcl_UniCharToUtfDString, Tcl_UtfToUniCharDString, Tcl_UniCharLen, Tcl_UniCharNcmp, Tcl_UtfCharComplete, Tcl_NumUtfChars, Tcl_UtfFindFirst, Tcl_UtfFindLast, Tcl_UtfNext, Tcl_UtfPrev, Tcl_UniCharAtIndex, Tcl_UtfAtIndex, Tcl_UtfBackslash \- routines for manipulating UTF-8 strings |
---|
14 | .SH SYNOPSIS |
---|
15 | .nf |
---|
16 | \fB#include <tcl.h>\fR |
---|
17 | .sp |
---|
18 | typedef ... Tcl_UniChar; |
---|
19 | .sp |
---|
20 | int |
---|
21 | \fBTcl_UniCharToUtf\fR(\fIch, buf\fR) |
---|
22 | .sp |
---|
23 | int |
---|
24 | \fBTcl_UtfToUniChar\fR(\fIsrc, chPtr\fR) |
---|
25 | .sp |
---|
26 | char * |
---|
27 | \fBTcl_UniCharToUtfDString\fR(\fIuniStr, uniLength, dsPtr\fR) |
---|
28 | .sp |
---|
29 | Tcl_UniChar * |
---|
30 | \fBTcl_UtfToUniCharDString\fR(\fIsrc, length, dsPtr\fR) |
---|
31 | .sp |
---|
32 | int |
---|
33 | \fBTcl_UniCharLen\fR(\fIuniStr\fR) |
---|
34 | .sp |
---|
35 | int |
---|
36 | \fBTcl_UniCharNcmp\fR(\fIucs, uct, numChars\fR) |
---|
37 | .sp |
---|
38 | int |
---|
39 | \fBTcl_UniCharNcasecmp\fR(\fIucs, uct, numChars\fR) |
---|
40 | .sp |
---|
41 | int |
---|
42 | \fBTcl_UniCharCaseMatch\fR(\fIuniStr, uniPattern, nocase\fR) |
---|
43 | .sp |
---|
44 | int |
---|
45 | \fBTcl_UtfNcmp\fR(\fIcs, ct, numChars\fR) |
---|
46 | .sp |
---|
47 | int |
---|
48 | \fBTcl_UtfNcasecmp\fR(\fIcs, ct, numChars\fR) |
---|
49 | .sp |
---|
50 | int |
---|
51 | \fBTcl_UtfCharComplete\fR(\fIsrc, length\fR) |
---|
52 | .sp |
---|
53 | int |
---|
54 | \fBTcl_NumUtfChars\fR(\fIsrc, length\fR) |
---|
55 | .sp |
---|
56 | const char * |
---|
57 | \fBTcl_UtfFindFirst\fR(\fIsrc, ch\fR) |
---|
58 | .sp |
---|
59 | const char * |
---|
60 | \fBTcl_UtfFindLast\fR(\fIsrc, ch\fR) |
---|
61 | .sp |
---|
62 | const char * |
---|
63 | \fBTcl_UtfNext\fR(\fIsrc\fR) |
---|
64 | .sp |
---|
65 | const char * |
---|
66 | \fBTcl_UtfPrev\fR(\fIsrc, start\fR) |
---|
67 | .sp |
---|
68 | Tcl_UniChar |
---|
69 | \fBTcl_UniCharAtIndex\fR(\fIsrc, index\fR) |
---|
70 | .sp |
---|
71 | const char * |
---|
72 | \fBTcl_UtfAtIndex\fR(\fIsrc, index\fR) |
---|
73 | .sp |
---|
74 | int |
---|
75 | \fBTcl_UtfBackslash\fR(\fIsrc, readPtr, dst\fR) |
---|
76 | .SH ARGUMENTS |
---|
77 | .AS "const Tcl_UniChar" *uniPattern in/out |
---|
78 | .AP char *buf out |
---|
79 | Buffer in which the UTF-8 representation of the Tcl_UniChar is stored. At most |
---|
80 | \fBTCL_UTF_MAX\fR bytes are stored in the buffer. |
---|
81 | .AP int ch in |
---|
82 | The Tcl_UniChar to be converted or examined. |
---|
83 | .AP Tcl_UniChar *chPtr out |
---|
84 | Filled with the Tcl_UniChar represented by the head of the UTF-8 string. |
---|
85 | .AP "const char" *src in |
---|
86 | Pointer to a UTF-8 string. |
---|
87 | .AP "const char" *cs in |
---|
88 | Pointer to a UTF-8 string. |
---|
89 | .AP "const char" *ct in |
---|
90 | Pointer to a UTF-8 string. |
---|
91 | .AP "const Tcl_UniChar" *uniStr in |
---|
92 | A null-terminated Unicode string. |
---|
93 | .AP "const Tcl_UniChar" *ucs in |
---|
94 | A null-terminated Unicode string. |
---|
95 | .AP "const Tcl_UniChar" *uct in |
---|
96 | A null-terminated Unicode string. |
---|
97 | .AP "const Tcl_UniChar" *uniPattern in |
---|
98 | A null-terminated Unicode string. |
---|
99 | .AP int length in |
---|
100 | The length of the UTF-8 string in bytes (not UTF-8 characters). If |
---|
101 | negative, all bytes up to the first null byte are used. |
---|
102 | .AP int uniLength in |
---|
103 | The length of the Unicode string in characters. Must be greater than or |
---|
104 | equal to 0. |
---|
105 | .AP "Tcl_DString" *dsPtr in/out |
---|
106 | A pointer to a previously initialized \fBTcl_DString\fR. |
---|
107 | .AP "unsigned long" numChars in |
---|
108 | The number of characters to compare. |
---|
109 | .AP "const char" *start in |
---|
110 | Pointer to the beginning of a UTF-8 string. |
---|
111 | .AP int index in |
---|
112 | The index of a character (not byte) in the UTF-8 string. |
---|
113 | .AP int *readPtr out |
---|
114 | If non-NULL, filled with the number of bytes in the backslash sequence, |
---|
115 | including the backslash character. |
---|
116 | .AP char *dst out |
---|
117 | Buffer in which the bytes represented by the backslash sequence are stored. |
---|
118 | At most \fBTCL_UTF_MAX\fR bytes are stored in the buffer. |
---|
119 | .AP int nocase in |
---|
120 | Specifies whether the match should be done case-sensitive (0) or |
---|
121 | case-insensitive (1). |
---|
122 | .BE |
---|
123 | |
---|
124 | .SH DESCRIPTION |
---|
125 | .PP |
---|
126 | These routines convert between UTF-8 strings and Tcl_UniChars. A |
---|
127 | Tcl_UniChar is a Unicode character represented as an unsigned, fixed-size |
---|
128 | quantity. A UTF-8 character is a Unicode character represented as |
---|
129 | a varying-length sequence of up to \fBTCL_UTF_MAX\fR bytes. A multibyte UTF-8 |
---|
130 | sequence consists of a lead byte followed by some number of trail bytes. |
---|
131 | .PP |
---|
132 | \fBTCL_UTF_MAX\fR is the maximum number of bytes that it takes to |
---|
133 | represent one Unicode character in the UTF-8 representation. |
---|
134 | .PP |
---|
135 | \fBTcl_UniCharToUtf\fR stores the Tcl_UniChar \fIch\fR as a UTF-8 string |
---|
136 | in starting at \fIbuf\fR. The return value is the number of bytes stored |
---|
137 | in \fIbuf\fR. |
---|
138 | .PP |
---|
139 | \fBTcl_UtfToUniChar\fR reads one UTF-8 character starting at \fIsrc\fR |
---|
140 | and stores it as a Tcl_UniChar in \fI*chPtr\fR. The return value is the |
---|
141 | number of bytes read from \fIsrc\fR. The caller must ensure that the |
---|
142 | source buffer is long enough such that this routine does not run off the |
---|
143 | end and dereference non-existent or random memory; if the source buffer |
---|
144 | is known to be null-terminated, this will not happen. If the input is |
---|
145 | not in proper UTF-8 format, \fBTcl_UtfToUniChar\fR will store the first |
---|
146 | byte of \fIsrc\fR in \fI*chPtr\fR as a Tcl_UniChar between 0x0000 and |
---|
147 | 0x00ff and return 1. |
---|
148 | .PP |
---|
149 | \fBTcl_UniCharToUtfDString\fR converts the given Unicode string |
---|
150 | to UTF-8, storing the result in a previously initialized \fBTcl_DString\fR. |
---|
151 | You must specify \fIuniLength\fR, the length of the given Unicode string. |
---|
152 | The return value is a pointer to the UTF-8 representation of the |
---|
153 | Unicode string. Storage for the return value is appended to the |
---|
154 | end of the \fBTcl_DString\fR. |
---|
155 | .PP |
---|
156 | \fBTcl_UtfToUniCharDString\fR converts the given UTF-8 string to Unicode, |
---|
157 | storing the result in the previously initialized \fBTcl_DString\fR. |
---|
158 | In the argument \fIlength\fR, you may either specify the length of |
---|
159 | the given UTF-8 string in bytes or |
---|
160 | .QW \-1 , |
---|
161 | in which case \fBTcl_UtfToUniCharDString\fR uses \fBstrlen\fR to |
---|
162 | calculate the length. The return value is a pointer to the Unicode |
---|
163 | representation of the UTF-8 string. Storage for the return value |
---|
164 | is appended to the end of the \fBTcl_DString\fR. The Unicode string |
---|
165 | is terminated with a Unicode null character. |
---|
166 | .PP |
---|
167 | \fBTcl_UniCharLen\fR corresponds to \fBstrlen\fR for Unicode |
---|
168 | characters. It accepts a null-terminated Unicode string and returns |
---|
169 | the number of Unicode characters (not bytes) in that string. |
---|
170 | .PP |
---|
171 | \fBTcl_UniCharNcmp\fR and \fBTcl_UniCharNcasecmp\fR correspond to |
---|
172 | \fBstrncmp\fR and \fBstrncasecmp\fR, respectively, for Unicode characters. |
---|
173 | They accept two null-terminated Unicode strings and the number of characters |
---|
174 | to compare. Both strings are assumed to be at least \fInumChars\fR characters |
---|
175 | long. \fBTcl_UniCharNcmp\fR compares the two strings character-by-character |
---|
176 | according to the Unicode character ordering. It returns an integer greater |
---|
177 | than, equal to, or less than 0 if the first string is greater than, equal |
---|
178 | to, or less than the second string respectively. \fBTcl_UniCharNcasecmp\fR |
---|
179 | is the Unicode case insensitive version. |
---|
180 | .PP |
---|
181 | \fBTcl_UniCharCaseMatch\fR is the Unicode equivalent to |
---|
182 | \fBTcl_StringCaseMatch\fR. It accepts a null-terminated Unicode string, |
---|
183 | a Unicode pattern, and a boolean value specifying whether the match should |
---|
184 | be case sensitive and returns whether the string matches the pattern. |
---|
185 | .PP |
---|
186 | \fBTcl_UtfNcmp\fR corresponds to \fBstrncmp\fR for UTF-8 strings. It |
---|
187 | accepts two null-terminated UTF-8 strings and the number of characters |
---|
188 | to compare. (Both strings are assumed to be at least \fInumChars\fR |
---|
189 | characters long.) \fBTcl_UtfNcmp\fR compares the two strings |
---|
190 | character-by-character according to the Unicode character ordering. |
---|
191 | It returns an integer greater than, equal to, or less than 0 if the |
---|
192 | first string is greater than, equal to, or less than the second string |
---|
193 | respectively. |
---|
194 | .PP |
---|
195 | \fBTcl_UtfNcasecmp\fR corresponds to \fBstrncasecmp\fR for UTF-8 |
---|
196 | strings. It is similar to \fBTcl_UtfNcmp\fR except comparisons ignore |
---|
197 | differences in case when comparing upper, lower or title case |
---|
198 | characters. |
---|
199 | .PP |
---|
200 | \fBTcl_UtfCharComplete\fR returns 1 if the source UTF-8 string \fIsrc\fR |
---|
201 | of \fIlength\fR bytes is long enough to be decoded by |
---|
202 | \fBTcl_UtfToUniChar\fR, or 0 otherwise. This function does not guarantee |
---|
203 | that the UTF-8 string is properly formed. This routine is used by |
---|
204 | procedures that are operating on a byte at a time and need to know if a |
---|
205 | full Tcl_UniChar has been seen. |
---|
206 | .PP |
---|
207 | \fBTcl_NumUtfChars\fR corresponds to \fBstrlen\fR for UTF-8 strings. It |
---|
208 | returns the number of Tcl_UniChars that are represented by the UTF-8 string |
---|
209 | \fIsrc\fR. The length of the source string is \fIlength\fR bytes. If the |
---|
210 | length is negative, all bytes up to the first null byte are used. |
---|
211 | .PP |
---|
212 | \fBTcl_UtfFindFirst\fR corresponds to \fBstrchr\fR for UTF-8 strings. It |
---|
213 | returns a pointer to the first occurrence of the Tcl_UniChar \fIch\fR |
---|
214 | in the null-terminated UTF-8 string \fIsrc\fR. The null terminator is |
---|
215 | considered part of the UTF-8 string. |
---|
216 | .PP |
---|
217 | \fBTcl_UtfFindLast\fR corresponds to \fBstrrchr\fR for UTF-8 strings. It |
---|
218 | returns a pointer to the last occurrence of the Tcl_UniChar \fIch\fR |
---|
219 | in the null-terminated UTF-8 string \fIsrc\fR. The null terminator is |
---|
220 | considered part of the UTF-8 string. |
---|
221 | .PP |
---|
222 | Given \fIsrc\fR, a pointer to some location in a UTF-8 string, |
---|
223 | \fBTcl_UtfNext\fR returns a pointer to the next UTF-8 character in the |
---|
224 | string. The caller must not ask for the next character after the last |
---|
225 | character in the string if the string is not terminated by a null |
---|
226 | character. |
---|
227 | .PP |
---|
228 | Given \fIsrc\fR, a pointer to some location in a UTF-8 string (or to a |
---|
229 | null byte immediately following such a string), \fBTcl_UtfPrev\fR |
---|
230 | returns a pointer to the closest preceding byte that starts a UTF-8 |
---|
231 | character. |
---|
232 | This function will not back up to a position before \fIstart\fR, |
---|
233 | the start of the UTF-8 string. If \fIsrc\fR was already at \fIstart\fR, the |
---|
234 | return value will be \fIstart\fR. |
---|
235 | .PP |
---|
236 | \fBTcl_UniCharAtIndex\fR corresponds to a C string array dereference or the |
---|
237 | Pascal Ord() function. It returns the Tcl_UniChar represented at the |
---|
238 | specified character (not byte) \fIindex\fR in the UTF-8 string |
---|
239 | \fIsrc\fR. The source string must contain at least \fIindex\fR |
---|
240 | characters. Behavior is undefined if a negative \fIindex\fR is given. |
---|
241 | .PP |
---|
242 | \fBTcl_UtfAtIndex\fR returns a pointer to the specified character (not |
---|
243 | byte) \fIindex\fR in the UTF-8 string \fIsrc\fR. The source string must |
---|
244 | contain at least \fIindex\fR characters. This is equivalent to calling |
---|
245 | \fBTcl_UtfNext\fR \fIindex\fR times. If a negative \fIindex\fR is given, |
---|
246 | the return pointer points to the first character in the source string. |
---|
247 | .PP |
---|
248 | \fBTcl_UtfBackslash\fR is a utility procedure used by several of the Tcl |
---|
249 | commands. It parses a backslash sequence and stores the properly formed |
---|
250 | UTF-8 character represented by the backslash sequence in the output |
---|
251 | buffer \fIdst\fR. At most \fBTCL_UTF_MAX\fR bytes are stored in the buffer. |
---|
252 | \fBTcl_UtfBackslash\fR modifies \fI*readPtr\fR to contain the number |
---|
253 | of bytes in the backslash sequence, including the backslash character. |
---|
254 | The return value is the number of bytes stored in the output buffer. |
---|
255 | .PP |
---|
256 | See the \fBTcl\fR manual entry for information on the valid backslash |
---|
257 | sequences. All of the sequences described in the Tcl manual entry are |
---|
258 | supported by \fBTcl_UtfBackslash\fR. |
---|
259 | |
---|
260 | .SH KEYWORDS |
---|
261 | utf, unicode, backslash |
---|