Planet
navi homePPSaboutscreenshotsdownloaddevelopmentforum

source: downloads/tcl8.5.2/doc/Utf.3 @ 25

Last change on this file since 25 was 25, checked in by landauf, 16 years ago

added tcl to libs

File size: 10.7 KB
Line 
1'\"
2'\" Copyright (c) 1997 Sun Microsystems, Inc.
3'\"
4'\" See the file "license.terms" for information on usage and redistribution
5'\" of this file, and for a DISCLAIMER OF ALL WARRANTIES.
6'\"
7'\" RCS: @(#) $Id: Utf.3,v 1.25 2007/12/13 15:22:32 dgp Exp $
8'\"
9.so man.macros
10.TH Utf 3 "8.1" Tcl "Tcl Library Procedures"
11.BS
12.SH NAME
13Tcl_UniChar, Tcl_UniCharCaseMatch, Tcl_UniCharNcasecmp, Tcl_UniCharToUtf, Tcl_UtfToUniChar, Tcl_UniCharToUtfDString, Tcl_UtfToUniCharDString, Tcl_UniCharLen, Tcl_UniCharNcmp, Tcl_UtfCharComplete, Tcl_NumUtfChars, Tcl_UtfFindFirst, Tcl_UtfFindLast, Tcl_UtfNext, Tcl_UtfPrev, Tcl_UniCharAtIndex, Tcl_UtfAtIndex, Tcl_UtfBackslash \- routines for manipulating UTF-8 strings
14.SH SYNOPSIS
15.nf
16\fB#include <tcl.h>\fR
17.sp
18typedef ... Tcl_UniChar;
19.sp
20int
21\fBTcl_UniCharToUtf\fR(\fIch, buf\fR)
22.sp
23int
24\fBTcl_UtfToUniChar\fR(\fIsrc, chPtr\fR)
25.sp
26char *
27\fBTcl_UniCharToUtfDString\fR(\fIuniStr, uniLength, dsPtr\fR)
28.sp
29Tcl_UniChar *
30\fBTcl_UtfToUniCharDString\fR(\fIsrc, length, dsPtr\fR)
31.sp
32int
33\fBTcl_UniCharLen\fR(\fIuniStr\fR)
34.sp
35int
36\fBTcl_UniCharNcmp\fR(\fIucs, uct, numChars\fR)
37.sp
38int
39\fBTcl_UniCharNcasecmp\fR(\fIucs, uct, numChars\fR)
40.sp
41int
42\fBTcl_UniCharCaseMatch\fR(\fIuniStr, uniPattern, nocase\fR)
43.sp
44int
45\fBTcl_UtfNcmp\fR(\fIcs, ct, numChars\fR)
46.sp
47int
48\fBTcl_UtfNcasecmp\fR(\fIcs, ct, numChars\fR)
49.sp
50int
51\fBTcl_UtfCharComplete\fR(\fIsrc, length\fR)
52.sp
53int
54\fBTcl_NumUtfChars\fR(\fIsrc, length\fR)
55.sp
56const char *
57\fBTcl_UtfFindFirst\fR(\fIsrc, ch\fR)
58.sp
59const char *
60\fBTcl_UtfFindLast\fR(\fIsrc, ch\fR)
61.sp
62const char *
63\fBTcl_UtfNext\fR(\fIsrc\fR)
64.sp
65const char *
66\fBTcl_UtfPrev\fR(\fIsrc, start\fR)
67.sp
68Tcl_UniChar
69\fBTcl_UniCharAtIndex\fR(\fIsrc, index\fR)
70.sp
71const char *
72\fBTcl_UtfAtIndex\fR(\fIsrc, index\fR)
73.sp
74int
75\fBTcl_UtfBackslash\fR(\fIsrc, readPtr, dst\fR)
76.SH ARGUMENTS
77.AS "const Tcl_UniChar" *uniPattern in/out
78.AP char *buf out
79Buffer in which the UTF-8 representation of the Tcl_UniChar is stored.  At most
80\fBTCL_UTF_MAX\fR bytes are stored in the buffer.
81.AP int ch in
82The Tcl_UniChar to be converted or examined.
83.AP Tcl_UniChar *chPtr out
84Filled with the Tcl_UniChar represented by the head of the UTF-8 string.
85.AP "const char" *src in
86Pointer to a UTF-8 string.
87.AP "const char" *cs in
88Pointer to a UTF-8 string.
89.AP "const char" *ct in
90Pointer to a UTF-8 string.
91.AP "const Tcl_UniChar" *uniStr in
92A null-terminated Unicode string.
93.AP "const Tcl_UniChar" *ucs in
94A null-terminated Unicode string.
95.AP "const Tcl_UniChar" *uct in
96A null-terminated Unicode string.
97.AP "const Tcl_UniChar" *uniPattern in
98A null-terminated Unicode string.
99.AP int length in
100The length of the UTF-8 string in bytes (not UTF-8 characters).  If
101negative, all bytes up to the first null byte are used.
102.AP int uniLength in
103The length of the Unicode string in characters.  Must be greater than or
104equal to 0.
105.AP "Tcl_DString" *dsPtr in/out
106A pointer to a previously initialized \fBTcl_DString\fR.
107.AP "unsigned long" numChars in
108The number of characters to compare.
109.AP "const char" *start in
110Pointer to the beginning of a UTF-8 string.
111.AP int index in
112The index of a character (not byte) in the UTF-8 string.
113.AP int *readPtr out
114If non-NULL, filled with the number of bytes in the backslash sequence,
115including the backslash character.
116.AP char *dst out
117Buffer in which the bytes represented by the backslash sequence are stored.
118At most \fBTCL_UTF_MAX\fR bytes are stored in the buffer.
119.AP int nocase in
120Specifies whether the match should be done case-sensitive (0) or
121case-insensitive (1).
122.BE
123
124.SH DESCRIPTION
125.PP
126These routines convert between UTF-8 strings and Tcl_UniChars.  A
127Tcl_UniChar is a Unicode character represented as an unsigned, fixed-size
128quantity.  A UTF-8 character is a Unicode character represented as
129a varying-length sequence of up to \fBTCL_UTF_MAX\fR bytes.  A multibyte UTF-8
130sequence consists of a lead byte followed by some number of trail bytes.
131.PP
132\fBTCL_UTF_MAX\fR is the maximum number of bytes that it takes to
133represent one Unicode character in the UTF-8 representation.
134.PP
135\fBTcl_UniCharToUtf\fR stores the Tcl_UniChar \fIch\fR as a UTF-8 string
136in starting at \fIbuf\fR.  The return value is the number of bytes stored
137in \fIbuf\fR.
138.PP
139\fBTcl_UtfToUniChar\fR reads one UTF-8 character starting at \fIsrc\fR
140and stores it as a Tcl_UniChar in \fI*chPtr\fR.  The return value is the
141number of bytes read from \fIsrc\fR.  The caller must ensure that the
142source buffer is long enough such that this routine does not run off the
143end and dereference non-existent or random memory; if the source buffer
144is known to be null-terminated, this will not happen.  If the input is
145not in proper UTF-8 format, \fBTcl_UtfToUniChar\fR will store the first
146byte of \fIsrc\fR in \fI*chPtr\fR as a Tcl_UniChar between 0x0000 and
1470x00ff and return 1. 
148.PP
149\fBTcl_UniCharToUtfDString\fR converts the given Unicode string
150to UTF-8, storing the result in a previously initialized \fBTcl_DString\fR.
151You must specify \fIuniLength\fR, the length of the given Unicode string.
152The return value is a pointer to the UTF-8 representation of the
153Unicode string.  Storage for the return value is appended to the
154end of the \fBTcl_DString\fR.
155.PP
156\fBTcl_UtfToUniCharDString\fR converts the given UTF-8 string to Unicode,
157storing the result in the previously initialized \fBTcl_DString\fR.
158In the argument \fIlength\fR, you may either specify the length of
159the given UTF-8 string in bytes or
160.QW \-1 ,
161in which case \fBTcl_UtfToUniCharDString\fR uses \fBstrlen\fR to
162calculate the length.  The return value is a pointer to the Unicode
163representation of the UTF-8 string.  Storage for the return value
164is appended to the end of the \fBTcl_DString\fR.  The Unicode string
165is terminated with a Unicode null character.
166.PP
167\fBTcl_UniCharLen\fR corresponds to \fBstrlen\fR for Unicode
168characters.  It accepts a null-terminated Unicode string and returns
169the number of Unicode characters (not bytes) in that string.
170.PP
171\fBTcl_UniCharNcmp\fR and \fBTcl_UniCharNcasecmp\fR correspond to
172\fBstrncmp\fR and \fBstrncasecmp\fR, respectively, for Unicode characters.
173They accept two null-terminated Unicode strings and the number of characters
174to compare.  Both strings are assumed to be at least \fInumChars\fR characters
175long. \fBTcl_UniCharNcmp\fR  compares the two strings character-by-character
176according to the Unicode character ordering.  It returns an integer greater
177than, equal to, or less than 0 if the first string is greater than, equal
178to, or less than the second string respectively.  \fBTcl_UniCharNcasecmp\fR
179is the Unicode case insensitive version.
180.PP
181\fBTcl_UniCharCaseMatch\fR is the Unicode equivalent to
182\fBTcl_StringCaseMatch\fR.  It accepts a null-terminated Unicode string,
183a Unicode pattern, and a boolean value specifying whether the match should
184be case sensitive and returns whether the string matches the pattern.
185.PP
186\fBTcl_UtfNcmp\fR corresponds to \fBstrncmp\fR for UTF-8 strings. It
187accepts two null-terminated UTF-8 strings and the number of characters
188to compare.  (Both strings are assumed to be at least \fInumChars\fR
189characters long.)  \fBTcl_UtfNcmp\fR compares the two strings
190character-by-character according to the Unicode character ordering.
191It returns an integer greater than, equal to, or less than 0 if the
192first string is greater than, equal to, or less than the second string
193respectively.
194.PP
195\fBTcl_UtfNcasecmp\fR corresponds to \fBstrncasecmp\fR for UTF-8
196strings.  It is similar to \fBTcl_UtfNcmp\fR except comparisons ignore
197differences in case when comparing upper, lower or title case
198characters.
199.PP
200\fBTcl_UtfCharComplete\fR returns 1 if the source UTF-8 string \fIsrc\fR
201of \fIlength\fR bytes is long enough to be decoded by
202\fBTcl_UtfToUniChar\fR, or 0 otherwise.  This function does not guarantee
203that the UTF-8 string is properly formed.  This routine is used by
204procedures that are operating on a byte at a time and need to know if a
205full Tcl_UniChar has been seen.
206.PP
207\fBTcl_NumUtfChars\fR corresponds to \fBstrlen\fR for UTF-8 strings.  It
208returns the number of Tcl_UniChars that are represented by the UTF-8 string
209\fIsrc\fR.  The length of the source string is \fIlength\fR bytes.  If the
210length is negative, all bytes up to the first null byte are used.
211.PP
212\fBTcl_UtfFindFirst\fR corresponds to \fBstrchr\fR for UTF-8 strings.  It
213returns a pointer to the first occurrence of the Tcl_UniChar \fIch\fR
214in the null-terminated UTF-8 string \fIsrc\fR.  The null terminator is
215considered part of the UTF-8 string. 
216.PP
217\fBTcl_UtfFindLast\fR corresponds to \fBstrrchr\fR for UTF-8 strings.  It
218returns a pointer to the last occurrence of the Tcl_UniChar \fIch\fR
219in the null-terminated UTF-8 string \fIsrc\fR.  The null terminator is
220considered part of the UTF-8 string. 
221.PP
222Given \fIsrc\fR, a pointer to some location in a UTF-8 string,
223\fBTcl_UtfNext\fR returns a pointer to the next UTF-8 character in the
224string.  The caller must not ask for the next character after the last
225character in the string if the string is not terminated by a null
226character.
227.PP
228Given \fIsrc\fR, a pointer to some location in a UTF-8 string (or to a
229null byte immediately following such a string), \fBTcl_UtfPrev\fR
230returns a pointer to the closest preceding byte that starts a UTF-8
231character.
232This function will not back up to a position before \fIstart\fR,
233the start of the UTF-8 string.  If \fIsrc\fR was already at \fIstart\fR, the
234return value will be \fIstart\fR.
235.PP
236\fBTcl_UniCharAtIndex\fR corresponds to a C string array dereference or the
237Pascal Ord() function.  It returns the Tcl_UniChar represented at the
238specified character (not byte) \fIindex\fR in the UTF-8 string
239\fIsrc\fR.  The source string must contain at least \fIindex\fR
240characters.  Behavior is undefined if a negative \fIindex\fR is given.
241.PP
242\fBTcl_UtfAtIndex\fR returns a pointer to the specified character (not
243byte) \fIindex\fR in the UTF-8 string \fIsrc\fR.  The source string must
244contain at least \fIindex\fR characters.  This is equivalent to calling
245\fBTcl_UtfNext\fR \fIindex\fR times.  If a negative \fIindex\fR is given,
246the return pointer points to the first character in the source string.
247.PP
248\fBTcl_UtfBackslash\fR is a utility procedure used by several of the Tcl
249commands.  It parses a backslash sequence and stores the properly formed
250UTF-8 character represented by the backslash sequence in the output
251buffer \fIdst\fR.  At most \fBTCL_UTF_MAX\fR bytes are stored in the buffer.
252\fBTcl_UtfBackslash\fR modifies \fI*readPtr\fR to contain the number
253of bytes in the backslash sequence, including the backslash character.
254The return value is the number of bytes stored in the output buffer.
255.PP
256See the \fBTcl\fR manual entry for information on the valid backslash
257sequences.  All of the sequences described in the Tcl manual entry are
258supported by \fBTcl_UtfBackslash\fR.
259
260.SH KEYWORDS
261utf, unicode, backslash
Note: See TracBrowser for help on using the repository browser.