| [25] | 1 | '\" | 
|---|
 | 2 | '\" Copyright (c) 1997 Sun Microsystems, Inc. | 
|---|
 | 3 | '\" | 
|---|
 | 4 | '\" See the file "license.terms" for information on usage and redistribution | 
|---|
 | 5 | '\" of this file, and for a DISCLAIMER OF ALL WARRANTIES. | 
|---|
 | 6 | '\"  | 
|---|
 | 7 | '\" RCS: @(#) $Id: Utf.3,v 1.25 2007/12/13 15:22:32 dgp Exp $ | 
|---|
 | 8 | '\"  | 
|---|
 | 9 | .so man.macros | 
|---|
 | 10 | .TH Utf 3 "8.1" Tcl "Tcl Library Procedures" | 
|---|
 | 11 | .BS | 
|---|
 | 12 | .SH NAME | 
|---|
 | 13 | Tcl_UniChar, Tcl_UniCharCaseMatch, Tcl_UniCharNcasecmp, Tcl_UniCharToUtf, Tcl_UtfToUniChar, Tcl_UniCharToUtfDString, Tcl_UtfToUniCharDString, Tcl_UniCharLen, Tcl_UniCharNcmp, Tcl_UtfCharComplete, Tcl_NumUtfChars, Tcl_UtfFindFirst, Tcl_UtfFindLast, Tcl_UtfNext, Tcl_UtfPrev, Tcl_UniCharAtIndex, Tcl_UtfAtIndex, Tcl_UtfBackslash \- routines for manipulating UTF-8 strings | 
|---|
 | 14 | .SH SYNOPSIS | 
|---|
 | 15 | .nf | 
|---|
 | 16 | \fB#include <tcl.h>\fR | 
|---|
 | 17 | .sp | 
|---|
 | 18 | typedef ... Tcl_UniChar; | 
|---|
 | 19 | .sp | 
|---|
 | 20 | int | 
|---|
 | 21 | \fBTcl_UniCharToUtf\fR(\fIch, buf\fR) | 
|---|
 | 22 | .sp | 
|---|
 | 23 | int | 
|---|
 | 24 | \fBTcl_UtfToUniChar\fR(\fIsrc, chPtr\fR) | 
|---|
 | 25 | .sp | 
|---|
 | 26 | char * | 
|---|
 | 27 | \fBTcl_UniCharToUtfDString\fR(\fIuniStr, uniLength, dsPtr\fR) | 
|---|
 | 28 | .sp | 
|---|
 | 29 | Tcl_UniChar * | 
|---|
 | 30 | \fBTcl_UtfToUniCharDString\fR(\fIsrc, length, dsPtr\fR) | 
|---|
 | 31 | .sp | 
|---|
 | 32 | int | 
|---|
 | 33 | \fBTcl_UniCharLen\fR(\fIuniStr\fR) | 
|---|
 | 34 | .sp | 
|---|
 | 35 | int | 
|---|
 | 36 | \fBTcl_UniCharNcmp\fR(\fIucs, uct, numChars\fR) | 
|---|
 | 37 | .sp | 
|---|
 | 38 | int | 
|---|
 | 39 | \fBTcl_UniCharNcasecmp\fR(\fIucs, uct, numChars\fR) | 
|---|
 | 40 | .sp | 
|---|
 | 41 | int | 
|---|
 | 42 | \fBTcl_UniCharCaseMatch\fR(\fIuniStr, uniPattern, nocase\fR) | 
|---|
 | 43 | .sp | 
|---|
 | 44 | int | 
|---|
 | 45 | \fBTcl_UtfNcmp\fR(\fIcs, ct, numChars\fR) | 
|---|
 | 46 | .sp | 
|---|
 | 47 | int | 
|---|
 | 48 | \fBTcl_UtfNcasecmp\fR(\fIcs, ct, numChars\fR) | 
|---|
 | 49 | .sp | 
|---|
 | 50 | int | 
|---|
 | 51 | \fBTcl_UtfCharComplete\fR(\fIsrc, length\fR) | 
|---|
 | 52 | .sp | 
|---|
 | 53 | int  | 
|---|
 | 54 | \fBTcl_NumUtfChars\fR(\fIsrc, length\fR) | 
|---|
 | 55 | .sp | 
|---|
 | 56 | const char * | 
|---|
 | 57 | \fBTcl_UtfFindFirst\fR(\fIsrc, ch\fR) | 
|---|
 | 58 | .sp | 
|---|
 | 59 | const char * | 
|---|
 | 60 | \fBTcl_UtfFindLast\fR(\fIsrc, ch\fR) | 
|---|
 | 61 | .sp | 
|---|
 | 62 | const char * | 
|---|
 | 63 | \fBTcl_UtfNext\fR(\fIsrc\fR) | 
|---|
 | 64 | .sp | 
|---|
 | 65 | const char * | 
|---|
 | 66 | \fBTcl_UtfPrev\fR(\fIsrc, start\fR) | 
|---|
 | 67 | .sp | 
|---|
 | 68 | Tcl_UniChar | 
|---|
 | 69 | \fBTcl_UniCharAtIndex\fR(\fIsrc, index\fR) | 
|---|
 | 70 | .sp | 
|---|
 | 71 | const char * | 
|---|
 | 72 | \fBTcl_UtfAtIndex\fR(\fIsrc, index\fR) | 
|---|
 | 73 | .sp | 
|---|
 | 74 | int | 
|---|
 | 75 | \fBTcl_UtfBackslash\fR(\fIsrc, readPtr, dst\fR) | 
|---|
 | 76 | .SH ARGUMENTS | 
|---|
 | 77 | .AS "const Tcl_UniChar" *uniPattern in/out | 
|---|
 | 78 | .AP char *buf out | 
|---|
 | 79 | Buffer in which the UTF-8 representation of the Tcl_UniChar is stored.  At most | 
|---|
 | 80 | \fBTCL_UTF_MAX\fR bytes are stored in the buffer. | 
|---|
 | 81 | .AP int ch in | 
|---|
 | 82 | The Tcl_UniChar to be converted or examined. | 
|---|
 | 83 | .AP Tcl_UniChar *chPtr out | 
|---|
 | 84 | Filled with the Tcl_UniChar represented by the head of the UTF-8 string. | 
|---|
 | 85 | .AP "const char" *src in | 
|---|
 | 86 | Pointer to a UTF-8 string. | 
|---|
 | 87 | .AP "const char" *cs in | 
|---|
 | 88 | Pointer to a UTF-8 string. | 
|---|
 | 89 | .AP "const char" *ct in | 
|---|
 | 90 | Pointer to a UTF-8 string. | 
|---|
 | 91 | .AP "const Tcl_UniChar" *uniStr in | 
|---|
 | 92 | A null-terminated Unicode string. | 
|---|
 | 93 | .AP "const Tcl_UniChar" *ucs in | 
|---|
 | 94 | A null-terminated Unicode string. | 
|---|
 | 95 | .AP "const Tcl_UniChar" *uct in | 
|---|
 | 96 | A null-terminated Unicode string. | 
|---|
 | 97 | .AP "const Tcl_UniChar" *uniPattern in | 
|---|
 | 98 | A null-terminated Unicode string. | 
|---|
 | 99 | .AP int length in | 
|---|
 | 100 | The length of the UTF-8 string in bytes (not UTF-8 characters).  If | 
|---|
 | 101 | negative, all bytes up to the first null byte are used. | 
|---|
 | 102 | .AP int uniLength in | 
|---|
 | 103 | The length of the Unicode string in characters.  Must be greater than or | 
|---|
 | 104 | equal to 0. | 
|---|
 | 105 | .AP "Tcl_DString" *dsPtr in/out | 
|---|
 | 106 | A pointer to a previously initialized \fBTcl_DString\fR. | 
|---|
 | 107 | .AP "unsigned long" numChars in | 
|---|
 | 108 | The number of characters to compare. | 
|---|
 | 109 | .AP "const char" *start in | 
|---|
 | 110 | Pointer to the beginning of a UTF-8 string. | 
|---|
 | 111 | .AP int index in | 
|---|
 | 112 | The index of a character (not byte) in the UTF-8 string. | 
|---|
 | 113 | .AP int *readPtr out | 
|---|
 | 114 | If non-NULL, filled with the number of bytes in the backslash sequence,  | 
|---|
 | 115 | including the backslash character. | 
|---|
 | 116 | .AP char *dst out | 
|---|
 | 117 | Buffer in which the bytes represented by the backslash sequence are stored. | 
|---|
 | 118 | At most \fBTCL_UTF_MAX\fR bytes are stored in the buffer. | 
|---|
 | 119 | .AP int nocase in | 
|---|
 | 120 | Specifies whether the match should be done case-sensitive (0) or | 
|---|
 | 121 | case-insensitive (1). | 
|---|
 | 122 | .BE | 
|---|
 | 123 |  | 
|---|
 | 124 | .SH DESCRIPTION | 
|---|
 | 125 | .PP | 
|---|
 | 126 | These routines convert between UTF-8 strings and Tcl_UniChars.  A | 
|---|
 | 127 | Tcl_UniChar is a Unicode character represented as an unsigned, fixed-size | 
|---|
 | 128 | quantity.  A UTF-8 character is a Unicode character represented as | 
|---|
 | 129 | a varying-length sequence of up to \fBTCL_UTF_MAX\fR bytes.  A multibyte UTF-8 | 
|---|
 | 130 | sequence consists of a lead byte followed by some number of trail bytes. | 
|---|
 | 131 | .PP | 
|---|
 | 132 | \fBTCL_UTF_MAX\fR is the maximum number of bytes that it takes to | 
|---|
 | 133 | represent one Unicode character in the UTF-8 representation. | 
|---|
 | 134 | .PP | 
|---|
 | 135 | \fBTcl_UniCharToUtf\fR stores the Tcl_UniChar \fIch\fR as a UTF-8 string | 
|---|
 | 136 | in starting at \fIbuf\fR.  The return value is the number of bytes stored | 
|---|
 | 137 | in \fIbuf\fR. | 
|---|
 | 138 | .PP | 
|---|
 | 139 | \fBTcl_UtfToUniChar\fR reads one UTF-8 character starting at \fIsrc\fR | 
|---|
 | 140 | and stores it as a Tcl_UniChar in \fI*chPtr\fR.  The return value is the | 
|---|
 | 141 | number of bytes read from \fIsrc\fR.  The caller must ensure that the | 
|---|
 | 142 | source buffer is long enough such that this routine does not run off the | 
|---|
 | 143 | end and dereference non-existent or random memory; if the source buffer | 
|---|
 | 144 | is known to be null-terminated, this will not happen.  If the input is | 
|---|
 | 145 | not in proper UTF-8 format, \fBTcl_UtfToUniChar\fR will store the first | 
|---|
 | 146 | byte of \fIsrc\fR in \fI*chPtr\fR as a Tcl_UniChar between 0x0000 and | 
|---|
 | 147 | 0x00ff and return 1.   | 
|---|
 | 148 | .PP | 
|---|
 | 149 | \fBTcl_UniCharToUtfDString\fR converts the given Unicode string | 
|---|
 | 150 | to UTF-8, storing the result in a previously initialized \fBTcl_DString\fR. | 
|---|
 | 151 | You must specify \fIuniLength\fR, the length of the given Unicode string. | 
|---|
 | 152 | The return value is a pointer to the UTF-8 representation of the | 
|---|
 | 153 | Unicode string.  Storage for the return value is appended to the | 
|---|
 | 154 | end of the \fBTcl_DString\fR. | 
|---|
 | 155 | .PP | 
|---|
 | 156 | \fBTcl_UtfToUniCharDString\fR converts the given UTF-8 string to Unicode, | 
|---|
 | 157 | storing the result in the previously initialized \fBTcl_DString\fR. | 
|---|
 | 158 | In the argument \fIlength\fR, you may either specify the length of | 
|---|
 | 159 | the given UTF-8 string in bytes or | 
|---|
 | 160 | .QW \-1 , | 
|---|
 | 161 | in which case \fBTcl_UtfToUniCharDString\fR uses \fBstrlen\fR to | 
|---|
 | 162 | calculate the length.  The return value is a pointer to the Unicode | 
|---|
 | 163 | representation of the UTF-8 string.  Storage for the return value | 
|---|
 | 164 | is appended to the end of the \fBTcl_DString\fR.  The Unicode string | 
|---|
 | 165 | is terminated with a Unicode null character. | 
|---|
 | 166 | .PP | 
|---|
 | 167 | \fBTcl_UniCharLen\fR corresponds to \fBstrlen\fR for Unicode | 
|---|
 | 168 | characters.  It accepts a null-terminated Unicode string and returns | 
|---|
 | 169 | the number of Unicode characters (not bytes) in that string. | 
|---|
 | 170 | .PP | 
|---|
 | 171 | \fBTcl_UniCharNcmp\fR and \fBTcl_UniCharNcasecmp\fR correspond to | 
|---|
 | 172 | \fBstrncmp\fR and \fBstrncasecmp\fR, respectively, for Unicode characters. | 
|---|
 | 173 | They accept two null-terminated Unicode strings and the number of characters | 
|---|
 | 174 | to compare.  Both strings are assumed to be at least \fInumChars\fR characters | 
|---|
 | 175 | long. \fBTcl_UniCharNcmp\fR  compares the two strings character-by-character | 
|---|
 | 176 | according to the Unicode character ordering.  It returns an integer greater | 
|---|
 | 177 | than, equal to, or less than 0 if the first string is greater than, equal | 
|---|
 | 178 | to, or less than the second string respectively.  \fBTcl_UniCharNcasecmp\fR | 
|---|
 | 179 | is the Unicode case insensitive version. | 
|---|
 | 180 | .PP | 
|---|
 | 181 | \fBTcl_UniCharCaseMatch\fR is the Unicode equivalent to | 
|---|
 | 182 | \fBTcl_StringCaseMatch\fR.  It accepts a null-terminated Unicode string, | 
|---|
 | 183 | a Unicode pattern, and a boolean value specifying whether the match should | 
|---|
 | 184 | be case sensitive and returns whether the string matches the pattern. | 
|---|
 | 185 | .PP | 
|---|
 | 186 | \fBTcl_UtfNcmp\fR corresponds to \fBstrncmp\fR for UTF-8 strings. It | 
|---|
 | 187 | accepts two null-terminated UTF-8 strings and the number of characters | 
|---|
 | 188 | to compare.  (Both strings are assumed to be at least \fInumChars\fR | 
|---|
 | 189 | characters long.)  \fBTcl_UtfNcmp\fR compares the two strings | 
|---|
 | 190 | character-by-character according to the Unicode character ordering. | 
|---|
 | 191 | It returns an integer greater than, equal to, or less than 0 if the | 
|---|
 | 192 | first string is greater than, equal to, or less than the second string | 
|---|
 | 193 | respectively. | 
|---|
 | 194 | .PP | 
|---|
 | 195 | \fBTcl_UtfNcasecmp\fR corresponds to \fBstrncasecmp\fR for UTF-8 | 
|---|
 | 196 | strings.  It is similar to \fBTcl_UtfNcmp\fR except comparisons ignore | 
|---|
 | 197 | differences in case when comparing upper, lower or title case | 
|---|
 | 198 | characters. | 
|---|
 | 199 | .PP | 
|---|
 | 200 | \fBTcl_UtfCharComplete\fR returns 1 if the source UTF-8 string \fIsrc\fR | 
|---|
 | 201 | of \fIlength\fR bytes is long enough to be decoded by | 
|---|
 | 202 | \fBTcl_UtfToUniChar\fR, or 0 otherwise.  This function does not guarantee | 
|---|
 | 203 | that the UTF-8 string is properly formed.  This routine is used by | 
|---|
 | 204 | procedures that are operating on a byte at a time and need to know if a | 
|---|
 | 205 | full Tcl_UniChar has been seen. | 
|---|
 | 206 | .PP | 
|---|
 | 207 | \fBTcl_NumUtfChars\fR corresponds to \fBstrlen\fR for UTF-8 strings.  It | 
|---|
 | 208 | returns the number of Tcl_UniChars that are represented by the UTF-8 string | 
|---|
 | 209 | \fIsrc\fR.  The length of the source string is \fIlength\fR bytes.  If the | 
|---|
 | 210 | length is negative, all bytes up to the first null byte are used. | 
|---|
 | 211 | .PP | 
|---|
 | 212 | \fBTcl_UtfFindFirst\fR corresponds to \fBstrchr\fR for UTF-8 strings.  It | 
|---|
 | 213 | returns a pointer to the first occurrence of the Tcl_UniChar \fIch\fR | 
|---|
 | 214 | in the null-terminated UTF-8 string \fIsrc\fR.  The null terminator is | 
|---|
 | 215 | considered part of the UTF-8 string.   | 
|---|
 | 216 | .PP | 
|---|
 | 217 | \fBTcl_UtfFindLast\fR corresponds to \fBstrrchr\fR for UTF-8 strings.  It | 
|---|
 | 218 | returns a pointer to the last occurrence of the Tcl_UniChar \fIch\fR | 
|---|
 | 219 | in the null-terminated UTF-8 string \fIsrc\fR.  The null terminator is | 
|---|
 | 220 | considered part of the UTF-8 string.   | 
|---|
 | 221 | .PP | 
|---|
 | 222 | Given \fIsrc\fR, a pointer to some location in a UTF-8 string, | 
|---|
 | 223 | \fBTcl_UtfNext\fR returns a pointer to the next UTF-8 character in the | 
|---|
 | 224 | string.  The caller must not ask for the next character after the last | 
|---|
 | 225 | character in the string if the string is not terminated by a null | 
|---|
 | 226 | character. | 
|---|
 | 227 | .PP | 
|---|
 | 228 | Given \fIsrc\fR, a pointer to some location in a UTF-8 string (or to a | 
|---|
 | 229 | null byte immediately following such a string), \fBTcl_UtfPrev\fR | 
|---|
 | 230 | returns a pointer to the closest preceding byte that starts a UTF-8 | 
|---|
 | 231 | character. | 
|---|
 | 232 | This function will not back up to a position before \fIstart\fR, | 
|---|
 | 233 | the start of the UTF-8 string.  If \fIsrc\fR was already at \fIstart\fR, the | 
|---|
 | 234 | return value will be \fIstart\fR. | 
|---|
 | 235 | .PP | 
|---|
 | 236 | \fBTcl_UniCharAtIndex\fR corresponds to a C string array dereference or the | 
|---|
 | 237 | Pascal Ord() function.  It returns the Tcl_UniChar represented at the | 
|---|
 | 238 | specified character (not byte) \fIindex\fR in the UTF-8 string | 
|---|
 | 239 | \fIsrc\fR.  The source string must contain at least \fIindex\fR | 
|---|
 | 240 | characters.  Behavior is undefined if a negative \fIindex\fR is given. | 
|---|
 | 241 | .PP | 
|---|
 | 242 | \fBTcl_UtfAtIndex\fR returns a pointer to the specified character (not | 
|---|
 | 243 | byte) \fIindex\fR in the UTF-8 string \fIsrc\fR.  The source string must | 
|---|
 | 244 | contain at least \fIindex\fR characters.  This is equivalent to calling  | 
|---|
 | 245 | \fBTcl_UtfNext\fR \fIindex\fR times.  If a negative \fIindex\fR is given, | 
|---|
 | 246 | the return pointer points to the first character in the source string. | 
|---|
 | 247 | .PP | 
|---|
 | 248 | \fBTcl_UtfBackslash\fR is a utility procedure used by several of the Tcl | 
|---|
 | 249 | commands.  It parses a backslash sequence and stores the properly formed | 
|---|
 | 250 | UTF-8 character represented by the backslash sequence in the output | 
|---|
 | 251 | buffer \fIdst\fR.  At most \fBTCL_UTF_MAX\fR bytes are stored in the buffer. | 
|---|
 | 252 | \fBTcl_UtfBackslash\fR modifies \fI*readPtr\fR to contain the number | 
|---|
 | 253 | of bytes in the backslash sequence, including the backslash character. | 
|---|
 | 254 | The return value is the number of bytes stored in the output buffer. | 
|---|
 | 255 | .PP | 
|---|
 | 256 | See the \fBTcl\fR manual entry for information on the valid backslash | 
|---|
 | 257 | sequences.  All of the sequences described in the Tcl manual entry are | 
|---|
 | 258 | supported by \fBTcl_UtfBackslash\fR. | 
|---|
 | 259 |  | 
|---|
 | 260 | .SH KEYWORDS | 
|---|
 | 261 | utf, unicode, backslash | 
|---|