[25] | 1 | '\" |
---|
| 2 | '\" Copyright (c) 1997 Sun Microsystems, Inc. |
---|
| 3 | '\" |
---|
| 4 | '\" See the file "license.terms" for information on usage and redistribution |
---|
| 5 | '\" of this file, and for a DISCLAIMER OF ALL WARRANTIES. |
---|
| 6 | '\" |
---|
| 7 | '\" RCS: @(#) $Id: Utf.3,v 1.25 2007/12/13 15:22:32 dgp Exp $ |
---|
| 8 | '\" |
---|
| 9 | .so man.macros |
---|
| 10 | .TH Utf 3 "8.1" Tcl "Tcl Library Procedures" |
---|
| 11 | .BS |
---|
| 12 | .SH NAME |
---|
| 13 | Tcl_UniChar, Tcl_UniCharCaseMatch, Tcl_UniCharNcasecmp, Tcl_UniCharToUtf, Tcl_UtfToUniChar, Tcl_UniCharToUtfDString, Tcl_UtfToUniCharDString, Tcl_UniCharLen, Tcl_UniCharNcmp, Tcl_UtfCharComplete, Tcl_NumUtfChars, Tcl_UtfFindFirst, Tcl_UtfFindLast, Tcl_UtfNext, Tcl_UtfPrev, Tcl_UniCharAtIndex, Tcl_UtfAtIndex, Tcl_UtfBackslash \- routines for manipulating UTF-8 strings |
---|
| 14 | .SH SYNOPSIS |
---|
| 15 | .nf |
---|
| 16 | \fB#include <tcl.h>\fR |
---|
| 17 | .sp |
---|
| 18 | typedef ... Tcl_UniChar; |
---|
| 19 | .sp |
---|
| 20 | int |
---|
| 21 | \fBTcl_UniCharToUtf\fR(\fIch, buf\fR) |
---|
| 22 | .sp |
---|
| 23 | int |
---|
| 24 | \fBTcl_UtfToUniChar\fR(\fIsrc, chPtr\fR) |
---|
| 25 | .sp |
---|
| 26 | char * |
---|
| 27 | \fBTcl_UniCharToUtfDString\fR(\fIuniStr, uniLength, dsPtr\fR) |
---|
| 28 | .sp |
---|
| 29 | Tcl_UniChar * |
---|
| 30 | \fBTcl_UtfToUniCharDString\fR(\fIsrc, length, dsPtr\fR) |
---|
| 31 | .sp |
---|
| 32 | int |
---|
| 33 | \fBTcl_UniCharLen\fR(\fIuniStr\fR) |
---|
| 34 | .sp |
---|
| 35 | int |
---|
| 36 | \fBTcl_UniCharNcmp\fR(\fIucs, uct, numChars\fR) |
---|
| 37 | .sp |
---|
| 38 | int |
---|
| 39 | \fBTcl_UniCharNcasecmp\fR(\fIucs, uct, numChars\fR) |
---|
| 40 | .sp |
---|
| 41 | int |
---|
| 42 | \fBTcl_UniCharCaseMatch\fR(\fIuniStr, uniPattern, nocase\fR) |
---|
| 43 | .sp |
---|
| 44 | int |
---|
| 45 | \fBTcl_UtfNcmp\fR(\fIcs, ct, numChars\fR) |
---|
| 46 | .sp |
---|
| 47 | int |
---|
| 48 | \fBTcl_UtfNcasecmp\fR(\fIcs, ct, numChars\fR) |
---|
| 49 | .sp |
---|
| 50 | int |
---|
| 51 | \fBTcl_UtfCharComplete\fR(\fIsrc, length\fR) |
---|
| 52 | .sp |
---|
| 53 | int |
---|
| 54 | \fBTcl_NumUtfChars\fR(\fIsrc, length\fR) |
---|
| 55 | .sp |
---|
| 56 | const char * |
---|
| 57 | \fBTcl_UtfFindFirst\fR(\fIsrc, ch\fR) |
---|
| 58 | .sp |
---|
| 59 | const char * |
---|
| 60 | \fBTcl_UtfFindLast\fR(\fIsrc, ch\fR) |
---|
| 61 | .sp |
---|
| 62 | const char * |
---|
| 63 | \fBTcl_UtfNext\fR(\fIsrc\fR) |
---|
| 64 | .sp |
---|
| 65 | const char * |
---|
| 66 | \fBTcl_UtfPrev\fR(\fIsrc, start\fR) |
---|
| 67 | .sp |
---|
| 68 | Tcl_UniChar |
---|
| 69 | \fBTcl_UniCharAtIndex\fR(\fIsrc, index\fR) |
---|
| 70 | .sp |
---|
| 71 | const char * |
---|
| 72 | \fBTcl_UtfAtIndex\fR(\fIsrc, index\fR) |
---|
| 73 | .sp |
---|
| 74 | int |
---|
| 75 | \fBTcl_UtfBackslash\fR(\fIsrc, readPtr, dst\fR) |
---|
| 76 | .SH ARGUMENTS |
---|
| 77 | .AS "const Tcl_UniChar" *uniPattern in/out |
---|
| 78 | .AP char *buf out |
---|
| 79 | Buffer in which the UTF-8 representation of the Tcl_UniChar is stored. At most |
---|
| 80 | \fBTCL_UTF_MAX\fR bytes are stored in the buffer. |
---|
| 81 | .AP int ch in |
---|
| 82 | The Tcl_UniChar to be converted or examined. |
---|
| 83 | .AP Tcl_UniChar *chPtr out |
---|
| 84 | Filled with the Tcl_UniChar represented by the head of the UTF-8 string. |
---|
| 85 | .AP "const char" *src in |
---|
| 86 | Pointer to a UTF-8 string. |
---|
| 87 | .AP "const char" *cs in |
---|
| 88 | Pointer to a UTF-8 string. |
---|
| 89 | .AP "const char" *ct in |
---|
| 90 | Pointer to a UTF-8 string. |
---|
| 91 | .AP "const Tcl_UniChar" *uniStr in |
---|
| 92 | A null-terminated Unicode string. |
---|
| 93 | .AP "const Tcl_UniChar" *ucs in |
---|
| 94 | A null-terminated Unicode string. |
---|
| 95 | .AP "const Tcl_UniChar" *uct in |
---|
| 96 | A null-terminated Unicode string. |
---|
| 97 | .AP "const Tcl_UniChar" *uniPattern in |
---|
| 98 | A null-terminated Unicode string. |
---|
| 99 | .AP int length in |
---|
| 100 | The length of the UTF-8 string in bytes (not UTF-8 characters). If |
---|
| 101 | negative, all bytes up to the first null byte are used. |
---|
| 102 | .AP int uniLength in |
---|
| 103 | The length of the Unicode string in characters. Must be greater than or |
---|
| 104 | equal to 0. |
---|
| 105 | .AP "Tcl_DString" *dsPtr in/out |
---|
| 106 | A pointer to a previously initialized \fBTcl_DString\fR. |
---|
| 107 | .AP "unsigned long" numChars in |
---|
| 108 | The number of characters to compare. |
---|
| 109 | .AP "const char" *start in |
---|
| 110 | Pointer to the beginning of a UTF-8 string. |
---|
| 111 | .AP int index in |
---|
| 112 | The index of a character (not byte) in the UTF-8 string. |
---|
| 113 | .AP int *readPtr out |
---|
| 114 | If non-NULL, filled with the number of bytes in the backslash sequence, |
---|
| 115 | including the backslash character. |
---|
| 116 | .AP char *dst out |
---|
| 117 | Buffer in which the bytes represented by the backslash sequence are stored. |
---|
| 118 | At most \fBTCL_UTF_MAX\fR bytes are stored in the buffer. |
---|
| 119 | .AP int nocase in |
---|
| 120 | Specifies whether the match should be done case-sensitive (0) or |
---|
| 121 | case-insensitive (1). |
---|
| 122 | .BE |
---|
| 123 | |
---|
| 124 | .SH DESCRIPTION |
---|
| 125 | .PP |
---|
| 126 | These routines convert between UTF-8 strings and Tcl_UniChars. A |
---|
| 127 | Tcl_UniChar is a Unicode character represented as an unsigned, fixed-size |
---|
| 128 | quantity. A UTF-8 character is a Unicode character represented as |
---|
| 129 | a varying-length sequence of up to \fBTCL_UTF_MAX\fR bytes. A multibyte UTF-8 |
---|
| 130 | sequence consists of a lead byte followed by some number of trail bytes. |
---|
| 131 | .PP |
---|
| 132 | \fBTCL_UTF_MAX\fR is the maximum number of bytes that it takes to |
---|
| 133 | represent one Unicode character in the UTF-8 representation. |
---|
| 134 | .PP |
---|
| 135 | \fBTcl_UniCharToUtf\fR stores the Tcl_UniChar \fIch\fR as a UTF-8 string |
---|
| 136 | in starting at \fIbuf\fR. The return value is the number of bytes stored |
---|
| 137 | in \fIbuf\fR. |
---|
| 138 | .PP |
---|
| 139 | \fBTcl_UtfToUniChar\fR reads one UTF-8 character starting at \fIsrc\fR |
---|
| 140 | and stores it as a Tcl_UniChar in \fI*chPtr\fR. The return value is the |
---|
| 141 | number of bytes read from \fIsrc\fR. The caller must ensure that the |
---|
| 142 | source buffer is long enough such that this routine does not run off the |
---|
| 143 | end and dereference non-existent or random memory; if the source buffer |
---|
| 144 | is known to be null-terminated, this will not happen. If the input is |
---|
| 145 | not in proper UTF-8 format, \fBTcl_UtfToUniChar\fR will store the first |
---|
| 146 | byte of \fIsrc\fR in \fI*chPtr\fR as a Tcl_UniChar between 0x0000 and |
---|
| 147 | 0x00ff and return 1. |
---|
| 148 | .PP |
---|
| 149 | \fBTcl_UniCharToUtfDString\fR converts the given Unicode string |
---|
| 150 | to UTF-8, storing the result in a previously initialized \fBTcl_DString\fR. |
---|
| 151 | You must specify \fIuniLength\fR, the length of the given Unicode string. |
---|
| 152 | The return value is a pointer to the UTF-8 representation of the |
---|
| 153 | Unicode string. Storage for the return value is appended to the |
---|
| 154 | end of the \fBTcl_DString\fR. |
---|
| 155 | .PP |
---|
| 156 | \fBTcl_UtfToUniCharDString\fR converts the given UTF-8 string to Unicode, |
---|
| 157 | storing the result in the previously initialized \fBTcl_DString\fR. |
---|
| 158 | In the argument \fIlength\fR, you may either specify the length of |
---|
| 159 | the given UTF-8 string in bytes or |
---|
| 160 | .QW \-1 , |
---|
| 161 | in which case \fBTcl_UtfToUniCharDString\fR uses \fBstrlen\fR to |
---|
| 162 | calculate the length. The return value is a pointer to the Unicode |
---|
| 163 | representation of the UTF-8 string. Storage for the return value |
---|
| 164 | is appended to the end of the \fBTcl_DString\fR. The Unicode string |
---|
| 165 | is terminated with a Unicode null character. |
---|
| 166 | .PP |
---|
| 167 | \fBTcl_UniCharLen\fR corresponds to \fBstrlen\fR for Unicode |
---|
| 168 | characters. It accepts a null-terminated Unicode string and returns |
---|
| 169 | the number of Unicode characters (not bytes) in that string. |
---|
| 170 | .PP |
---|
| 171 | \fBTcl_UniCharNcmp\fR and \fBTcl_UniCharNcasecmp\fR correspond to |
---|
| 172 | \fBstrncmp\fR and \fBstrncasecmp\fR, respectively, for Unicode characters. |
---|
| 173 | They accept two null-terminated Unicode strings and the number of characters |
---|
| 174 | to compare. Both strings are assumed to be at least \fInumChars\fR characters |
---|
| 175 | long. \fBTcl_UniCharNcmp\fR compares the two strings character-by-character |
---|
| 176 | according to the Unicode character ordering. It returns an integer greater |
---|
| 177 | than, equal to, or less than 0 if the first string is greater than, equal |
---|
| 178 | to, or less than the second string respectively. \fBTcl_UniCharNcasecmp\fR |
---|
| 179 | is the Unicode case insensitive version. |
---|
| 180 | .PP |
---|
| 181 | \fBTcl_UniCharCaseMatch\fR is the Unicode equivalent to |
---|
| 182 | \fBTcl_StringCaseMatch\fR. It accepts a null-terminated Unicode string, |
---|
| 183 | a Unicode pattern, and a boolean value specifying whether the match should |
---|
| 184 | be case sensitive and returns whether the string matches the pattern. |
---|
| 185 | .PP |
---|
| 186 | \fBTcl_UtfNcmp\fR corresponds to \fBstrncmp\fR for UTF-8 strings. It |
---|
| 187 | accepts two null-terminated UTF-8 strings and the number of characters |
---|
| 188 | to compare. (Both strings are assumed to be at least \fInumChars\fR |
---|
| 189 | characters long.) \fBTcl_UtfNcmp\fR compares the two strings |
---|
| 190 | character-by-character according to the Unicode character ordering. |
---|
| 191 | It returns an integer greater than, equal to, or less than 0 if the |
---|
| 192 | first string is greater than, equal to, or less than the second string |
---|
| 193 | respectively. |
---|
| 194 | .PP |
---|
| 195 | \fBTcl_UtfNcasecmp\fR corresponds to \fBstrncasecmp\fR for UTF-8 |
---|
| 196 | strings. It is similar to \fBTcl_UtfNcmp\fR except comparisons ignore |
---|
| 197 | differences in case when comparing upper, lower or title case |
---|
| 198 | characters. |
---|
| 199 | .PP |
---|
| 200 | \fBTcl_UtfCharComplete\fR returns 1 if the source UTF-8 string \fIsrc\fR |
---|
| 201 | of \fIlength\fR bytes is long enough to be decoded by |
---|
| 202 | \fBTcl_UtfToUniChar\fR, or 0 otherwise. This function does not guarantee |
---|
| 203 | that the UTF-8 string is properly formed. This routine is used by |
---|
| 204 | procedures that are operating on a byte at a time and need to know if a |
---|
| 205 | full Tcl_UniChar has been seen. |
---|
| 206 | .PP |
---|
| 207 | \fBTcl_NumUtfChars\fR corresponds to \fBstrlen\fR for UTF-8 strings. It |
---|
| 208 | returns the number of Tcl_UniChars that are represented by the UTF-8 string |
---|
| 209 | \fIsrc\fR. The length of the source string is \fIlength\fR bytes. If the |
---|
| 210 | length is negative, all bytes up to the first null byte are used. |
---|
| 211 | .PP |
---|
| 212 | \fBTcl_UtfFindFirst\fR corresponds to \fBstrchr\fR for UTF-8 strings. It |
---|
| 213 | returns a pointer to the first occurrence of the Tcl_UniChar \fIch\fR |
---|
| 214 | in the null-terminated UTF-8 string \fIsrc\fR. The null terminator is |
---|
| 215 | considered part of the UTF-8 string. |
---|
| 216 | .PP |
---|
| 217 | \fBTcl_UtfFindLast\fR corresponds to \fBstrrchr\fR for UTF-8 strings. It |
---|
| 218 | returns a pointer to the last occurrence of the Tcl_UniChar \fIch\fR |
---|
| 219 | in the null-terminated UTF-8 string \fIsrc\fR. The null terminator is |
---|
| 220 | considered part of the UTF-8 string. |
---|
| 221 | .PP |
---|
| 222 | Given \fIsrc\fR, a pointer to some location in a UTF-8 string, |
---|
| 223 | \fBTcl_UtfNext\fR returns a pointer to the next UTF-8 character in the |
---|
| 224 | string. The caller must not ask for the next character after the last |
---|
| 225 | character in the string if the string is not terminated by a null |
---|
| 226 | character. |
---|
| 227 | .PP |
---|
| 228 | Given \fIsrc\fR, a pointer to some location in a UTF-8 string (or to a |
---|
| 229 | null byte immediately following such a string), \fBTcl_UtfPrev\fR |
---|
| 230 | returns a pointer to the closest preceding byte that starts a UTF-8 |
---|
| 231 | character. |
---|
| 232 | This function will not back up to a position before \fIstart\fR, |
---|
| 233 | the start of the UTF-8 string. If \fIsrc\fR was already at \fIstart\fR, the |
---|
| 234 | return value will be \fIstart\fR. |
---|
| 235 | .PP |
---|
| 236 | \fBTcl_UniCharAtIndex\fR corresponds to a C string array dereference or the |
---|
| 237 | Pascal Ord() function. It returns the Tcl_UniChar represented at the |
---|
| 238 | specified character (not byte) \fIindex\fR in the UTF-8 string |
---|
| 239 | \fIsrc\fR. The source string must contain at least \fIindex\fR |
---|
| 240 | characters. Behavior is undefined if a negative \fIindex\fR is given. |
---|
| 241 | .PP |
---|
| 242 | \fBTcl_UtfAtIndex\fR returns a pointer to the specified character (not |
---|
| 243 | byte) \fIindex\fR in the UTF-8 string \fIsrc\fR. The source string must |
---|
| 244 | contain at least \fIindex\fR characters. This is equivalent to calling |
---|
| 245 | \fBTcl_UtfNext\fR \fIindex\fR times. If a negative \fIindex\fR is given, |
---|
| 246 | the return pointer points to the first character in the source string. |
---|
| 247 | .PP |
---|
| 248 | \fBTcl_UtfBackslash\fR is a utility procedure used by several of the Tcl |
---|
| 249 | commands. It parses a backslash sequence and stores the properly formed |
---|
| 250 | UTF-8 character represented by the backslash sequence in the output |
---|
| 251 | buffer \fIdst\fR. At most \fBTCL_UTF_MAX\fR bytes are stored in the buffer. |
---|
| 252 | \fBTcl_UtfBackslash\fR modifies \fI*readPtr\fR to contain the number |
---|
| 253 | of bytes in the backslash sequence, including the backslash character. |
---|
| 254 | The return value is the number of bytes stored in the output buffer. |
---|
| 255 | .PP |
---|
| 256 | See the \fBTcl\fR manual entry for information on the valid backslash |
---|
| 257 | sequences. All of the sequences described in the Tcl manual entry are |
---|
| 258 | supported by \fBTcl_UtfBackslash\fR. |
---|
| 259 | |
---|
| 260 | .SH KEYWORDS |
---|
| 261 | utf, unicode, backslash |
---|