Planet
navi homePPSaboutscreenshotsdownloaddevelopmentforum

source: downloads/tcl8.5.2/doc/Encoding.3 @ 25

Last change on this file since 25 was 25, checked in by landauf, 16 years ago

added tcl to libs

File size: 27.7 KB
Line 
1'\"
2'\" Copyright (c) 1997-1998 Sun Microsystems, Inc.
3'\"
4'\" See the file "license.terms" for information on usage and redistribution
5'\" of this file, and for a DISCLAIMER OF ALL WARRANTIES.
6'\"
7'\" RCS: @(#) $Id: Encoding.3,v 1.29 2007/12/13 15:22:31 dgp Exp $
8'\"
9.so man.macros
10.TH Tcl_GetEncoding 3 "8.1" Tcl "Tcl Library Procedures"
11.BS
12.SH NAME
13Tcl_GetEncoding, Tcl_FreeEncoding, Tcl_GetEncodingFromObj, Tcl_ExternalToUtfDString, Tcl_ExternalToUtf, Tcl_UtfToExternalDString, Tcl_UtfToExternal, Tcl_WinTCharToUtf, Tcl_WinUtfToTChar, Tcl_GetEncodingName, Tcl_SetSystemEncoding, Tcl_GetEncodingNameFromEnvironment, Tcl_GetEncodingNames, Tcl_CreateEncoding, Tcl_GetEncodingSearchPath, Tcl_SetEncodingSearchPath, Tcl_GetDefaultEncodingDir, Tcl_SetDefaultEncodingDir \- procedures for creating and using encodings
14.SH SYNOPSIS
15.nf
16\fB#include <tcl.h>\fR
17.sp
18Tcl_Encoding
19\fBTcl_GetEncoding\fR(\fIinterp, name\fR)
20.sp
21void
22\fBTcl_FreeEncoding\fR(\fIencoding\fR)
23.sp
24.VS 8.5
25int
26\fBTcl_GetEncodingFromObj\fR(\fIinterp, objPtr, encodingPtr\fR)
27.VE 8.5
28.sp
29char *
30\fBTcl_ExternalToUtfDString\fR(\fIencoding, src, srcLen, dstPtr\fR)
31.sp
32char *
33\fBTcl_UtfToExternalDString\fR(\fIencoding, src, srcLen, dstPtr\fR)
34.sp
35int
36\fBTcl_ExternalToUtf\fR(\fIinterp, encoding, src, srcLen, flags, statePtr,
37                  dst, dstLen, srcReadPtr, dstWrotePtr, dstCharsPtr\fR)
38.sp
39int
40\fBTcl_UtfToExternal\fR(\fIinterp, encoding, src, srcLen, flags, statePtr,
41                  dst, dstLen, srcReadPtr, dstWrotePtr, dstCharsPtr\fR)
42.sp
43char *
44\fBTcl_WinTCharToUtf\fR(\fItsrc, srcLen, dstPtr\fR)
45.sp
46TCHAR *
47\fBTcl_WinUtfToTChar\fR(\fIsrc, srcLen, dstPtr\fR)
48.sp
49const char *
50\fBTcl_GetEncodingName\fR(\fIencoding\fR)
51.sp
52int
53\fBTcl_SetSystemEncoding\fR(\fIinterp, name\fR)
54.sp
55.VS 8.5
56const char *
57\fBTcl_GetEncodingNameFromEnvironment\fR(\fIbufPtr\fR)
58.VE 8.5
59.sp
60void
61\fBTcl_GetEncodingNames\fR(\fIinterp\fR)
62.sp
63Tcl_Encoding
64\fBTcl_CreateEncoding\fR(\fItypePtr\fR)
65.sp
66.VS 8.5
67Tcl_Obj *
68\fBTcl_GetEncodingSearchPath\fR()
69.sp
70int
71\fBTcl_SetEncodingSearchPath\fR(\fIsearchPath\fR)
72.VE 8.5
73.sp
74const char *
75\fBTcl_GetDefaultEncodingDir\fR(\fIvoid\fR)
76.sp
77void
78\fBTcl_SetDefaultEncodingDir\fR(\fIpath\fR)
79.SH ARGUMENTS
80.AS "const Tcl_EncodingType" *dstWrotePtr in/out
81.AP Tcl_Interp *interp in
82Interpreter to use for error reporting, or NULL if no error reporting is
83desired.
84.AP "const char" *name in
85Name of encoding to load.
86.AP Tcl_Encoding encoding in
87The encoding to query, free, or use for converting text.  If \fIencoding\fR is
88NULL, the current system encoding is used.
89.AP Tcl_Obj *objPtr in
90.VS 8.5
91Name of encoding to get token for.
92.VE 8.5
93.AP Tcl_Encoding *encodingPtr out
94.VS 8.5
95Points to storage where encoding token is to be written.
96.VE 8.5
97.AP "const char" *src in
98For the \fBTcl_ExternalToUtf\fR functions, an array of bytes in the
99specified encoding that are to be converted to UTF-8.  For the
100\fBTcl_UtfToExternal\fR and \fBTcl_WinUtfToTChar\fR functions, an array of
101UTF-8 characters to be converted to the specified encoding. 
102.AP "const TCHAR" *tsrc in
103An array of Windows TCHAR characters to convert to UTF-8.
104.AP int srcLen in
105Length of \fIsrc\fR or \fItsrc\fR in bytes.  If the length is negative, the
106encoding-specific length of the string is used.
107.AP Tcl_DString *dstPtr out
108Pointer to an uninitialized or free \fBTcl_DString\fR in which the converted
109result will be stored.
110.AP int flags in
111Various flag bits OR-ed together. 
112\fBTCL_ENCODING_START\fR signifies that the
113source buffer is the first block in a (potentially multi-block) input
114stream, telling the conversion routine to reset to an initial state and
115perform any initialization that needs to occur before the first byte is
116converted. \fBTCL_ENCODING_END\fR signifies that the source buffer is the last
117block in a (potentially multi-block) input stream, telling the conversion
118routine to perform any finalization that needs to occur after the last
119byte is converted and then to reset to an initial state.
120\fBTCL_ENCODING_STOPONERROR\fR signifies that the conversion routine should
121return immediately upon reading a source character that does not exist in
122the target encoding; otherwise a default fallback character will
123automatically be substituted. 
124.AP Tcl_EncodingState *statePtr in/out
125Used when converting a (generally long or indefinite length) byte stream
126in a piece-by-piece fashion.  The conversion routine stores its current
127state in \fI*statePtr\fR after \fIsrc\fR (the buffer containing the
128current piece) has been converted; that state information must be passed
129back when converting the next piece of the stream so the conversion
130routine knows what state it was in when it left off at the end of the
131last piece.  May be NULL, in which case the value specified for \fIflags\fR
132is ignored and the source buffer is assumed to contain the complete string to
133convert.
134.AP char *dst out
135Buffer in which the converted result will be stored.  No more than
136\fIdstLen\fR bytes will be stored in \fIdst\fR.
137.AP int dstLen in
138The maximum length of the output buffer \fIdst\fR in bytes.
139.AP int *srcReadPtr out
140Filled with the number of bytes from \fIsrc\fR that were actually
141converted.  This may be less than the original source length if there was
142a problem converting some source characters.  May be NULL.
143.AP int *dstWrotePtr out
144Filled with the number of bytes that were actually stored in the output
145buffer as a result of the conversion.  May be NULL.
146.AP int *dstCharsPtr out
147Filled with the number of characters that correspond to the number of bytes
148stored in the output buffer.  May be NULL.
149.AP Tcl_DString *bufPtr out
150.VS 8.5
151Storage for the prescribed system encoding name.
152.VE 8.5
153.AP "const Tcl_EncodingType" *typePtr in
154Structure that defines a new type of encoding. 
155.AP Tcl_Obj *searchPath in
156.VS 8.5
157List of filesystem directories in which to search for encoding data files.
158.VE 8.5
159.AP "const char" *path in
160A path to the location of the encoding file. 
161.BE
162.SH INTRODUCTION
163.PP
164These routines convert between Tcl's internal character representation,
165UTF-8, and character representations used by various operating systems or
166file systems, such as Unicode, ASCII, or Shift-JIS.  When operating on
167strings, such as such as obtaining the names of files or displaying
168characters using international fonts, the strings must be translated into
169one or possibly multiple formats that the various system calls can use.  For
170instance, on a Japanese Unix workstation, a user might obtain a filename
171represented in the EUC-JP file encoding and then translate the characters to
172the jisx0208 font encoding in order to display the filename in a Tk widget.
173The purpose of the encoding package is to help bridge the translation gap.
174UTF-8 provides an intermediate staging ground for all the various
175encodings.  In the example above, text would be translated into UTF-8 from
176whatever file encoding the operating system is using.  Then it would be
177translated from UTF-8 into whatever font encoding the display routines
178require.
179.PP
180Some basic encodings are compiled into Tcl.  Others can be defined by the
181user or dynamically loaded from encoding files in a
182platform-independent manner.
183.SH DESCRIPTION
184.PP
185\fBTcl_GetEncoding\fR finds an encoding given its \fIname\fR.  The name may
186refer to a built-in Tcl encoding, a user-defined encoding registered by
187calling \fBTcl_CreateEncoding\fR, or a dynamically-loadable encoding
188file.  The return value is a token that represents the encoding and can be
189used in subsequent calls to procedures such as \fBTcl_GetEncodingName\fR,
190\fBTcl_FreeEncoding\fR, and \fBTcl_UtfToExternal\fR.  If the name did not
191refer to any known or loadable encoding, NULL is returned and an error
192message is returned in \fIinterp\fR.
193.PP
194The encoding package maintains a database of all encodings currently in use.
195The first time \fIname\fR is seen, \fBTcl_GetEncoding\fR returns an
196encoding with a reference count of 1.  If the same \fIname\fR is requested
197further times, then the reference count for that encoding is incremented
198without the overhead of allocating a new encoding and all its associated
199data structures. 
200.PP
201When an \fIencoding\fR is no longer needed, \fBTcl_FreeEncoding\fR
202should be called to release it.  When an \fIencoding\fR is no longer in use
203anywhere (i.e., it has been freed as many times as it has been gotten)
204\fBTcl_FreeEncoding\fR will release all storage the encoding was using
205and delete it from the database.
206.PP
207.VS 8.5
208\fBTcl_GetEncodingFromObj\fR treats the string representation of
209\fIobjPtr\fR as an encoding name, and finds an encoding with that
210name, just as \fBTcl_GetEncoding\fR does. When an encoding is found,
211it is cached within the \fBobjPtr\fR value for future reference, the
212\fBTcl_Encoding\fR token is written to the storage pointed to by
213\fIencodingPtr\fR, and the value \fBTCL_OK\fR is returned. If no such
214encoding is found, the value \fBTCL_ERROR\fR is returned, and no
215writing to \fB*\fR\fIencodingPtr\fR takes place. Just as with
216\fBTcl_GetEncoding\fR, the caller should call \fBTcl_FreeEncoding\fR
217on the resulting encoding token when that token will no longer be
218used.
219.VE 8.5
220.PP
221\fBTcl_ExternalToUtfDString\fR converts a source buffer \fIsrc\fR from the
222specified \fIencoding\fR into UTF-8.  The converted bytes are stored in
223\fIdstPtr\fR, which is then null-terminated.  The caller should eventually
224call \fBTcl_DStringFree\fR to free any information stored in \fIdstPtr\fR.
225When converting, if any of the characters in the source buffer cannot be
226represented in the target encoding, a default fallback character will be
227used.  The return value is a pointer to the value stored in the DString.
228.PP
229\fBTcl_ExternalToUtf\fR converts a source buffer \fIsrc\fR from the specified
230\fIencoding\fR into UTF-8.  Up to \fIsrcLen\fR bytes are converted from the
231source buffer and up to \fIdstLen\fR converted bytes are stored in \fIdst\fR.
232In all cases, \fI*srcReadPtr\fR is filled with the number of bytes that were
233successfully converted from \fIsrc\fR and \fI*dstWrotePtr\fR is filled with
234the corresponding number of bytes that were stored in \fIdst\fR.  The return
235value is one of the following:
236.RS
237.IP \fBTCL_OK\fR 29
238All bytes of \fIsrc\fR were converted.
239.IP \fBTCL_CONVERT_NOSPACE\fR 29
240The destination buffer was not large enough for all of the converted data; as
241many characters as could fit were converted though.
242.IP \fBTCL_CONVERT_MULTIBYTE\fR 29
243The last few bytes in the source buffer were the beginning of a multibyte
244sequence, but more bytes were needed to complete this sequence.  A
245subsequent call to the conversion routine should pass a buffer containing
246the unconverted bytes that remained in \fIsrc\fR plus some further bytes
247from the source stream to properly convert the formerly split-up multibyte
248sequence. 
249.IP \fBTCL_CONVERT_SYNTAX\fR 29
250The source buffer contained an invalid character sequence.  This may occur
251if the input stream has been damaged or if the input encoding method was
252misidentified.
253.IP \fBTCL_CONVERT_UNKNOWN\fR 29
254The source buffer contained a character that could not be represented in
255the target encoding and \fBTCL_ENCODING_STOPONERROR\fR was specified. 
256.RE
257.LP
258\fBTcl_UtfToExternalDString\fR converts a source buffer \fIsrc\fR from UTF-8
259into the specified \fIencoding\fR.  The converted bytes are stored in
260\fIdstPtr\fR, which is then terminated with the appropriate encoding-specific
261null.  The caller should eventually call \fBTcl_DStringFree\fR to free any
262information stored in \fIdstPtr\fR.  When converting, if any of the
263characters in the source buffer cannot be represented in the target
264encoding, a default fallback character will be used.  The return value is
265a pointer to the value stored in the DString.
266.PP
267\fBTcl_UtfToExternal\fR converts a source buffer \fIsrc\fR from UTF-8 into
268the specified \fIencoding\fR.  Up to \fIsrcLen\fR bytes are converted from
269the source buffer and up to \fIdstLen\fR converted bytes are stored in
270\fIdst\fR.  In all cases, \fI*srcReadPtr\fR is filled with the number of
271bytes that were successfully converted from \fIsrc\fR and \fI*dstWrotePtr\fR
272is filled with the corresponding number of bytes that were stored in
273\fIdst\fR.  The return values are the same as the return values for
274\fBTcl_ExternalToUtf\fR.
275.PP
276\fBTcl_WinUtfToTChar\fR and \fBTcl_WinTCharToUtf\fR are
277Windows-only convenience
278functions for converting between UTF-8 and Windows strings.  On Windows 95
279(as with the Unix operating system),
280all strings exchanged between Tcl and the operating system are
281.QW "char"
282based.  On Windows NT, some strings exchanged between Tcl and the
283operating system are
284.QW "char"
285oriented while others are in Unicode.  By
286convention, in Windows a TCHAR is a character in the ANSI code page
287on Windows 95 and a Unicode character on Windows NT.
288.PP
289If you planned to use the same
290.QW "char"
291based interfaces on both Windows
29295 and Windows NT, you could use \fBTcl_UtfToExternal\fR and
293\fBTcl_ExternalToUtf\fR (or their \fBTcl_DString\fR equivalents) with an
294encoding of NULL (the current system encoding).  On the other hand,
295if you planned to use the Unicode interface when running on Windows NT
296and the
297.QW "char"
298interfaces when running on Windows 95, you would have
299to perform the following type of test over and over in your program
300(as represented in pseudo-code):
301.CS
302if (running NT) {
303    encoding <- Tcl_GetEncoding("unicode");
304    nativeBuffer <- Tcl_UtfToExternal(encoding, utfBuffer);
305    Tcl_FreeEncoding(encoding);
306} else {
307    nativeBuffer <- Tcl_UtfToExternal(NULL, utfBuffer);
308}
309.CE
310\fBTcl_WinUtfToTChar\fR and \fBTcl_WinTCharToUtf\fR automatically
311handle this test and use the proper encoding based on the current
312operating system.  \fBTcl_WinUtfToTChar\fR returns a pointer to
313a TCHAR string, and \fBTcl_WinTCharToUtf\fR expects a TCHAR string
314pointer as the \fIsrc\fR string.  Otherwise, these functions
315behave identically to \fBTcl_UtfToExternalDString\fR and
316\fBTcl_ExternalToUtfDString\fR.
317.PP
318\fBTcl_GetEncodingName\fR is roughly the inverse of \fBTcl_GetEncoding\fR.
319Given an \fIencoding\fR, the return value is the \fIname\fR argument that
320was used to create the encoding.  The string returned by
321\fBTcl_GetEncodingName\fR is only guaranteed to persist until the
322\fIencoding\fR is deleted.  The caller must not modify this string.
323.PP
324\fBTcl_SetSystemEncoding\fR sets the default encoding that should be used
325whenever the user passes a NULL value for the \fIencoding\fR argument to
326any of the other encoding functions.  If \fIname\fR is NULL, the system
327encoding is reset to the default system encoding, \fBbinary\fR.  If the
328name did not refer to any known or loadable encoding, \fBTCL_ERROR\fR is
329returned and an error message is left in \fIinterp\fR.  Otherwise, this
330procedure increments the reference count of the new system encoding,
331decrements the reference count of the old system encoding, and returns
332\fBTCL_OK\fR.
333.PP
334.VS 8.5
335\fBTcl_GetEncodingNameFromEnvironment\fR provides a means for the Tcl
336library to report the encoding name it believes to be the correct one
337to use as the system encoding, based on system calls and examination of
338the environment suitable for the platform.  It accepts \fIbufPtr\fR,
339a pointer to an uninitialized or freed \fBTcl_DString\fR and writes
340the encoding name to it.  The \fBTcl_DStringValue\fR is returned.
341.VE 8.5
342.PP
343\fBTcl_GetEncodingNames\fR sets the \fIinterp\fR result to a list
344consisting of the names of all the encodings that are currently defined
345or can be dynamically loaded, searching the encoding path specified by
346\fBTcl_SetDefaultEncodingDir\fR.  This procedure does not ensure that the
347dynamically-loadable encoding files contain valid data, but merely that they
348exist.
349.PP
350\fBTcl_CreateEncoding\fR defines a new encoding and registers the C
351procedures that are called back to convert between the encoding and
352UTF-8.  Encodings created by \fBTcl_CreateEncoding\fR are thereafter
353visible in the database used by \fBTcl_GetEncoding\fR.  Just as with the
354\fBTcl_GetEncoding\fR procedure, the return value is a token that
355represents the encoding and can be used in subsequent calls to other
356encoding functions.  \fBTcl_CreateEncoding\fR returns an encoding with a
357reference count of 1. If an encoding with the specified \fIname\fR
358already exists, then its entry in the database is replaced with the new
359encoding; the token for the old encoding will remain valid and continue
360to behave as before, but users of the new token will now call the new
361encoding procedures. 
362.PP
363The \fItypePtr\fR argument to \fBTcl_CreateEncoding\fR contains information
364about the name of the encoding and the procedures that will be called to
365convert between this encoding and UTF-8.  It is defined as follows:
366.PP
367.CS
368typedef struct Tcl_EncodingType {
369        const char *\fIencodingName\fR;
370        Tcl_EncodingConvertProc *\fItoUtfProc\fR;
371        Tcl_EncodingConvertProc *\fIfromUtfProc\fR;
372        Tcl_EncodingFreeProc *\fIfreeProc\fR;
373        ClientData \fIclientData\fR;
374        int \fInullSize\fR;
375} Tcl_EncodingType; 
376.CE
377.PP
378The \fIencodingName\fR provides a string name for the encoding, by
379which it can be referred in other procedures such as
380\fBTcl_GetEncoding\fR.  The \fItoUtfProc\fR refers to a callback
381procedure to invoke to convert text from this encoding into UTF-8.
382The \fIfromUtfProc\fR refers to a callback procedure to invoke to
383convert text from UTF-8 into this encoding.  The \fIfreeProc\fR refers
384to a callback procedure to invoke when this encoding is deleted.  The
385\fIfreeProc\fR field may be NULL.  The \fIclientData\fR contains an
386arbitrary one-word value passed to \fItoUtfProc\fR, \fIfromUtfProc\fR,
387and \fIfreeProc\fR whenever they are called.  Typically, this is a
388pointer to a data structure containing encoding-specific information
389that can be used by the callback procedures.  For instance, two very
390similar encodings such as \fBascii\fR and \fBmacRoman\fR may use the
391same callback procedure, but use different values of \fIclientData\fR
392to control its behavior.  The \fInullSize\fR specifies the number of
393zero bytes that signify end-of-string in this encoding.  It must be
394\fB1\fR (for single-byte or multi-byte encodings like ASCII or
395Shift-JIS) or \fB2\fR (for double-byte encodings like Unicode).
396Constant-sized encodings with 3 or more bytes per character (such as
397CNS11643) are not accepted.
398.PP
399The callback procedures \fItoUtfProc\fR and \fIfromUtfProc\fR should match the
400type \fBTcl_EncodingConvertProc\fR:
401.PP
402.CS
403typedef int Tcl_EncodingConvertProc(
404        ClientData \fIclientData\fR,
405        const char *\fIsrc\fR,
406        int \fIsrcLen\fR,
407        int \fIflags\fR,
408        Tcl_EncodingState *\fIstatePtr\fR,
409        char *\fIdst\fR,
410        int \fIdstLen\fR,
411        int *\fIsrcReadPtr\fR,
412        int *\fIdstWrotePtr\fR,
413        int *\fIdstCharsPtr\fR);
414.CE
415.PP
416The \fItoUtfProc\fR and \fIfromUtfProc\fR procedures are called by the
417\fBTcl_ExternalToUtf\fR or \fBTcl_UtfToExternal\fR family of functions to
418perform the actual conversion.  The \fIclientData\fR parameter to these
419procedures is the same as the \fIclientData\fR field specified to
420\fBTcl_CreateEncoding\fR when the encoding was created.  The remaining
421arguments to the callback procedures are the same as the arguments,
422documented at the top, to \fBTcl_ExternalToUtf\fR or
423\fBTcl_UtfToExternal\fR, with the following exceptions.  If the
424\fIsrcLen\fR argument to one of those high-level functions is negative,
425the value passed to the callback procedure will be the appropriate
426encoding-specific string length of \fIsrc\fR.  If any of the \fIsrcReadPtr\fR,
427\fIdstWrotePtr\fR, or \fIdstCharsPtr\fR arguments to one of the high-level
428functions is NULL, the corresponding value passed to the callback
429procedure will be a non-NULL location.
430.PP
431The callback procedure \fIfreeProc\fR, if non-NULL, should match the type
432\fBTcl_EncodingFreeProc\fR:
433.CS
434typedef void Tcl_EncodingFreeProc(
435        ClientData \fIclientData\fR);
436.CE
437.PP
438This \fIfreeProc\fR function is called when the encoding is deleted.  The
439\fIclientData\fR parameter is the same as the \fIclientData\fR field
440specified to \fBTcl_CreateEncoding\fR when the encoding was created. 
441.PP
442.VS 8.5
443\fBTcl_GetEncodingSearchPath\fR and \fBTcl_SetEncodingSearchPath\fR
444are called to access and set the list of filesystem directories searched
445for encoding data files. 
446.PP
447The value returned by \fBTcl_GetEncodingSearchPath\fR
448is the value stored by the last successful call to
449\fBTcl_SetEncodingSearchPath\fR.  If no calls to
450\fBTcl_SetEncodingSearchPath\fR have occurred, Tcl will compute an initial
451value based on the environment.  There is one encoding search path for the
452entire process, shared by all threads in the process.
453.PP
454\fBTcl_SetEncodingSearchPath\fR stores \fIsearchPath\fR and returns
455\fBTCL_OK\fR, unless \fIsearchPath\fR is not a valid Tcl list, which
456causes \fBTCL_ERROR\fR to be returned.  The elements of \fIsearchPath\fR
457are not verified as existing readable filesystem directories.  When
458searching for encoding data files takes place, and non-existent or
459non-readable filesystem directories on the \fIsearchPath\fR are silently
460ignored.
461.PP
462\fBTcl_GetDefaultEncodingDir\fR and \fBTcl_SetDefaultEncodingDir\fR
463are obsolete interfaces best replaced with calls to
464\fBTcl_GetEncodingSearchPath\fR and \fBTcl_SetEncodingSearchPath\fR.
465They are called to access and set the first element of the \fIsearchPath\fR
466list.  Since Tcl searches \fIsearchPath\fR for encoding data files in
467list order, these routines establish the
468.QW default
469directory in which to find encoding data files.
470.VE 8.5
471.SH "ENCODING FILES"
472Space would prohibit precompiling into Tcl every possible encoding
473algorithm, so many encodings are stored on disk as dynamically-loadable
474encoding files.  This behavior also allows the user to create additional
475encoding files that can be loaded using the same mechanism.  These
476encoding files contain information about the tables and/or escape
477sequences used to map between an external encoding and Unicode.  The
478external encoding may consist of single-byte, multi-byte, or double-byte
479characters. 
480.PP
481Each dynamically-loadable encoding is represented as a text file.  The
482initial line of the file, beginning with a
483.QW #
484symbol, is a comment
485that provides a human-readable description of the file.  The next line
486identifies the type of encoding file.  It can be one of the following
487letters:
488.IP "[1] \fBS\fR"
489A single-byte encoding, where one character is always one byte long in the
490encoding.  An example is \fBiso8859-1\fR, used by many European languages.
491.IP "[2] \fBD\fR"
492A double-byte encoding, where one character is always two bytes long in the
493encoding.  An example is \fBbig5\fR, used for Chinese text.
494.IP "[3] \fBM\fR"
495A multi-byte encoding, where one character may be either one or two bytes long.
496Certain bytes are lead bytes, indicating that another byte must follow
497and that together the two bytes represent one character.  Other bytes are not
498lead bytes and represent themselves.  An example is \fBshiftjis\fR, used by
499many Japanese computers.
500.IP "[4] \fBE\fR"
501An escape-sequence encoding, specifying that certain sequences of bytes
502do not represent characters, but commands that describe how following bytes
503should be interpreted. 
504.PP
505The rest of the lines in the file depend on the type. 
506.PP
507Cases [1], [2], and [3] are collectively referred to as table-based encoding
508files.  The lines in a table-based encoding file are in the same
509format as this example taken from the \fBshiftjis\fR encoding (this is not
510the complete file):
511.CS
512# Encoding file: shiftjis, multi-byte
513M
514003F 0 40
51500
5160000000100020003000400050006000700080009000A000B000C000D000E000F
5170010001100120013001400150016001700180019001A001B001C001D001E001F
5180020002100220023002400250026002700280029002A002B002C002D002E002F
5190030003100320033003400350036003700380039003A003B003C003D003E003F
5200040004100420043004400450046004700480049004A004B004C004D004E004F
5210050005100520053005400550056005700580059005A005B005C005D005E005F
5220060006100620063006400650066006700680069006A006B006C006D006E006F
5230070007100720073007400750076007700780079007A007B007C007D203E007F
5240080000000000000000000000000000000000000000000000000000000000000
5250000000000000000000000000000000000000000000000000000000000000000
5260000FF61FF62FF63FF64FF65FF66FF67FF68FF69FF6AFF6BFF6CFF6DFF6EFF6F
527FF70FF71FF72FF73FF74FF75FF76FF77FF78FF79FF7AFF7BFF7CFF7DFF7EFF7F
528FF80FF81FF82FF83FF84FF85FF86FF87FF88FF89FF8AFF8BFF8CFF8DFF8EFF8F
529FF90FF91FF92FF93FF94FF95FF96FF97FF98FF99FF9AFF9BFF9CFF9DFF9EFF9F
5300000000000000000000000000000000000000000000000000000000000000000
5310000000000000000000000000000000000000000000000000000000000000000
53281
5330000000000000000000000000000000000000000000000000000000000000000
5340000000000000000000000000000000000000000000000000000000000000000
5350000000000000000000000000000000000000000000000000000000000000000
5360000000000000000000000000000000000000000000000000000000000000000
537300030013002FF0CFF0E30FBFF1AFF1BFF1FFF01309B309C00B4FF4000A8FF3E
538FFE3FF3F30FD30FE309D309E30034EDD30053006300730FC20152010FF0F005C
539301C2016FF5C2026202520182019201C201DFF08FF0930143015FF3BFF3DFF5B
540FF5D30083009300A300B300C300D300E300F30103011FF0B221200B100D70000
54100F7FF1D2260FF1CFF1E22662267221E22342642264000B0203220332103FFE5
542FF0400A200A3FF05FF03FF06FF0AFF2000A72606260525CB25CF25CE25C725C6
54325A125A025B325B225BD25BC203B301221922190219121933013000000000000
544000000000000000000000000000000002208220B2286228722822283222A2229
545000000000000000000000000000000002227222800AC21D221D4220022030000
5460000000000000000000000000000000000000000222022A52312220222072261
5472252226A226B221A223D221D2235222B222C0000000000000000000000000000
548212B2030266F266D266A2020202100B6000000000000000025EF000000000000
549.CE
550.PP
551The third line of the file is three numbers.  The first number is the
552fallback character (in base 16) to use when converting from UTF-8 to this
553encoding.  The second number is a \fB1\fR if this file represents the
554encoding for a symbol font, or \fB0\fR otherwise.  The last number (in base
55510) is how many pages of data follow. 
556.PP
557Subsequent lines in the example above are pages that describe how to map
558from the encoding into 2-byte Unicode.  The first line in a page identifies
559the page number.  Following it are 256 double-byte numbers, arranged as 16
560rows of 16 numbers.  Given a character in the encoding, the high byte of
561that character is used to select which page, and the low byte of that
562character is used as an index to select one of the double-byte numbers in
563that page \- the value obtained being the corresponding Unicode character.
564By examination of the example above, one can see that the characters 0x7E
565and 0x8163 in \fBshiftjis\fR map to 203E and 2026 in Unicode, respectively.
566.PP
567Following the first page will be all the other pages, each in the same
568format as the first: one number identifying the page followed by 256
569double-byte Unicode characters.  If a character in the encoding maps to the
570Unicode character 0000, it means that the character does not actually exist.
571If all characters on a page would map to 0000, that page can be omitted.
572.PP
573Case [4] is the escape-sequence encoding file.  The lines in an this type of
574file are in the same format as this example taken from the \fBiso2022-jp\fR
575encoding:
576.CS
577.ta 1.5i
578# Encoding file: iso2022-jp, escape-driven
579E
580init            {}
581final           {}
582iso8859-1       \ex1b(B
583jis0201         \ex1b(J
584jis0208         \ex1b$@
585jis0208         \ex1b$B
586jis0212         \ex1b$(D
587gb2312          \ex1b$A
588ksc5601         \ex1b$(C
589.CE
590.PP
591In the file, the first column represents an option and the second column
592is the associated value.  \fBinit\fR is a string to emit or expect before
593the first character is converted, while \fBfinal\fR is a string to emit
594or expect after the last character.  All other options are names of
595table-based encodings; the associated value is the escape-sequence that
596marks that encoding.  Tcl syntax is used for the values; in the above
597example, for instance,
598.QW \fB{}\fR
599represents the empty string and
600.QW \fB\ex1b\fR
601represents character 27.
602.PP
603When \fBTcl_GetEncoding\fR encounters an encoding \fIname\fR that has not
604been loaded, it attempts to load an encoding file called \fIname\fB.enc\fR
605from the \fBencoding\fR subdirectory of each directory that Tcl searches
606for its script library.  If the encoding file exists, but is
607malformed, an error message will be left in \fIinterp\fR.
608.SH KEYWORDS
609utf, encoding, convert
Note: See TracBrowser for help on using the repository browser.