Planet

navi

home

PPS

about

screenshots

download

development

forum

Context Navigation

source: downloads/tcl8.5.2/tools/uniParse.tcl @ 45

Last change on this file since 45 was 25, checked in by landauf, 18 years ago
added tcl to libs
File size: 9.6 KB

Line
1	# uniParse.tcl --
2	#
3	# This program parses the UnicodeData file and generates the
4	# corresponding tclUniData.c file with compressed character
5	# data tables. The input to this program should be the latest
6	# UnicodeData file from:
7	# ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData-Latest.txt
8	#
9	# Copyright (c) 1998-1999 by Scriptics Corporation.
10	# All rights reserved.
11	#
12	# RCS: @(#) $Id: uniParse.tcl,v 1.4 2001/05/28 04:37:57 hobbs Exp $
13
14
15	namespace eval uni {
16	set shift 5; # number of bits of data within a page
17	# This value can be adjusted to find the
18	# best split to minimize table size
19
20	variable pMap; # map from page to page index, each entry is
21	# an index into the pages table, indexed by
22	# page number
23	variable pages; # map from page index to page info, each
24	# entry is a list of indices into the groups
25	# table, the list is indexed by the offset
26	variable groups; # list of character info values, indexed by
27	# group number, initialized with the
28	# unassigned character group
29
30	variable categories {
31	Cn Lu Ll Lt Lm Lo Mn Me Mc Nd Nl No Zs Zl Zp
32	Cc Cf Co Cs Pc Pd Ps Pe Pi Pf Po Sm Sc Sk So
33	}; # Ordered list of character categories, must
34	# match the enumeration in the header file.
35
36	variable titleCount 0; # Count of the number of title case
37	# characters. This value is used in the
38	# regular expression code to allocate enough
39	# space for the title case variants.
40	}
41
42	proc uni::getValue {items index} {
43	variable categories
44	variable titleCount
45
46	# Extract character info
47
48	set category [lindex $items 2]
49	if {[scan [lindex $items 12] %4x toupper] == 1} {
50	set toupper [expr {$index - $toupper}]
51	} else {
52	set toupper {}
53	}
54	if {[scan [lindex $items 13] %4x tolower] == 1} {
55	set tolower [expr {$tolower - $index}]
56	} else {
57	set tolower {}
58	}
59	if {[scan [lindex $items 14] %4x totitle] == 1} {
60	set totitle [expr {$index - $totitle}]
61	} else {
62	set totitle {}
63	}
64
65	set categoryIndex [lsearch -exact $categories $category]
66	if {$categoryIndex < 0} {
67	puts "Unexpected character category: $index($category)"
68	set categoryIndex 0
69	} elseif {$category == "Lt"} {
70	incr titleCount
71	}
72
73	return "$categoryIndex,$toupper,$tolower,$totitle"
74	}
75
76	proc uni::getGroup {value} {
77	variable groups
78
79	set gIndex [lsearch -exact $groups $value]
80	if {$gIndex == -1} {
81	set gIndex [llength $groups]
82	lappend groups $value
83	}
84	return $gIndex
85	}
86
87	proc uni::addPage {info} {
88	variable pMap
89	variable pages
90
91	set pIndex [lsearch -exact $pages $info]
92	if {$pIndex == -1} {
93	set pIndex [llength $pages]
94	lappend pages $info
95	}
96	lappend pMap $pIndex
97	return
98	}
99
100	proc uni::buildTables {data} {
101	variable shift
102
103	variable pMap {}
104	variable pages {}
105	variable groups {{0,,,}}
106	set info {} ;# temporary page info
107
108	set mask [expr {(1 << $shift) - 1}]
109
110	set next 0
111
112	foreach line [split $data \n] {
113	if {$line == ""} {
114	set line "FFFF;;Cn;0;ON;;;;;N;;;;;\n"
115	}
116
117	set items [split $line \;]
118
119	scan [lindex $items 0] %4x index
120	set index [format 0x%0.4x $index]
121
122	set gIndex [getGroup [getValue $items $index]]
123
124	# Since the input table omits unassigned characters, these will
125	# show up as gaps in the index sequence. There are a few special cases
126	# where the gaps correspond to a uniform block of assigned characters.
127	# These are indicated as such in the character name.
128
129	# Enter all unassigned characters up to the current character.
130	if {($index > $next) \
131	&& ![regexp "Last>$" [lindex $items 1]]} {
132	for {} {$next < $index} {incr next} {
133	lappend info 0
134	if {($next & $mask) == $mask} {
135	addPage $info
136	set info {}
137	}
138	}
139	}
140
141	# Enter all assigned characters up to the current character
142	for {set i $next} {$i <= $index} {incr i} {
143	# Split character index into offset and page number
144	set offset [expr {$i & $mask}]
145	set page [expr {($i >> $shift)}]
146
147	# Add the group index to the info for the current page
148	lappend info $gIndex
149
150	# If this is the last entry in the page, add the page
151	if {$offset == $mask} {
152	addPage $info
153	set info {}
154	}
155	}
156	set next [expr {$index + 1}]
157	}
158	return
159	}
160
161	proc uni::main {} {
162	global argc argv0 argv
163	variable pMap
164	variable pages
165	variable groups
166	variable shift
167	variable titleCount
168
169	if {$argc != 2} {
170	puts stderr "\nusage: $argv0 <datafile> <outdir>\n"
171	exit 1
172	}
173	set f [open [lindex $argv 0] r]
174	set data [read $f]
175	close $f
176
177	buildTables $data
178	puts "X = [llength $pMap] Y= [llength $pages] A= [llength $groups]"
179	set size [expr {[llength $pMap] + [llength $pages]*(1<<$shift)}]
180	puts "shift = 6, space = $size"
181	puts "title case count = $titleCount"
182
183	set f [open [file join [lindex $argv 1] tclUniData.c] w]
184	fconfigure $f -translation lf
185	puts $f "/*
186	* tclUniData.c --
187	*
188	* Declarations of Unicode character information tables. This file is
189	* automatically generated by the tools/uniParse.tcl script. Do not
190	* modify this file by hand.
191	*
192	* Copyright (c) 1998 by Scriptics Corporation.
193	* All rights reserved.
194	*
195	* RCS: @(#) \$Id\$
196	*/
197
198	/*
199	* A 16-bit Unicode character is split into two parts in order to index
200	* into the following tables. The lower OFFSET_BITS comprise an offset
201	* into a page of characters. The upper bits comprise the page number.
202	*/
203
204	#define OFFSET_BITS $shift
205
206	/*
207	* The pageMap is indexed by page number and returns an alternate page number
208	* that identifies a unique page of characters. Many Unicode characters map
209	* to the same alternate page number.
210	*/
211
212	static unsigned char pageMap\[\] = {"
213	set line " "
214	set last [expr {[llength $pMap] - 1}]
215	for {set i 0} {$i <= $last} {incr i} {
216	append line [lindex $pMap $i]
217	if {$i != $last} {
218	append line ", "
219	}
220	if {[string length $line] > 70} {
221	puts $f $line
222	set line " "
223	}
224	}
225	puts $f $line
226	puts $f "};
227
228	/*
229	* The groupMap is indexed by combining the alternate page number with
230	* the page offset and returns a group number that identifies a unique
231	* set of character attributes.
232	*/
233
234	static unsigned char groupMap\[\] = {"
235	set line " "
236	set lasti [expr {[llength $pages] - 1}]
237	for {set i 0} {$i <= $lasti} {incr i} {
238	set page [lindex $pages $i]
239	set lastj [expr {[llength $page] - 1}]
240	for {set j 0} {$j <= $lastj} {incr j} {
241	append line [lindex $page $j]
242	if {$j != $lastj \|\| $i != $lasti} {
243	append line ", "
244	}
245	if {[string length $line] > 70} {
246	puts $f $line
247	set line " "
248	}
249	}
250	}
251	puts $f $line
252	puts $f "};
253
254	/*
255	* Each group represents a unique set of character attributes. The attributes
256	* are encoded into a 32-bit value as follows:
257	*
258	* Bits 0-4 Character category: see the constants listed below.
259	*
260	* Bits 5-7 Case delta type: 000 = identity
261	* 010 = add delta for lower
262	* 011 = add delta for lower, add 1 for title
263	* 100 = sutract delta for title/upper
264	* 101 = sub delta for upper, sub 1 for title
265	* 110 = sub delta for upper, add delta for lower
266	*
267	* Bits 8-21 Reserved for future use.
268	*
269	* Bits 22-31 Case delta: delta for case conversions. This should be the
270	* highest field so we can easily sign extend.
271	*/
272
273	static int groups\[\] = {"
274	set line " "
275	set last [expr {[llength $groups] - 1}]
276	for {set i 0} {$i <= $last} {incr i} {
277	foreach {type toupper tolower totitle} [split [lindex $groups $i] ,] {}
278
279	# Compute the case conversion type and delta
280
281	if {$totitle != ""} {
282	if {$totitle == $toupper} {
283	# subtract delta for title or upper
284	set case 4
285	set delta $toupper
286	} elseif {$toupper != ""} {
287	# subtract delta for upper, subtract 1 for title
288	set case 5
289	set delta $toupper
290	} else {
291	# add delta for lower, add 1 for title
292	set case 3
293	set delta $tolower
294	}
295	} elseif {$toupper != ""} {
296	# subtract delta for upper, add delta for lower
297	set case 6
298	set delta $toupper
299	} elseif {$tolower != ""} {
300	# add delta for lower
301	set case 2
302	set delta $tolower
303	} else {
304	# noop
305	set case 0
306	set delta 0
307	}
308
309	set val [expr {($delta << 22) \| ($case << 5) \| $type}]
310
311	append line [format "%d" $val]
312	if {$i != $last} {
313	append line ", "
314	}
315	if {[string length $line] > 65} {
316	puts $f $line
317	set line " "
318	}
319	}
320	puts $f $line
321	puts $f "};
322
323	/*
324	* The following constants are used to determine the category of a
325	* Unicode character.
326	*/
327
328	#define UNICODE_CATEGORY_MASK 0X1F
329
330	enum {
331	UNASSIGNED,
332	UPPERCASE_LETTER,
333	LOWERCASE_LETTER,
334	TITLECASE_LETTER,
335	MODIFIER_LETTER,
336	OTHER_LETTER,
337	NON_SPACING_MARK,
338	ENCLOSING_MARK,
339	COMBINING_SPACING_MARK,
340	DECIMAL_DIGIT_NUMBER,
341	LETTER_NUMBER,
342	OTHER_NUMBER,
343	SPACE_SEPARATOR,
344	LINE_SEPARATOR,
345	PARAGRAPH_SEPARATOR,
346	CONTROL,
347	FORMAT,
348	PRIVATE_USE,
349	SURROGATE,
350	CONNECTOR_PUNCTUATION,
351	DASH_PUNCTUATION,
352	OPEN_PUNCTUATION,
353	CLOSE_PUNCTUATION,
354	INITIAL_QUOTE_PUNCTUATION,
355	FINAL_QUOTE_PUNCTUATION,
356	OTHER_PUNCTUATION,
357	MATH_SYMBOL,
358	CURRENCY_SYMBOL,
359	MODIFIER_SYMBOL,
360	OTHER_SYMBOL
361	};
362
363	/*
364	* The following macros extract the fields of the character info. The
365	* GetDelta() macro is complicated because we can't rely on the C compiler
366	* to do sign extension on right shifts.
367	*/
368
369	#define GetCaseType(info) (((info) & 0xE0) >> 5)
370	#define GetCategory(info) ((info) & 0x1F)
371	#define GetDelta(info) (((info) > 0) ? ((info) >> 22) : (~(~((info)) >> 22)))
372
373	/*
374	* This macro extracts the information about a character from the
375	* Unicode character tables.
376	*/
377
378	#define GetUniCharInfo(ch) (groups\[groupMap\[(pageMap\[(((int)(ch)) & 0xffff) >> OFFSET_BITS\] << OFFSET_BITS) \| ((ch) & ((1 << OFFSET_BITS)-1))\]\])
379	"
380
381	close $f
382	}
383
384	uni::main
385
386	return

Note: See TracBrowser for help on using the repository browser.

Download in other formats: