Planet

navi

home

PPS

about

screenshots

download

development

forum

Context Navigation

source: downloads/tcl8.5.2/tools/uniParse.tcl @ 45

Last change on this file since 45 was 25, checked in by landauf, 18 years ago
added tcl to libs
File size: 9.6 KB

Rev	Line
[25]	1	# uniParse.tcl --
	2	#
	3	# This program parses the UnicodeData file and generates the
	4	# corresponding tclUniData.c file with compressed character
	5	# data tables. The input to this program should be the latest
	6	# UnicodeData file from:
	7	# ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData-Latest.txt
	8	#
	9	# Copyright (c) 1998-1999 by Scriptics Corporation.
	10	# All rights reserved.
	11	#
	12	# RCS: @(#) $Id: uniParse.tcl,v 1.4 2001/05/28 04:37:57 hobbs Exp $
	13
	14
	15	namespace eval uni {
	16	set shift 5; # number of bits of data within a page
	17	# This value can be adjusted to find the
	18	# best split to minimize table size
	19
	20	variable pMap; # map from page to page index, each entry is
	21	# an index into the pages table, indexed by
	22	# page number
	23	variable pages; # map from page index to page info, each
	24	# entry is a list of indices into the groups
	25	# table, the list is indexed by the offset
	26	variable groups; # list of character info values, indexed by
	27	# group number, initialized with the
	28	# unassigned character group
	29
	30	variable categories {
	31	Cn Lu Ll Lt Lm Lo Mn Me Mc Nd Nl No Zs Zl Zp
	32	Cc Cf Co Cs Pc Pd Ps Pe Pi Pf Po Sm Sc Sk So
	33	}; # Ordered list of character categories, must
	34	# match the enumeration in the header file.
	35
	36	variable titleCount 0; # Count of the number of title case
	37	# characters. This value is used in the
	38	# regular expression code to allocate enough
	39	# space for the title case variants.
	40	}
	41
	42	proc uni::getValue {items index} {
	43	variable categories
	44	variable titleCount
	45
	46	# Extract character info
	47
	48	set category [lindex $items 2]
	49	if {[scan [lindex $items 12] %4x toupper] == 1} {
	50	set toupper [expr {$index - $toupper}]
	51	} else {
	52	set toupper {}
	53	}
	54	if {[scan [lindex $items 13] %4x tolower] == 1} {
	55	set tolower [expr {$tolower - $index}]
	56	} else {
	57	set tolower {}
	58	}
	59	if {[scan [lindex $items 14] %4x totitle] == 1} {
	60	set totitle [expr {$index - $totitle}]
	61	} else {
	62	set totitle {}
	63	}
	64
	65	set categoryIndex [lsearch -exact $categories $category]
	66	if {$categoryIndex < 0} {
	67	puts "Unexpected character category: $index($category)"
	68	set categoryIndex 0
	69	} elseif {$category == "Lt"} {
	70	incr titleCount
	71	}
	72
	73	return "$categoryIndex,$toupper,$tolower,$totitle"
	74	}
	75
	76	proc uni::getGroup {value} {
	77	variable groups
	78
	79	set gIndex [lsearch -exact $groups $value]
	80	if {$gIndex == -1} {
	81	set gIndex [llength $groups]
	82	lappend groups $value
	83	}
	84	return $gIndex
	85	}
	86
	87	proc uni::addPage {info} {
	88	variable pMap
	89	variable pages
	90
	91	set pIndex [lsearch -exact $pages $info]
	92	if {$pIndex == -1} {
	93	set pIndex [llength $pages]
	94	lappend pages $info
	95	}
	96	lappend pMap $pIndex
	97	return
	98	}
	99
	100	proc uni::buildTables {data} {
	101	variable shift
	102
	103	variable pMap {}
	104	variable pages {}
	105	variable groups {{0,,,}}
	106	set info {} ;# temporary page info
	107
	108	set mask [expr {(1 << $shift) - 1}]
	109
	110	set next 0
	111
	112	foreach line [split $data \n] {
	113	if {$line == ""} {
	114	set line "FFFF;;Cn;0;ON;;;;;N;;;;;\n"
	115	}
	116
	117	set items [split $line \;]
	118
	119	scan [lindex $items 0] %4x index
	120	set index [format 0x%0.4x $index]
	121
	122	set gIndex [getGroup [getValue $items $index]]
	123
	124	# Since the input table omits unassigned characters, these will
	125	# show up as gaps in the index sequence. There are a few special cases
	126	# where the gaps correspond to a uniform block of assigned characters.
	127	# These are indicated as such in the character name.
	128
	129	# Enter all unassigned characters up to the current character.
	130	if {($index > $next) \
	131	&& ![regexp "Last>$" [lindex $items 1]]} {
	132	for {} {$next < $index} {incr next} {
	133	lappend info 0
	134	if {($next & $mask) == $mask} {
	135	addPage $info
	136	set info {}
	137	}
	138	}
	139	}
	140
	141	# Enter all assigned characters up to the current character
	142	for {set i $next} {$i <= $index} {incr i} {
	143	# Split character index into offset and page number
	144	set offset [expr {$i & $mask}]
	145	set page [expr {($i >> $shift)}]
	146
	147	# Add the group index to the info for the current page
	148	lappend info $gIndex
	149
	150	# If this is the last entry in the page, add the page
	151	if {$offset == $mask} {
	152	addPage $info
	153	set info {}
	154	}
	155	}
	156	set next [expr {$index + 1}]
	157	}
	158	return
	159	}
	160
	161	proc uni::main {} {
	162	global argc argv0 argv
	163	variable pMap
	164	variable pages
	165	variable groups
	166	variable shift
	167	variable titleCount
	168
	169	if {$argc != 2} {
	170	puts stderr "\nusage: $argv0 <datafile> <outdir>\n"
	171	exit 1
	172	}
	173	set f [open [lindex $argv 0] r]
	174	set data [read $f]
	175	close $f
	176
	177	buildTables $data
	178	puts "X = [llength $pMap] Y= [llength $pages] A= [llength $groups]"
	179	set size [expr {[llength $pMap] + [llength $pages]*(1<<$shift)}]
	180	puts "shift = 6, space = $size"
	181	puts "title case count = $titleCount"
	182
	183	set f [open [file join [lindex $argv 1] tclUniData.c] w]
	184	fconfigure $f -translation lf
	185	puts $f "/*
	186	* tclUniData.c --
	187	*
	188	* Declarations of Unicode character information tables. This file is
	189	* automatically generated by the tools/uniParse.tcl script. Do not
	190	* modify this file by hand.
	191	*
	192	* Copyright (c) 1998 by Scriptics Corporation.
	193	* All rights reserved.
	194	*
	195	* RCS: @(#) \$Id\$
	196	*/
	197
	198	/*
	199	* A 16-bit Unicode character is split into two parts in order to index
	200	* into the following tables. The lower OFFSET_BITS comprise an offset
	201	* into a page of characters. The upper bits comprise the page number.
	202	*/
	203
	204	#define OFFSET_BITS $shift
	205
	206	/*
	207	* The pageMap is indexed by page number and returns an alternate page number
	208	* that identifies a unique page of characters. Many Unicode characters map
	209	* to the same alternate page number.
	210	*/
	211
	212	static unsigned char pageMap\[\] = {"
	213	set line " "
	214	set last [expr {[llength $pMap] - 1}]
	215	for {set i 0} {$i <= $last} {incr i} {
	216	append line [lindex $pMap $i]
	217	if {$i != $last} {
	218	append line ", "
	219	}
	220	if {[string length $line] > 70} {
	221	puts $f $line
	222	set line " "
	223	}
	224	}
	225	puts $f $line
	226	puts $f "};
	227
	228	/*
	229	* The groupMap is indexed by combining the alternate page number with
	230	* the page offset and returns a group number that identifies a unique
	231	* set of character attributes.
	232	*/
	233
	234	static unsigned char groupMap\[\] = {"
	235	set line " "
	236	set lasti [expr {[llength $pages] - 1}]
	237	for {set i 0} {$i <= $lasti} {incr i} {
	238	set page [lindex $pages $i]
	239	set lastj [expr {[llength $page] - 1}]
	240	for {set j 0} {$j <= $lastj} {incr j} {
	241	append line [lindex $page $j]
	242	if {$j != $lastj \|\| $i != $lasti} {
	243	append line ", "
	244	}
	245	if {[string length $line] > 70} {
	246	puts $f $line
	247	set line " "
	248	}
	249	}
	250	}
	251	puts $f $line
	252	puts $f "};
	253
	254	/*
	255	* Each group represents a unique set of character attributes. The attributes
	256	* are encoded into a 32-bit value as follows:
	257	*
	258	* Bits 0-4 Character category: see the constants listed below.
	259	*
	260	* Bits 5-7 Case delta type: 000 = identity
	261	* 010 = add delta for lower
	262	* 011 = add delta for lower, add 1 for title
	263	* 100 = sutract delta for title/upper
	264	* 101 = sub delta for upper, sub 1 for title
	265	* 110 = sub delta for upper, add delta for lower
	266	*
	267	* Bits 8-21 Reserved for future use.
	268	*
	269	* Bits 22-31 Case delta: delta for case conversions. This should be the
	270	* highest field so we can easily sign extend.
	271	*/
	272
	273	static int groups\[\] = {"
	274	set line " "
	275	set last [expr {[llength $groups] - 1}]
	276	for {set i 0} {$i <= $last} {incr i} {
	277	foreach {type toupper tolower totitle} [split [lindex $groups $i] ,] {}
	278
	279	# Compute the case conversion type and delta
	280
	281	if {$totitle != ""} {
	282	if {$totitle == $toupper} {
	283	# subtract delta for title or upper
	284	set case 4
	285	set delta $toupper
	286	} elseif {$toupper != ""} {
	287	# subtract delta for upper, subtract 1 for title
	288	set case 5
	289	set delta $toupper
	290	} else {
	291	# add delta for lower, add 1 for title
	292	set case 3
	293	set delta $tolower
	294	}
	295	} elseif {$toupper != ""} {
	296	# subtract delta for upper, add delta for lower
	297	set case 6
	298	set delta $toupper
	299	} elseif {$tolower != ""} {
	300	# add delta for lower
	301	set case 2
	302	set delta $tolower
	303	} else {
	304	# noop
	305	set case 0
	306	set delta 0
	307	}
	308
	309	set val [expr {($delta << 22) \| ($case << 5) \| $type}]
	310
	311	append line [format "%d" $val]
	312	if {$i != $last} {
	313	append line ", "
	314	}
	315	if {[string length $line] > 65} {
	316	puts $f $line
	317	set line " "
	318	}
	319	}
	320	puts $f $line
	321	puts $f "};
	322
	323	/*
	324	* The following constants are used to determine the category of a
	325	* Unicode character.
	326	*/
	327
	328	#define UNICODE_CATEGORY_MASK 0X1F
	329
	330	enum {
	331	UNASSIGNED,
	332	UPPERCASE_LETTER,
	333	LOWERCASE_LETTER,
	334	TITLECASE_LETTER,
	335	MODIFIER_LETTER,
	336	OTHER_LETTER,
	337	NON_SPACING_MARK,
	338	ENCLOSING_MARK,
	339	COMBINING_SPACING_MARK,
	340	DECIMAL_DIGIT_NUMBER,
	341	LETTER_NUMBER,
	342	OTHER_NUMBER,
	343	SPACE_SEPARATOR,
	344	LINE_SEPARATOR,
	345	PARAGRAPH_SEPARATOR,
	346	CONTROL,
	347	FORMAT,
	348	PRIVATE_USE,
	349	SURROGATE,
	350	CONNECTOR_PUNCTUATION,
	351	DASH_PUNCTUATION,
	352	OPEN_PUNCTUATION,
	353	CLOSE_PUNCTUATION,
	354	INITIAL_QUOTE_PUNCTUATION,
	355	FINAL_QUOTE_PUNCTUATION,
	356	OTHER_PUNCTUATION,
	357	MATH_SYMBOL,
	358	CURRENCY_SYMBOL,
	359	MODIFIER_SYMBOL,
	360	OTHER_SYMBOL
	361	};
	362
	363	/*
	364	* The following macros extract the fields of the character info. The
	365	* GetDelta() macro is complicated because we can't rely on the C compiler
	366	* to do sign extension on right shifts.
	367	*/
	368
	369	#define GetCaseType(info) (((info) & 0xE0) >> 5)
	370	#define GetCategory(info) ((info) & 0x1F)
	371	#define GetDelta(info) (((info) > 0) ? ((info) >> 22) : (~(~((info)) >> 22)))
	372
	373	/*
	374	* This macro extracts the information about a character from the
	375	* Unicode character tables.
	376	*/
	377
	378	#define GetUniCharInfo(ch) (groups\[groupMap\[(pageMap\[(((int)(ch)) & 0xffff) >> OFFSET_BITS\] << OFFSET_BITS) \| ((ch) & ((1 << OFFSET_BITS)-1))\]\])
	379	"
	380
	381	close $f
	382	}
	383
	384	uni::main
	385
	386	return

Note: See TracBrowser for help on using the repository browser.

Download in other formats: