[25] | 1 | # uniParse.tcl -- |
---|
| 2 | # |
---|
| 3 | # This program parses the UnicodeData file and generates the |
---|
| 4 | # corresponding tclUniData.c file with compressed character |
---|
| 5 | # data tables. The input to this program should be the latest |
---|
| 6 | # UnicodeData file from: |
---|
| 7 | # ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData-Latest.txt |
---|
| 8 | # |
---|
| 9 | # Copyright (c) 1998-1999 by Scriptics Corporation. |
---|
| 10 | # All rights reserved. |
---|
| 11 | # |
---|
| 12 | # RCS: @(#) $Id: uniParse.tcl,v 1.4 2001/05/28 04:37:57 hobbs Exp $ |
---|
| 13 | |
---|
| 14 | |
---|
| 15 | namespace eval uni { |
---|
| 16 | set shift 5; # number of bits of data within a page |
---|
| 17 | # This value can be adjusted to find the |
---|
| 18 | # best split to minimize table size |
---|
| 19 | |
---|
| 20 | variable pMap; # map from page to page index, each entry is |
---|
| 21 | # an index into the pages table, indexed by |
---|
| 22 | # page number |
---|
| 23 | variable pages; # map from page index to page info, each |
---|
| 24 | # entry is a list of indices into the groups |
---|
| 25 | # table, the list is indexed by the offset |
---|
| 26 | variable groups; # list of character info values, indexed by |
---|
| 27 | # group number, initialized with the |
---|
| 28 | # unassigned character group |
---|
| 29 | |
---|
| 30 | variable categories { |
---|
| 31 | Cn Lu Ll Lt Lm Lo Mn Me Mc Nd Nl No Zs Zl Zp |
---|
| 32 | Cc Cf Co Cs Pc Pd Ps Pe Pi Pf Po Sm Sc Sk So |
---|
| 33 | }; # Ordered list of character categories, must |
---|
| 34 | # match the enumeration in the header file. |
---|
| 35 | |
---|
| 36 | variable titleCount 0; # Count of the number of title case |
---|
| 37 | # characters. This value is used in the |
---|
| 38 | # regular expression code to allocate enough |
---|
| 39 | # space for the title case variants. |
---|
| 40 | } |
---|
| 41 | |
---|
| 42 | proc uni::getValue {items index} { |
---|
| 43 | variable categories |
---|
| 44 | variable titleCount |
---|
| 45 | |
---|
| 46 | # Extract character info |
---|
| 47 | |
---|
| 48 | set category [lindex $items 2] |
---|
| 49 | if {[scan [lindex $items 12] %4x toupper] == 1} { |
---|
| 50 | set toupper [expr {$index - $toupper}] |
---|
| 51 | } else { |
---|
| 52 | set toupper {} |
---|
| 53 | } |
---|
| 54 | if {[scan [lindex $items 13] %4x tolower] == 1} { |
---|
| 55 | set tolower [expr {$tolower - $index}] |
---|
| 56 | } else { |
---|
| 57 | set tolower {} |
---|
| 58 | } |
---|
| 59 | if {[scan [lindex $items 14] %4x totitle] == 1} { |
---|
| 60 | set totitle [expr {$index - $totitle}] |
---|
| 61 | } else { |
---|
| 62 | set totitle {} |
---|
| 63 | } |
---|
| 64 | |
---|
| 65 | set categoryIndex [lsearch -exact $categories $category] |
---|
| 66 | if {$categoryIndex < 0} { |
---|
| 67 | puts "Unexpected character category: $index($category)" |
---|
| 68 | set categoryIndex 0 |
---|
| 69 | } elseif {$category == "Lt"} { |
---|
| 70 | incr titleCount |
---|
| 71 | } |
---|
| 72 | |
---|
| 73 | return "$categoryIndex,$toupper,$tolower,$totitle" |
---|
| 74 | } |
---|
| 75 | |
---|
| 76 | proc uni::getGroup {value} { |
---|
| 77 | variable groups |
---|
| 78 | |
---|
| 79 | set gIndex [lsearch -exact $groups $value] |
---|
| 80 | if {$gIndex == -1} { |
---|
| 81 | set gIndex [llength $groups] |
---|
| 82 | lappend groups $value |
---|
| 83 | } |
---|
| 84 | return $gIndex |
---|
| 85 | } |
---|
| 86 | |
---|
| 87 | proc uni::addPage {info} { |
---|
| 88 | variable pMap |
---|
| 89 | variable pages |
---|
| 90 | |
---|
| 91 | set pIndex [lsearch -exact $pages $info] |
---|
| 92 | if {$pIndex == -1} { |
---|
| 93 | set pIndex [llength $pages] |
---|
| 94 | lappend pages $info |
---|
| 95 | } |
---|
| 96 | lappend pMap $pIndex |
---|
| 97 | return |
---|
| 98 | } |
---|
| 99 | |
---|
| 100 | proc uni::buildTables {data} { |
---|
| 101 | variable shift |
---|
| 102 | |
---|
| 103 | variable pMap {} |
---|
| 104 | variable pages {} |
---|
| 105 | variable groups {{0,,,}} |
---|
| 106 | set info {} ;# temporary page info |
---|
| 107 | |
---|
| 108 | set mask [expr {(1 << $shift) - 1}] |
---|
| 109 | |
---|
| 110 | set next 0 |
---|
| 111 | |
---|
| 112 | foreach line [split $data \n] { |
---|
| 113 | if {$line == ""} { |
---|
| 114 | set line "FFFF;;Cn;0;ON;;;;;N;;;;;\n" |
---|
| 115 | } |
---|
| 116 | |
---|
| 117 | set items [split $line \;] |
---|
| 118 | |
---|
| 119 | scan [lindex $items 0] %4x index |
---|
| 120 | set index [format 0x%0.4x $index] |
---|
| 121 | |
---|
| 122 | set gIndex [getGroup [getValue $items $index]] |
---|
| 123 | |
---|
| 124 | # Since the input table omits unassigned characters, these will |
---|
| 125 | # show up as gaps in the index sequence. There are a few special cases |
---|
| 126 | # where the gaps correspond to a uniform block of assigned characters. |
---|
| 127 | # These are indicated as such in the character name. |
---|
| 128 | |
---|
| 129 | # Enter all unassigned characters up to the current character. |
---|
| 130 | if {($index > $next) \ |
---|
| 131 | && ![regexp "Last>$" [lindex $items 1]]} { |
---|
| 132 | for {} {$next < $index} {incr next} { |
---|
| 133 | lappend info 0 |
---|
| 134 | if {($next & $mask) == $mask} { |
---|
| 135 | addPage $info |
---|
| 136 | set info {} |
---|
| 137 | } |
---|
| 138 | } |
---|
| 139 | } |
---|
| 140 | |
---|
| 141 | # Enter all assigned characters up to the current character |
---|
| 142 | for {set i $next} {$i <= $index} {incr i} { |
---|
| 143 | # Split character index into offset and page number |
---|
| 144 | set offset [expr {$i & $mask}] |
---|
| 145 | set page [expr {($i >> $shift)}] |
---|
| 146 | |
---|
| 147 | # Add the group index to the info for the current page |
---|
| 148 | lappend info $gIndex |
---|
| 149 | |
---|
| 150 | # If this is the last entry in the page, add the page |
---|
| 151 | if {$offset == $mask} { |
---|
| 152 | addPage $info |
---|
| 153 | set info {} |
---|
| 154 | } |
---|
| 155 | } |
---|
| 156 | set next [expr {$index + 1}] |
---|
| 157 | } |
---|
| 158 | return |
---|
| 159 | } |
---|
| 160 | |
---|
| 161 | proc uni::main {} { |
---|
| 162 | global argc argv0 argv |
---|
| 163 | variable pMap |
---|
| 164 | variable pages |
---|
| 165 | variable groups |
---|
| 166 | variable shift |
---|
| 167 | variable titleCount |
---|
| 168 | |
---|
| 169 | if {$argc != 2} { |
---|
| 170 | puts stderr "\nusage: $argv0 <datafile> <outdir>\n" |
---|
| 171 | exit 1 |
---|
| 172 | } |
---|
| 173 | set f [open [lindex $argv 0] r] |
---|
| 174 | set data [read $f] |
---|
| 175 | close $f |
---|
| 176 | |
---|
| 177 | buildTables $data |
---|
| 178 | puts "X = [llength $pMap] Y= [llength $pages] A= [llength $groups]" |
---|
| 179 | set size [expr {[llength $pMap] + [llength $pages]*(1<<$shift)}] |
---|
| 180 | puts "shift = 6, space = $size" |
---|
| 181 | puts "title case count = $titleCount" |
---|
| 182 | |
---|
| 183 | set f [open [file join [lindex $argv 1] tclUniData.c] w] |
---|
| 184 | fconfigure $f -translation lf |
---|
| 185 | puts $f "/* |
---|
| 186 | * tclUniData.c -- |
---|
| 187 | * |
---|
| 188 | * Declarations of Unicode character information tables. This file is |
---|
| 189 | * automatically generated by the tools/uniParse.tcl script. Do not |
---|
| 190 | * modify this file by hand. |
---|
| 191 | * |
---|
| 192 | * Copyright (c) 1998 by Scriptics Corporation. |
---|
| 193 | * All rights reserved. |
---|
| 194 | * |
---|
| 195 | * RCS: @(#) \$Id\$ |
---|
| 196 | */ |
---|
| 197 | |
---|
| 198 | /* |
---|
| 199 | * A 16-bit Unicode character is split into two parts in order to index |
---|
| 200 | * into the following tables. The lower OFFSET_BITS comprise an offset |
---|
| 201 | * into a page of characters. The upper bits comprise the page number. |
---|
| 202 | */ |
---|
| 203 | |
---|
| 204 | #define OFFSET_BITS $shift |
---|
| 205 | |
---|
| 206 | /* |
---|
| 207 | * The pageMap is indexed by page number and returns an alternate page number |
---|
| 208 | * that identifies a unique page of characters. Many Unicode characters map |
---|
| 209 | * to the same alternate page number. |
---|
| 210 | */ |
---|
| 211 | |
---|
| 212 | static unsigned char pageMap\[\] = {" |
---|
| 213 | set line " " |
---|
| 214 | set last [expr {[llength $pMap] - 1}] |
---|
| 215 | for {set i 0} {$i <= $last} {incr i} { |
---|
| 216 | append line [lindex $pMap $i] |
---|
| 217 | if {$i != $last} { |
---|
| 218 | append line ", " |
---|
| 219 | } |
---|
| 220 | if {[string length $line] > 70} { |
---|
| 221 | puts $f $line |
---|
| 222 | set line " " |
---|
| 223 | } |
---|
| 224 | } |
---|
| 225 | puts $f $line |
---|
| 226 | puts $f "}; |
---|
| 227 | |
---|
| 228 | /* |
---|
| 229 | * The groupMap is indexed by combining the alternate page number with |
---|
| 230 | * the page offset and returns a group number that identifies a unique |
---|
| 231 | * set of character attributes. |
---|
| 232 | */ |
---|
| 233 | |
---|
| 234 | static unsigned char groupMap\[\] = {" |
---|
| 235 | set line " " |
---|
| 236 | set lasti [expr {[llength $pages] - 1}] |
---|
| 237 | for {set i 0} {$i <= $lasti} {incr i} { |
---|
| 238 | set page [lindex $pages $i] |
---|
| 239 | set lastj [expr {[llength $page] - 1}] |
---|
| 240 | for {set j 0} {$j <= $lastj} {incr j} { |
---|
| 241 | append line [lindex $page $j] |
---|
| 242 | if {$j != $lastj || $i != $lasti} { |
---|
| 243 | append line ", " |
---|
| 244 | } |
---|
| 245 | if {[string length $line] > 70} { |
---|
| 246 | puts $f $line |
---|
| 247 | set line " " |
---|
| 248 | } |
---|
| 249 | } |
---|
| 250 | } |
---|
| 251 | puts $f $line |
---|
| 252 | puts $f "}; |
---|
| 253 | |
---|
| 254 | /* |
---|
| 255 | * Each group represents a unique set of character attributes. The attributes |
---|
| 256 | * are encoded into a 32-bit value as follows: |
---|
| 257 | * |
---|
| 258 | * Bits 0-4 Character category: see the constants listed below. |
---|
| 259 | * |
---|
| 260 | * Bits 5-7 Case delta type: 000 = identity |
---|
| 261 | * 010 = add delta for lower |
---|
| 262 | * 011 = add delta for lower, add 1 for title |
---|
| 263 | * 100 = sutract delta for title/upper |
---|
| 264 | * 101 = sub delta for upper, sub 1 for title |
---|
| 265 | * 110 = sub delta for upper, add delta for lower |
---|
| 266 | * |
---|
| 267 | * Bits 8-21 Reserved for future use. |
---|
| 268 | * |
---|
| 269 | * Bits 22-31 Case delta: delta for case conversions. This should be the |
---|
| 270 | * highest field so we can easily sign extend. |
---|
| 271 | */ |
---|
| 272 | |
---|
| 273 | static int groups\[\] = {" |
---|
| 274 | set line " " |
---|
| 275 | set last [expr {[llength $groups] - 1}] |
---|
| 276 | for {set i 0} {$i <= $last} {incr i} { |
---|
| 277 | foreach {type toupper tolower totitle} [split [lindex $groups $i] ,] {} |
---|
| 278 | |
---|
| 279 | # Compute the case conversion type and delta |
---|
| 280 | |
---|
| 281 | if {$totitle != ""} { |
---|
| 282 | if {$totitle == $toupper} { |
---|
| 283 | # subtract delta for title or upper |
---|
| 284 | set case 4 |
---|
| 285 | set delta $toupper |
---|
| 286 | } elseif {$toupper != ""} { |
---|
| 287 | # subtract delta for upper, subtract 1 for title |
---|
| 288 | set case 5 |
---|
| 289 | set delta $toupper |
---|
| 290 | } else { |
---|
| 291 | # add delta for lower, add 1 for title |
---|
| 292 | set case 3 |
---|
| 293 | set delta $tolower |
---|
| 294 | } |
---|
| 295 | } elseif {$toupper != ""} { |
---|
| 296 | # subtract delta for upper, add delta for lower |
---|
| 297 | set case 6 |
---|
| 298 | set delta $toupper |
---|
| 299 | } elseif {$tolower != ""} { |
---|
| 300 | # add delta for lower |
---|
| 301 | set case 2 |
---|
| 302 | set delta $tolower |
---|
| 303 | } else { |
---|
| 304 | # noop |
---|
| 305 | set case 0 |
---|
| 306 | set delta 0 |
---|
| 307 | } |
---|
| 308 | |
---|
| 309 | set val [expr {($delta << 22) | ($case << 5) | $type}] |
---|
| 310 | |
---|
| 311 | append line [format "%d" $val] |
---|
| 312 | if {$i != $last} { |
---|
| 313 | append line ", " |
---|
| 314 | } |
---|
| 315 | if {[string length $line] > 65} { |
---|
| 316 | puts $f $line |
---|
| 317 | set line " " |
---|
| 318 | } |
---|
| 319 | } |
---|
| 320 | puts $f $line |
---|
| 321 | puts $f "}; |
---|
| 322 | |
---|
| 323 | /* |
---|
| 324 | * The following constants are used to determine the category of a |
---|
| 325 | * Unicode character. |
---|
| 326 | */ |
---|
| 327 | |
---|
| 328 | #define UNICODE_CATEGORY_MASK 0X1F |
---|
| 329 | |
---|
| 330 | enum { |
---|
| 331 | UNASSIGNED, |
---|
| 332 | UPPERCASE_LETTER, |
---|
| 333 | LOWERCASE_LETTER, |
---|
| 334 | TITLECASE_LETTER, |
---|
| 335 | MODIFIER_LETTER, |
---|
| 336 | OTHER_LETTER, |
---|
| 337 | NON_SPACING_MARK, |
---|
| 338 | ENCLOSING_MARK, |
---|
| 339 | COMBINING_SPACING_MARK, |
---|
| 340 | DECIMAL_DIGIT_NUMBER, |
---|
| 341 | LETTER_NUMBER, |
---|
| 342 | OTHER_NUMBER, |
---|
| 343 | SPACE_SEPARATOR, |
---|
| 344 | LINE_SEPARATOR, |
---|
| 345 | PARAGRAPH_SEPARATOR, |
---|
| 346 | CONTROL, |
---|
| 347 | FORMAT, |
---|
| 348 | PRIVATE_USE, |
---|
| 349 | SURROGATE, |
---|
| 350 | CONNECTOR_PUNCTUATION, |
---|
| 351 | DASH_PUNCTUATION, |
---|
| 352 | OPEN_PUNCTUATION, |
---|
| 353 | CLOSE_PUNCTUATION, |
---|
| 354 | INITIAL_QUOTE_PUNCTUATION, |
---|
| 355 | FINAL_QUOTE_PUNCTUATION, |
---|
| 356 | OTHER_PUNCTUATION, |
---|
| 357 | MATH_SYMBOL, |
---|
| 358 | CURRENCY_SYMBOL, |
---|
| 359 | MODIFIER_SYMBOL, |
---|
| 360 | OTHER_SYMBOL |
---|
| 361 | }; |
---|
| 362 | |
---|
| 363 | /* |
---|
| 364 | * The following macros extract the fields of the character info. The |
---|
| 365 | * GetDelta() macro is complicated because we can't rely on the C compiler |
---|
| 366 | * to do sign extension on right shifts. |
---|
| 367 | */ |
---|
| 368 | |
---|
| 369 | #define GetCaseType(info) (((info) & 0xE0) >> 5) |
---|
| 370 | #define GetCategory(info) ((info) & 0x1F) |
---|
| 371 | #define GetDelta(info) (((info) > 0) ? ((info) >> 22) : (~(~((info)) >> 22))) |
---|
| 372 | |
---|
| 373 | /* |
---|
| 374 | * This macro extracts the information about a character from the |
---|
| 375 | * Unicode character tables. |
---|
| 376 | */ |
---|
| 377 | |
---|
| 378 | #define GetUniCharInfo(ch) (groups\[groupMap\[(pageMap\[(((int)(ch)) & 0xffff) >> OFFSET_BITS\] << OFFSET_BITS) | ((ch) & ((1 << OFFSET_BITS)-1))\]\]) |
---|
| 379 | " |
---|
| 380 | |
---|
| 381 | close $f |
---|
| 382 | } |
---|
| 383 | |
---|
| 384 | uni::main |
---|
| 385 | |
---|
| 386 | return |
---|