| 1 | # uniParse.tcl -- |
|---|
| 2 | # |
|---|
| 3 | # This program parses the UnicodeData file and generates the |
|---|
| 4 | # corresponding tclUniData.c file with compressed character |
|---|
| 5 | # data tables. The input to this program should be the latest |
|---|
| 6 | # UnicodeData file from: |
|---|
| 7 | # ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData-Latest.txt |
|---|
| 8 | # |
|---|
| 9 | # Copyright (c) 1998-1999 by Scriptics Corporation. |
|---|
| 10 | # All rights reserved. |
|---|
| 11 | # |
|---|
| 12 | # RCS: @(#) $Id: uniParse.tcl,v 1.4 2001/05/28 04:37:57 hobbs Exp $ |
|---|
| 13 | |
|---|
| 14 | |
|---|
| 15 | namespace eval uni { |
|---|
| 16 | set shift 5; # number of bits of data within a page |
|---|
| 17 | # This value can be adjusted to find the |
|---|
| 18 | # best split to minimize table size |
|---|
| 19 | |
|---|
| 20 | variable pMap; # map from page to page index, each entry is |
|---|
| 21 | # an index into the pages table, indexed by |
|---|
| 22 | # page number |
|---|
| 23 | variable pages; # map from page index to page info, each |
|---|
| 24 | # entry is a list of indices into the groups |
|---|
| 25 | # table, the list is indexed by the offset |
|---|
| 26 | variable groups; # list of character info values, indexed by |
|---|
| 27 | # group number, initialized with the |
|---|
| 28 | # unassigned character group |
|---|
| 29 | |
|---|
| 30 | variable categories { |
|---|
| 31 | Cn Lu Ll Lt Lm Lo Mn Me Mc Nd Nl No Zs Zl Zp |
|---|
| 32 | Cc Cf Co Cs Pc Pd Ps Pe Pi Pf Po Sm Sc Sk So |
|---|
| 33 | }; # Ordered list of character categories, must |
|---|
| 34 | # match the enumeration in the header file. |
|---|
| 35 | |
|---|
| 36 | variable titleCount 0; # Count of the number of title case |
|---|
| 37 | # characters. This value is used in the |
|---|
| 38 | # regular expression code to allocate enough |
|---|
| 39 | # space for the title case variants. |
|---|
| 40 | } |
|---|
| 41 | |
|---|
| 42 | proc uni::getValue {items index} { |
|---|
| 43 | variable categories |
|---|
| 44 | variable titleCount |
|---|
| 45 | |
|---|
| 46 | # Extract character info |
|---|
| 47 | |
|---|
| 48 | set category [lindex $items 2] |
|---|
| 49 | if {[scan [lindex $items 12] %4x toupper] == 1} { |
|---|
| 50 | set toupper [expr {$index - $toupper}] |
|---|
| 51 | } else { |
|---|
| 52 | set toupper {} |
|---|
| 53 | } |
|---|
| 54 | if {[scan [lindex $items 13] %4x tolower] == 1} { |
|---|
| 55 | set tolower [expr {$tolower - $index}] |
|---|
| 56 | } else { |
|---|
| 57 | set tolower {} |
|---|
| 58 | } |
|---|
| 59 | if {[scan [lindex $items 14] %4x totitle] == 1} { |
|---|
| 60 | set totitle [expr {$index - $totitle}] |
|---|
| 61 | } else { |
|---|
| 62 | set totitle {} |
|---|
| 63 | } |
|---|
| 64 | |
|---|
| 65 | set categoryIndex [lsearch -exact $categories $category] |
|---|
| 66 | if {$categoryIndex < 0} { |
|---|
| 67 | puts "Unexpected character category: $index($category)" |
|---|
| 68 | set categoryIndex 0 |
|---|
| 69 | } elseif {$category == "Lt"} { |
|---|
| 70 | incr titleCount |
|---|
| 71 | } |
|---|
| 72 | |
|---|
| 73 | return "$categoryIndex,$toupper,$tolower,$totitle" |
|---|
| 74 | } |
|---|
| 75 | |
|---|
| 76 | proc uni::getGroup {value} { |
|---|
| 77 | variable groups |
|---|
| 78 | |
|---|
| 79 | set gIndex [lsearch -exact $groups $value] |
|---|
| 80 | if {$gIndex == -1} { |
|---|
| 81 | set gIndex [llength $groups] |
|---|
| 82 | lappend groups $value |
|---|
| 83 | } |
|---|
| 84 | return $gIndex |
|---|
| 85 | } |
|---|
| 86 | |
|---|
| 87 | proc uni::addPage {info} { |
|---|
| 88 | variable pMap |
|---|
| 89 | variable pages |
|---|
| 90 | |
|---|
| 91 | set pIndex [lsearch -exact $pages $info] |
|---|
| 92 | if {$pIndex == -1} { |
|---|
| 93 | set pIndex [llength $pages] |
|---|
| 94 | lappend pages $info |
|---|
| 95 | } |
|---|
| 96 | lappend pMap $pIndex |
|---|
| 97 | return |
|---|
| 98 | } |
|---|
| 99 | |
|---|
| 100 | proc uni::buildTables {data} { |
|---|
| 101 | variable shift |
|---|
| 102 | |
|---|
| 103 | variable pMap {} |
|---|
| 104 | variable pages {} |
|---|
| 105 | variable groups {{0,,,}} |
|---|
| 106 | set info {} ;# temporary page info |
|---|
| 107 | |
|---|
| 108 | set mask [expr {(1 << $shift) - 1}] |
|---|
| 109 | |
|---|
| 110 | set next 0 |
|---|
| 111 | |
|---|
| 112 | foreach line [split $data \n] { |
|---|
| 113 | if {$line == ""} { |
|---|
| 114 | set line "FFFF;;Cn;0;ON;;;;;N;;;;;\n" |
|---|
| 115 | } |
|---|
| 116 | |
|---|
| 117 | set items [split $line \;] |
|---|
| 118 | |
|---|
| 119 | scan [lindex $items 0] %4x index |
|---|
| 120 | set index [format 0x%0.4x $index] |
|---|
| 121 | |
|---|
| 122 | set gIndex [getGroup [getValue $items $index]] |
|---|
| 123 | |
|---|
| 124 | # Since the input table omits unassigned characters, these will |
|---|
| 125 | # show up as gaps in the index sequence. There are a few special cases |
|---|
| 126 | # where the gaps correspond to a uniform block of assigned characters. |
|---|
| 127 | # These are indicated as such in the character name. |
|---|
| 128 | |
|---|
| 129 | # Enter all unassigned characters up to the current character. |
|---|
| 130 | if {($index > $next) \ |
|---|
| 131 | && ![regexp "Last>$" [lindex $items 1]]} { |
|---|
| 132 | for {} {$next < $index} {incr next} { |
|---|
| 133 | lappend info 0 |
|---|
| 134 | if {($next & $mask) == $mask} { |
|---|
| 135 | addPage $info |
|---|
| 136 | set info {} |
|---|
| 137 | } |
|---|
| 138 | } |
|---|
| 139 | } |
|---|
| 140 | |
|---|
| 141 | # Enter all assigned characters up to the current character |
|---|
| 142 | for {set i $next} {$i <= $index} {incr i} { |
|---|
| 143 | # Split character index into offset and page number |
|---|
| 144 | set offset [expr {$i & $mask}] |
|---|
| 145 | set page [expr {($i >> $shift)}] |
|---|
| 146 | |
|---|
| 147 | # Add the group index to the info for the current page |
|---|
| 148 | lappend info $gIndex |
|---|
| 149 | |
|---|
| 150 | # If this is the last entry in the page, add the page |
|---|
| 151 | if {$offset == $mask} { |
|---|
| 152 | addPage $info |
|---|
| 153 | set info {} |
|---|
| 154 | } |
|---|
| 155 | } |
|---|
| 156 | set next [expr {$index + 1}] |
|---|
| 157 | } |
|---|
| 158 | return |
|---|
| 159 | } |
|---|
| 160 | |
|---|
| 161 | proc uni::main {} { |
|---|
| 162 | global argc argv0 argv |
|---|
| 163 | variable pMap |
|---|
| 164 | variable pages |
|---|
| 165 | variable groups |
|---|
| 166 | variable shift |
|---|
| 167 | variable titleCount |
|---|
| 168 | |
|---|
| 169 | if {$argc != 2} { |
|---|
| 170 | puts stderr "\nusage: $argv0 <datafile> <outdir>\n" |
|---|
| 171 | exit 1 |
|---|
| 172 | } |
|---|
| 173 | set f [open [lindex $argv 0] r] |
|---|
| 174 | set data [read $f] |
|---|
| 175 | close $f |
|---|
| 176 | |
|---|
| 177 | buildTables $data |
|---|
| 178 | puts "X = [llength $pMap] Y= [llength $pages] A= [llength $groups]" |
|---|
| 179 | set size [expr {[llength $pMap] + [llength $pages]*(1<<$shift)}] |
|---|
| 180 | puts "shift = 6, space = $size" |
|---|
| 181 | puts "title case count = $titleCount" |
|---|
| 182 | |
|---|
| 183 | set f [open [file join [lindex $argv 1] tclUniData.c] w] |
|---|
| 184 | fconfigure $f -translation lf |
|---|
| 185 | puts $f "/* |
|---|
| 186 | * tclUniData.c -- |
|---|
| 187 | * |
|---|
| 188 | * Declarations of Unicode character information tables. This file is |
|---|
| 189 | * automatically generated by the tools/uniParse.tcl script. Do not |
|---|
| 190 | * modify this file by hand. |
|---|
| 191 | * |
|---|
| 192 | * Copyright (c) 1998 by Scriptics Corporation. |
|---|
| 193 | * All rights reserved. |
|---|
| 194 | * |
|---|
| 195 | * RCS: @(#) \$Id\$ |
|---|
| 196 | */ |
|---|
| 197 | |
|---|
| 198 | /* |
|---|
| 199 | * A 16-bit Unicode character is split into two parts in order to index |
|---|
| 200 | * into the following tables. The lower OFFSET_BITS comprise an offset |
|---|
| 201 | * into a page of characters. The upper bits comprise the page number. |
|---|
| 202 | */ |
|---|
| 203 | |
|---|
| 204 | #define OFFSET_BITS $shift |
|---|
| 205 | |
|---|
| 206 | /* |
|---|
| 207 | * The pageMap is indexed by page number and returns an alternate page number |
|---|
| 208 | * that identifies a unique page of characters. Many Unicode characters map |
|---|
| 209 | * to the same alternate page number. |
|---|
| 210 | */ |
|---|
| 211 | |
|---|
| 212 | static unsigned char pageMap\[\] = {" |
|---|
| 213 | set line " " |
|---|
| 214 | set last [expr {[llength $pMap] - 1}] |
|---|
| 215 | for {set i 0} {$i <= $last} {incr i} { |
|---|
| 216 | append line [lindex $pMap $i] |
|---|
| 217 | if {$i != $last} { |
|---|
| 218 | append line ", " |
|---|
| 219 | } |
|---|
| 220 | if {[string length $line] > 70} { |
|---|
| 221 | puts $f $line |
|---|
| 222 | set line " " |
|---|
| 223 | } |
|---|
| 224 | } |
|---|
| 225 | puts $f $line |
|---|
| 226 | puts $f "}; |
|---|
| 227 | |
|---|
| 228 | /* |
|---|
| 229 | * The groupMap is indexed by combining the alternate page number with |
|---|
| 230 | * the page offset and returns a group number that identifies a unique |
|---|
| 231 | * set of character attributes. |
|---|
| 232 | */ |
|---|
| 233 | |
|---|
| 234 | static unsigned char groupMap\[\] = {" |
|---|
| 235 | set line " " |
|---|
| 236 | set lasti [expr {[llength $pages] - 1}] |
|---|
| 237 | for {set i 0} {$i <= $lasti} {incr i} { |
|---|
| 238 | set page [lindex $pages $i] |
|---|
| 239 | set lastj [expr {[llength $page] - 1}] |
|---|
| 240 | for {set j 0} {$j <= $lastj} {incr j} { |
|---|
| 241 | append line [lindex $page $j] |
|---|
| 242 | if {$j != $lastj || $i != $lasti} { |
|---|
| 243 | append line ", " |
|---|
| 244 | } |
|---|
| 245 | if {[string length $line] > 70} { |
|---|
| 246 | puts $f $line |
|---|
| 247 | set line " " |
|---|
| 248 | } |
|---|
| 249 | } |
|---|
| 250 | } |
|---|
| 251 | puts $f $line |
|---|
| 252 | puts $f "}; |
|---|
| 253 | |
|---|
| 254 | /* |
|---|
| 255 | * Each group represents a unique set of character attributes. The attributes |
|---|
| 256 | * are encoded into a 32-bit value as follows: |
|---|
| 257 | * |
|---|
| 258 | * Bits 0-4 Character category: see the constants listed below. |
|---|
| 259 | * |
|---|
| 260 | * Bits 5-7 Case delta type: 000 = identity |
|---|
| 261 | * 010 = add delta for lower |
|---|
| 262 | * 011 = add delta for lower, add 1 for title |
|---|
| 263 | * 100 = sutract delta for title/upper |
|---|
| 264 | * 101 = sub delta for upper, sub 1 for title |
|---|
| 265 | * 110 = sub delta for upper, add delta for lower |
|---|
| 266 | * |
|---|
| 267 | * Bits 8-21 Reserved for future use. |
|---|
| 268 | * |
|---|
| 269 | * Bits 22-31 Case delta: delta for case conversions. This should be the |
|---|
| 270 | * highest field so we can easily sign extend. |
|---|
| 271 | */ |
|---|
| 272 | |
|---|
| 273 | static int groups\[\] = {" |
|---|
| 274 | set line " " |
|---|
| 275 | set last [expr {[llength $groups] - 1}] |
|---|
| 276 | for {set i 0} {$i <= $last} {incr i} { |
|---|
| 277 | foreach {type toupper tolower totitle} [split [lindex $groups $i] ,] {} |
|---|
| 278 | |
|---|
| 279 | # Compute the case conversion type and delta |
|---|
| 280 | |
|---|
| 281 | if {$totitle != ""} { |
|---|
| 282 | if {$totitle == $toupper} { |
|---|
| 283 | # subtract delta for title or upper |
|---|
| 284 | set case 4 |
|---|
| 285 | set delta $toupper |
|---|
| 286 | } elseif {$toupper != ""} { |
|---|
| 287 | # subtract delta for upper, subtract 1 for title |
|---|
| 288 | set case 5 |
|---|
| 289 | set delta $toupper |
|---|
| 290 | } else { |
|---|
| 291 | # add delta for lower, add 1 for title |
|---|
| 292 | set case 3 |
|---|
| 293 | set delta $tolower |
|---|
| 294 | } |
|---|
| 295 | } elseif {$toupper != ""} { |
|---|
| 296 | # subtract delta for upper, add delta for lower |
|---|
| 297 | set case 6 |
|---|
| 298 | set delta $toupper |
|---|
| 299 | } elseif {$tolower != ""} { |
|---|
| 300 | # add delta for lower |
|---|
| 301 | set case 2 |
|---|
| 302 | set delta $tolower |
|---|
| 303 | } else { |
|---|
| 304 | # noop |
|---|
| 305 | set case 0 |
|---|
| 306 | set delta 0 |
|---|
| 307 | } |
|---|
| 308 | |
|---|
| 309 | set val [expr {($delta << 22) | ($case << 5) | $type}] |
|---|
| 310 | |
|---|
| 311 | append line [format "%d" $val] |
|---|
| 312 | if {$i != $last} { |
|---|
| 313 | append line ", " |
|---|
| 314 | } |
|---|
| 315 | if {[string length $line] > 65} { |
|---|
| 316 | puts $f $line |
|---|
| 317 | set line " " |
|---|
| 318 | } |
|---|
| 319 | } |
|---|
| 320 | puts $f $line |
|---|
| 321 | puts $f "}; |
|---|
| 322 | |
|---|
| 323 | /* |
|---|
| 324 | * The following constants are used to determine the category of a |
|---|
| 325 | * Unicode character. |
|---|
| 326 | */ |
|---|
| 327 | |
|---|
| 328 | #define UNICODE_CATEGORY_MASK 0X1F |
|---|
| 329 | |
|---|
| 330 | enum { |
|---|
| 331 | UNASSIGNED, |
|---|
| 332 | UPPERCASE_LETTER, |
|---|
| 333 | LOWERCASE_LETTER, |
|---|
| 334 | TITLECASE_LETTER, |
|---|
| 335 | MODIFIER_LETTER, |
|---|
| 336 | OTHER_LETTER, |
|---|
| 337 | NON_SPACING_MARK, |
|---|
| 338 | ENCLOSING_MARK, |
|---|
| 339 | COMBINING_SPACING_MARK, |
|---|
| 340 | DECIMAL_DIGIT_NUMBER, |
|---|
| 341 | LETTER_NUMBER, |
|---|
| 342 | OTHER_NUMBER, |
|---|
| 343 | SPACE_SEPARATOR, |
|---|
| 344 | LINE_SEPARATOR, |
|---|
| 345 | PARAGRAPH_SEPARATOR, |
|---|
| 346 | CONTROL, |
|---|
| 347 | FORMAT, |
|---|
| 348 | PRIVATE_USE, |
|---|
| 349 | SURROGATE, |
|---|
| 350 | CONNECTOR_PUNCTUATION, |
|---|
| 351 | DASH_PUNCTUATION, |
|---|
| 352 | OPEN_PUNCTUATION, |
|---|
| 353 | CLOSE_PUNCTUATION, |
|---|
| 354 | INITIAL_QUOTE_PUNCTUATION, |
|---|
| 355 | FINAL_QUOTE_PUNCTUATION, |
|---|
| 356 | OTHER_PUNCTUATION, |
|---|
| 357 | MATH_SYMBOL, |
|---|
| 358 | CURRENCY_SYMBOL, |
|---|
| 359 | MODIFIER_SYMBOL, |
|---|
| 360 | OTHER_SYMBOL |
|---|
| 361 | }; |
|---|
| 362 | |
|---|
| 363 | /* |
|---|
| 364 | * The following macros extract the fields of the character info. The |
|---|
| 365 | * GetDelta() macro is complicated because we can't rely on the C compiler |
|---|
| 366 | * to do sign extension on right shifts. |
|---|
| 367 | */ |
|---|
| 368 | |
|---|
| 369 | #define GetCaseType(info) (((info) & 0xE0) >> 5) |
|---|
| 370 | #define GetCategory(info) ((info) & 0x1F) |
|---|
| 371 | #define GetDelta(info) (((info) > 0) ? ((info) >> 22) : (~(~((info)) >> 22))) |
|---|
| 372 | |
|---|
| 373 | /* |
|---|
| 374 | * This macro extracts the information about a character from the |
|---|
| 375 | * Unicode character tables. |
|---|
| 376 | */ |
|---|
| 377 | |
|---|
| 378 | #define GetUniCharInfo(ch) (groups\[groupMap\[(pageMap\[(((int)(ch)) & 0xffff) >> OFFSET_BITS\] << OFFSET_BITS) | ((ch) & ((1 << OFFSET_BITS)-1))\]\]) |
|---|
| 379 | " |
|---|
| 380 | |
|---|
| 381 | close $f |
|---|
| 382 | } |
|---|
| 383 | |
|---|
| 384 | uni::main |
|---|
| 385 | |
|---|
| 386 | return |
|---|