[25] | 1 | '\" |
---|
| 2 | '\" Copyright (c) 1994 The Regents of the University of California. |
---|
| 3 | '\" Copyright (c) 1994-1996 Sun Microsystems, Inc. |
---|
| 4 | '\" Copyright (c) 1998-1999 Scriptics Corporation |
---|
| 5 | '\" |
---|
| 6 | '\" See the file "license.terms" for information on usage and redistribution |
---|
| 7 | '\" of this file, and for a DISCLAIMER OF ALL WARRANTIES. |
---|
| 8 | '\" |
---|
| 9 | '\" RCS: @(#) $Id: RegExp.3,v 1.28 2007/12/13 15:22:31 dgp Exp $ |
---|
| 10 | '\" |
---|
| 11 | .so man.macros |
---|
| 12 | .TH Tcl_RegExpMatch 3 8.1 Tcl "Tcl Library Procedures" |
---|
| 13 | .BS |
---|
| 14 | .SH NAME |
---|
| 15 | Tcl_RegExpMatch, Tcl_RegExpCompile, Tcl_RegExpExec, Tcl_RegExpRange, Tcl_GetRegExpFromObj, Tcl_RegExpMatchObj, Tcl_RegExpExecObj, Tcl_RegExpGetInfo \- Pattern matching with regular expressions |
---|
| 16 | .SH SYNOPSIS |
---|
| 17 | .nf |
---|
| 18 | \fB#include <tcl.h>\fR |
---|
| 19 | .sp |
---|
| 20 | int |
---|
| 21 | \fBTcl_RegExpMatchObj\fR(\fIinterp\fR, \fItextObj\fR, \fIpatObj\fR) |
---|
| 22 | .sp |
---|
| 23 | int |
---|
| 24 | \fBTcl_RegExpMatch\fR(\fIinterp\fR, \fItext\fR, \fIpattern\fR) |
---|
| 25 | .sp |
---|
| 26 | Tcl_RegExp |
---|
| 27 | \fBTcl_RegExpCompile\fR(\fIinterp\fR, \fIpattern\fR) |
---|
| 28 | .sp |
---|
| 29 | int |
---|
| 30 | \fBTcl_RegExpExec\fR(\fIinterp\fR, \fIregexp\fR, \fItext\fR, \fIstart\fR) |
---|
| 31 | .sp |
---|
| 32 | void |
---|
| 33 | \fBTcl_RegExpRange\fR(\fIregexp\fR, \fIindex\fR, \fIstartPtr\fR, \fIendPtr\fR) |
---|
| 34 | .sp |
---|
| 35 | Tcl_RegExp |
---|
| 36 | \fBTcl_GetRegExpFromObj\fR(\fIinterp\fR, \fIpatObj\fR, \fIcflags\fR) |
---|
| 37 | .sp |
---|
| 38 | int |
---|
| 39 | \fBTcl_RegExpExecObj\fR(\fIinterp\fR, \fIregexp\fR, \fItextObj\fR, \fIoffset\fR, \fInmatches\fR, \fIeflags\fR) |
---|
| 40 | .sp |
---|
| 41 | void |
---|
| 42 | \fBTcl_RegExpGetInfo\fR(\fIregexp\fR, \fIinfoPtr\fR) |
---|
| 43 | .fi |
---|
| 44 | .SH ARGUMENTS |
---|
| 45 | .AS Tcl_RegExpInfo *interp in/out |
---|
| 46 | .AP Tcl_Interp *interp in |
---|
| 47 | Tcl interpreter to use for error reporting. The interpreter may be |
---|
| 48 | NULL if no error reporting is desired. |
---|
| 49 | .AP Tcl_Obj *textObj in/out |
---|
| 50 | Refers to the object from which to get the text to search. The |
---|
| 51 | internal representation of the object may be converted to a form that |
---|
| 52 | can be efficiently searched. |
---|
| 53 | .AP Tcl_Obj *patObj in/out |
---|
| 54 | Refers to the object from which to get a regular expression. The |
---|
| 55 | compiled regular expression is cached in the object. |
---|
| 56 | .AP char *text in |
---|
| 57 | Text to search for a match with a regular expression. |
---|
| 58 | .AP "const char" *pattern in |
---|
| 59 | String in the form of a regular expression pattern. |
---|
| 60 | .AP Tcl_RegExp regexp in |
---|
| 61 | Compiled regular expression. Must have been returned previously |
---|
| 62 | by \fBTcl_GetRegExpFromObj\fR or \fBTcl_RegExpCompile\fR. |
---|
| 63 | .AP char *start in |
---|
| 64 | If \fItext\fR is just a portion of some other string, this argument |
---|
| 65 | identifies the beginning of the larger string. |
---|
| 66 | If it is not the same as \fItext\fR, then no |
---|
| 67 | .QW \fB^\fR |
---|
| 68 | matches will be allowed. |
---|
| 69 | .AP int index in |
---|
| 70 | Specifies which range is desired: 0 means the range of the entire |
---|
| 71 | match, 1 or greater means the range that matched a parenthesized |
---|
| 72 | sub-expression. |
---|
| 73 | .AP "const char" **startPtr out |
---|
| 74 | The address of the first character in the range is stored here, or |
---|
| 75 | NULL if there is no such range. |
---|
| 76 | .AP "const char" **endPtr out |
---|
| 77 | The address of the character just after the last one in the range |
---|
| 78 | is stored here, or NULL if there is no such range. |
---|
| 79 | .AP int cflags in |
---|
| 80 | OR-ed combination of the compilation flags \fBTCL_REG_ADVANCED\fR, |
---|
| 81 | \fBTCL_REG_EXTENDED\fR, \fBTCL_REG_BASIC\fR, \fBTCL_REG_EXPANDED\fR, |
---|
| 82 | \fBTCL_REG_QUOTE\fR, \fBTCL_REG_NOCASE\fR, \fBTCL_REG_NEWLINE\fR, |
---|
| 83 | \fBTCL_REG_NLSTOP\fR, \fBTCL_REG_NLANCH\fR, \fBTCL_REG_NOSUB\fR, and |
---|
| 84 | \fBTCL_REG_CANMATCH\fR. See below for more information. |
---|
| 85 | .AP int offset in |
---|
| 86 | The character offset into the text where matching should begin. |
---|
| 87 | The value of the offset has no impact on \fB^\fR matches. This |
---|
| 88 | behavior is controlled by \fIeflags\fR. |
---|
| 89 | .AP int nmatches in |
---|
| 90 | The number of matching subexpressions that should be remembered for |
---|
| 91 | later use. If this value is 0, then no subexpression match |
---|
| 92 | information will be computed. If the value is \-1, then |
---|
| 93 | all of the matching subexpressions will be remembered. Any other |
---|
| 94 | value will be taken as the maximum number of subexpressions to |
---|
| 95 | remember. |
---|
| 96 | .AP int eflags in |
---|
| 97 | OR-ed combination of the execution flags \fBTCL_REG_NOTBOL\fR and |
---|
| 98 | \fBTCL_REG_NOTEOL\fR. See below for more information. |
---|
| 99 | .AP Tcl_RegExpInfo *infoPtr out |
---|
| 100 | The address of the location where information about a previous match |
---|
| 101 | should be stored by \fBTcl_RegExpGetInfo\fR. |
---|
| 102 | .BE |
---|
| 103 | .SH DESCRIPTION |
---|
| 104 | .PP |
---|
| 105 | \fBTcl_RegExpMatch\fR determines whether its \fIpattern\fR argument |
---|
| 106 | matches \fIregexp\fR, where \fIregexp\fR is interpreted |
---|
| 107 | as a regular expression using the rules in the \fBre_syntax\fR |
---|
| 108 | reference page. |
---|
| 109 | If there is a match then \fBTcl_RegExpMatch\fR returns 1. |
---|
| 110 | If there is no match then \fBTcl_RegExpMatch\fR returns 0. |
---|
| 111 | If an error occurs in the matching process (e.g. \fIpattern\fR |
---|
| 112 | is not a valid regular expression) then \fBTcl_RegExpMatch\fR |
---|
| 113 | returns \-1 and leaves an error message in the interpreter result. |
---|
| 114 | \fBTcl_RegExpMatchObj\fR is similar to \fBTcl_RegExpMatch\fR except it |
---|
| 115 | operates on the Tcl objects \fItextObj\fR and \fIpatObj\fR instead of |
---|
| 116 | UTF strings. |
---|
| 117 | \fBTcl_RegExpMatchObj\fR is generally more efficient than |
---|
| 118 | \fBTcl_RegExpMatch\fR, so it is the preferred interface. |
---|
| 119 | .PP |
---|
| 120 | \fBTcl_RegExpCompile\fR, \fBTcl_RegExpExec\fR, and \fBTcl_RegExpRange\fR |
---|
| 121 | provide lower-level access to the regular expression pattern matcher. |
---|
| 122 | \fBTcl_RegExpCompile\fR compiles a regular expression string into |
---|
| 123 | the internal form used for efficient pattern matching. |
---|
| 124 | The return value is a token for this compiled form, which can be |
---|
| 125 | used in subsequent calls to \fBTcl_RegExpExec\fR or \fBTcl_RegExpRange\fR. |
---|
| 126 | If an error occurs while compiling the regular expression then |
---|
| 127 | \fBTcl_RegExpCompile\fR returns NULL and leaves an error message |
---|
| 128 | in the interpreter result. |
---|
| 129 | Note: the return value from \fBTcl_RegExpCompile\fR is only valid |
---|
| 130 | up to the next call to \fBTcl_RegExpCompile\fR; it is not safe to |
---|
| 131 | retain these values for long periods of time. |
---|
| 132 | .PP |
---|
| 133 | \fBTcl_RegExpExec\fR executes the regular expression pattern matcher. |
---|
| 134 | It returns 1 if \fItext\fR contains a range of characters that |
---|
| 135 | match \fIregexp\fR, 0 if no match is found, and |
---|
| 136 | \-1 if an error occurs. |
---|
| 137 | In the case of an error, \fBTcl_RegExpExec\fR leaves an error |
---|
| 138 | message in the interpreter result. |
---|
| 139 | When searching a string for multiple matches of a pattern, |
---|
| 140 | it is important to distinguish between the start of the original |
---|
| 141 | string and the start of the current search. |
---|
| 142 | For example, when searching for the second occurrence of a |
---|
| 143 | match, the \fItext\fR argument might point to the character |
---|
| 144 | just after the first match; however, it is important for the |
---|
| 145 | pattern matcher to know that this is not the start of the entire string, |
---|
| 146 | so that it does not allow |
---|
| 147 | .QW \fB^\fR |
---|
| 148 | atoms in the pattern to match. |
---|
| 149 | The \fIstart\fR argument provides this information by pointing |
---|
| 150 | to the start of the overall string containing \fItext\fR. |
---|
| 151 | \fIStart\fR will be less than or equal to \fItext\fR; if it |
---|
| 152 | is less than \fItext\fR then no \fB^\fR matches will be allowed. |
---|
| 153 | .PP |
---|
| 154 | \fBTcl_RegExpRange\fR may be invoked after \fBTcl_RegExpExec\fR |
---|
| 155 | returns; it provides detailed information about what ranges of |
---|
| 156 | the string matched what parts of the pattern. |
---|
| 157 | \fBTcl_RegExpRange\fR returns a pair of pointers in \fI*startPtr\fR |
---|
| 158 | and \fI*endPtr\fR that identify a range of characters in |
---|
| 159 | the source string for the most recent call to \fBTcl_RegExpExec\fR. |
---|
| 160 | \fIIndex\fR indicates which of several ranges is desired: |
---|
| 161 | if \fIindex\fR is 0, information is returned about the overall range |
---|
| 162 | of characters that matched the entire pattern; otherwise, |
---|
| 163 | information is returned about the range of characters that matched the |
---|
| 164 | \fIindex\fR'th parenthesized subexpression within the pattern. |
---|
| 165 | If there is no range corresponding to \fIindex\fR then NULL |
---|
| 166 | is stored in \fI*startPtr\fR and \fI*endPtr\fR. |
---|
| 167 | .PP |
---|
| 168 | \fBTcl_GetRegExpFromObj\fR, \fBTcl_RegExpExecObj\fR, and |
---|
| 169 | \fBTcl_RegExpGetInfo\fR are object interfaces that provide the most |
---|
| 170 | direct control of Henry Spencer's regular expression library. For |
---|
| 171 | users that need to modify compilation and execution options directly, |
---|
| 172 | it is recommended that you use these interfaces instead of calling the |
---|
| 173 | internal regexp functions. These interfaces handle the details of UTF |
---|
| 174 | to Unicode translations as well as providing improved performance |
---|
| 175 | through caching in the pattern and string objects. |
---|
| 176 | .PP |
---|
| 177 | \fBTcl_GetRegExpFromObj\fR attempts to return a compiled regular |
---|
| 178 | expression from the \fIpatObj\fR. If the object does not already |
---|
| 179 | contain a compiled regular expression it will attempt to create one |
---|
| 180 | from the string in the object and assign it to the internal |
---|
| 181 | representation of the \fIpatObj\fR. The return value of this function |
---|
| 182 | is of type \fBTcl_RegExp\fR. The return value is a token for this |
---|
| 183 | compiled form, which can be used in subsequent calls to |
---|
| 184 | \fBTcl_RegExpExecObj\fR or \fBTcl_RegExpGetInfo\fR. If an error |
---|
| 185 | occurs while compiling the regular expression then |
---|
| 186 | \fBTcl_GetRegExpFromObj\fR returns NULL and leaves an error message in |
---|
| 187 | the interpreter result. The regular expression token can be used as |
---|
| 188 | long as the internal representation of \fIpatObj\fR refers to the |
---|
| 189 | compiled form. The \fIcflags\fR argument is a bit-wise OR of |
---|
| 190 | zero or more of the following flags that control the compilation of |
---|
| 191 | \fIpatObj\fR: |
---|
| 192 | .RS 2 |
---|
| 193 | .TP |
---|
| 194 | \fBTCL_REG_ADVANCED\fR |
---|
| 195 | Compile advanced regular expressions |
---|
| 196 | .PQ ARE s . |
---|
| 197 | This mode corresponds to |
---|
| 198 | the normal regular expression syntax accepted by the Tcl \fBregexp\fR and |
---|
| 199 | \fBregsub\fR commands. |
---|
| 200 | .TP |
---|
| 201 | \fBTCL_REG_EXTENDED\fR |
---|
| 202 | Compile extended regular expressions |
---|
| 203 | .PQ ERE s . |
---|
| 204 | This mode corresponds |
---|
| 205 | to the regular expression syntax recognized by Tcl 8.0 and earlier |
---|
| 206 | versions. |
---|
| 207 | .TP |
---|
| 208 | \fBTCL_REG_BASIC\fR |
---|
| 209 | Compile basic regular expressions |
---|
| 210 | .PQ BRE s . |
---|
| 211 | This mode corresponds |
---|
| 212 | to the regular expression syntax recognized by common Unix utilities |
---|
| 213 | like \fBsed\fR and \fBgrep\fR. This is the default if no flags are |
---|
| 214 | specified. |
---|
| 215 | .TP |
---|
| 216 | \fBTCL_REG_EXPANDED\fR |
---|
| 217 | Compile the regular expression (basic, extended, or advanced) using an |
---|
| 218 | expanded syntax that allows comments and whitespace. This mode causes |
---|
| 219 | non-backslashed non-bracket-expression white |
---|
| 220 | space and #-to-end-of-line comments to be ignored. |
---|
| 221 | .TP |
---|
| 222 | \fBTCL_REG_QUOTE\fR |
---|
| 223 | Compile a literal string, with all characters treated as ordinary characters. |
---|
| 224 | .TP |
---|
| 225 | \fBTCL_REG_NOCASE\fR |
---|
| 226 | Compile for matching that ignores upper/lower case distinctions. |
---|
| 227 | .TP |
---|
| 228 | \fBTCL_REG_NEWLINE\fR |
---|
| 229 | Compile for newline-sensitive matching. By default, newline is a |
---|
| 230 | completely ordinary character with no special meaning in either |
---|
| 231 | regular expressions or strings. With this flag, |
---|
| 232 | .QW [^ |
---|
| 233 | bracket expressions and |
---|
| 234 | .QW . |
---|
| 235 | never match newline, |
---|
| 236 | .QW ^ |
---|
| 237 | matches an empty string |
---|
| 238 | after any newline in addition to its normal function, and |
---|
| 239 | .QW $ |
---|
| 240 | matches |
---|
| 241 | an empty string before any newline in addition to its normal function. |
---|
| 242 | \fBREG_NEWLINE\fR is the bit-wise OR of \fBREG_NLSTOP\fR and |
---|
| 243 | \fBREG_NLANCH\fR. |
---|
| 244 | .TP |
---|
| 245 | \fBTCL_REG_NLSTOP\fR |
---|
| 246 | Compile for partial newline-sensitive matching, |
---|
| 247 | with the behavior of |
---|
| 248 | .QW [^ |
---|
| 249 | bracket expressions and |
---|
| 250 | .QW . |
---|
| 251 | affected, but not the behavior of |
---|
| 252 | .QW ^ |
---|
| 253 | and |
---|
| 254 | .QW $ . |
---|
| 255 | In this mode, |
---|
| 256 | .QW [^ |
---|
| 257 | bracket expressions and |
---|
| 258 | .QW . |
---|
| 259 | never match newline. |
---|
| 260 | .TP |
---|
| 261 | \fBTCL_REG_NLANCH\fR |
---|
| 262 | Compile for inverse partial newline-sensitive matching, |
---|
| 263 | with the behavior of |
---|
| 264 | .QW ^ |
---|
| 265 | and |
---|
| 266 | .QW $ |
---|
| 267 | (the |
---|
| 268 | .QW anchors ) |
---|
| 269 | affected, but not the behavior of |
---|
| 270 | .QW [^ |
---|
| 271 | bracket expressions and |
---|
| 272 | .QW . . |
---|
| 273 | In this mode |
---|
| 274 | .QW ^ |
---|
| 275 | matches an empty string |
---|
| 276 | after any newline in addition to its normal function, and |
---|
| 277 | .QW $ |
---|
| 278 | matches |
---|
| 279 | an empty string before any newline in addition to its normal function. |
---|
| 280 | .TP |
---|
| 281 | \fBTCL_REG_NOSUB\fR |
---|
| 282 | Compile for matching that reports only success or failure, |
---|
| 283 | not what was matched. This reduces compile overhead and may improve |
---|
| 284 | performance. Subsequent calls to \fBTcl_RegExpGetInfo\fR or |
---|
| 285 | \fBTcl_RegExpRange\fR will not report any match information. |
---|
| 286 | .TP |
---|
| 287 | \fBTCL_REG_CANMATCH\fR |
---|
| 288 | Compile for matching that reports the potential to complete a partial |
---|
| 289 | match given more text (see below). |
---|
| 290 | .RE |
---|
| 291 | .PP |
---|
| 292 | Only one of |
---|
| 293 | \fBTCL_REG_EXTENDED\fR, |
---|
| 294 | \fBTCL_REG_ADVANCED\fR, |
---|
| 295 | \fBTCL_REG_BASIC\fR, and |
---|
| 296 | \fBTCL_REG_QUOTE\fR may be specified. |
---|
| 297 | .PP |
---|
| 298 | \fBTcl_RegExpExecObj\fR executes the regular expression pattern |
---|
| 299 | matcher. It returns 1 if \fIobjPtr\fR contains a range of characters |
---|
| 300 | that match \fIregexp\fR, 0 if no match is found, and \-1 if an error |
---|
| 301 | occurs. In the case of an error, \fBTcl_RegExpExecObj\fR leaves an |
---|
| 302 | error message in the interpreter result. The \fInmatches\fR value |
---|
| 303 | indicates to the matcher how many subexpressions are of interest. If |
---|
| 304 | \fInmatches\fR is 0, then no subexpression match information is |
---|
| 305 | recorded, which may allow the matcher to make various optimizations. |
---|
| 306 | If the value is \-1, then all of the subexpressions in the pattern are |
---|
| 307 | remembered. If the value is a positive integer, then only that number |
---|
| 308 | of subexpressions will be remembered. Matching begins at the |
---|
| 309 | specified Unicode character index given by \fIoffset\fR. Unlike |
---|
| 310 | \fBTcl_RegExpExec\fR, the behavior of anchors is not affected by the |
---|
| 311 | offset value. Instead the behavior of the anchors is explicitly |
---|
| 312 | controlled by the \fIeflags\fR argument, which is a bit-wise OR of |
---|
| 313 | zero or more of the following flags: |
---|
| 314 | .RS 2 |
---|
| 315 | .TP |
---|
| 316 | \fBTCL_REG_NOTBOL\fR |
---|
| 317 | The starting character will not be treated as the beginning of a |
---|
| 318 | line or the beginning of the string, so |
---|
| 319 | .QW ^ |
---|
| 320 | will not match there. |
---|
| 321 | Note that this flag has no effect on how |
---|
| 322 | .QW \fB\eA\fR |
---|
| 323 | matches. |
---|
| 324 | .TP |
---|
| 325 | \fBTCL_REG_NOTEOL\fR |
---|
| 326 | The last character in the string will not be treated as the end of a |
---|
| 327 | line or the end of the string, so |
---|
| 328 | .QW $ |
---|
| 329 | will not match there. |
---|
| 330 | Note that this flag has no effect on how |
---|
| 331 | .QW \fB\eZ\fR |
---|
| 332 | matches. |
---|
| 333 | .RE |
---|
| 334 | .PP |
---|
| 335 | \fBTcl_RegExpGetInfo\fR retrieves information about the last match |
---|
| 336 | performed with a given regular expression \fIregexp\fR. The |
---|
| 337 | \fIinfoPtr\fR argument contains a pointer to a structure that is |
---|
| 338 | defined as follows: |
---|
| 339 | .PP |
---|
| 340 | .CS |
---|
| 341 | typedef struct Tcl_RegExpInfo { |
---|
| 342 | int \fInsubs\fR; |
---|
| 343 | Tcl_RegExpIndices *\fImatches\fR; |
---|
| 344 | long \fIextendStart\fR; |
---|
| 345 | } Tcl_RegExpInfo; |
---|
| 346 | .CE |
---|
| 347 | .PP |
---|
| 348 | The \fInsubs\fR field contains a count of the number of parenthesized |
---|
| 349 | subexpressions within the regular expression. If the \fBTCL_REG_NOSUB\fR |
---|
| 350 | was used, then this value will be zero. The \fImatches\fR field |
---|
| 351 | points to an array of \fInsubs\fR values that indicate the bounds of each |
---|
| 352 | subexpression matched. The first element in the array refers to the |
---|
| 353 | range matched by the entire regular expression, and subsequent elements |
---|
| 354 | refer to the parenthesized subexpressions in the order that they |
---|
| 355 | appear in the pattern. Each element is a structure that is defined as |
---|
| 356 | follows: |
---|
| 357 | .PP |
---|
| 358 | .CS |
---|
| 359 | typedef struct Tcl_RegExpIndices { |
---|
| 360 | long \fIstart\fR; |
---|
| 361 | long \fIend\fR; |
---|
| 362 | } Tcl_RegExpIndices; |
---|
| 363 | .CE |
---|
| 364 | .PP |
---|
| 365 | The \fIstart\fR and \fIend\fR values are Unicode character indices |
---|
| 366 | relative to the offset location within \fIobjPtr\fR where matching began. |
---|
| 367 | The \fIstart\fR index identifies the first character of the matched |
---|
| 368 | subexpression. The \fIend\fR index identifies the first character |
---|
| 369 | after the matched subexpression. If the subexpression matched the |
---|
| 370 | empty string, then \fIstart\fR and \fIend\fR will be equal. If the |
---|
| 371 | subexpression did not participate in the match, then \fIstart\fR and |
---|
| 372 | \fIend\fR will be set to \-1. |
---|
| 373 | .PP |
---|
| 374 | The \fIextendStart\fR field in \fBTcl_RegExpInfo\fR is only set if the |
---|
| 375 | \fBTCL_REG_CANMATCH\fR flag was used. It indicates the first |
---|
| 376 | character in the string where a match could occur. If a match was |
---|
| 377 | found, this will be the same as the beginning of the current match. |
---|
| 378 | If no match was found, then it indicates the earliest point at which a |
---|
| 379 | match might occur if additional text is appended to the string. If it |
---|
| 380 | is no match is possible even with further text, this field will be set |
---|
| 381 | to \-1. |
---|
| 382 | .SH "SEE ALSO" |
---|
| 383 | re_syntax(n) |
---|
| 384 | .SH KEYWORDS |
---|
| 385 | match, pattern, regular expression, string, subexpression, Tcl_RegExpIndices, Tcl_RegExpInfo |
---|