1 | '\" |
---|
2 | '\" Copyright (c) 1994 The Regents of the University of California. |
---|
3 | '\" Copyright (c) 1994-1996 Sun Microsystems, Inc. |
---|
4 | '\" Copyright (c) 1998-1999 Scriptics Corporation |
---|
5 | '\" |
---|
6 | '\" See the file "license.terms" for information on usage and redistribution |
---|
7 | '\" of this file, and for a DISCLAIMER OF ALL WARRANTIES. |
---|
8 | '\" |
---|
9 | '\" RCS: @(#) $Id: RegExp.3,v 1.28 2007/12/13 15:22:31 dgp Exp $ |
---|
10 | '\" |
---|
11 | .so man.macros |
---|
12 | .TH Tcl_RegExpMatch 3 8.1 Tcl "Tcl Library Procedures" |
---|
13 | .BS |
---|
14 | .SH NAME |
---|
15 | Tcl_RegExpMatch, Tcl_RegExpCompile, Tcl_RegExpExec, Tcl_RegExpRange, Tcl_GetRegExpFromObj, Tcl_RegExpMatchObj, Tcl_RegExpExecObj, Tcl_RegExpGetInfo \- Pattern matching with regular expressions |
---|
16 | .SH SYNOPSIS |
---|
17 | .nf |
---|
18 | \fB#include <tcl.h>\fR |
---|
19 | .sp |
---|
20 | int |
---|
21 | \fBTcl_RegExpMatchObj\fR(\fIinterp\fR, \fItextObj\fR, \fIpatObj\fR) |
---|
22 | .sp |
---|
23 | int |
---|
24 | \fBTcl_RegExpMatch\fR(\fIinterp\fR, \fItext\fR, \fIpattern\fR) |
---|
25 | .sp |
---|
26 | Tcl_RegExp |
---|
27 | \fBTcl_RegExpCompile\fR(\fIinterp\fR, \fIpattern\fR) |
---|
28 | .sp |
---|
29 | int |
---|
30 | \fBTcl_RegExpExec\fR(\fIinterp\fR, \fIregexp\fR, \fItext\fR, \fIstart\fR) |
---|
31 | .sp |
---|
32 | void |
---|
33 | \fBTcl_RegExpRange\fR(\fIregexp\fR, \fIindex\fR, \fIstartPtr\fR, \fIendPtr\fR) |
---|
34 | .sp |
---|
35 | Tcl_RegExp |
---|
36 | \fBTcl_GetRegExpFromObj\fR(\fIinterp\fR, \fIpatObj\fR, \fIcflags\fR) |
---|
37 | .sp |
---|
38 | int |
---|
39 | \fBTcl_RegExpExecObj\fR(\fIinterp\fR, \fIregexp\fR, \fItextObj\fR, \fIoffset\fR, \fInmatches\fR, \fIeflags\fR) |
---|
40 | .sp |
---|
41 | void |
---|
42 | \fBTcl_RegExpGetInfo\fR(\fIregexp\fR, \fIinfoPtr\fR) |
---|
43 | .fi |
---|
44 | .SH ARGUMENTS |
---|
45 | .AS Tcl_RegExpInfo *interp in/out |
---|
46 | .AP Tcl_Interp *interp in |
---|
47 | Tcl interpreter to use for error reporting. The interpreter may be |
---|
48 | NULL if no error reporting is desired. |
---|
49 | .AP Tcl_Obj *textObj in/out |
---|
50 | Refers to the object from which to get the text to search. The |
---|
51 | internal representation of the object may be converted to a form that |
---|
52 | can be efficiently searched. |
---|
53 | .AP Tcl_Obj *patObj in/out |
---|
54 | Refers to the object from which to get a regular expression. The |
---|
55 | compiled regular expression is cached in the object. |
---|
56 | .AP char *text in |
---|
57 | Text to search for a match with a regular expression. |
---|
58 | .AP "const char" *pattern in |
---|
59 | String in the form of a regular expression pattern. |
---|
60 | .AP Tcl_RegExp regexp in |
---|
61 | Compiled regular expression. Must have been returned previously |
---|
62 | by \fBTcl_GetRegExpFromObj\fR or \fBTcl_RegExpCompile\fR. |
---|
63 | .AP char *start in |
---|
64 | If \fItext\fR is just a portion of some other string, this argument |
---|
65 | identifies the beginning of the larger string. |
---|
66 | If it is not the same as \fItext\fR, then no |
---|
67 | .QW \fB^\fR |
---|
68 | matches will be allowed. |
---|
69 | .AP int index in |
---|
70 | Specifies which range is desired: 0 means the range of the entire |
---|
71 | match, 1 or greater means the range that matched a parenthesized |
---|
72 | sub-expression. |
---|
73 | .AP "const char" **startPtr out |
---|
74 | The address of the first character in the range is stored here, or |
---|
75 | NULL if there is no such range. |
---|
76 | .AP "const char" **endPtr out |
---|
77 | The address of the character just after the last one in the range |
---|
78 | is stored here, or NULL if there is no such range. |
---|
79 | .AP int cflags in |
---|
80 | OR-ed combination of the compilation flags \fBTCL_REG_ADVANCED\fR, |
---|
81 | \fBTCL_REG_EXTENDED\fR, \fBTCL_REG_BASIC\fR, \fBTCL_REG_EXPANDED\fR, |
---|
82 | \fBTCL_REG_QUOTE\fR, \fBTCL_REG_NOCASE\fR, \fBTCL_REG_NEWLINE\fR, |
---|
83 | \fBTCL_REG_NLSTOP\fR, \fBTCL_REG_NLANCH\fR, \fBTCL_REG_NOSUB\fR, and |
---|
84 | \fBTCL_REG_CANMATCH\fR. See below for more information. |
---|
85 | .AP int offset in |
---|
86 | The character offset into the text where matching should begin. |
---|
87 | The value of the offset has no impact on \fB^\fR matches. This |
---|
88 | behavior is controlled by \fIeflags\fR. |
---|
89 | .AP int nmatches in |
---|
90 | The number of matching subexpressions that should be remembered for |
---|
91 | later use. If this value is 0, then no subexpression match |
---|
92 | information will be computed. If the value is \-1, then |
---|
93 | all of the matching subexpressions will be remembered. Any other |
---|
94 | value will be taken as the maximum number of subexpressions to |
---|
95 | remember. |
---|
96 | .AP int eflags in |
---|
97 | OR-ed combination of the execution flags \fBTCL_REG_NOTBOL\fR and |
---|
98 | \fBTCL_REG_NOTEOL\fR. See below for more information. |
---|
99 | .AP Tcl_RegExpInfo *infoPtr out |
---|
100 | The address of the location where information about a previous match |
---|
101 | should be stored by \fBTcl_RegExpGetInfo\fR. |
---|
102 | .BE |
---|
103 | .SH DESCRIPTION |
---|
104 | .PP |
---|
105 | \fBTcl_RegExpMatch\fR determines whether its \fIpattern\fR argument |
---|
106 | matches \fIregexp\fR, where \fIregexp\fR is interpreted |
---|
107 | as a regular expression using the rules in the \fBre_syntax\fR |
---|
108 | reference page. |
---|
109 | If there is a match then \fBTcl_RegExpMatch\fR returns 1. |
---|
110 | If there is no match then \fBTcl_RegExpMatch\fR returns 0. |
---|
111 | If an error occurs in the matching process (e.g. \fIpattern\fR |
---|
112 | is not a valid regular expression) then \fBTcl_RegExpMatch\fR |
---|
113 | returns \-1 and leaves an error message in the interpreter result. |
---|
114 | \fBTcl_RegExpMatchObj\fR is similar to \fBTcl_RegExpMatch\fR except it |
---|
115 | operates on the Tcl objects \fItextObj\fR and \fIpatObj\fR instead of |
---|
116 | UTF strings. |
---|
117 | \fBTcl_RegExpMatchObj\fR is generally more efficient than |
---|
118 | \fBTcl_RegExpMatch\fR, so it is the preferred interface. |
---|
119 | .PP |
---|
120 | \fBTcl_RegExpCompile\fR, \fBTcl_RegExpExec\fR, and \fBTcl_RegExpRange\fR |
---|
121 | provide lower-level access to the regular expression pattern matcher. |
---|
122 | \fBTcl_RegExpCompile\fR compiles a regular expression string into |
---|
123 | the internal form used for efficient pattern matching. |
---|
124 | The return value is a token for this compiled form, which can be |
---|
125 | used in subsequent calls to \fBTcl_RegExpExec\fR or \fBTcl_RegExpRange\fR. |
---|
126 | If an error occurs while compiling the regular expression then |
---|
127 | \fBTcl_RegExpCompile\fR returns NULL and leaves an error message |
---|
128 | in the interpreter result. |
---|
129 | Note: the return value from \fBTcl_RegExpCompile\fR is only valid |
---|
130 | up to the next call to \fBTcl_RegExpCompile\fR; it is not safe to |
---|
131 | retain these values for long periods of time. |
---|
132 | .PP |
---|
133 | \fBTcl_RegExpExec\fR executes the regular expression pattern matcher. |
---|
134 | It returns 1 if \fItext\fR contains a range of characters that |
---|
135 | match \fIregexp\fR, 0 if no match is found, and |
---|
136 | \-1 if an error occurs. |
---|
137 | In the case of an error, \fBTcl_RegExpExec\fR leaves an error |
---|
138 | message in the interpreter result. |
---|
139 | When searching a string for multiple matches of a pattern, |
---|
140 | it is important to distinguish between the start of the original |
---|
141 | string and the start of the current search. |
---|
142 | For example, when searching for the second occurrence of a |
---|
143 | match, the \fItext\fR argument might point to the character |
---|
144 | just after the first match; however, it is important for the |
---|
145 | pattern matcher to know that this is not the start of the entire string, |
---|
146 | so that it does not allow |
---|
147 | .QW \fB^\fR |
---|
148 | atoms in the pattern to match. |
---|
149 | The \fIstart\fR argument provides this information by pointing |
---|
150 | to the start of the overall string containing \fItext\fR. |
---|
151 | \fIStart\fR will be less than or equal to \fItext\fR; if it |
---|
152 | is less than \fItext\fR then no \fB^\fR matches will be allowed. |
---|
153 | .PP |
---|
154 | \fBTcl_RegExpRange\fR may be invoked after \fBTcl_RegExpExec\fR |
---|
155 | returns; it provides detailed information about what ranges of |
---|
156 | the string matched what parts of the pattern. |
---|
157 | \fBTcl_RegExpRange\fR returns a pair of pointers in \fI*startPtr\fR |
---|
158 | and \fI*endPtr\fR that identify a range of characters in |
---|
159 | the source string for the most recent call to \fBTcl_RegExpExec\fR. |
---|
160 | \fIIndex\fR indicates which of several ranges is desired: |
---|
161 | if \fIindex\fR is 0, information is returned about the overall range |
---|
162 | of characters that matched the entire pattern; otherwise, |
---|
163 | information is returned about the range of characters that matched the |
---|
164 | \fIindex\fR'th parenthesized subexpression within the pattern. |
---|
165 | If there is no range corresponding to \fIindex\fR then NULL |
---|
166 | is stored in \fI*startPtr\fR and \fI*endPtr\fR. |
---|
167 | .PP |
---|
168 | \fBTcl_GetRegExpFromObj\fR, \fBTcl_RegExpExecObj\fR, and |
---|
169 | \fBTcl_RegExpGetInfo\fR are object interfaces that provide the most |
---|
170 | direct control of Henry Spencer's regular expression library. For |
---|
171 | users that need to modify compilation and execution options directly, |
---|
172 | it is recommended that you use these interfaces instead of calling the |
---|
173 | internal regexp functions. These interfaces handle the details of UTF |
---|
174 | to Unicode translations as well as providing improved performance |
---|
175 | through caching in the pattern and string objects. |
---|
176 | .PP |
---|
177 | \fBTcl_GetRegExpFromObj\fR attempts to return a compiled regular |
---|
178 | expression from the \fIpatObj\fR. If the object does not already |
---|
179 | contain a compiled regular expression it will attempt to create one |
---|
180 | from the string in the object and assign it to the internal |
---|
181 | representation of the \fIpatObj\fR. The return value of this function |
---|
182 | is of type \fBTcl_RegExp\fR. The return value is a token for this |
---|
183 | compiled form, which can be used in subsequent calls to |
---|
184 | \fBTcl_RegExpExecObj\fR or \fBTcl_RegExpGetInfo\fR. If an error |
---|
185 | occurs while compiling the regular expression then |
---|
186 | \fBTcl_GetRegExpFromObj\fR returns NULL and leaves an error message in |
---|
187 | the interpreter result. The regular expression token can be used as |
---|
188 | long as the internal representation of \fIpatObj\fR refers to the |
---|
189 | compiled form. The \fIcflags\fR argument is a bit-wise OR of |
---|
190 | zero or more of the following flags that control the compilation of |
---|
191 | \fIpatObj\fR: |
---|
192 | .RS 2 |
---|
193 | .TP |
---|
194 | \fBTCL_REG_ADVANCED\fR |
---|
195 | Compile advanced regular expressions |
---|
196 | .PQ ARE s . |
---|
197 | This mode corresponds to |
---|
198 | the normal regular expression syntax accepted by the Tcl \fBregexp\fR and |
---|
199 | \fBregsub\fR commands. |
---|
200 | .TP |
---|
201 | \fBTCL_REG_EXTENDED\fR |
---|
202 | Compile extended regular expressions |
---|
203 | .PQ ERE s . |
---|
204 | This mode corresponds |
---|
205 | to the regular expression syntax recognized by Tcl 8.0 and earlier |
---|
206 | versions. |
---|
207 | .TP |
---|
208 | \fBTCL_REG_BASIC\fR |
---|
209 | Compile basic regular expressions |
---|
210 | .PQ BRE s . |
---|
211 | This mode corresponds |
---|
212 | to the regular expression syntax recognized by common Unix utilities |
---|
213 | like \fBsed\fR and \fBgrep\fR. This is the default if no flags are |
---|
214 | specified. |
---|
215 | .TP |
---|
216 | \fBTCL_REG_EXPANDED\fR |
---|
217 | Compile the regular expression (basic, extended, or advanced) using an |
---|
218 | expanded syntax that allows comments and whitespace. This mode causes |
---|
219 | non-backslashed non-bracket-expression white |
---|
220 | space and #-to-end-of-line comments to be ignored. |
---|
221 | .TP |
---|
222 | \fBTCL_REG_QUOTE\fR |
---|
223 | Compile a literal string, with all characters treated as ordinary characters. |
---|
224 | .TP |
---|
225 | \fBTCL_REG_NOCASE\fR |
---|
226 | Compile for matching that ignores upper/lower case distinctions. |
---|
227 | .TP |
---|
228 | \fBTCL_REG_NEWLINE\fR |
---|
229 | Compile for newline-sensitive matching. By default, newline is a |
---|
230 | completely ordinary character with no special meaning in either |
---|
231 | regular expressions or strings. With this flag, |
---|
232 | .QW [^ |
---|
233 | bracket expressions and |
---|
234 | .QW . |
---|
235 | never match newline, |
---|
236 | .QW ^ |
---|
237 | matches an empty string |
---|
238 | after any newline in addition to its normal function, and |
---|
239 | .QW $ |
---|
240 | matches |
---|
241 | an empty string before any newline in addition to its normal function. |
---|
242 | \fBREG_NEWLINE\fR is the bit-wise OR of \fBREG_NLSTOP\fR and |
---|
243 | \fBREG_NLANCH\fR. |
---|
244 | .TP |
---|
245 | \fBTCL_REG_NLSTOP\fR |
---|
246 | Compile for partial newline-sensitive matching, |
---|
247 | with the behavior of |
---|
248 | .QW [^ |
---|
249 | bracket expressions and |
---|
250 | .QW . |
---|
251 | affected, but not the behavior of |
---|
252 | .QW ^ |
---|
253 | and |
---|
254 | .QW $ . |
---|
255 | In this mode, |
---|
256 | .QW [^ |
---|
257 | bracket expressions and |
---|
258 | .QW . |
---|
259 | never match newline. |
---|
260 | .TP |
---|
261 | \fBTCL_REG_NLANCH\fR |
---|
262 | Compile for inverse partial newline-sensitive matching, |
---|
263 | with the behavior of |
---|
264 | .QW ^ |
---|
265 | and |
---|
266 | .QW $ |
---|
267 | (the |
---|
268 | .QW anchors ) |
---|
269 | affected, but not the behavior of |
---|
270 | .QW [^ |
---|
271 | bracket expressions and |
---|
272 | .QW . . |
---|
273 | In this mode |
---|
274 | .QW ^ |
---|
275 | matches an empty string |
---|
276 | after any newline in addition to its normal function, and |
---|
277 | .QW $ |
---|
278 | matches |
---|
279 | an empty string before any newline in addition to its normal function. |
---|
280 | .TP |
---|
281 | \fBTCL_REG_NOSUB\fR |
---|
282 | Compile for matching that reports only success or failure, |
---|
283 | not what was matched. This reduces compile overhead and may improve |
---|
284 | performance. Subsequent calls to \fBTcl_RegExpGetInfo\fR or |
---|
285 | \fBTcl_RegExpRange\fR will not report any match information. |
---|
286 | .TP |
---|
287 | \fBTCL_REG_CANMATCH\fR |
---|
288 | Compile for matching that reports the potential to complete a partial |
---|
289 | match given more text (see below). |
---|
290 | .RE |
---|
291 | .PP |
---|
292 | Only one of |
---|
293 | \fBTCL_REG_EXTENDED\fR, |
---|
294 | \fBTCL_REG_ADVANCED\fR, |
---|
295 | \fBTCL_REG_BASIC\fR, and |
---|
296 | \fBTCL_REG_QUOTE\fR may be specified. |
---|
297 | .PP |
---|
298 | \fBTcl_RegExpExecObj\fR executes the regular expression pattern |
---|
299 | matcher. It returns 1 if \fIobjPtr\fR contains a range of characters |
---|
300 | that match \fIregexp\fR, 0 if no match is found, and \-1 if an error |
---|
301 | occurs. In the case of an error, \fBTcl_RegExpExecObj\fR leaves an |
---|
302 | error message in the interpreter result. The \fInmatches\fR value |
---|
303 | indicates to the matcher how many subexpressions are of interest. If |
---|
304 | \fInmatches\fR is 0, then no subexpression match information is |
---|
305 | recorded, which may allow the matcher to make various optimizations. |
---|
306 | If the value is \-1, then all of the subexpressions in the pattern are |
---|
307 | remembered. If the value is a positive integer, then only that number |
---|
308 | of subexpressions will be remembered. Matching begins at the |
---|
309 | specified Unicode character index given by \fIoffset\fR. Unlike |
---|
310 | \fBTcl_RegExpExec\fR, the behavior of anchors is not affected by the |
---|
311 | offset value. Instead the behavior of the anchors is explicitly |
---|
312 | controlled by the \fIeflags\fR argument, which is a bit-wise OR of |
---|
313 | zero or more of the following flags: |
---|
314 | .RS 2 |
---|
315 | .TP |
---|
316 | \fBTCL_REG_NOTBOL\fR |
---|
317 | The starting character will not be treated as the beginning of a |
---|
318 | line or the beginning of the string, so |
---|
319 | .QW ^ |
---|
320 | will not match there. |
---|
321 | Note that this flag has no effect on how |
---|
322 | .QW \fB\eA\fR |
---|
323 | matches. |
---|
324 | .TP |
---|
325 | \fBTCL_REG_NOTEOL\fR |
---|
326 | The last character in the string will not be treated as the end of a |
---|
327 | line or the end of the string, so |
---|
328 | .QW $ |
---|
329 | will not match there. |
---|
330 | Note that this flag has no effect on how |
---|
331 | .QW \fB\eZ\fR |
---|
332 | matches. |
---|
333 | .RE |
---|
334 | .PP |
---|
335 | \fBTcl_RegExpGetInfo\fR retrieves information about the last match |
---|
336 | performed with a given regular expression \fIregexp\fR. The |
---|
337 | \fIinfoPtr\fR argument contains a pointer to a structure that is |
---|
338 | defined as follows: |
---|
339 | .PP |
---|
340 | .CS |
---|
341 | typedef struct Tcl_RegExpInfo { |
---|
342 | int \fInsubs\fR; |
---|
343 | Tcl_RegExpIndices *\fImatches\fR; |
---|
344 | long \fIextendStart\fR; |
---|
345 | } Tcl_RegExpInfo; |
---|
346 | .CE |
---|
347 | .PP |
---|
348 | The \fInsubs\fR field contains a count of the number of parenthesized |
---|
349 | subexpressions within the regular expression. If the \fBTCL_REG_NOSUB\fR |
---|
350 | was used, then this value will be zero. The \fImatches\fR field |
---|
351 | points to an array of \fInsubs\fR values that indicate the bounds of each |
---|
352 | subexpression matched. The first element in the array refers to the |
---|
353 | range matched by the entire regular expression, and subsequent elements |
---|
354 | refer to the parenthesized subexpressions in the order that they |
---|
355 | appear in the pattern. Each element is a structure that is defined as |
---|
356 | follows: |
---|
357 | .PP |
---|
358 | .CS |
---|
359 | typedef struct Tcl_RegExpIndices { |
---|
360 | long \fIstart\fR; |
---|
361 | long \fIend\fR; |
---|
362 | } Tcl_RegExpIndices; |
---|
363 | .CE |
---|
364 | .PP |
---|
365 | The \fIstart\fR and \fIend\fR values are Unicode character indices |
---|
366 | relative to the offset location within \fIobjPtr\fR where matching began. |
---|
367 | The \fIstart\fR index identifies the first character of the matched |
---|
368 | subexpression. The \fIend\fR index identifies the first character |
---|
369 | after the matched subexpression. If the subexpression matched the |
---|
370 | empty string, then \fIstart\fR and \fIend\fR will be equal. If the |
---|
371 | subexpression did not participate in the match, then \fIstart\fR and |
---|
372 | \fIend\fR will be set to \-1. |
---|
373 | .PP |
---|
374 | The \fIextendStart\fR field in \fBTcl_RegExpInfo\fR is only set if the |
---|
375 | \fBTCL_REG_CANMATCH\fR flag was used. It indicates the first |
---|
376 | character in the string where a match could occur. If a match was |
---|
377 | found, this will be the same as the beginning of the current match. |
---|
378 | If no match was found, then it indicates the earliest point at which a |
---|
379 | match might occur if additional text is appended to the string. If it |
---|
380 | is no match is possible even with further text, this field will be set |
---|
381 | to \-1. |
---|
382 | .SH "SEE ALSO" |
---|
383 | re_syntax(n) |
---|
384 | .SH KEYWORDS |
---|
385 | match, pattern, regular expression, string, subexpression, Tcl_RegExpIndices, Tcl_RegExpInfo |
---|