Planet

navi

home

PPS

about

screenshots

download

development

forum

Context Navigation

source: downloads/tcl8.5.2/generic/regcomp.c @ 33

Last change on this file since 33 was 25, checked in by landauf, 16 years ago
added tcl to libs
File size: 52.9 KB

Line
1	/*
2	* re_*comp and friends - compile REs
3	* This file #includes several others (see the bottom).
4	*
5	* Copyright (c) 1998, 1999 Henry Spencer. All rights reserved.
6	*
7	* Development of this software was funded, in part, by Cray Research Inc.,
8	* UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
9	* Corporation, none of whom are responsible for the results. The author
10	* thanks all of them.
11	*
12	* Redistribution and use in source and binary forms -- with or without
13	* modification -- are permitted for any purpose, provided that
14	* redistributions in source form retain this entire copyright notice and
15	* indicate the origin and nature of any modifications.
16	*
17	* I'd appreciate being given credit for this package in the documentation of
18	* software which uses it, but that is not a requirement.
19	*
20	* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
21	* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
22	* AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
23	* HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24	* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25	* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
26	* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27	* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
28	* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
29	* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30	*
31	*/
32
33	#include "regguts.h"
34
35	/*
36	* forward declarations, up here so forward datatypes etc. are defined early
37	*/
38	/* =====^!^===== begin forwards =====^!^===== */
39	/* automatically gathered by fwd; do not hand-edit */
40	/* === regcomp.c === */
41	int compile(regex_t , const chr , size_t, int);
42	static void moresubs(struct vars *, int);
43	static int freev(struct vars *, int);
44	static void makesearch(struct vars , struct nfa );
45	static struct subre parse(struct vars , int, int, struct state , struct state );
46	static struct subre parsebranch(struct vars , int, int, struct state , struct state , int);
47	static void parseqatom(struct vars , int, int, struct state , struct state , struct subre );
48	static void nonword(struct vars , int, struct state , struct state *);
49	static void word(struct vars , int, struct state , struct state *);
50	static int scannum(struct vars *);
51	static void repeat(struct vars , struct state , struct state *, int, int);
52	static void bracket(struct vars , struct state , struct state *);
53	static void cbracket(struct vars , struct state , struct state *);
54	static void brackpart(struct vars , struct state , struct state *);
55	static const chr scanplain(struct vars );
56	static void onechr(struct vars , pchr, struct state , struct state *);
57	static void dovec(struct vars , struct cvec , struct state , struct state );
58	static void wordchrs(struct vars *);
59	static struct subre subre(struct vars , int, int, struct state , struct state );
60	static void freesubre(struct vars , struct subre );
61	static void freesrnode(struct vars , struct subre );
62	static void optst(struct vars , struct subre );
63	static int numst(struct subre *, int);
64	static void markst(struct subre *);
65	static void cleanst(struct vars *);
66	static long nfatree(struct vars , struct subre , FILE *);
67	static long nfanode(struct vars , struct subre , FILE *);
68	static int newlacon(struct vars , struct state , struct state *, int);
69	static void freelacons(struct subre *, int);
70	static void rfree(regex_t *);
71	static void dump(regex_t , FILE );
72	static void dumpst(struct subre , FILE , int);
73	static void stdump(struct subre , FILE , int);
74	static const char stid(struct subre , char *, size_t);
75	/* === regc_lex.c === */
76	static void lexstart(struct vars *);
77	static void prefixes(struct vars *);
78	static void lexnest(struct vars , const chr , const chr *);
79	static void lexword(struct vars *);
80	static int next(struct vars *);
81	static int lexescape(struct vars *);
82	static chr lexdigits(struct vars *, int, int, int);
83	static int brenext(struct vars *, pchr);
84	static void skip(struct vars *);
85	static chr newline(NOPARMS);
86	#ifdef REG_DEBUG
87	static const chr *ch(NOPARMS);
88	#endif
89	static chr chrnamed(struct vars , const chr , const chr *, pchr);
90	/* === regc_color.c === */
91	static void initcm(struct vars , struct colormap );
92	static void freecm(struct colormap *);
93	static void cmtreefree(struct colormap , union tree , int);
94	static color setcolor(struct colormap *, pchr, pcolor);
95	static color maxcolor(struct colormap *);
96	static color newcolor(struct colormap *);
97	static void freecolor(struct colormap *, pcolor);
98	static color pseudocolor(struct colormap *);
99	static color subcolor(struct colormap *, pchr c);
100	static color newsub(struct colormap *, pcolor);
101	static void subrange(struct vars , pchr, pchr, struct state , struct state *);
102	static void subblock(struct vars , pchr, struct state , struct state *);
103	static void okcolors(struct nfa , struct colormap );
104	static void colorchain(struct colormap , struct arc );
105	static void uncolorchain(struct colormap , struct arc );
106	static void rainbow(struct nfa , struct colormap , int, pcolor, struct state , struct state );
107	static void colorcomplement(struct nfa , struct colormap , int, struct state , struct state , struct state *);
108	#ifdef REG_DEBUG
109	static void dumpcolors(struct colormap , FILE );
110	static void fillcheck(struct colormap , union tree , int, FILE *);
111	static void dumpchr(pchr, FILE *);
112	#endif
113	/* === regc_nfa.c === */
114	static struct nfa newnfa(struct vars , struct colormap , struct nfa );
115	static void freenfa(struct nfa *);
116	static struct state newstate(struct nfa );
117	static struct state newfstate(struct nfa , int flag);
118	static void dropstate(struct nfa , struct state );
119	static void freestate(struct nfa , struct state );
120	static void destroystate(struct nfa , struct state );
121	static void newarc(struct nfa , int, pcolor, struct state , struct state *);
122	static struct arc allocarc(struct nfa , struct state *);
123	static void freearc(struct nfa , struct arc );
124	static struct arc findarc(struct state , int, pcolor);
125	static void cparc(struct nfa , struct arc , struct state , struct state );
126	static void moveins(struct nfa , struct state , struct state *);
127	static void copyins(struct nfa , struct state , struct state *);
128	static void moveouts(struct nfa , struct state , struct state *);
129	static void copyouts(struct nfa , struct state , struct state *);
130	static void cloneouts(struct nfa , struct state , struct state , struct state , int);
131	static void delsub(struct nfa , struct state , struct state *);
132	static void deltraverse(struct nfa , struct state , struct state *);
133	static void dupnfa(struct nfa , struct state , struct state , struct state , struct state *);
134	static void duptraverse(struct nfa , struct state , struct state *);
135	static void cleartraverse(struct nfa , struct state );
136	static void specialcolors(struct nfa *);
137	static long optimize(struct nfa , FILE );
138	static void pullback(struct nfa , FILE );
139	static int pull(struct nfa , struct arc );
140	static void pushfwd(struct nfa , FILE );
141	static int push(struct nfa , struct arc );
142	#define INCOMPATIBLE 1 /* destroys arc */
143	#define SATISFIED 2 /* constraint satisfied */
144	#define COMPATIBLE 3 /* compatible but not satisfied yet */
145	static int combine(struct arc , struct arc );
146	static void fixempties(struct nfa , FILE );
147	static int unempty(struct nfa , struct arc );
148	static void cleanup(struct nfa *);
149	static void markreachable(struct nfa , struct state , struct state , struct state );
150	static void markcanreach(struct nfa , struct state , struct state , struct state );
151	static long analyze(struct nfa *);
152	static void compact(struct nfa , struct cnfa );
153	static void carcsort(struct carc , struct carc );
154	static void freecnfa(struct cnfa *);
155	static void dumpnfa(struct nfa , FILE );
156	#ifdef REG_DEBUG
157	static void dumpstate(struct state , FILE );
158	static void dumparcs(struct state , FILE );
159	static int dumprarcs(struct arc , struct state , FILE *, int);
160	static void dumparc(struct arc , struct state , FILE *);
161	#endif
162	static void dumpcnfa(struct cnfa , FILE );
163	#ifdef REG_DEBUG
164	static void dumpcstate(int, struct carc , struct cnfa , FILE *);
165	#endif
166	/* === regc_cvec.c === */
167	static struct cvec clearcvec(struct cvec );
168	static void addchr(struct cvec *, pchr);
169	static void addrange(struct cvec *, pchr, pchr);
170	static struct cvec *newcvec(int, int);
171	static struct cvec getcvec(struct vars , int, int);
172	static void freecvec(struct cvec *);
173	/* === regc_locale.c === */
174	static celt element(struct vars , const chr , const chr *);
175	static struct cvec range(struct vars , celt, celt, int);
176	static int before(celt, celt);
177	static struct cvec eclass(struct vars , celt, int);
178	static struct cvec cclass(struct vars , const chr , const chr , int);
179	static struct cvec allcases(struct vars , pchr);
180	static int cmp(const chr , const chr , size_t);
181	static int casecmp(const chr , const chr , size_t);
182	/* automatically gathered by fwd; do not hand-edit */
183	/* =====^!^===== end forwards =====^!^===== */
184
185	/* internal variables, bundled for easy passing around */
186	struct vars {
187	regex_t *re;
188	const chr now; / scan pointer into string */
189	const chr stop; / end of string */
190	const chr savenow; / saved now and stop for "subroutine call" */
191	const chr *savestop;
192	int err; /* error code (0 if none) */
193	int cflags; /* copy of compile flags */
194	int lasttype; /* type of previous token */
195	int nexttype; /* type of next token */
196	chr nextvalue; /* value (if any) of next token */
197	int lexcon; /* lexical context type (see lex.c) */
198	int nsubexp; /* subexpression count */
199	struct subre *subs; / subRE pointer vector */
200	size_t nsubs; /* length of vector */
201	struct subre sub10[10]; / initial vector, enough for most */
202	struct nfa nfa; / the NFA */
203	struct colormap cm; / character color map */
204	color nlcolor; /* color of newline */
205	struct state wordchrs; / state in nfa holding word-char outarcs */
206	struct subre tree; / subexpression tree */
207	struct subre treechain; / all tree nodes allocated */
208	struct subre treefree; / any free tree nodes */
209	int ntree; /* number of tree nodes */
210	struct cvec cv; / interface cvec */
211	struct cvec cv2; / utility cvec */
212	struct subre lacons; / lookahead-constraint vector */
213	int nlacons; /* size of lacons */
214	};
215
216	/* parsing macros; most know that `v' is the struct vars pointer */
217	#define NEXT() (next(v)) /* advance by one token */
218	#define SEE(t) (v->nexttype == (t)) /* is next token this? */
219	#define EAT(t) (SEE(t) && next(v)) /* if next is this, swallow it */
220	#define VISERR(vv) ((vv)->err != 0)/* have we seen an error yet? */
221	#define ISERR() VISERR(v)
222	#define VERR(vv,e) \
223	((vv)->nexttype = EOS, ((vv)->err) ? (vv)->err : ((vv)->err = (e)))
224	#define ERR(e) VERR(v, e) /* record an error */
225	#define NOERR() {if (ISERR()) return;} /* if error seen, return */
226	#define NOERRN() {if (ISERR()) return NULL;} /* NOERR with retval */
227	#define NOERRZ() {if (ISERR()) return 0;} /* NOERR with retval */
228	#define INSIST(c, e) ((c) ? 0 : ERR(e)) /* if condition false, error */
229	#define NOTE(b) (v->re->re_info \|= (b)) /* note visible condition */
230	#define EMPTYARC(x, y) newarc(v->nfa, EMPTY, 0, x, y)
231
232	/* token type codes, some also used as NFA arc types */
233	#define EMPTY 'n' /* no token present */
234	#define EOS 'e' /* end of string */
235	#define PLAIN 'p' /* ordinary character */
236	#define DIGIT 'd' /* digit (in bound) */
237	#define BACKREF 'b' /* back reference */
238	#define COLLEL 'I' /* start of [. */
239	#define ECLASS 'E' /* start of [= */
240	#define CCLASS 'C' /* start of [: */
241	#define END 'X' /* end of [. [= [: */
242	#define RANGE 'R' /* - within [] which might be range delim. */
243	#define LACON 'L' /* lookahead constraint subRE */
244	#define AHEAD 'a' /* color-lookahead arc */
245	#define BEHIND 'r' /* color-lookbehind arc */
246	#define WBDRY 'w' /* word boundary constraint */
247	#define NWBDRY 'W' /* non-word-boundary constraint */
248	#define SBEGIN 'A' /* beginning of string (even if not BOL) */
249	#define SEND 'Z' /* end of string (even if not EOL) */
250	#define PREFER 'P' /* length preference */
251
252	/* is an arc colored, and hence on a color chain? */
253	#define COLORED(a) \
254	((a)->type == PLAIN \|\| (a)->type == AHEAD \|\| (a)->type == BEHIND)
255
256	/* static function list */
257	static struct fns functions = {
258	rfree, /* regfree insides */
259	};
260
261	/*
262	- compile - compile regular expression
263	^ int compile(regex_t , const chr , size_t, int);
264	*/
265	int
266	compile(
267	regex_t *re,
268	const chr *string,
269	size_t len,
270	int flags)
271	{
272	AllocVars(v);
273	struct guts *g;
274	int i;
275	size_t j;
276	FILE *debug = (flags&REG_PROGRESS) ? stdout : NULL;
277	#define CNOERR() { if (ISERR()) return freev(v, v->err); }
278
279	/*
280	* Sanity checks.
281	*/
282
283	if (re == NULL \|\| string == NULL) {
284	FreeVars(v);
285	return REG_INVARG;
286	}
287	if ((flags&REG_QUOTE) && (flags&(REG_ADVANCED\|REG_EXPANDED\|REG_NEWLINE))) {
288	FreeVars(v);
289	return REG_INVARG;
290	}
291	if (!(flags&REG_EXTENDED) && (flags&REG_ADVF)) {
292	FreeVars(v);
293	return REG_INVARG;
294	}
295
296	/*
297	* Initial setup (after which freev() is callable).
298	*/
299
300	v->re = re;
301	v->now = string;
302	v->stop = v->now + len;
303	v->savenow = v->savestop = NULL;
304	v->err = 0;
305	v->cflags = flags;
306	v->nsubexp = 0;
307	v->subs = v->sub10;
308	v->nsubs = 10;
309	for (j = 0; j < v->nsubs; j++) {
310	v->subs[j] = NULL;
311	}
312	v->nfa = NULL;
313	v->cm = NULL;
314	v->nlcolor = COLORLESS;
315	v->wordchrs = NULL;
316	v->tree = NULL;
317	v->treechain = NULL;
318	v->treefree = NULL;
319	v->cv = NULL;
320	v->cv2 = NULL;
321	v->lacons = NULL;
322	v->nlacons = 0;
323	re->re_magic = REMAGIC;
324	re->re_info = 0; /* bits get set during parse */
325	re->re_csize = sizeof(chr);
326	re->re_guts = NULL;
327	re->re_fns = VS(&functions);
328
329	/*
330	* More complex setup, malloced things.
331	*/
332
333	re->re_guts = VS(MALLOC(sizeof(struct guts)));
334	if (re->re_guts == NULL) {
335	return freev(v, REG_ESPACE);
336	}
337	g = (struct guts *) re->re_guts;
338	g->tree = NULL;
339	initcm(v, &g->cmap);
340	v->cm = &g->cmap;
341	g->lacons = NULL;
342	g->nlacons = 0;
343	ZAPCNFA(g->search);
344	v->nfa = newnfa(v, v->cm, NULL);
345	CNOERR();
346	v->cv = newcvec(100, 20);
347	if (v->cv == NULL) {
348	return freev(v, REG_ESPACE);
349	}
350
351	/*
352	* Parsing.
353	*/
354
355	lexstart(v); /* also handles prefixes */
356	if ((v->cflags&REG_NLSTOP) \|\| (v->cflags&REG_NLANCH)) {
357	/*
358	* Assign newline a unique color.
359	*/
360
361	v->nlcolor = subcolor(v->cm, newline());
362	okcolors(v->nfa, v->cm);
363	}
364	CNOERR();
365	v->tree = parse(v, EOS, PLAIN, v->nfa->init, v->nfa->final);
366	assert(SEE(EOS)); /* even if error; ISERR() => SEE(EOS) */
367	CNOERR();
368	assert(v->tree != NULL);
369
370	/*
371	* Finish setup of nfa and its subre tree.
372	*/
373
374	specialcolors(v->nfa);
375	CNOERR();
376	if (debug != NULL) {
377	fprintf(debug, "\n\n\n========= RAW ==========\n");
378	dumpnfa(v->nfa, debug);
379	dumpst(v->tree, debug, 1);
380	}
381	optst(v, v->tree);
382	v->ntree = numst(v->tree, 1);
383	markst(v->tree);
384	cleanst(v);
385	if (debug != NULL) {
386	fprintf(debug, "\n\n\n========= TREE FIXED ==========\n");
387	dumpst(v->tree, debug, 1);
388	}
389
390	/*
391	* Build compacted NFAs for tree and lacons.
392	*/
393
394	re->re_info \|= nfatree(v, v->tree, debug);
395	CNOERR();
396	assert(v->nlacons == 0 \|\| v->lacons != NULL);
397	for (i = 1; i < v->nlacons; i++) {
398	if (debug != NULL) {
399	fprintf(debug, "\n\n\n========= LA%d ==========\n", i);
400	}
401	nfanode(v, &v->lacons[i], debug);
402	}
403	CNOERR();
404	if (v->tree->flags&SHORTER) {
405	NOTE(REG_USHORTEST);
406	}
407
408	/*
409	* Build compacted NFAs for tree, lacons, fast search.
410	*/
411
412	if (debug != NULL) {
413	fprintf(debug, "\n\n\n========= SEARCH ==========\n");
414	}
415
416	/*
417	* Can sacrifice main NFA now, so use it as work area.
418	*/
419
420	(DISCARD) optimize(v->nfa, debug);
421	CNOERR();
422	makesearch(v, v->nfa);
423	CNOERR();
424	compact(v->nfa, &g->search);
425	CNOERR();
426
427	/*
428	* Looks okay, package it up.
429	*/
430
431	re->re_nsub = v->nsubexp;
432	v->re = NULL; /* freev no longer frees re */
433	g->magic = GUTSMAGIC;
434	g->cflags = v->cflags;
435	g->info = re->re_info;
436	g->nsub = re->re_nsub;
437	g->tree = v->tree;
438	v->tree = NULL;
439	g->ntree = v->ntree;
440	g->compare = (v->cflags&REG_ICASE) ? casecmp : cmp;
441	g->lacons = v->lacons;
442	v->lacons = NULL;
443	g->nlacons = v->nlacons;
444
445	if (flags&REG_DUMP) {
446	dump(re, stdout);
447	}
448
449	assert(v->err == 0);
450	return freev(v, 0);
451	}
452
453	/*
454	- moresubs - enlarge subRE vector
455	^ static void moresubs(struct vars *, int);
456	*/
457	static void
458	moresubs(
459	struct vars *v,
460	int wanted) /* want enough room for this one */
461	{
462	struct subre **p;
463	size_t n;
464
465	assert(wanted > 0 && (size_t)wanted >= v->nsubs);
466	n = (size_t)wanted * 3 / 2 + 1;
467	if (v->subs == v->sub10) {
468	p = (struct subre *) MALLOC(n sizeof(struct subre *));
469	if (p != NULL) {
470	memcpy(p, v->subs, v->nsubs * sizeof(struct subre *));
471	}
472	} else {
473	p = (struct subre *) REALLOC(v->subs, nsizeof(struct subre *));
474	}
475	if (p == NULL) {
476	ERR(REG_ESPACE);
477	return;
478	}
479
480	v->subs = p;
481	for (p = &v->subs[v->nsubs]; v->nsubs < n; p++, v->nsubs++) {
482	*p = NULL;
483	}
484	assert(v->nsubs == n);
485	assert((size_t)wanted < v->nsubs);
486	}
487
488	/*
489	- freev - free vars struct's substructures where necessary
490	* Optionally does error-number setting, and always returns error code (if
491	* any), to make error-handling code terser.
492	^ static int freev(struct vars *, int);
493	*/
494	static int
495	freev(
496	struct vars *v,
497	int err)
498	{
499	register int ret;
500
501	if (v->re != NULL) {
502	rfree(v->re);
503	}
504	if (v->subs != v->sub10) {
505	FREE(v->subs);
506	}
507	if (v->nfa != NULL) {
508	freenfa(v->nfa);
509	}
510	if (v->tree != NULL) {
511	freesubre(v, v->tree);
512	}
513	if (v->treechain != NULL) {
514	cleanst(v);
515	}
516	if (v->cv != NULL) {
517	freecvec(v->cv);
518	}
519	if (v->cv2 != NULL) {
520	freecvec(v->cv2);
521	}
522	if (v->lacons != NULL) {
523	freelacons(v->lacons, v->nlacons);
524	}
525	ERR(err); /* nop if err==0 */
526
527	ret = v->err;
528	FreeVars(v);
529	return ret;
530	}
531
532	/*
533	- makesearch - turn an NFA into a search NFA (implicit prepend of .*?)
534	* NFA must have been optimize()d already.
535	^ static void makesearch(struct vars , struct nfa );
536	*/
537	static void
538	makesearch(
539	struct vars *v,
540	struct nfa *nfa)
541	{
542	struct arc a, b;
543	struct state *pre = nfa->pre;
544	struct state s, s2, *slist;
545
546	/*
547	* No loops are needed if it's anchored.
548	*/
549
550	for (a = pre->outs; a != NULL; a = a->outchain) {
551	assert(a->type == PLAIN);
552	if (a->co != nfa->bos[0] && a->co != nfa->bos[1]) {
553	break;
554	}
555	}
556	if (a != NULL) {
557	/*
558	* Add implicit .* in front.
559	*/
560
561	rainbow(nfa, v->cm, PLAIN, COLORLESS, pre, pre);
562
563	/*
564	* And ^* and \A* too -- not always necessary, but harmless.
565	*/
566
567	newarc(nfa, PLAIN, nfa->bos[0], pre, pre);
568	newarc(nfa, PLAIN, nfa->bos[1], pre, pre);
569	}
570
571	/*
572	* Now here's the subtle part. Because many REs have no lookback
573	* constraints, often knowing when you were in the pre state tells you
574	* little; it's the next state(s) that are informative. But some of them
575	* may have other inarcs, i.e. it may be possible to make actual progress
576	* and then return to one of them. We must de-optimize such cases,
577	* splitting each such state into progress and no-progress states.
578	*/
579
580	/*
581	* First, make a list of the states.
582	*/
583
584	slist = NULL;
585	for (a=pre->outs ; a!=NULL ; a=a->outchain) {
586	s = a->to;
587	for (b=s->ins ; b!=NULL ; b=b->inchain) {
588	if (b->from != pre) {
589	break;
590	}
591	}
592	if (b != NULL && s->tmp == NULL) {
593	/*
594	* Must be split if not already in the list (fixes bugs 505048,
595	* 230589, 840258, 504785).
596	*/
597
598	s->tmp = slist;
599	slist = s;
600	}
601	}
602
603	/*
604	* Do the splits.
605	*/
606
607	for (s=slist ; s!=NULL ; s=s2) {
608	s2 = newstate(nfa);
609
610	copyouts(nfa, s, s2);
611	for (a=s->ins ; a!=NULL ; a=b) {
612	b = a->inchain;
613
614	if (a->from != pre) {
615	cparc(nfa, a, a->from, s2);
616	freearc(nfa, a);
617	}
618	}
619	s2 = s->tmp;
620	s->tmp = NULL; /* clean up while we're at it */
621	}
622	}
623
624	/*
625	- parse - parse an RE
626	* This is actually just the top level, which parses a bunch of branches tied
627	* together with '\|'. They appear in the tree as the left children of a chain
628	* of '\|' subres.
629	^ static struct subre parse(struct vars , int, int, struct state *,
630	^ struct state *);
631	*/
632	static struct subre *
633	parse(
634	struct vars *v,
635	int stopper, /* EOS or ')' */
636	int type, /* LACON (lookahead subRE) or PLAIN */
637	struct state init, / initial state */
638	struct state final) / final state */
639	{
640	struct state left, right; /* scaffolding for branch */
641	struct subre branches; / top level */
642	struct subre branch; / current branch */
643	struct subre t; / temporary */
644	int firstbranch; /* is this the first branch? */
645
646	assert(stopper == ')' \|\| stopper == EOS);
647
648	branches = subre(v, '\|', LONGER, init, final);
649	NOERRN();
650	branch = branches;
651	firstbranch = 1;
652	do { /* a branch */
653	if (!firstbranch) {
654	/*
655	* Need a place to hang the branch.
656	*/
657
658	branch->right = subre(v, '\|', LONGER, init, final);
659	NOERRN();
660	branch = branch->right;
661	}
662	firstbranch = 0;
663	left = newstate(v->nfa);
664	right = newstate(v->nfa);
665	NOERRN();
666	EMPTYARC(init, left);
667	EMPTYARC(right, final);
668	NOERRN();
669	branch->left = parsebranch(v, stopper, type, left, right, 0);
670	NOERRN();
671	branch->flags \|= UP(branch->flags \| branch->left->flags);
672	if ((branch->flags &~ branches->flags) != 0) { /* new flags */
673	for (t = branches; t != branch; t = t->right) {
674	t->flags \|= branch->flags;
675	}
676	}
677	} while (EAT('\|'));
678	assert(SEE(stopper) \|\| SEE(EOS));
679
680	if (!SEE(stopper)) {
681	assert(stopper == ')' && SEE(EOS));
682	ERR(REG_EPAREN);
683	}
684
685	/*
686	* Optimize out simple cases.
687	*/
688
689	if (branch == branches) { /* only one branch */
690	assert(branch->right == NULL);
691	t = branch->left;
692	branch->left = NULL;
693	freesubre(v, branches);
694	branches = t;
695	} else if (!MESSY(branches->flags)) { /* no interesting innards */
696	freesubre(v, branches->left);
697	branches->left = NULL;
698	freesubre(v, branches->right);
699	branches->right = NULL;
700	branches->op = '=';
701	}
702
703	return branches;
704	}
705
706	/*
707	- parsebranch - parse one branch of an RE
708	* This mostly manages concatenation, working closely with parseqatom().
709	* Concatenated things are bundled up as much as possible, with separate
710	* ',' nodes introduced only when necessary due to substructure.
711	^ static struct subre parsebranch(struct vars , int, int, struct state *,
712	^ struct state *, int);
713	*/
714	static struct subre *
715	parsebranch(
716	struct vars *v,
717	int stopper, /* EOS or ')' */
718	int type, /* LACON (lookahead subRE) or PLAIN */
719	struct state left, / leftmost state */
720	struct state right, / rightmost state */
721	int partial) /* is this only part of a branch? */
722	{
723	struct state lp; / left end of current construct */
724	int seencontent; /* is there anything in this branch yet? */
725	struct subre *t;
726
727	lp = left;
728	seencontent = 0;
729	t = subre(v, '=', 0, left, right); /* op '=' is tentative */
730	NOERRN();
731	while (!SEE('\|') && !SEE(stopper) && !SEE(EOS)) {
732	if (seencontent) { /* implicit concat operator */
733	lp = newstate(v->nfa);
734	NOERRN();
735	moveins(v->nfa, right, lp);
736	}
737	seencontent = 1;
738
739	/* NB, recursion in parseqatom() may swallow rest of branch */
740	parseqatom(v, stopper, type, lp, right, t);
741	}
742
743	if (!seencontent) { /* empty branch */
744	if (!partial) {
745	NOTE(REG_UUNSPEC);
746	}
747	assert(lp == left);
748	EMPTYARC(left, right);
749	}
750
751	return t;
752	}
753
754	/*
755	- parseqatom - parse one quantified atom or constraint of an RE
756	* The bookkeeping near the end cooperates very closely with parsebranch(); in
757	* particular, it contains a recursion that can involve parsing the rest of
758	* the branch, making this function's name somewhat inaccurate.
759	^ static void parseqatom(struct vars , int, int, struct state ,
760	^ struct state , struct subre );
761	*/
762	static void
763	parseqatom(
764	struct vars *v,
765	int stopper, /* EOS or ')' */
766	int type, /* LACON (lookahead subRE) or PLAIN */
767	struct state lp, / left state to hang it on */
768	struct state rp, / right state to hang it on */
769	struct subre top) / subtree top */
770	{
771	struct state s; / temporaries for new states */
772	struct state *s2;
773	#define ARCV(t, val) newarc(v->nfa, t, val, lp, rp)
774	int m, n;
775	struct subre atom; / atom's subtree */
776	struct subre *t;
777	int cap; /* capturing parens? */
778	int pos; /* positive lookahead? */
779	int subno; /* capturing-parens or backref number */
780	int atomtype;
781	int qprefer; /* quantifier short/long preference */
782	int f;
783	struct subre *atomp; / where the pointer to atom is */
784
785	/*
786	* Initial bookkeeping.
787	*/
788
789	atom = NULL;
790	assert(lp->nouts == 0); /* must string new code */
791	assert(rp->nins == 0); /* between lp and rp */
792	subno = 0; /* just to shut lint up */
793
794	/*
795	* An atom or constraint...
796	*/
797
798	atomtype = v->nexttype;
799	switch (atomtype) {
800	/* first, constraints, which end by returning */
801	case '^':
802	ARCV('^', 1);
803	if (v->cflags&REG_NLANCH) {
804	ARCV(BEHIND, v->nlcolor);
805	}
806	NEXT();
807	return;
808	case '$':
809	ARCV('$', 1);
810	if (v->cflags&REG_NLANCH) {
811	ARCV(AHEAD, v->nlcolor);
812	}
813	NEXT();
814	return;
815	case SBEGIN:
816	ARCV('^', 1); /* BOL */
817	ARCV('^', 0); /* or BOS */
818	NEXT();
819	return;
820	case SEND:
821	ARCV('$', 1); /* EOL */
822	ARCV('$', 0); /* or EOS */
823	NEXT();
824	return;
825	case '<':
826	wordchrs(v); /* does NEXT() */
827	s = newstate(v->nfa);
828	NOERR();
829	nonword(v, BEHIND, lp, s);
830	word(v, AHEAD, s, rp);
831	return;
832	case '>':
833	wordchrs(v); /* does NEXT() */
834	s = newstate(v->nfa);
835	NOERR();
836	word(v, BEHIND, lp, s);
837	nonword(v, AHEAD, s, rp);
838	return;
839	case WBDRY:
840	wordchrs(v); /* does NEXT() */
841	s = newstate(v->nfa);
842	NOERR();
843	nonword(v, BEHIND, lp, s);
844	word(v, AHEAD, s, rp);
845	s = newstate(v->nfa);
846	NOERR();
847	word(v, BEHIND, lp, s);
848	nonword(v, AHEAD, s, rp);
849	return;
850	case NWBDRY:
851	wordchrs(v); /* does NEXT() */
852	s = newstate(v->nfa);
853	NOERR();
854	word(v, BEHIND, lp, s);
855	word(v, AHEAD, s, rp);
856	s = newstate(v->nfa);
857	NOERR();
858	nonword(v, BEHIND, lp, s);
859	nonword(v, AHEAD, s, rp);
860	return;
861	case LACON: /* lookahead constraint */
862	pos = v->nextvalue;
863	NEXT();
864	s = newstate(v->nfa);
865	s2 = newstate(v->nfa);
866	NOERR();
867	t = parse(v, ')', LACON, s, s2);
868	freesubre(v, t); /* internal structure irrelevant */
869	assert(SEE(')') \|\| ISERR());
870	NEXT();
871	n = newlacon(v, s, s2, pos);
872	NOERR();
873	ARCV(LACON, n);
874	return;
875
876	/*
877	* Then errors, to get them out of the way.
878	*/
879
880	case '*':
881	case '+':
882	case '?':
883	case '{':
884	ERR(REG_BADRPT);
885	return;
886	default:
887	ERR(REG_ASSERT);
888	return;
889
890	/*
891	* Then plain characters, and minor variants on that theme.
892	*/
893
894	case ')': /* unbalanced paren */
895	if ((v->cflags&REG_ADVANCED) != REG_EXTENDED) {
896	ERR(REG_EPAREN);
897	return;
898	}
899
900	/*
901	* Legal in EREs due to specification botch.
902	*/
903
904	NOTE(REG_UPBOTCH);
905	/* fallthrough into case PLAIN */
906	case PLAIN:
907	onechr(v, v->nextvalue, lp, rp);
908	okcolors(v->nfa, v->cm);
909	NOERR();
910	NEXT();
911	break;
912	case '[':
913	if (v->nextvalue == 1) {
914	bracket(v, lp, rp);
915	} else {
916	cbracket(v, lp, rp);
917	}
918	assert(SEE(']') \|\| ISERR());
919	NEXT();
920	break;
921	case '.':
922	rainbow(v->nfa, v->cm, PLAIN,
923	(v->cflags&REG_NLSTOP) ? v->nlcolor : COLORLESS, lp, rp);
924	NEXT();
925	break;
926
927	/*
928	* And finally the ugly stuff.
929	*/
930
931	case '(': /* value flags as capturing or non */
932	cap = (type == LACON) ? 0 : v->nextvalue;
933	if (cap) {
934	v->nsubexp++;
935	subno = v->nsubexp;
936	if ((size_t)subno >= v->nsubs) {
937	moresubs(v, subno);
938	}
939	assert((size_t)subno < v->nsubs);
940	} else {
941	atomtype = PLAIN; /* something that's not '(' */
942	}
943	NEXT();
944
945	/*
946	* Need new endpoints because tree will contain pointers.
947	*/
948
949	s = newstate(v->nfa);
950	s2 = newstate(v->nfa);
951	NOERR();
952	EMPTYARC(lp, s);
953	EMPTYARC(s2, rp);
954	NOERR();
955	atom = parse(v, ')', PLAIN, s, s2);
956	assert(SEE(')') \|\| ISERR());
957	NEXT();
958	NOERR();
959	if (cap) {
960	v->subs[subno] = atom;
961	t = subre(v, '(', atom->flags\|CAP, lp, rp);
962	NOERR();
963	t->subno = subno;
964	t->left = atom;
965	atom = t;
966	}
967
968	/*
969	* Postpone everything else pending possible {0}.
970	*/
971
972	break;
973	case BACKREF: /* the Feature From The Black Lagoon */
974	INSIST(type != LACON, REG_ESUBREG);
975	INSIST(v->nextvalue < v->nsubs, REG_ESUBREG);
976	INSIST(v->subs[v->nextvalue] != NULL, REG_ESUBREG);
977	NOERR();
978	assert(v->nextvalue > 0);
979	atom = subre(v, 'b', BACKR, lp, rp);
980	subno = v->nextvalue;
981	atom->subno = subno;
982	EMPTYARC(lp, rp); /* temporarily, so there's something */
983	NEXT();
984	break;
985	}
986
987	/*
988	* ...and an atom may be followed by a quantifier.
989	*/
990
991	switch (v->nexttype) {
992	case '*':
993	m = 0;
994	n = INFINITY;
995	qprefer = (v->nextvalue) ? LONGER : SHORTER;
996	NEXT();
997	break;
998	case '+':
999	m = 1;
1000	n = INFINITY;
1001	qprefer = (v->nextvalue) ? LONGER : SHORTER;
1002	NEXT();
1003	break;
1004	case '?':
1005	m = 0;
1006	n = 1;
1007	qprefer = (v->nextvalue) ? LONGER : SHORTER;
1008	NEXT();
1009	break;
1010	case '{':
1011	NEXT();
1012	m = scannum(v);
1013	if (EAT(',')) {
1014	if (SEE(DIGIT)) {
1015	n = scannum(v);
1016	} else {
1017	n = INFINITY;
1018	}
1019	if (m > n) {
1020	ERR(REG_BADBR);
1021	return;
1022	}
1023
1024	/*
1025	* {m,n} exercises preference, even if it's {m,m}
1026	*/
1027
1028	qprefer = (v->nextvalue) ? LONGER : SHORTER;
1029	} else {
1030	n = m;
1031	/*
1032	* {m} passes operand's preference through.
1033	*/
1034
1035	qprefer = 0;
1036	}
1037	if (!SEE('}')) { /* catches errors too */
1038	ERR(REG_BADBR);
1039	return;
1040	}
1041	NEXT();
1042	break;
1043	default: /* no quantifier */
1044	m = n = 1;
1045	qprefer = 0;
1046	break;
1047	}
1048
1049	/*
1050	* Annoying special case: {0} or {0,0} cancels everything.
1051	*/
1052
1053	if (m == 0 && n == 0) {
1054	if (atom != NULL) {
1055	freesubre(v, atom);
1056	}
1057	if (atomtype == '(') {
1058	v->subs[subno] = NULL;
1059	}
1060	delsub(v->nfa, lp, rp);
1061	EMPTYARC(lp, rp);
1062	return;
1063	}
1064
1065	/*
1066	* If not a messy case, avoid hard part.
1067	*/
1068
1069	assert(!MESSY(top->flags));
1070	f = top->flags \| qprefer \| ((atom != NULL) ? atom->flags : 0);
1071	if (atomtype != '(' && atomtype != BACKREF && !MESSY(UP(f))) {
1072	if (!(m == 1 && n == 1)) {
1073	repeat(v, lp, rp, m, n);
1074	}
1075	if (atom != NULL) {
1076	freesubre(v, atom);
1077	}
1078	top->flags = f;
1079	return;
1080	}
1081
1082	/*
1083	* hard part: something messy
1084	* That is, capturing parens, back reference, short/long clash, or an atom
1085	* with substructure containing one of those.
1086	*/
1087
1088	/*
1089	* Now we'll need a subre for the contents even if they're boring.
1090	*/
1091
1092	if (atom == NULL) {
1093	atom = subre(v, '=', 0, lp, rp);
1094	NOERR();
1095	}
1096
1097	/*
1098	* Prepare a general-purpose state skeleton.
1099	*
1100	* ---> [s] ---prefix---> [begin] ---atom---> [end] ----rest---> [rp]
1101	* / /
1102	* [lp] ----> [s2] ----bypass---------------------
1103	*
1104	* where bypass is an empty, and prefix is some repetitions of atom
1105	*/
1106
1107	s = newstate(v->nfa); /* first, new endpoints for the atom */
1108	s2 = newstate(v->nfa);
1109	NOERR();
1110	moveouts(v->nfa, lp, s);
1111	moveins(v->nfa, rp, s2);
1112	NOERR();
1113	atom->begin = s;
1114	atom->end = s2;
1115	s = newstate(v->nfa); /* and spots for prefix and bypass */
1116	s2 = newstate(v->nfa);
1117	NOERR();
1118	EMPTYARC(lp, s);
1119	EMPTYARC(lp, s2);
1120	NOERR();
1121
1122	/*
1123	* Break remaining subRE into x{...} and what follows.
1124	*/
1125
1126	t = subre(v, '.', COMBINE(qprefer, atom->flags), lp, rp);
1127	t->left = atom;
1128	atomp = &t->left;
1129
1130	/*
1131	* Here we should recurse... but we must postpone that to the end.
1132	*/
1133
1134	/*
1135	* Split top into prefix and remaining.
1136	*/
1137
1138	assert(top->op == '=' && top->left == NULL && top->right == NULL);
1139	top->left = subre(v, '=', top->flags, top->begin, lp);
1140	top->op = '.';
1141	top->right = t;
1142
1143	/*
1144	* If it's a backref, now is the time to replicate the subNFA.
1145	*/
1146
1147	if (atomtype == BACKREF) {
1148	assert(atom->begin->nouts == 1); /* just the EMPTY */
1149	delsub(v->nfa, atom->begin, atom->end);
1150	assert(v->subs[subno] != NULL);
1151
1152	/*
1153	* And here's why the recursion got postponed: it must wait until the
1154	* skeleton is filled in, because it may hit a backref that wants to
1155	* copy the filled-in skeleton.
1156	*/
1157
1158	dupnfa(v->nfa, v->subs[subno]->begin, v->subs[subno]->end,
1159	atom->begin, atom->end);
1160	NOERR();
1161	}
1162
1163	/*
1164	* It's quantifier time; first, turn x{0,...} into x{1,...}\|empty
1165	*/
1166
1167	if (m == 0) {
1168	EMPTYARC(s2, atom->end);/* the bypass */
1169	assert(PREF(qprefer) != 0);
1170	f = COMBINE(qprefer, atom->flags);
1171	t = subre(v, '\|', f, lp, atom->end);
1172	NOERR();
1173	t->left = atom;
1174	t->right = subre(v, '\|', PREF(f), s2, atom->end);
1175	NOERR();
1176	t->right->left = subre(v, '=', 0, s2, atom->end);
1177	NOERR();
1178	*atomp = t;
1179	atomp = &t->left;
1180	m = 1;
1181	}
1182
1183	/*
1184	* Deal with the rest of the quantifier.
1185	*/
1186
1187	if (atomtype == BACKREF) {
1188	/*
1189	* Special case: backrefs have internal quantifiers.
1190	*/
1191
1192	EMPTYARC(s, atom->begin); /* empty prefix */
1193
1194	/*
1195	* Just stuff everything into atom.
1196	*/
1197
1198	repeat(v, atom->begin, atom->end, m, n);
1199	atom->min = (short) m;
1200	atom->max = (short) n;
1201	atom->flags \|= COMBINE(qprefer, atom->flags);
1202	} else if (m == 1 && n == 1) {
1203	/*
1204	* No/vacuous quantifier: done.
1205	*/
1206
1207	EMPTYARC(s, atom->begin); /* empty prefix */
1208	} else {
1209	/*
1210	* Turn x{m,n} into x{m-1,n-1}x, with capturing parens in only second
1211	* x
1212	*/
1213
1214	dupnfa(v->nfa, atom->begin, atom->end, s, atom->begin);
1215	assert(m >= 1 && m != INFINITY && n >= 1);
1216	repeat(v, s, atom->begin, m-1, (n == INFINITY) ? n : n-1);
1217	f = COMBINE(qprefer, atom->flags);
1218	t = subre(v, '.', f, s, atom->end); /* prefix and atom */
1219	NOERR();
1220	t->left = subre(v, '=', PREF(f), s, atom->begin);
1221	NOERR();
1222	t->right = atom;
1223	*atomp = t;
1224	}
1225
1226	/*
1227	* And finally, look after that postponed recursion.
1228	*/
1229
1230	t = top->right;
1231	if (!(SEE('\|') \|\| SEE(stopper) \|\| SEE(EOS))) {
1232	t->right = parsebranch(v, stopper, type, atom->end, rp, 1);
1233	} else {
1234	EMPTYARC(atom->end, rp);
1235	t->right = subre(v, '=', 0, atom->end, rp);
1236	}
1237	assert(SEE('\|') \|\| SEE(stopper) \|\| SEE(EOS));
1238	t->flags \|= COMBINE(t->flags, t->right->flags);
1239	top->flags \|= COMBINE(top->flags, t->flags);
1240	}
1241
1242	/*
1243	- nonword - generate arcs for non-word-character ahead or behind
1244	^ static void nonword(struct vars , int, struct state , struct state *);
1245	*/
1246	static void
1247	nonword(
1248	struct vars *v,
1249	int dir, /* AHEAD or BEHIND */
1250	struct state *lp,
1251	struct state *rp)
1252	{
1253	int anchor = (dir == AHEAD) ? '$' : '^';
1254
1255	assert(dir == AHEAD \|\| dir == BEHIND);
1256	newarc(v->nfa, anchor, 1, lp, rp);
1257	newarc(v->nfa, anchor, 0, lp, rp);
1258	colorcomplement(v->nfa, v->cm, dir, v->wordchrs, lp, rp);
1259	/* (no need for special attention to \n) */
1260	}
1261
1262	/*
1263	- word - generate arcs for word character ahead or behind
1264	^ static void word(struct vars , int, struct state , struct state *);
1265	*/
1266	static void
1267	word(
1268	struct vars *v,
1269	int dir, /* AHEAD or BEHIND */
1270	struct state *lp,
1271	struct state *rp)
1272	{
1273	assert(dir == AHEAD \|\| dir == BEHIND);
1274	cloneouts(v->nfa, v->wordchrs, lp, rp, dir);
1275	/* (no need for special attention to \n) */
1276	}
1277
1278	/*
1279	- scannum - scan a number
1280	^ static int scannum(struct vars *);
1281	*/
1282	static int /* value, <= DUPMAX */
1283	scannum(
1284	struct vars *v)
1285	{
1286	int n = 0;
1287
1288	while (SEE(DIGIT) && n < DUPMAX) {
1289	n = n*10 + v->nextvalue;
1290	NEXT();
1291	}
1292	if (SEE(DIGIT) \|\| n > DUPMAX) {
1293	ERR(REG_BADBR);
1294	return 0;
1295	}
1296	return n;
1297	}
1298
1299	/*
1300	- repeat - replicate subNFA for quantifiers
1301	* The duplication sequences used here are chosen carefully so that any
1302	* pointers starting out pointing into the subexpression end up pointing into
1303	* the last occurrence. (Note that it may not be strung between the same left
1304	* and right end states, however!) This used to be important for the subRE
1305	* tree, although the important bits are now handled by the in-line code in
1306	* parse(), and when this is called, it doesn't matter any more.
1307	^ static void repeat(struct vars , struct state , struct state *, int, int);
1308	*/
1309	static void
1310	repeat(
1311	struct vars *v,
1312	struct state *lp,
1313	struct state *rp,
1314	int m,
1315	int n)
1316	{
1317	#define SOME 2
1318	#define INF 3
1319	#define PAIR(x, y) ((x)*4 + (y))
1320	#define REDUCE(x) ( ((x) == INFINITY) ? INF : (((x) > 1) ? SOME : (x)) )
1321	const int rm = REDUCE(m);
1322	const int rn = REDUCE(n);
1323	struct state s, s2;
1324
1325	switch (PAIR(rm, rn)) {
1326	case PAIR(0, 0): /* empty string */
1327	delsub(v->nfa, lp, rp);
1328	EMPTYARC(lp, rp);
1329	break;
1330	case PAIR(0, 1): /* do as x\| */
1331	EMPTYARC(lp, rp);
1332	break;
1333	case PAIR(0, SOME): /* do as x{1,n}\| */
1334	repeat(v, lp, rp, 1, n);
1335	NOERR();
1336	EMPTYARC(lp, rp);
1337	break;
1338	case PAIR(0, INF): /* loop x around */
1339	s = newstate(v->nfa);
1340	NOERR();
1341	moveouts(v->nfa, lp, s);
1342	moveins(v->nfa, rp, s);
1343	EMPTYARC(lp, s);
1344	EMPTYARC(s, rp);
1345	break;
1346	case PAIR(1, 1): /* no action required */
1347	break;
1348	case PAIR(1, SOME): /* do as x{0,n-1}x = (x{1,n-1}\|)x */
1349	s = newstate(v->nfa);
1350	NOERR();
1351	moveouts(v->nfa, lp, s);
1352	dupnfa(v->nfa, s, rp, lp, s);
1353	NOERR();
1354	repeat(v, lp, s, 1, n-1);
1355	NOERR();
1356	EMPTYARC(lp, s);
1357	break;
1358	case PAIR(1, INF): /* add loopback arc */
1359	s = newstate(v->nfa);
1360	s2 = newstate(v->nfa);
1361	NOERR();
1362	moveouts(v->nfa, lp, s);
1363	moveins(v->nfa, rp, s2);
1364	EMPTYARC(lp, s);
1365	EMPTYARC(s2, rp);
1366	EMPTYARC(s2, s);
1367	break;
1368	case PAIR(SOME, SOME): /* do as x{m-1,n-1}x */
1369	s = newstate(v->nfa);
1370	NOERR();
1371	moveouts(v->nfa, lp, s);
1372	dupnfa(v->nfa, s, rp, lp, s);
1373	NOERR();
1374	repeat(v, lp, s, m-1, n-1);
1375	break;
1376	case PAIR(SOME, INF): /* do as x{m-1,}x */
1377	s = newstate(v->nfa);
1378	NOERR();
1379	moveouts(v->nfa, lp, s);
1380	dupnfa(v->nfa, s, rp, lp, s);
1381	NOERR();
1382	repeat(v, lp, s, m-1, n);
1383	break;
1384	default:
1385	ERR(REG_ASSERT);
1386	break;
1387	}
1388	}
1389
1390	/*
1391	- bracket - handle non-complemented bracket expression
1392	* Also called from cbracket for complemented bracket expressions.
1393	^ static void bracket(struct vars , struct state , struct state *);
1394	*/
1395	static void
1396	bracket(
1397	struct vars *v,
1398	struct state *lp,
1399	struct state *rp)
1400	{
1401	assert(SEE('['));
1402	NEXT();
1403	while (!SEE(']') && !SEE(EOS)) {
1404	brackpart(v, lp, rp);
1405	}
1406	assert(SEE(']') \|\| ISERR());
1407	okcolors(v->nfa, v->cm);
1408	}
1409
1410	/*
1411	- cbracket - handle complemented bracket expression
1412	* We do it by calling bracket() with dummy endpoints, and then complementing
1413	* the result. The alternative would be to invoke rainbow(), and then delete
1414	* arcs as the b.e. is seen... but that gets messy.
1415	^ static void cbracket(struct vars , struct state , struct state *);
1416	*/
1417	static void
1418	cbracket(
1419	struct vars *v,
1420	struct state *lp,
1421	struct state *rp)
1422	{
1423	struct state *left = newstate(v->nfa);
1424	struct state *right = newstate(v->nfa);
1425
1426	NOERR();
1427	bracket(v, left, right);
1428	if (v->cflags&REG_NLSTOP) {
1429	newarc(v->nfa, PLAIN, v->nlcolor, left, right);
1430	}
1431	NOERR();
1432
1433	assert(lp->nouts == 0); /* all outarcs will be ours */
1434
1435	/*
1436	* Easy part of complementing, and all there is to do since the MCCE code
1437	* was removed.
1438	*/
1439
1440	colorcomplement(v->nfa, v->cm, PLAIN, left, lp, rp);
1441	NOERR();
1442	dropstate(v->nfa, left);
1443	assert(right->nins == 0);
1444	freestate(v->nfa, right);
1445	return;
1446	}
1447
1448	/*
1449	- brackpart - handle one item (or range) within a bracket expression
1450	^ static void brackpart(struct vars , struct state , struct state *);
1451	*/
1452	static void
1453	brackpart(
1454	struct vars *v,
1455	struct state *lp,
1456	struct state *rp)
1457	{
1458	celt startc, endc;
1459	struct cvec *cv;
1460	const chr startp, endp;
1461	chr c[1];
1462
1463	/*
1464	* Parse something, get rid of special cases, take shortcuts.
1465	*/
1466
1467	switch (v->nexttype) {
1468	case RANGE: /* a-b-c or other botch */
1469	ERR(REG_ERANGE);
1470	return;
1471	break;
1472	case PLAIN:
1473	c[0] = v->nextvalue;
1474	NEXT();
1475
1476	/*
1477	* Shortcut for ordinary chr (not range).
1478	*/
1479
1480	if (!SEE(RANGE)) {
1481	onechr(v, c[0], lp, rp);
1482	return;
1483	}
1484	startc = element(v, c, c+1);
1485	NOERR();
1486	break;
1487	case COLLEL:
1488	startp = v->now;
1489	endp = scanplain(v);
1490	INSIST(startp < endp, REG_ECOLLATE);
1491	NOERR();
1492	startc = element(v, startp, endp);
1493	NOERR();
1494	break;
1495	case ECLASS:
1496	startp = v->now;
1497	endp = scanplain(v);
1498	INSIST(startp < endp, REG_ECOLLATE);
1499	NOERR();
1500	startc = element(v, startp, endp);
1501	NOERR();
1502	cv = eclass(v, startc, (v->cflags&REG_ICASE));
1503	NOERR();
1504	dovec(v, cv, lp, rp);
1505	return;
1506	break;
1507	case CCLASS:
1508	startp = v->now;
1509	endp = scanplain(v);
1510	INSIST(startp < endp, REG_ECTYPE);
1511	NOERR();
1512	cv = cclass(v, startp, endp, (v->cflags&REG_ICASE));
1513	NOERR();
1514	dovec(v, cv, lp, rp);
1515	return;
1516	break;
1517	default:
1518	ERR(REG_ASSERT);
1519	return;
1520	break;
1521	}
1522
1523	if (SEE(RANGE)) {
1524	NEXT();
1525	switch (v->nexttype) {
1526	case PLAIN:
1527	case RANGE:
1528	c[0] = v->nextvalue;
1529	NEXT();
1530	endc = element(v, c, c+1);
1531	NOERR();
1532	break;
1533	case COLLEL:
1534	startp = v->now;
1535	endp = scanplain(v);
1536	INSIST(startp < endp, REG_ECOLLATE);
1537	NOERR();
1538	endc = element(v, startp, endp);
1539	NOERR();
1540	break;
1541	default:
1542	ERR(REG_ERANGE);
1543	return;
1544	break;
1545	}
1546	} else {
1547	endc = startc;
1548	}
1549
1550	/*
1551	* Ranges are unportable. Actually, standard C does guarantee that digits
1552	* are contiguous, but making that an exception is just too complicated.
1553	*/
1554
1555	if (startc != endc) {
1556	NOTE(REG_UUNPORT);
1557	}
1558	cv = range(v, startc, endc, (v->cflags&REG_ICASE));
1559	NOERR();
1560	dovec(v, cv, lp, rp);
1561	}
1562
1563	/*
1564	- scanplain - scan PLAIN contents of [. etc.
1565	* Certain bits of trickery in lex.c know that this code does not try to look
1566	* past the final bracket of the [. etc.
1567	^ static const chr scanplain(struct vars );
1568	*/
1569	static const chr * /* just after end of sequence */
1570	scanplain(
1571	struct vars *v)
1572	{
1573	const chr *endp;
1574
1575	assert(SEE(COLLEL) \|\| SEE(ECLASS) \|\| SEE(CCLASS));
1576	NEXT();
1577
1578	endp = v->now;
1579	while (SEE(PLAIN)) {
1580	endp = v->now;
1581	NEXT();
1582	}
1583
1584	assert(SEE(END) \|\| ISERR());
1585	NEXT();
1586
1587	return endp;
1588	}
1589
1590	/*
1591	- onechr - fill in arcs for a plain character, and possible case complements
1592	* This is mostly a shortcut for efficient handling of the common case.
1593	^ static void onechr(struct vars , pchr, struct state , struct state *);
1594	*/
1595	static void
1596	onechr(
1597	struct vars *v,
1598	pchr c,
1599	struct state *lp,
1600	struct state *rp)
1601	{
1602	if (!(v->cflags&REG_ICASE)) {
1603	newarc(v->nfa, PLAIN, subcolor(v->cm, c), lp, rp);
1604	return;
1605	}
1606
1607	/*
1608	* Rats, need general case anyway...
1609	*/
1610
1611	dovec(v, allcases(v, c), lp, rp);
1612	}
1613
1614	/*
1615	- dovec - fill in arcs for each element of a cvec
1616	^ static void dovec(struct vars , struct cvec , struct state *,
1617	^ struct state *);
1618	*/
1619	static void
1620	dovec(
1621	struct vars *v,
1622	struct cvec *cv,
1623	struct state *lp,
1624	struct state *rp)
1625	{
1626	chr ch, from, to;
1627	const chr *p;
1628	int i;
1629
1630	for (p = cv->chrs, i = cv->nchrs; i > 0; p++, i--) {
1631	ch = *p;
1632	newarc(v->nfa, PLAIN, subcolor(v->cm, ch), lp, rp);
1633	}
1634
1635	for (p = cv->ranges, i = cv->nranges; i > 0; p += 2, i--) {
1636	from = *p;
1637	to = *(p+1);
1638	if (from <= to) {
1639	subrange(v, from, to, lp, rp);
1640	}
1641	}
1642
1643	}
1644
1645	/*
1646	- wordchrs - set up word-chr list for word-boundary stuff, if needed
1647	* The list is kept as a bunch of arcs between two dummy states; it's disposed
1648	* of by the unreachable-states sweep in NFA optimization. Does NEXT(). Must
1649	* not be called from any unusual lexical context. This should be reconciled
1650	* with the \w etc. handling in lex.c, and should be cleaned up to reduce
1651	* dependencies on input scanning.
1652	^ static void wordchrs(struct vars *);
1653	*/
1654	static void
1655	wordchrs(
1656	struct vars *v)
1657	{
1658	struct state left, right;
1659
1660	if (v->wordchrs != NULL) {
1661	NEXT(); /* for consistency */
1662	return;
1663	}
1664
1665	left = newstate(v->nfa);
1666	right = newstate(v->nfa);
1667	NOERR();
1668
1669	/*
1670	* Fine point: implemented with [::], and lexer will set REG_ULOCALE.
1671	*/
1672
1673	lexword(v);
1674	NEXT();
1675	assert(v->savenow != NULL && SEE('['));
1676	bracket(v, left, right);
1677	assert((v->savenow != NULL && SEE(']')) \|\| ISERR());
1678	NEXT();
1679	NOERR();
1680	v->wordchrs = left;
1681	}
1682
1683	/*
1684	- subre - allocate a subre
1685	^ static struct subre subre(struct vars , int, int, struct state *,
1686	^ struct state *);
1687	*/
1688	static struct subre *
1689	subre(
1690	struct vars *v,
1691	int op,
1692	int flags,
1693	struct state *begin,
1694	struct state *end)
1695	{
1696	struct subre *ret = v->treefree;
1697
1698	if (ret != NULL) {
1699	v->treefree = ret->left;
1700	} else {
1701	ret = (struct subre *) MALLOC(sizeof(struct subre));
1702	if (ret == NULL) {
1703	ERR(REG_ESPACE);
1704	return NULL;
1705	}
1706	ret->chain = v->treechain;
1707	v->treechain = ret;
1708	}
1709
1710	assert(strchr("\|.b(=", op) != NULL);
1711
1712	ret->op = op;
1713	ret->flags = flags;
1714	ret->retry = 0;
1715	ret->subno = 0;
1716	ret->min = ret->max = 1;
1717	ret->left = NULL;
1718	ret->right = NULL;
1719	ret->begin = begin;
1720	ret->end = end;
1721	ZAPCNFA(ret->cnfa);
1722
1723	return ret;
1724	}
1725
1726	/*
1727	- freesubre - free a subRE subtree
1728	^ static void freesubre(struct vars , struct subre );
1729	*/
1730	static void
1731	freesubre(
1732	struct vars v, / might be NULL */
1733	struct subre *sr)
1734	{
1735	if (sr == NULL) {
1736	return;
1737	}
1738
1739	if (sr->left != NULL) {
1740	freesubre(v, sr->left);
1741	}
1742	if (sr->right != NULL) {
1743	freesubre(v, sr->right);
1744	}
1745
1746	freesrnode(v, sr);
1747	}
1748
1749	/*
1750	- freesrnode - free one node in a subRE subtree
1751	^ static void freesrnode(struct vars , struct subre );
1752	*/
1753	static void
1754	freesrnode(
1755	struct vars v, / might be NULL */
1756	struct subre *sr)
1757	{
1758	if (sr == NULL) {
1759	return;
1760	}
1761
1762	if (!NULLCNFA(sr->cnfa)) {
1763	freecnfa(&sr->cnfa);
1764	}
1765	sr->flags = 0;
1766
1767	if (v != NULL) {
1768	sr->left = v->treefree;
1769	v->treefree = sr;
1770	} else {
1771	FREE(sr);
1772	}
1773	}
1774
1775	/*
1776	- optst - optimize a subRE subtree
1777	^ static void optst(struct vars , struct subre );
1778	*/
1779	static void
1780	optst(
1781	struct vars *v,
1782	struct subre *t)
1783	{
1784	/*
1785	* DGP (2007-11-13): I assume it was the programmer's intent to eventually
1786	* come back and add code to optimize subRE trees, but the routine coded
1787	* just spends effort traversing the tree and doing nothing. We can do
1788	* nothing with less effort.
1789	*/
1790
1791	return;
1792	}
1793
1794	/*
1795	- numst - number tree nodes (assigning retry indexes)
1796	^ static int numst(struct subre *, int);
1797	*/
1798	static int /* next number */
1799	numst(
1800	struct subre *t,
1801	int start) /* starting point for subtree numbers */
1802	{
1803	int i;
1804
1805	assert(t != NULL);
1806
1807	i = start;
1808	t->retry = (short) i++;
1809	if (t->left != NULL) {
1810	i = numst(t->left, i);
1811	}
1812	if (t->right != NULL) {
1813	i = numst(t->right, i);
1814	}
1815	return i;
1816	}
1817
1818	/*
1819	- markst - mark tree nodes as INUSE
1820	^ static void markst(struct subre *);
1821	*/
1822	static void
1823	markst(
1824	struct subre *t)
1825	{
1826	assert(t != NULL);
1827
1828	t->flags \|= INUSE;
1829	if (t->left != NULL) {
1830	markst(t->left);
1831	}
1832	if (t->right != NULL) {
1833	markst(t->right);
1834	}
1835	}
1836
1837	/*
1838	- cleanst - free any tree nodes not marked INUSE
1839	^ static void cleanst(struct vars *);
1840	*/
1841	static void
1842	cleanst(
1843	struct vars *v)
1844	{
1845	struct subre *t;
1846	struct subre *next;
1847
1848	for (t = v->treechain; t != NULL; t = next) {
1849	next = t->chain;
1850	if (!(t->flags&INUSE)) {
1851	FREE(t);
1852	}
1853	}
1854	v->treechain = NULL;
1855	v->treefree = NULL; /* just on general principles */
1856	}
1857
1858	/*
1859	- nfatree - turn a subRE subtree into a tree of compacted NFAs
1860	^ static long nfatree(struct vars , struct subre , FILE *);
1861	*/
1862	static long /* optimize results from top node */
1863	nfatree(
1864	struct vars *v,
1865	struct subre *t,
1866	FILE f) / for debug output */
1867	{
1868	assert(t != NULL && t->begin != NULL);
1869
1870	if (t->left != NULL) {
1871	(DISCARD) nfatree(v, t->left, f);
1872	}
1873	if (t->right != NULL) {
1874	(DISCARD) nfatree(v, t->right, f);
1875	}
1876
1877	return nfanode(v, t, f);
1878	}
1879
1880	/*
1881	- nfanode - do one NFA for nfatree
1882	^ static long nfanode(struct vars , struct subre , FILE *);
1883	*/
1884	static long /* optimize results */
1885	nfanode(
1886	struct vars *v,
1887	struct subre *t,
1888	FILE f) / for debug output */
1889	{
1890	struct nfa *nfa;
1891	long ret = 0;
1892	char idbuf[50];
1893
1894	assert(t->begin != NULL);
1895
1896	if (f != NULL) {
1897	fprintf(f, "\n\n\n========= TREE NODE %s ==========\n",
1898	stid(t, idbuf, sizeof(idbuf)));
1899	}
1900	nfa = newnfa(v, v->cm, v->nfa);
1901	NOERRZ();
1902	dupnfa(nfa, t->begin, t->end, nfa->init, nfa->final);
1903	if (!ISERR()) {
1904	specialcolors(nfa);
1905	ret = optimize(nfa, f);
1906	}
1907	if (!ISERR()) {
1908	compact(nfa, &t->cnfa);
1909	}
1910
1911	freenfa(nfa);
1912	return ret;
1913	}
1914
1915	/*
1916	- newlacon - allocate a lookahead-constraint subRE
1917	^ static int newlacon(struct vars , struct state , struct state *, int);
1918	*/
1919	static int /* lacon number */
1920	newlacon(
1921	struct vars *v,
1922	struct state *begin,
1923	struct state *end,
1924	int pos)
1925	{
1926	struct subre *sub;
1927	int n;
1928
1929	if (v->nlacons == 0) {
1930	v->lacons = (struct subre ) MALLOC(2 sizeof(struct subre));
1931	n = 1; /* skip 0th */
1932	v->nlacons = 2;
1933	} else {
1934	v->lacons = (struct subre *) REALLOC(v->lacons,
1935	(v->nlacons+1)*sizeof(struct subre));
1936	n = v->nlacons++;
1937	}
1938
1939	if (v->lacons == NULL) {
1940	ERR(REG_ESPACE);
1941	return 0;
1942	}
1943
1944	sub = &v->lacons[n];
1945	sub->begin = begin;
1946	sub->end = end;
1947	sub->subno = pos;
1948	ZAPCNFA(sub->cnfa);
1949	return n;
1950	}
1951
1952	/*
1953	- freelacons - free lookahead-constraint subRE vector
1954	^ static void freelacons(struct subre *, int);
1955	*/
1956	static void
1957	freelacons(
1958	struct subre *subs,
1959	int n)
1960	{
1961	struct subre *sub;
1962	int i;
1963
1964	assert(n > 0);
1965	for (sub=subs+1, i=n-1; i>0; sub++, i--) { /* no 0th */
1966	if (!NULLCNFA(sub->cnfa)) {
1967	freecnfa(&sub->cnfa);
1968	}
1969	}
1970	FREE(subs);
1971	}
1972
1973	/*
1974	- rfree - free a whole RE (insides of regfree)
1975	^ static void rfree(regex_t *);
1976	*/
1977	static void
1978	rfree(
1979	regex_t *re)
1980	{
1981	struct guts *g;
1982
1983	if (re == NULL \|\| re->re_magic != REMAGIC) {
1984	return;
1985	}
1986
1987	re->re_magic = 0; /* invalidate RE */
1988	g = (struct guts *) re->re_guts;
1989	re->re_guts = NULL;
1990	re->re_fns = NULL;
1991	g->magic = 0;
1992	freecm(&g->cmap);
1993	if (g->tree != NULL) {
1994	freesubre(NULL, g->tree);
1995	}
1996	if (g->lacons != NULL) {
1997	freelacons(g->lacons, g->nlacons);
1998	}
1999	if (!NULLCNFA(g->search)) {
2000	freecnfa(&g->search);
2001	}
2002	FREE(g);
2003	}
2004
2005	/*
2006	- dump - dump an RE in human-readable form
2007	^ static void dump(regex_t , FILE );
2008	*/
2009	static void
2010	dump(
2011	regex_t *re,
2012	FILE *f)
2013	{
2014	#ifdef REG_DEBUG
2015	struct guts *g;
2016	int i;
2017
2018	if (re->re_magic != REMAGIC) {
2019	fprintf(f, "bad magic number (0x%x not 0x%x)\n",
2020	re->re_magic, REMAGIC);
2021	}
2022	if (re->re_guts == NULL) {
2023	fprintf(f, "NULL guts!!!\n");
2024	return;
2025	}
2026	g = (struct guts *) re->re_guts;
2027	if (g->magic != GUTSMAGIC) {
2028	fprintf(f, "bad guts magic number (0x%x not 0x%x)\n",
2029	g->magic, GUTSMAGIC);
2030	}
2031
2032	fprintf(f, "\n\n\n========= DUMP ==========\n");
2033	fprintf(f, "nsub %d, info 0%lo, csize %d, ntree %d\n",
2034	re->re_nsub, re->re_info, re->re_csize, g->ntree);
2035
2036	dumpcolors(&g->cmap, f);
2037	if (!NULLCNFA(g->search)) {
2038	printf("\nsearch:\n");
2039	dumpcnfa(&g->search, f);
2040	}
2041	for (i = 1; i < g->nlacons; i++) {
2042	fprintf(f, "\nla%d (%s):\n", i,
2043	(g->lacons[i].subno) ? "positive" : "negative");
2044	dumpcnfa(&g->lacons[i].cnfa, f);
2045	}
2046	fprintf(f, "\n");
2047	dumpst(g->tree, f, 0);
2048	#endif
2049	}
2050
2051	/*
2052	- dumpst - dump a subRE tree
2053	^ static void dumpst(struct subre , FILE , int);
2054	*/
2055	static void
2056	dumpst(
2057	struct subre *t,
2058	FILE *f,
2059	int nfapresent) /* is the original NFA still around? */
2060	{
2061	if (t == NULL) {
2062	fprintf(f, "null tree\n");
2063	} else {
2064	stdump(t, f, nfapresent);
2065	}
2066	fflush(f);
2067	}
2068
2069	/*
2070	- stdump - recursive guts of dumpst
2071	^ static void stdump(struct subre , FILE , int);
2072	*/
2073	static void
2074	stdump(
2075	struct subre *t,
2076	FILE *f,
2077	int nfapresent) /* is the original NFA still around? */
2078	{
2079	char idbuf[50];
2080
2081	fprintf(f, "%s. `%c'", stid(t, idbuf, sizeof(idbuf)), t->op);
2082	if (t->flags&LONGER) {
2083	fprintf(f, " longest");
2084	}
2085	if (t->flags&SHORTER) {
2086	fprintf(f, " shortest");
2087	}
2088	if (t->flags&MIXED) {
2089	fprintf(f, " hasmixed");
2090	}
2091	if (t->flags&CAP) {
2092	fprintf(f, " hascapture");
2093	}
2094	if (t->flags&BACKR) {
2095	fprintf(f, " hasbackref");
2096	}
2097	if (!(t->flags&INUSE)) {
2098	fprintf(f, " UNUSED");
2099	}
2100	if (t->subno != 0) {
2101	fprintf(f, " (#%d)", t->subno);
2102	}
2103	if (t->min != 1 \|\| t->max != 1) {
2104	fprintf(f, " {%d,", t->min);
2105	if (t->max != INFINITY) {
2106	fprintf(f, "%d", t->max);
2107	}
2108	fprintf(f, "}");
2109	}
2110	if (nfapresent) {
2111	fprintf(f, " %ld-%ld", (long)t->begin->no, (long)t->end->no);
2112	}
2113	if (t->left != NULL) {
2114	fprintf(f, " L:%s", stid(t->left, idbuf, sizeof(idbuf)));
2115	}
2116	if (t->right != NULL) {
2117	fprintf(f, " R:%s", stid(t->right, idbuf, sizeof(idbuf)));
2118	}
2119	if (!NULLCNFA(t->cnfa)) {
2120	fprintf(f, "\n");
2121	dumpcnfa(&t->cnfa, f);
2122	}
2123	fprintf(f, "\n");
2124	if (t->left != NULL) {
2125	stdump(t->left, f, nfapresent);
2126	}
2127	if (t->right != NULL) {
2128	stdump(t->right, f, nfapresent);
2129	}
2130	}
2131
2132	/*
2133	- stid - identify a subtree node for dumping
2134	^ static char stid(struct subre , char *, size_t);
2135	*/
2136	static const char * /* points to buf or constant string */
2137	stid(
2138	struct subre *t,
2139	char *buf,
2140	size_t bufsize)
2141	{
2142	/*
2143	* Big enough for hex int or decimal t->retry?
2144	*/
2145
2146	if (bufsize < sizeof(void)2 + 3 \|\| bufsize < sizeof(t->retry)*3 + 1) {
2147	return "unable";
2148	}
2149	if (t->retry != 0) {
2150	sprintf(buf, "%d", t->retry);
2151	} else {
2152	sprintf(buf, "%p", t);
2153	}
2154	return buf;
2155	}
2156
2157	#include "regc_lex.c"
2158	#include "regc_color.c"
2159	#include "regc_nfa.c"
2160	#include "regc_cvec.c"
2161	#include "regc_locale.c"
2162
2163	/*
2164	* Local Variables:
2165	* mode: c
2166	* c-basic-offset: 4
2167	* fill-column: 78
2168	* End:
2169	*/

Note: See TracBrowser for help on using the repository browser.

Download in other formats: