@@ -1036,11 +1036,17 @@ parseqatom(struct vars * v,
10361036 /*----------
10371037 * Prepare a general-purpose state skeleton.
10381038 *
1039- * ---> [s] ---prefix---> [begin] ---atom---> [end] ----rest---> [rp]
1040- * / /
1041- * [lp] ----> [s2] ----bypass---------------------
1039+ * In the no-backrefs case, we want this:
10421040 *
1043- * where bypass is an empty, and prefix is some repetitions of atom
1041+ * [lp] ---> [s] ---prefix---> [begin] ---atom---> [end] ---rest---> [rp]
1042+ *
1043+ * where prefix is some repetitions of atom. In the general case we need
1044+ *
1045+ * [lp] ---> [s] ---iterator---> [s2] ---rest---> [rp]
1046+ *
1047+ * where the iterator wraps around [begin] ---atom---> [end]
1048+ *
1049+ * We make the s state here for both cases; s2 is made below if needed
10441050 *----------
10451051 */
10461052 s = newstate (v -> nfa ); /* first, new endpoints for the atom */
@@ -1051,11 +1057,9 @@ parseqatom(struct vars * v,
10511057 NOERR ();
10521058 atom -> begin = s ;
10531059 atom -> end = s2 ;
1054- s = newstate (v -> nfa ); /* and spots for prefix and bypass */
1055- s2 = newstate (v -> nfa );
1060+ s = newstate (v -> nfa ); /* set up starting state */
10561061 NOERR ();
10571062 EMPTYARC (lp , s );
1058- EMPTYARC (lp , s2 );
10591063 NOERR ();
10601064
10611065 /* break remaining subRE into x{...} and what follows */
@@ -1089,28 +1093,9 @@ parseqatom(struct vars * v,
10891093 }
10901094
10911095 /*
1092- * It's quantifier time. If the atom is just a BACKREF, we'll let it deal
1093- * with quantifiers internally. Otherwise, the first step is to turn
1094- * x{0,...} into x{1,...}|empty
1096+ * It's quantifier time. If the atom is just a backref, we'll let it deal
1097+ * with quantifiers internally.
10951098 */
1096- if (m == 0 && atomtype != BACKREF )
1097- {
1098- EMPTYARC (s2 , atom -> end ); /* the bypass */
1099- assert (PREF (qprefer ) != 0 );
1100- f = COMBINE (qprefer , atom -> flags );
1101- t = subre (v , '|' , f , lp , atom -> end );
1102- NOERR ();
1103- t -> left = atom ;
1104- t -> right = subre (v , '|' , PREF (f ), s2 , atom -> end );
1105- NOERR ();
1106- t -> right -> left = subre (v , '=' , 0 , s2 , atom -> end );
1107- NOERR ();
1108- * atomp = t ;
1109- atomp = & t -> left ;
1110- m = 1 ;
1111- }
1112-
1113- /* deal with the rest of the quantifier */
11141099 if (atomtype == BACKREF )
11151100 {
11161101 /* special case: backrefs have internal quantifiers */
@@ -1120,17 +1105,25 @@ parseqatom(struct vars * v,
11201105 atom -> min = (short ) m ;
11211106 atom -> max = (short ) n ;
11221107 atom -> flags |= COMBINE (qprefer , atom -> flags );
1108+ /* rest of branch can be strung starting from atom->end */
1109+ s2 = atom -> end ;
11231110 }
11241111 else if (m == 1 && n == 1 )
11251112 {
11261113 /* no/vacuous quantifier: done */
11271114 EMPTYARC (s , atom -> begin ); /* empty prefix */
1115+ /* rest of branch can be strung starting from atom->end */
1116+ s2 = atom -> end ;
11281117 }
1129- else
1118+ else if ( m > 0 && !( atom -> flags & BACKR ))
11301119 {
11311120 /*
1132- * Turn x{m,n} into x{m-1,n-1}x, with capturing parens in only the
1133- * second x
1121+ * If there's no backrefs involved, we can turn x{m,n} into
1122+ * x{m-1,n-1}x, with capturing parens in only the second x. This
1123+ * is valid because we only care about capturing matches from the
1124+ * final iteration of the quantifier. It's a win because we can
1125+ * implement the backref-free left side as a plain DFA node, since
1126+ * we don't really care where its submatches are.
11341127 */
11351128 dupnfa (v -> nfa , atom -> begin , atom -> end , s , atom -> begin );
11361129 assert (m >= 1 && m != INFINITY && n >= 1 );
@@ -1142,16 +1135,36 @@ parseqatom(struct vars * v,
11421135 NOERR ();
11431136 t -> right = atom ;
11441137 * atomp = t ;
1138+ /* rest of branch can be strung starting from atom->end */
1139+ s2 = atom -> end ;
1140+ }
1141+ else
1142+ {
1143+ /* general case: need an iteration node */
1144+ s2 = newstate (v -> nfa );
1145+ NOERR ();
1146+ moveouts (v -> nfa , atom -> end , s2 );
1147+ NOERR ();
1148+ dupnfa (v -> nfa , atom -> begin , atom -> end , s , s2 );
1149+ repeat (v , s , s2 , m , n );
1150+ f = COMBINE (qprefer , atom -> flags );
1151+ t = subre (v , '*' , f , s , s2 );
1152+ NOERR ();
1153+ t -> min = (short ) m ;
1154+ t -> max = (short ) n ;
1155+ t -> left = atom ;
1156+ * atomp = t ;
1157+ /* rest of branch is to be strung from iteration's end state */
11451158 }
11461159
11471160 /* and finally, look after that postponed recursion */
11481161 t = top -> right ;
11491162 if (!(SEE ('|' ) || SEE (stopper ) || SEE (EOS )))
1150- t -> right = parsebranch (v , stopper , type , atom -> end , rp , 1 );
1163+ t -> right = parsebranch (v , stopper , type , s2 , rp , 1 );
11511164 else
11521165 {
1153- EMPTYARC (atom -> end , rp );
1154- t -> right = subre (v , '=' , 0 , atom -> end , rp );
1166+ EMPTYARC (s2 , rp );
1167+ t -> right = subre (v , '=' , 0 , s2 , rp );
11551168 }
11561169 assert (SEE ('|' ) || SEE (stopper ) || SEE (EOS ));
11571170 t -> flags |= COMBINE (t -> flags , t -> right -> flags );
@@ -1214,6 +1227,9 @@ scannum(struct vars * v)
12141227/*
12151228 * repeat - replicate subNFA for quantifiers
12161229 *
1230+ * The sub-NFA strung from lp to rp is modified to represent m to n
1231+ * repetitions of its initial contents.
1232+ *
12171233 * The duplication sequences used here are chosen carefully so that any
12181234 * pointers starting out pointing into the subexpression end up pointing into
12191235 * the last occurrence. (Note that it may not be strung between the same
@@ -1229,7 +1245,7 @@ repeat(struct vars * v,
12291245 int n )
12301246{
12311247#define SOME 2
1232- #define INF 3
1248+ #define INF 3
12331249#define PAIR (x , y ) ((x)*4 + (y))
12341250#define REDUCE (x ) ( ((x) == INFINITY) ? INF : (((x) > 1) ? SOME : (x)) )
12351251 const int rm = REDUCE (m );
@@ -1603,7 +1619,7 @@ subre(struct vars * v,
16031619 v -> treechain = ret ;
16041620 }
16051621
1606- assert (strchr ("|.b(= " , op ) != NULL );
1622+ assert (strchr ("=b|.*( " , op ) != NULL );
16071623
16081624 ret -> op = op ;
16091625 ret -> flags = flags ;
0 commit comments