Standard Portable Library: src/pcre/pcre_dfa

00001 /*************************************************
00002 *      Perl-Compatible Regular Expressions       *
00003 *************************************************/
00004 
00005 /* PCRE is a library of functions to support regular expressions whose syntax
00006 and semantics are as close as possible to those of the Perl 5 language (but see
00007 below for why this module is different).
00008 
00009                        Written by Philip Hazel
00010            Copyright (c) 1997-2009 University of Cambridge
00011 
00012 -----------------------------------------------------------------------------
00013 Redistribution and use in source and binary forms, with or without
00014 modification, are permitted provided that the following conditions are met:
00015 
00016     * Redistributions of source code must retain the above copyright notice,
00017       this list of conditions and the following disclaimer.
00018 
00019     * Redistributions in binary form must reproduce the above copyright
00020       notice, this list of conditions and the following disclaimer in the
00021       documentation and/or other materials provided with the distribution.
00022 
00023     * Neither the name of the University of Cambridge nor the names of its
00024       contributors may be used to endorse or promote products derived from
00025       this software without specific prior written permission.
00026 
00027 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
00028 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00029 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
00030 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
00031 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
00032 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
00033 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
00034 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
00035 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
00036 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
00037 POSSIBILITY OF SUCH DAMAGE.
00038 -----------------------------------------------------------------------------
00039 */
00040 
00041 
00042 /* This module contains the external function pcre_dfa_exec(), which is an
00043 alternative matching function that uses a sort of DFA algorithm (not a true
00044 FSM). This is NOT Perl- compatible, but it has advantages in certain
00045 applications. */
00046 
00047 
00048 /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
00049 the performance of his patterns greatly. I could not use it as it stood, as it
00050 was not thread safe, and made assumptions about pattern sizes. Also, it caused
00051 test 7 to loop, and test 9 to crash with a segfault.
00052 
00053 The issue is the check for duplicate states, which is done by a simple linear
00054 search up the state list. (Grep for "duplicate" below to find the code.) For
00055 many patterns, there will never be many states active at one time, so a simple
00056 linear search is fine. In patterns that have many active states, it might be a
00057 bottleneck. The suggested code used an indexing scheme to remember which states
00058 had previously been used for each character, and avoided the linear search when
00059 it knew there was no chance of a duplicate. This was implemented when adding
00060 states to the state lists.
00061 
00062 I wrote some thread-safe, not-limited code to try something similar at the time
00063 of checking for duplicates (instead of when adding states), using index vectors
00064 on the stack. It did give a 13% improvement with one specially constructed
00065 pattern for certain subject strings, but on other strings and on many of the
00066 simpler patterns in the test suite it did worse. The major problem, I think,
00067 was the extra time to initialize the index. This had to be done for each call
00068 of internal_dfa_exec(). (The supplied patch used a static vector, initialized
00069 only once - I suspect this was the cause of the problems with the tests.)
00070 
00071 Overall, I concluded that the gains in some cases did not outweigh the losses
00072 in others, so I abandoned this code. */
00073 
00074 
00075 
00076 #ifdef HAVE_CONFIG_H
00077 #include "config.h"
00078 #else if defined(_WINDOWS)
00079 #include <spl/configwin32.h>
00080 #endif
00081 
00082 
00083 #define NLBLOCK md             /* Block containing newline information */
00084 #define PSSTART start_subject  /* Field containing processed string start */
00085 #define PSEND   end_subject    /* Field containing processed string end */
00086 
00087 #include "pcre_internal.h"
00088 
00089 
00090 /* For use to indent debugging output */
00091 
00092 #define SP "                   "
00093 
00094 
00095 /*************************************************
00096 *      Code parameters and static tables         *
00097 *************************************************/
00098 
00099 /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
00100 into others, under special conditions. A gap of 20 between the blocks should be
00101 enough. The resulting opcodes don't have to be less than 256 because they are
00102 never stored, so we push them well clear of the normal opcodes. */
00103 
00104 #define OP_PROP_EXTRA       300
00105 #define OP_EXTUNI_EXTRA     320
00106 #define OP_ANYNL_EXTRA      340
00107 #define OP_HSPACE_EXTRA     360
00108 #define OP_VSPACE_EXTRA     380
00109 
00110 
00111 /* This table identifies those opcodes that are followed immediately by a
00112 character that is to be tested in some way. This makes is possible to
00113 centralize the loading of these characters. In the case of Type * etc, the
00114 "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
00115 small value. Non-zero values in the table are the offsets from the opcode where
00116 the character is to be found. ***NOTE*** If the start of this table is
00117 modified, the three tables that follow must also be modified. */
00118 
00119 static const uschar coptable[] = {
00120   0,                             /* End                                    */
00121   0, 0, 0, 0, 0,                 /* \A, \G, \K, \B, \b                     */
00122   0, 0, 0, 0, 0, 0,              /* \D, \d, \S, \s, \W, \w                 */
00123   0, 0, 0,                       /* Any, AllAny, Anybyte                   */
00124   0, 0, 0,                       /* NOTPROP, PROP, EXTUNI                  */
00125   0, 0, 0, 0, 0,                 /* \R, \H, \h, \V, \v                     */
00126   0, 0, 0, 0, 0,                 /* \Z, \z, Opt, ^, $                      */
00127   1,                             /* Char                                   */
00128   1,                             /* Charnc                                 */
00129   1,                             /* not                                    */
00130   /* Positive single-char repeats                                          */
00131   1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
00132   3, 3, 3,                       /* upto, minupto, exact                   */
00133   1, 1, 1, 3,                    /* *+, ++, ?+, upto+                      */
00134   /* Negative single-char repeats - only for chars < 256                   */
00135   1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
00136   3, 3, 3,                       /* NOT upto, minupto, exact               */
00137   1, 1, 1, 3,                    /* NOT *+, ++, ?+, updo+                  */
00138   /* Positive type repeats                                                 */
00139   1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
00140   3, 3, 3,                       /* Type upto, minupto, exact              */
00141   1, 1, 1, 3,                    /* Type *+, ++, ?+, upto+                 */
00142   /* Character class & ref repeats                                         */
00143   0, 0, 0, 0, 0, 0,              /* *, *?, +, +?, ?, ??                    */
00144   0, 0,                          /* CRRANGE, CRMINRANGE                    */
00145   0,                             /* CLASS                                  */
00146   0,                             /* NCLASS                                 */
00147   0,                             /* XCLASS - variable length               */
00148   0,                             /* REF                                    */
00149   0,                             /* RECURSE                                */
00150   0,                             /* CALLOUT                                */
00151   0,                             /* Alt                                    */
00152   0,                             /* Ket                                    */
00153   0,                             /* KetRmax                                */
00154   0,                             /* KetRmin                                */
00155   0,                             /* Assert                                 */
00156   0,                             /* Assert not                             */
00157   0,                             /* Assert behind                          */
00158   0,                             /* Assert behind not                      */
00159   0,                             /* Reverse                                */
00160   0, 0, 0, 0,                    /* ONCE, BRA, CBRA, COND                  */
00161   0, 0, 0,                       /* SBRA, SCBRA, SCOND                     */
00162   0,                             /* CREF                                   */
00163   0,                             /* RREF                                   */
00164   0,                             /* DEF                                    */
00165   0, 0,                          /* BRAZERO, BRAMINZERO                    */
00166   0, 0, 0, 0,                    /* PRUNE, SKIP, THEN, COMMIT              */
00167   0, 0, 0, 0                     /* FAIL, ACCEPT, CLOSE, SKIPZERO          */
00168 };
00169 
00170 /* This table identifies those opcodes that inspect a character. It is used to
00171 remember the fact that a character could have been inspected when the end of
00172 the subject is reached. ***NOTE*** If the start of this table is modified, the
00173 two tables that follow must also be modified. */
00174 
00175 static const uschar poptable[] = {
00176   0,                             /* End                                    */
00177   0, 0, 0, 1, 1,                 /* \A, \G, \K, \B, \b                     */
00178   1, 1, 1, 1, 1, 1,              /* \D, \d, \S, \s, \W, \w                 */
00179   1, 1, 1,                       /* Any, AllAny, Anybyte                   */
00180   1, 1, 1,                       /* NOTPROP, PROP, EXTUNI                  */
00181   1, 1, 1, 1, 1,                 /* \R, \H, \h, \V, \v                     */
00182   0, 0, 0, 0, 0,                 /* \Z, \z, Opt, ^, $                      */
00183   1,                             /* Char                                   */
00184   1,                             /* Charnc                                 */
00185   1,                             /* not                                    */
00186   /* Positive single-char repeats                                          */
00187   1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
00188   1, 1, 1,                       /* upto, minupto, exact                   */
00189   1, 1, 1, 1,                    /* *+, ++, ?+, upto+                      */
00190   /* Negative single-char repeats - only for chars < 256                   */
00191   1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
00192   1, 1, 1,                       /* NOT upto, minupto, exact               */
00193   1, 1, 1, 1,                    /* NOT *+, ++, ?+, upto+                  */
00194   /* Positive type repeats                                                 */
00195   1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
00196   1, 1, 1,                       /* Type upto, minupto, exact              */
00197   1, 1, 1, 1,                    /* Type *+, ++, ?+, upto+                 */
00198   /* Character class & ref repeats                                         */
00199   1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
00200   1, 1,                          /* CRRANGE, CRMINRANGE                    */
00201   1,                             /* CLASS                                  */
00202   1,                             /* NCLASS                                 */
00203   1,                             /* XCLASS - variable length               */
00204   0,                             /* REF                                    */
00205   0,                             /* RECURSE                                */
00206   0,                             /* CALLOUT                                */
00207   0,                             /* Alt                                    */
00208   0,                             /* Ket                                    */
00209   0,                             /* KetRmax                                */
00210   0,                             /* KetRmin                                */
00211   0,                             /* Assert                                 */
00212   0,                             /* Assert not                             */
00213   0,                             /* Assert behind                          */
00214   0,                             /* Assert behind not                      */
00215   0,                             /* Reverse                                */
00216   0, 0, 0, 0,                    /* ONCE, BRA, CBRA, COND                  */
00217   0, 0, 0,                       /* SBRA, SCBRA, SCOND                     */
00218   0,                             /* CREF                                   */
00219   0,                             /* RREF                                   */
00220   0,                             /* DEF                                    */
00221   0, 0,                          /* BRAZERO, BRAMINZERO                    */
00222   0, 0, 0, 0,                    /* PRUNE, SKIP, THEN, COMMIT              */
00223   0, 0, 0, 0                     /* FAIL, ACCEPT, CLOSE, SKIPZERO          */
00224 };
00225 
00226 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
00227 and \w */
00228 
00229 static const uschar toptable1[] = {
00230   0, 0, 0, 0, 0, 0,
00231   ctype_digit, ctype_digit,
00232   ctype_space, ctype_space,
00233   ctype_word,  ctype_word,
00234   0, 0                            /* OP_ANY, OP_ALLANY */
00235 };
00236 
00237 static const uschar toptable2[] = {
00238   0, 0, 0, 0, 0, 0,
00239   ctype_digit, 0,
00240   ctype_space, 0,
00241   ctype_word,  0,
00242   1, 1                            /* OP_ANY, OP_ALLANY */
00243 };
00244 
00245 
00246 /* Structure for holding data about a particular state, which is in effect the
00247 current data for an active path through the match tree. It must consist
00248 entirely of ints because the working vector we are passed, and which we put
00249 these structures in, is a vector of ints. */
00250 
00251 typedef struct stateblock {
00252   int offset;                     /* Offset to opcode */
00253   int count;                      /* Count for repeats */
00254   int ims;                        /* ims flag bits */
00255   int data;                       /* Some use extra data */
00256 } stateblock;
00257 
00258 #define INTS_PER_STATEBLOCK  (sizeof(stateblock)/sizeof(int))
00259 
00260 
00261 #ifdef DEBUG
00262 /*************************************************
00263 *             Print character string             *
00264 *************************************************/
00265 
00266 /* Character string printing function for debugging.
00267 
00268 Arguments:
00269   p            points to string
00270   length       number of bytes
00271   f            where to print
00272 
00273 Returns:       nothing
00274 */
00275 
00276 static void
00277 pchars(unsigned char *p, int length, FILE *f)
00278 {
00279 int c;
00280 while (length-- > 0)
00281   {
00282   if (isprint(c = *(p++)))
00283     fprintf(f, "%c", c);
00284   else
00285     fprintf(f, "\\x%02x", c);
00286   }
00287 }
00288 #endif
00289 
00290 
00291 
00292 /*************************************************
00293 *    Execute a Regular Expression - DFA engine   *
00294 *************************************************/
00295 
00296 /* This internal function applies a compiled pattern to a subject string,
00297 starting at a given point, using a DFA engine. This function is called from the
00298 external one, possibly multiple times if the pattern is not anchored. The
00299 function calls itself recursively for some kinds of subpattern.
00300 
00301 Arguments:
00302   md                the match_data block with fixed information
00303   this_start_code   the opening bracket of this subexpression's code
00304   current_subject   where we currently are in the subject string
00305   start_offset      start offset in the subject string
00306   offsets           vector to contain the matching string offsets
00307   offsetcount       size of same
00308   workspace         vector of workspace
00309   wscount           size of same
00310   ims               the current ims flags
00311   rlevel            function call recursion level
00312   recursing         regex recursive call level
00313 
00314 Returns:            > 0 => number of match offset pairs placed in offsets
00315                     = 0 => offsets overflowed; longest matches are present
00316                      -1 => failed to match
00317                    < -1 => some kind of unexpected problem
00318 
00319 The following macros are used for adding states to the two state vectors (one
00320 for the current character, one for the following character). */
00321 
00322 #define ADD_ACTIVE(x,y) \
00323   if (active_count++ < wscount) \
00324     { \
00325     next_active_state->offset = (x); \
00326     next_active_state->count  = (y); \
00327     next_active_state->ims    = ims; \
00328     next_active_state++; \
00329     DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
00330     } \
00331   else return PCRE_ERROR_DFA_WSSIZE
00332 
00333 #define ADD_ACTIVE_DATA(x,y,z) \
00334   if (active_count++ < wscount) \
00335     { \
00336     next_active_state->offset = (x); \
00337     next_active_state->count  = (y); \
00338     next_active_state->ims    = ims; \
00339     next_active_state->data   = (z); \
00340     next_active_state++; \
00341     DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
00342     } \
00343   else return PCRE_ERROR_DFA_WSSIZE
00344 
00345 #define ADD_NEW(x,y) \
00346   if (new_count++ < wscount) \
00347     { \
00348     next_new_state->offset = (x); \
00349     next_new_state->count  = (y); \
00350     next_new_state->ims    = ims; \
00351     next_new_state++; \
00352     DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
00353     } \
00354   else return PCRE_ERROR_DFA_WSSIZE
00355 
00356 #define ADD_NEW_DATA(x,y,z) \
00357   if (new_count++ < wscount) \
00358     { \
00359     next_new_state->offset = (x); \
00360     next_new_state->count  = (y); \
00361     next_new_state->ims    = ims; \
00362     next_new_state->data   = (z); \
00363     next_new_state++; \
00364     DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
00365     } \
00366   else return PCRE_ERROR_DFA_WSSIZE
00367 
00368 /* And now, here is the code */
00369 
00370 static int
00371 internal_dfa_exec(
00372   dfa_match_data *md,
00373   const uschar *this_start_code,
00374   const uschar *current_subject,
00375   int start_offset,
00376   int *offsets,
00377   int offsetcount,
00378   int *workspace,
00379   int wscount,
00380   int ims,
00381   int  rlevel,
00382   int  recursing)
00383 {
00384 stateblock *active_states, *new_states, *temp_states;
00385 stateblock *next_active_state, *next_new_state;
00386 
00387 const uschar *ctypes, *lcc, *fcc;
00388 const uschar *ptr;
00389 const uschar *end_code, *first_op;
00390 
00391 int active_count, new_count, match_count;
00392 
00393 /* Some fields in the md block are frequently referenced, so we load them into
00394 independent variables in the hope that this will perform better. */
00395 
00396 const uschar *start_subject = md->start_subject;
00397 const uschar *end_subject = md->end_subject;
00398 const uschar *start_code = md->start_code;
00399 
00400 #ifdef SUPPORT_UTF8
00401 BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
00402 #else
00403 BOOL utf8 = FALSE;
00404 #endif
00405 
00406 rlevel++;
00407 offsetcount &= (-2);
00408 
00409 wscount -= 2;
00410 wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
00411           (2 * INTS_PER_STATEBLOCK);
00412 
00413 DPRINTF(("\n%.*s---------------------\n"
00414   "%.*sCall to internal_dfa_exec f=%d r=%d\n",
00415   rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));
00416 
00417 ctypes = md->tables + ctypes_offset;
00418 lcc = md->tables + lcc_offset;
00419 fcc = md->tables + fcc_offset;
00420 
00421 match_count = PCRE_ERROR_NOMATCH;   /* A negative number */
00422 
00423 active_states = (stateblock *)(workspace + 2);
00424 next_new_state = new_states = active_states + wscount;
00425 new_count = 0;
00426 
00427 first_op = this_start_code + 1 + LINK_SIZE +
00428   ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
00429 
00430 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
00431 the alternative states onto the list, and find out where the end is. This
00432 makes is possible to use this function recursively, when we want to stop at a
00433 matching internal ket rather than at the end.
00434 
00435 If the first opcode in the first alternative is OP_REVERSE, we are dealing with
00436 a backward assertion. In that case, we have to find out the maximum amount to
00437 move back, and set up each alternative appropriately. */
00438 
00439 if (*first_op == OP_REVERSE)
00440   {
00441   int max_back = 0;
00442   int gone_back;
00443 
00444   end_code = this_start_code;
00445   do
00446     {
00447     int back = GET(end_code, 2+LINK_SIZE);
00448     if (back > max_back) max_back = back;
00449     end_code += GET(end_code, 1);
00450     }
00451   while (*end_code == OP_ALT);
00452 
00453   /* If we can't go back the amount required for the longest lookbehind
00454   pattern, go back as far as we can; some alternatives may still be viable. */
00455 
00456 #ifdef SUPPORT_UTF8
00457   /* In character mode we have to step back character by character */
00458 
00459   if (utf8)
00460     {
00461     for (gone_back = 0; gone_back < max_back; gone_back++)
00462       {
00463       if (current_subject <= start_subject) break;
00464       current_subject--;
00465       while (current_subject > start_subject &&
00466              (*current_subject & 0xc0) == 0x80)
00467         current_subject--;
00468       }
00469     }
00470   else
00471 #endif
00472 
00473   /* In byte-mode we can do this quickly. */
00474 
00475     {
00476     gone_back = (current_subject - max_back < start_subject)?
00477       current_subject - start_subject : max_back;
00478     current_subject -= gone_back;
00479     }
00480 
00481   /* Save the earliest consulted character */
00482 
00483   if (current_subject < md->start_used_ptr)
00484     md->start_used_ptr = current_subject;
00485 
00486   /* Now we can process the individual branches. */
00487 
00488   end_code = this_start_code;
00489   do
00490     {
00491     int back = GET(end_code, 2+LINK_SIZE);
00492     if (back <= gone_back)
00493       {
00494       int bstate = end_code - start_code + 2 + 2*LINK_SIZE;
00495       ADD_NEW_DATA(-bstate, 0, gone_back - back);
00496       }
00497     end_code += GET(end_code, 1);
00498     }
00499   while (*end_code == OP_ALT);
00500  }
00501 
00502 /* This is the code for a "normal" subpattern (not a backward assertion). The
00503 start of a whole pattern is always one of these. If we are at the top level,
00504 we may be asked to restart matching from the same point that we reached for a
00505 previous partial match. We still have to scan through the top-level branches to
00506 find the end state. */
00507 
00508 else
00509   {
00510   end_code = this_start_code;
00511 
00512   /* Restarting */
00513 
00514   if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
00515     {
00516     do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
00517     new_count = workspace[1];
00518     if (!workspace[0])
00519       memcpy(new_states, active_states, new_count * sizeof(stateblock));
00520     }
00521 
00522   /* Not restarting */
00523 
00524   else
00525     {
00526     int length = 1 + LINK_SIZE +
00527       ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
00528     do
00529       {
00530       ADD_NEW(end_code - start_code + length, 0);
00531       end_code += GET(end_code, 1);
00532       length = 1 + LINK_SIZE;
00533       }
00534     while (*end_code == OP_ALT);
00535     }
00536   }
00537 
00538 workspace[0] = 0;    /* Bit indicating which vector is current */
00539 
00540 DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));
00541 
00542 /* Loop for scanning the subject */
00543 
00544 ptr = current_subject;
00545 for (;;)
00546   {
00547   int i, j;
00548   int clen, dlen;
00549   unsigned int c, d;
00550   int forced_fail = 0;
00551   BOOL could_continue = FALSE;
00552 
00553   /* Make the new state list into the active state list and empty the
00554   new state list. */
00555 
00556   temp_states = active_states;
00557   active_states = new_states;
00558   new_states = temp_states;
00559   active_count = new_count;
00560   new_count = 0;
00561 
00562   workspace[0] ^= 1;              /* Remember for the restarting feature */
00563   workspace[1] = active_count;
00564 
00565 #ifdef DEBUG
00566   printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
00567   pchars((uschar *)ptr, strlen((char *)ptr), stdout);
00568   printf("\"\n");
00569 
00570   printf("%.*sActive states: ", rlevel*2-2, SP);
00571   for (i = 0; i < active_count; i++)
00572     printf("%d/%d ", active_states[i].offset, active_states[i].count);
00573   printf("\n");
00574 #endif
00575 
00576   /* Set the pointers for adding new states */
00577 
00578   next_active_state = active_states + active_count;
00579   next_new_state = new_states;
00580 
00581   /* Load the current character from the subject outside the loop, as many
00582   different states may want to look at it, and we assume that at least one
00583   will. */
00584 
00585   if (ptr < end_subject)
00586     {
00587     clen = 1;        /* Number of bytes in the character */
00588 #ifdef SUPPORT_UTF8
00589     if (utf8) { GETCHARLEN(c, ptr, clen); } else
00590 #endif  /* SUPPORT_UTF8 */
00591     c = *ptr;
00592     }
00593   else
00594     {
00595     clen = 0;        /* This indicates the end of the subject */
00596     c = NOTACHAR;    /* This value should never actually be used */
00597     }
00598 
00599   /* Scan up the active states and act on each one. The result of an action
00600   may be to add more states to the currently active list (e.g. on hitting a
00601   parenthesis) or it may be to put states on the new list, for considering
00602   when we move the character pointer on. */
00603 
00604   for (i = 0; i < active_count; i++)
00605     {
00606     stateblock *current_state = active_states + i;
00607     const uschar *code;
00608     int state_offset = current_state->offset;
00609     int count, codevalue, rrc;
00610 
00611 #ifdef DEBUG
00612     printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
00613     if (clen == 0) printf("EOL\n");
00614       else if (c > 32 && c < 127) printf("'%c'\n", c);
00615         else printf("0x%02x\n", c);
00616 #endif
00617 
00618     /* This variable is referred to implicity in the ADD_xxx macros. */
00619 
00620     ims = current_state->ims;
00621 
00622     /* A negative offset is a special case meaning "hold off going to this
00623     (negated) state until the number of characters in the data field have
00624     been skipped". */
00625 
00626     if (state_offset < 0)
00627       {
00628       if (current_state->data > 0)
00629         {
00630         DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
00631         ADD_NEW_DATA(state_offset, current_state->count,
00632           current_state->data - 1);
00633         continue;
00634         }
00635       else
00636         {
00637         current_state->offset = state_offset = -state_offset;
00638         }
00639       }
00640 
00641     /* Check for a duplicate state with the same count, and skip if found.
00642     See the note at the head of this module about the possibility of improving
00643     performance here. */
00644 
00645     for (j = 0; j < i; j++)
00646       {
00647       if (active_states[j].offset == state_offset &&
00648           active_states[j].count == current_state->count)
00649         {
00650         DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
00651         goto NEXT_ACTIVE_STATE;
00652         }
00653       }
00654 
00655     /* The state offset is the offset to the opcode */
00656 
00657     code = start_code + state_offset;
00658     codevalue = *code;
00659 
00660     /* If this opcode inspects a character, but we are at the end of the
00661     subject, remember the fact for use when testing for a partial match. */
00662 
00663     if (clen == 0 && poptable[codevalue] != 0)
00664       could_continue = TRUE;
00665 
00666     /* If this opcode is followed by an inline character, load it. It is
00667     tempting to test for the presence of a subject character here, but that
00668     is wrong, because sometimes zero repetitions of the subject are
00669     permitted.
00670 
00671     We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
00672     argument that is not a data character - but is always one byte long. We
00673     have to take special action to deal with  \P, \p, \H, \h, \V, \v and \X in
00674     this case. To keep the other cases fast, convert these ones to new opcodes.
00675     */
00676 
00677     if (coptable[codevalue] > 0)
00678       {
00679       dlen = 1;
00680 #ifdef SUPPORT_UTF8
00681       if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
00682 #endif  /* SUPPORT_UTF8 */
00683       d = code[coptable[codevalue]];
00684       if (codevalue >= OP_TYPESTAR)
00685         {
00686         switch(d)
00687           {
00688           case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
00689           case OP_NOTPROP:
00690           case OP_PROP: codevalue += OP_PROP_EXTRA; break;
00691           case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
00692           case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
00693           case OP_NOT_HSPACE:
00694           case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
00695           case OP_NOT_VSPACE:
00696           case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
00697           default: break;
00698           }
00699         }
00700       }
00701     else
00702       {
00703       dlen = 0;         /* Not strictly necessary, but compilers moan */
00704       d = NOTACHAR;     /* if these variables are not set. */
00705       }
00706 
00707 
00708     /* Now process the individual opcodes */
00709 
00710     switch (codevalue)
00711       {
00712 
00713 /* ========================================================================== */
00714       /* Reached a closing bracket. If not at the end of the pattern, carry
00715       on with the next opcode. Otherwise, unless we have an empty string and
00716       PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
00717       start of the subject, save the match data, shifting up all previous
00718       matches so we always have the longest first. */
00719 
00720       case OP_KET:
00721       case OP_KETRMIN:
00722       case OP_KETRMAX:
00723       if (code != end_code)
00724         {
00725         ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
00726         if (codevalue != OP_KET)
00727           {
00728           ADD_ACTIVE(state_offset - GET(code, 1), 0);
00729           }
00730         }
00731       else
00732         {
00733         if (ptr > current_subject ||
00734             ((md->moptions & PCRE_NOTEMPTY) == 0 &&
00735               ((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 ||
00736                 current_subject > start_subject + md->start_offset)))
00737           {
00738           if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
00739             else if (match_count > 0 && ++match_count * 2 >= offsetcount)
00740               match_count = 0;
00741           count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
00742           if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
00743           if (offsetcount >= 2)
00744             {
00745             offsets[0] = current_subject - start_subject;
00746             offsets[1] = ptr - start_subject;
00747             DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
00748               offsets[1] - offsets[0], current_subject));
00749             }
00750           if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
00751             {
00752             DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
00753               "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
00754               match_count, rlevel*2-2, SP));
00755             return match_count;
00756             }
00757           }
00758         }
00759       break;
00760 
00761 /* ========================================================================== */
00762       /* These opcodes add to the current list of states without looking
00763       at the current character. */
00764 
00765       /*-----------------------------------------------------------------*/
00766       case OP_ALT:
00767       do { code += GET(code, 1); } while (*code == OP_ALT);
00768       ADD_ACTIVE(code - start_code, 0);
00769       break;
00770 
00771       /*-----------------------------------------------------------------*/
00772       case OP_BRA:
00773       case OP_SBRA:
00774       do
00775         {
00776         ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
00777         code += GET(code, 1);
00778         }
00779       while (*code == OP_ALT);
00780       break;
00781 
00782       /*-----------------------------------------------------------------*/
00783       case OP_CBRA:
00784       case OP_SCBRA:
00785       ADD_ACTIVE(code - start_code + 3 + LINK_SIZE,  0);
00786       code += GET(code, 1);
00787       while (*code == OP_ALT)
00788         {
00789         ADD_ACTIVE(code - start_code + 1 + LINK_SIZE,  0);
00790         code += GET(code, 1);
00791         }
00792       break;
00793 
00794       /*-----------------------------------------------------------------*/
00795       case OP_BRAZERO:
00796       case OP_BRAMINZERO:
00797       ADD_ACTIVE(state_offset + 1, 0);
00798       code += 1 + GET(code, 2);
00799       while (*code == OP_ALT) code += GET(code, 1);
00800       ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
00801       break;
00802 
00803       /*-----------------------------------------------------------------*/
00804       case OP_SKIPZERO:
00805       code += 1 + GET(code, 2);
00806       while (*code == OP_ALT) code += GET(code, 1);
00807       ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
00808       break;
00809 
00810       /*-----------------------------------------------------------------*/
00811       case OP_CIRC:
00812       if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
00813           ((ims & PCRE_MULTILINE) != 0 &&
00814             ptr != end_subject &&
00815             WAS_NEWLINE(ptr)))
00816         { ADD_ACTIVE(state_offset + 1, 0); }
00817       break;
00818 
00819       /*-----------------------------------------------------------------*/
00820       case OP_EOD:
00821       if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }
00822       break;
00823 
00824       /*-----------------------------------------------------------------*/
00825       case OP_OPT:
00826       ims = code[1];
00827       ADD_ACTIVE(state_offset + 2, 0);
00828       break;
00829 
00830       /*-----------------------------------------------------------------*/
00831       case OP_SOD:
00832       if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
00833       break;
00834 
00835       /*-----------------------------------------------------------------*/
00836       case OP_SOM:
00837       if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
00838       break;
00839 
00840 
00841 /* ========================================================================== */
00842       /* These opcodes inspect the next subject character, and sometimes
00843       the previous one as well, but do not have an argument. The variable
00844       clen contains the length of the current character and is zero if we are
00845       at the end of the subject. */
00846 
00847       /*-----------------------------------------------------------------*/
00848       case OP_ANY:
00849       if (clen > 0 && !IS_NEWLINE(ptr))
00850         { ADD_NEW(state_offset + 1, 0); }
00851       break;
00852 
00853       /*-----------------------------------------------------------------*/
00854       case OP_ALLANY:
00855       if (clen > 0)
00856         { ADD_NEW(state_offset + 1, 0); }
00857       break;
00858 
00859       /*-----------------------------------------------------------------*/
00860       case OP_EODN:
00861       if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
00862         { ADD_ACTIVE(state_offset + 1, 0); }
00863       break;
00864 
00865       /*-----------------------------------------------------------------*/
00866       case OP_DOLL:
00867       if ((md->moptions & PCRE_NOTEOL) == 0)
00868         {
00869         if (clen == 0 ||
00870             ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
00871                ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
00872             ))
00873           { ADD_ACTIVE(state_offset + 1, 0); }
00874         }
00875       else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))
00876         { ADD_ACTIVE(state_offset + 1, 0); }
00877       break;
00878 
00879       /*-----------------------------------------------------------------*/
00880 
00881       case OP_DIGIT:
00882       case OP_WHITESPACE:
00883       case OP_WORDCHAR:
00884       if (clen > 0 && c < 256 &&
00885             ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
00886         { ADD_NEW(state_offset + 1, 0); }
00887       break;
00888 
00889       /*-----------------------------------------------------------------*/
00890       case OP_NOT_DIGIT:
00891       case OP_NOT_WHITESPACE:
00892       case OP_NOT_WORDCHAR:
00893       if (clen > 0 && (c >= 256 ||
00894             ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
00895         { ADD_NEW(state_offset + 1, 0); }
00896       break;
00897 
00898       /*-----------------------------------------------------------------*/
00899       case OP_WORD_BOUNDARY:
00900       case OP_NOT_WORD_BOUNDARY:
00901         {
00902         int left_word, right_word;
00903 
00904         if (ptr > start_subject)
00905           {
00906           const uschar *temp = ptr - 1;
00907           if (temp < md->start_used_ptr) md->start_used_ptr = temp;
00908 #ifdef SUPPORT_UTF8
00909           if (utf8) BACKCHAR(temp);
00910 #endif
00911           GETCHARTEST(d, temp);
00912           left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
00913           }
00914         else left_word = 0;
00915 
00916         if (clen > 0)
00917           right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
00918         else right_word = 0;
00919 
00920         if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
00921           { ADD_ACTIVE(state_offset + 1, 0); }
00922         }
00923       break;
00924 
00925 
00926       /*-----------------------------------------------------------------*/
00927       /* Check the next character by Unicode property. We will get here only
00928       if the support is in the binary; otherwise a compile-time error occurs.
00929       */
00930 
00931 #ifdef SUPPORT_UCP
00932       case OP_PROP:
00933       case OP_NOTPROP:
00934       if (clen > 0)
00935         {
00936         BOOL OK;
00937         const ucd_record * prop = GET_UCD(c);
00938         switch(code[1])
00939           {
00940           case PT_ANY:
00941           OK = TRUE;
00942           break;
00943 
00944           case PT_LAMP:
00945           OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
00946           break;
00947 
00948           case PT_GC:
00949           OK = _pcre_ucp_gentype[prop->chartype] == code[2];
00950           break;
00951 
00952           case PT_PC:
00953           OK = prop->chartype == code[2];
00954           break;
00955 
00956           case PT_SC:
00957           OK = prop->script == code[2];
00958           break;
00959 
00960           /* Should never occur, but keep compilers from grumbling. */
00961 
00962           default:
00963           OK = codevalue != OP_PROP;
00964           break;
00965           }
00966 
00967         if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
00968         }
00969       break;
00970 #endif
00971 
00972 
00973 
00974 /* ========================================================================== */
00975       /* These opcodes likewise inspect the subject character, but have an
00976       argument that is not a data character. It is one of these opcodes:
00977       OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
00978       OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
00979 
00980       case OP_TYPEPLUS:
00981       case OP_TYPEMINPLUS:
00982       case OP_TYPEPOSPLUS:
00983       count = current_state->count;  /* Already matched */
00984       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
00985       if (clen > 0)
00986         {
00987         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
00988             (c < 256 &&
00989               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
00990               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
00991           {
00992           if (count > 0 && codevalue == OP_TYPEPOSPLUS)
00993             {
00994             active_count--;            /* Remove non-match possibility */
00995             next_active_state--;
00996             }
00997           count++;
00998           ADD_NEW(state_offset, count);
00999           }
01000         }
01001       break;
01002 
01003       /*-----------------------------------------------------------------*/
01004       case OP_TYPEQUERY:
01005       case OP_TYPEMINQUERY:
01006       case OP_TYPEPOSQUERY:
01007       ADD_ACTIVE(state_offset + 2, 0);
01008       if (clen > 0)
01009         {
01010         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
01011             (c < 256 &&
01012               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
01013               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
01014           {
01015           if (codevalue == OP_TYPEPOSQUERY)
01016             {
01017             active_count--;            /* Remove non-match possibility */
01018             next_active_state--;
01019             }
01020           ADD_NEW(state_offset + 2, 0);
01021           }
01022         }
01023       break;
01024 
01025       /*-----------------------------------------------------------------*/
01026       case OP_TYPESTAR:
01027       case OP_TYPEMINSTAR:
01028       case OP_TYPEPOSSTAR:
01029       ADD_ACTIVE(state_offset + 2, 0);
01030       if (clen > 0)
01031         {
01032         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
01033             (c < 256 &&
01034               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
01035               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
01036           {
01037           if (codevalue == OP_TYPEPOSSTAR)
01038             {
01039             active_count--;            /* Remove non-match possibility */
01040             next_active_state--;
01041             }
01042           ADD_NEW(state_offset, 0);
01043           }
01044         }
01045       break;
01046 
01047       /*-----------------------------------------------------------------*/
01048       case OP_TYPEEXACT:
01049       count = current_state->count;  /* Number already matched */
01050       if (clen > 0)
01051         {
01052         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
01053             (c < 256 &&
01054               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
01055               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
01056           {
01057           if (++count >= GET2(code, 1))
01058             { ADD_NEW(state_offset + 4, 0); }
01059           else
01060             { ADD_NEW(state_offset, count); }
01061           }
01062         }
01063       break;
01064 
01065       /*-----------------------------------------------------------------*/
01066       case OP_TYPEUPTO:
01067       case OP_TYPEMINUPTO:
01068       case OP_TYPEPOSUPTO:
01069       ADD_ACTIVE(state_offset + 4, 0);
01070       count = current_state->count;  /* Number already matched */
01071       if (clen > 0)
01072         {
01073         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
01074             (c < 256 &&
01075               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
01076               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
01077           {
01078           if (codevalue == OP_TYPEPOSUPTO)
01079             {
01080             active_count--;           /* Remove non-match possibility */
01081             next_active_state--;
01082             }
01083           if (++count >= GET2(code, 1))
01084             { ADD_NEW(state_offset + 4, 0); }
01085           else
01086             { ADD_NEW(state_offset, count); }
01087           }
01088         }
01089       break;
01090 
01091 /* ========================================================================== */
01092       /* These are virtual opcodes that are used when something like
01093       OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
01094       argument. It keeps the code above fast for the other cases. The argument
01095       is in the d variable. */
01096 
01097 #ifdef SUPPORT_UCP
01098       case OP_PROP_EXTRA + OP_TYPEPLUS:
01099       case OP_PROP_EXTRA + OP_TYPEMINPLUS:
01100       case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
01101       count = current_state->count;           /* Already matched */
01102       if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
01103       if (clen > 0)
01104         {
01105         BOOL OK;
01106         const ucd_record * prop = GET_UCD(c);
01107         switch(code[2])
01108           {
01109           case PT_ANY:
01110           OK = TRUE;
01111           break;
01112 
01113           case PT_LAMP:
01114           OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
01115           break;
01116 
01117           case PT_GC:
01118           OK = _pcre_ucp_gentype[prop->chartype] == code[3];
01119           break;
01120 
01121           case PT_PC:
01122           OK = prop->chartype == code[3];
01123           break;
01124 
01125           case PT_SC:
01126           OK = prop->script == code[3];
01127           break;
01128 
01129           /* Should never occur, but keep compilers from grumbling. */
01130 
01131           default:
01132           OK = codevalue != OP_PROP;
01133           break;
01134           }
01135 
01136         if (OK == (d == OP_PROP))
01137           {
01138           if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
01139             {
01140             active_count--;           /* Remove non-match possibility */
01141             next_active_state--;
01142             }
01143           count++;
01144           ADD_NEW(state_offset, count);
01145           }
01146         }
01147       break;
01148 
01149       /*-----------------------------------------------------------------*/
01150       case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
01151       case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
01152       case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
01153       count = current_state->count;  /* Already matched */
01154       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
01155       if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
01156         {
01157         const uschar *nptr = ptr + clen;
01158         int ncount = 0;
01159         if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
01160           {
01161           active_count--;           /* Remove non-match possibility */
01162           next_active_state--;
01163           }
01164         while (nptr < end_subject)
01165           {
01166           int nd;
01167           int ndlen = 1;
01168           GETCHARLEN(nd, nptr, ndlen);
01169           if (UCD_CATEGORY(nd) != ucp_M) break;
01170           ncount++;
01171           nptr += ndlen;
01172           }
01173         count++;
01174         ADD_NEW_DATA(-state_offset, count, ncount);
01175         }
01176       break;
01177 #endif
01178 
01179       /*-----------------------------------------------------------------*/
01180       case OP_ANYNL_EXTRA + OP_TYPEPLUS:
01181       case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
01182       case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
01183       count = current_state->count;  /* Already matched */
01184       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
01185       if (clen > 0)
01186         {
01187         int ncount = 0;
01188         switch (c)
01189           {
01190           case 0x000b:
01191           case 0x000c:
01192           case 0x0085:
01193           case 0x2028:
01194           case 0x2029:
01195           if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
01196           goto ANYNL01;
01197 
01198           case 0x000d:
01199           if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
01200           /* Fall through */
01201 
01202           ANYNL01:
01203           case 0x000a:
01204           if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
01205             {
01206             active_count--;           /* Remove non-match possibility */
01207             next_active_state--;
01208             }
01209           count++;
01210           ADD_NEW_DATA(-state_offset, count, ncount);
01211           break;
01212 
01213           default:
01214           break;
01215           }
01216         }
01217       break;
01218 
01219       /*-----------------------------------------------------------------*/
01220       case OP_VSPACE_EXTRA + OP_TYPEPLUS:
01221       case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
01222       case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
01223       count = current_state->count;  /* Already matched */
01224       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
01225       if (clen > 0)
01226         {
01227         BOOL OK;
01228         switch (c)
01229           {
01230           case 0x000a:
01231           case 0x000b:
01232           case 0x000c:
01233           case 0x000d:
01234           case 0x0085:
01235           case 0x2028:
01236           case 0x2029:
01237           OK = TRUE;
01238           break;
01239 
01240           default:
01241           OK = FALSE;
01242           break;
01243           }
01244 
01245         if (OK == (d == OP_VSPACE))
01246           {
01247           if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
01248             {
01249             active_count--;           /* Remove non-match possibility */
01250             next_active_state--;
01251             }
01252           count++;
01253           ADD_NEW_DATA(-state_offset, count, 0);
01254           }
01255         }
01256       break;
01257 
01258       /*-----------------------------------------------------------------*/
01259       case OP_HSPACE_EXTRA + OP_TYPEPLUS:
01260       case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
01261       case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
01262       count = current_state->count;  /* Already matched */
01263       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
01264       if (clen > 0)
01265         {
01266         BOOL OK;
01267         switch (c)
01268           {
01269           case 0x09:      /* HT */
01270           case 0x20:      /* SPACE */
01271           case 0xa0:      /* NBSP */
01272           case 0x1680:    /* OGHAM SPACE MARK */
01273           case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
01274           case 0x2000:    /* EN QUAD */
01275           case 0x2001:    /* EM QUAD */
01276           case 0x2002:    /* EN SPACE */
01277           case 0x2003:    /* EM SPACE */
01278           case 0x2004:    /* THREE-PER-EM SPACE */
01279           case 0x2005:    /* FOUR-PER-EM SPACE */
01280           case 0x2006:    /* SIX-PER-EM SPACE */
01281           case 0x2007:    /* FIGURE SPACE */
01282           case 0x2008:    /* PUNCTUATION SPACE */
01283           case 0x2009:    /* THIN SPACE */
01284           case 0x200A:    /* HAIR SPACE */
01285           case 0x202f:    /* NARROW NO-BREAK SPACE */
01286           case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
01287           case 0x3000:    /* IDEOGRAPHIC SPACE */
01288           OK = TRUE;
01289           break;
01290 
01291           default:
01292           OK = FALSE;
01293           break;
01294           }
01295 
01296         if (OK == (d == OP_HSPACE))
01297           {
01298           if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
01299             {
01300             active_count--;           /* Remove non-match possibility */
01301             next_active_state--;
01302             }
01303           count++;
01304           ADD_NEW_DATA(-state_offset, count, 0);
01305           }
01306         }
01307       break;
01308 
01309       /*-----------------------------------------------------------------*/
01310 #ifdef SUPPORT_UCP
01311       case OP_PROP_EXTRA + OP_TYPEQUERY:
01312       case OP_PROP_EXTRA + OP_TYPEMINQUERY:
01313       case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
01314       count = 4;
01315       goto QS1;
01316 
01317       case OP_PROP_EXTRA + OP_TYPESTAR:
01318       case OP_PROP_EXTRA + OP_TYPEMINSTAR:
01319       case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
01320       count = 0;
01321 
01322       QS1:
01323 
01324       ADD_ACTIVE(state_offset + 4, 0);
01325       if (clen > 0)
01326         {
01327         BOOL OK;
01328         const ucd_record * prop = GET_UCD(c);
01329         switch(code[2])
01330           {
01331           case PT_ANY:
01332           OK = TRUE;
01333           break;
01334 
01335           case PT_LAMP:
01336           OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
01337           break;
01338 
01339           case PT_GC:
01340           OK = _pcre_ucp_gentype[prop->chartype] == code[3];
01341           break;
01342 
01343           case PT_PC:
01344           OK = prop->chartype == code[3];
01345           break;
01346 
01347           case PT_SC:
01348           OK = prop->script == code[3];
01349           break;
01350 
01351           /* Should never occur, but keep compilers from grumbling. */
01352 
01353           default:
01354           OK = codevalue != OP_PROP;
01355           break;
01356           }
01357 
01358         if (OK == (d == OP_PROP))
01359           {
01360           if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
01361               codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
01362             {
01363             active_count--;           /* Remove non-match possibility */
01364             next_active_state--;
01365             }
01366           ADD_NEW(state_offset + count, 0);
01367           }
01368         }
01369       break;
01370 
01371       /*-----------------------------------------------------------------*/
01372       case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
01373       case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
01374       case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
01375       count = 2;
01376       goto QS2;
01377 
01378       case OP_EXTUNI_EXTRA + OP_TYPESTAR:
01379       case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
01380       case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
01381       count = 0;
01382 
01383       QS2:
01384 
01385       ADD_ACTIVE(state_offset + 2, 0);
01386       if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
01387         {
01388         const uschar *nptr = ptr + clen;
01389         int ncount = 0;
01390         if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
01391             codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
01392           {
01393           active_count--;           /* Remove non-match possibility */
01394           next_active_state--;
01395           }
01396         while (nptr < end_subject)
01397           {
01398           int nd;
01399           int ndlen = 1;
01400           GETCHARLEN(nd, nptr, ndlen);
01401           if (UCD_CATEGORY(nd) != ucp_M) break;
01402           ncount++;
01403           nptr += ndlen;
01404           }
01405         ADD_NEW_DATA(-(state_offset + count), 0, ncount);
01406         }
01407       break;
01408 #endif
01409 
01410       /*-----------------------------------------------------------------*/
01411       case OP_ANYNL_EXTRA + OP_TYPEQUERY:
01412       case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
01413       case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
01414       count = 2;
01415       goto QS3;
01416 
01417       case OP_ANYNL_EXTRA + OP_TYPESTAR:
01418       case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
01419       case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
01420       count = 0;
01421 
01422       QS3:
01423       ADD_ACTIVE(state_offset + 2, 0);
01424       if (clen > 0)
01425         {
01426         int ncount = 0;
01427         switch (c)
01428           {
01429           case 0x000b:
01430           case 0x000c:
01431           case 0x0085:
01432           case 0x2028:
01433           case 0x2029:
01434           if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
01435           goto ANYNL02;
01436 
01437           case 0x000d:
01438           if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
01439           /* Fall through */
01440 
01441           ANYNL02:
01442           case 0x000a:
01443           if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
01444               codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
01445             {
01446             active_count--;           /* Remove non-match possibility */
01447             next_active_state--;
01448             }
01449           ADD_NEW_DATA(-(state_offset + count), 0, ncount);
01450           break;
01451 
01452           default:
01453           break;
01454           }
01455         }
01456       break;
01457 
01458       /*-----------------------------------------------------------------*/
01459       case OP_VSPACE_EXTRA + OP_TYPEQUERY:
01460       case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
01461       case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
01462       count = 2;
01463       goto QS4;
01464 
01465       case OP_VSPACE_EXTRA + OP_TYPESTAR:
01466       case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
01467       case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
01468       count = 0;
01469 
01470       QS4:
01471       ADD_ACTIVE(state_offset + 2, 0);
01472       if (clen > 0)
01473         {
01474         BOOL OK;
01475         switch (c)
01476           {
01477           case 0x000a:
01478           case 0x000b:
01479           case 0x000c:
01480           case 0x000d:
01481           case 0x0085:
01482           case 0x2028:
01483           case 0x2029:
01484           OK = TRUE;
01485           break;
01486 
01487           default:
01488           OK = FALSE;
01489           break;
01490           }
01491         if (OK == (d == OP_VSPACE))
01492           {
01493           if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
01494               codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
01495             {
01496             active_count--;           /* Remove non-match possibility */
01497             next_active_state--;
01498             }
01499           ADD_NEW_DATA(-(state_offset + count), 0, 0);
01500           }
01501         }
01502       break;
01503 
01504       /*-----------------------------------------------------------------*/
01505       case OP_HSPACE_EXTRA + OP_TYPEQUERY:
01506       case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
01507       case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
01508       count = 2;
01509       goto QS5;
01510 
01511       case OP_HSPACE_EXTRA + OP_TYPESTAR:
01512       case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
01513       case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
01514       count = 0;
01515 
01516       QS5:
01517       ADD_ACTIVE(state_offset + 2, 0);
01518       if (clen > 0)
01519         {
01520         BOOL OK;
01521         switch (c)
01522           {
01523           case 0x09:      /* HT */
01524           case 0x20:      /* SPACE */
01525           case 0xa0:      /* NBSP */
01526           case 0x1680:    /* OGHAM SPACE MARK */
01527           case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
01528           case 0x2000:    /* EN QUAD */
01529           case 0x2001:    /* EM QUAD */
01530           case 0x2002:    /* EN SPACE */
01531           case 0x2003:    /* EM SPACE */
01532           case 0x2004:    /* THREE-PER-EM SPACE */
01533           case 0x2005:    /* FOUR-PER-EM SPACE */
01534           case 0x2006:    /* SIX-PER-EM SPACE */
01535           case 0x2007:    /* FIGURE SPACE */
01536           case 0x2008:    /* PUNCTUATION SPACE */
01537           case 0x2009:    /* THIN SPACE */
01538           case 0x200A:    /* HAIR SPACE */
01539           case 0x202f:    /* NARROW NO-BREAK SPACE */
01540           case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
01541           case 0x3000:    /* IDEOGRAPHIC SPACE */
01542           OK = TRUE;
01543           break;
01544 
01545           default:
01546           OK = FALSE;
01547           break;
01548           }
01549 
01550         if (OK == (d == OP_HSPACE))
01551           {
01552           if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
01553               codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
01554             {
01555             active_count--;           /* Remove non-match possibility */
01556             next_active_state--;
01557             }
01558           ADD_NEW_DATA(-(state_offset + count), 0, 0);
01559           }
01560         }
01561       break;
01562 
01563       /*-----------------------------------------------------------------*/
01564 #ifdef SUPPORT_UCP
01565       case OP_PROP_EXTRA + OP_TYPEEXACT:
01566       case OP_PROP_EXTRA + OP_TYPEUPTO:
01567       case OP_PROP_EXTRA + OP_TYPEMINUPTO:
01568       case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
01569       if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
01570         { ADD_ACTIVE(state_offset + 6, 0); }
01571       count = current_state->count;  /* Number already matched */
01572       if (clen > 0)
01573         {
01574         BOOL OK;
01575         const ucd_record * prop = GET_UCD(c);
01576         switch(code[4])
01577           {
01578           case PT_ANY:
01579           OK = TRUE;
01580           break;
01581 
01582           case PT_LAMP:
01583           OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
01584           break;
01585 
01586           case PT_GC:
01587           OK = _pcre_ucp_gentype[prop->chartype] == code[5];
01588           break;
01589 
01590           case PT_PC:
01591           OK = prop->chartype == code[5];
01592           break;
01593 
01594           case PT_SC:
01595           OK = prop->script == code[5];
01596           break;
01597 
01598           /* Should never occur, but keep compilers from grumbling. */
01599 
01600           default:
01601           OK = codevalue != OP_PROP;
01602           break;
01603           }
01604 
01605         if (OK == (d == OP_PROP))
01606           {
01607           if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
01608             {
01609             active_count--;           /* Remove non-match possibility */
01610             next_active_state--;
01611             }
01612           if (++count >= GET2(code, 1))
01613             { ADD_NEW(state_offset + 6, 0); }
01614           else
01615             { ADD_NEW(state_offset, count); }
01616           }
01617         }
01618       break;
01619 
01620       /*-----------------------------------------------------------------*/
01621       case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
01622       case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
01623       case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
01624       case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
01625       if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
01626         { ADD_ACTIVE(state_offset + 4, 0); }
01627       count = current_state->count;  /* Number already matched */
01628       if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
01629         {
01630         const uschar *nptr = ptr + clen;
01631         int ncount = 0;
01632         if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
01633           {
01634           active_count--;           /* Remove non-match possibility */
01635           next_active_state--;
01636           }
01637         while (nptr < end_subject)
01638           {
01639           int nd;
01640           int ndlen = 1;
01641           GETCHARLEN(nd, nptr, ndlen);
01642           if (UCD_CATEGORY(nd) != ucp_M) break;
01643           ncount++;
01644           nptr += ndlen;
01645           }
01646         if (++count >= GET2(code, 1))
01647           { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
01648         else
01649           { ADD_NEW_DATA(-state_offset, count, ncount); }
01650         }
01651       break;
01652 #endif
01653 
01654       /*-----------------------------------------------------------------*/
01655       case OP_ANYNL_EXTRA + OP_TYPEEXACT:
01656       case OP_ANYNL_EXTRA + OP_TYPEUPTO:
01657       case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
01658       case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
01659       if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
01660         { ADD_ACTIVE(state_offset + 4, 0); }
01661       count = current_state->count;  /* Number already matched */
01662       if (clen > 0)
01663         {
01664         int ncount = 0;
01665         switch (c)
01666           {
01667           case 0x000b:
01668           case 0x000c:
01669           case 0x0085:
01670           case 0x2028:
01671           case 0x2029:
01672           if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
01673           goto ANYNL03;
01674 
01675           case 0x000d:
01676           if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
01677           /* Fall through */
01678 
01679           ANYNL03:
01680           case 0x000a:
01681           if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
01682             {
01683             active_count--;           /* Remove non-match possibility */
01684             next_active_state--;
01685             }
01686           if (++count >= GET2(code, 1))
01687             { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
01688           else
01689             { ADD_NEW_DATA(-state_offset, count, ncount); }
01690           break;
01691 
01692           default:
01693           break;
01694           }
01695         }
01696       break;
01697 
01698       /*-----------------------------------------------------------------*/
01699       case OP_VSPACE_EXTRA + OP_TYPEEXACT:
01700       case OP_VSPACE_EXTRA + OP_TYPEUPTO:
01701       case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
01702       case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
01703       if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
01704         { ADD_ACTIVE(state_offset + 4, 0); }
01705       count = current_state->count;  /* Number already matched */
01706       if (clen > 0)
01707         {
01708         BOOL OK;
01709         switch (c)
01710           {
01711           case 0x000a:
01712           case 0x000b:
01713           case 0x000c:
01714           case 0x000d:
01715           case 0x0085:
01716           case 0x2028:
01717           case 0x2029:
01718           OK = TRUE;
01719           break;
01720 
01721           default:
01722           OK = FALSE;
01723           }
01724 
01725         if (OK == (d == OP_VSPACE))
01726           {
01727           if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
01728             {
01729             active_count--;           /* Remove non-match possibility */
01730             next_active_state--;
01731             }
01732           if (++count >= GET2(code, 1))
01733             { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
01734           else
01735             { ADD_NEW_DATA(-state_offset, count, 0); }
01736           }
01737         }
01738       break;
01739 
01740       /*-----------------------------------------------------------------*/
01741       case OP_HSPACE_EXTRA + OP_TYPEEXACT:
01742       case OP_HSPACE_EXTRA + OP_TYPEUPTO:
01743       case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
01744       case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
01745       if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
01746         { ADD_ACTIVE(state_offset + 4, 0); }
01747       count = current_state->count;  /* Number already matched */
01748       if (clen > 0)
01749         {
01750         BOOL OK;
01751         switch (c)
01752           {
01753           case 0x09:      /* HT */
01754           case 0x20:      /* SPACE */
01755           case 0xa0:      /* NBSP */
01756           case 0x1680:    /* OGHAM SPACE MARK */
01757           case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
01758           case 0x2000:    /* EN QUAD */
01759           case 0x2001:    /* EM QUAD */
01760           case 0x2002:    /* EN SPACE */
01761           case 0x2003:    /* EM SPACE */
01762           case 0x2004:    /* THREE-PER-EM SPACE */
01763           case 0x2005:    /* FOUR-PER-EM SPACE */
01764           case 0x2006:    /* SIX-PER-EM SPACE */
01765           case 0x2007:    /* FIGURE SPACE */
01766           case 0x2008:    /* PUNCTUATION SPACE */
01767           case 0x2009:    /* THIN SPACE */
01768           case 0x200A:    /* HAIR SPACE */
01769           case 0x202f:    /* NARROW NO-BREAK SPACE */
01770           case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
01771           case 0x3000:    /* IDEOGRAPHIC SPACE */
01772           OK = TRUE;
01773           break;
01774 
01775           default:
01776           OK = FALSE;
01777           break;
01778           }
01779 
01780         if (OK == (d == OP_HSPACE))
01781           {
01782           if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
01783             {
01784             active_count--;           /* Remove non-match possibility */
01785             next_active_state--;
01786             }
01787           if (++count >= GET2(code, 1))
01788             { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
01789           else
01790             { ADD_NEW_DATA(-state_offset, count, 0); }
01791           }
01792         }
01793       break;
01794 
01795 /* ========================================================================== */
01796       /* These opcodes are followed by a character that is usually compared
01797       to the current subject character; it is loaded into d. We still get
01798       here even if there is no subject character, because in some cases zero
01799       repetitions are permitted. */
01800 
01801       /*-----------------------------------------------------------------*/
01802       case OP_CHAR:
01803       if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
01804       break;
01805 
01806       /*-----------------------------------------------------------------*/
01807       case OP_CHARNC:
01808       if (clen == 0) break;
01809 
01810 #ifdef SUPPORT_UTF8
01811       if (utf8)
01812         {
01813         if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
01814           {
01815           unsigned int othercase;
01816           if (c < 128) othercase = fcc[c]; else
01817 
01818           /* If we have Unicode property support, we can use it to test the
01819           other case of the character. */
01820 
01821 #ifdef SUPPORT_UCP
01822           othercase = UCD_OTHERCASE(c);
01823 #else
01824           othercase = NOTACHAR;
01825 #endif
01826 
01827           if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
01828           }
01829         }
01830       else
01831 #endif  /* SUPPORT_UTF8 */
01832 
01833       /* Non-UTF-8 mode */
01834         {
01835         if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
01836         }
01837       break;
01838 
01839 
01840 #ifdef SUPPORT_UCP
01841       /*-----------------------------------------------------------------*/
01842       /* This is a tricky one because it can match more than one character.
01843       Find out how many characters to skip, and then set up a negative state
01844       to wait for them to pass before continuing. */
01845 
01846       case OP_EXTUNI:
01847       if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
01848         {
01849         const uschar *nptr = ptr + clen;
01850         int ncount = 0;
01851         while (nptr < end_subject)
01852           {
01853           int nclen = 1;
01854           GETCHARLEN(c, nptr, nclen);
01855           if (UCD_CATEGORY(c) != ucp_M) break;
01856           ncount++;
01857           nptr += nclen;
01858           }
01859         ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
01860         }
01861       break;
01862 #endif
01863 
01864       /*-----------------------------------------------------------------*/
01865       /* This is a tricky like EXTUNI because it too can match more than one
01866       character (when CR is followed by LF). In this case, set up a negative
01867       state to wait for one character to pass before continuing. */
01868 
01869       case OP_ANYNL:
01870       if (clen > 0) switch(c)
01871         {
01872         case 0x000b:
01873         case 0x000c:
01874         case 0x0085:
01875         case 0x2028:
01876         case 0x2029:
01877         if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
01878 
01879         case 0x000a:
01880         ADD_NEW(state_offset + 1, 0);
01881         break;
01882 
01883         case 0x000d:
01884         if (ptr + 1 < end_subject && ptr[1] == 0x0a)
01885           {
01886           ADD_NEW_DATA(-(state_offset + 1), 0, 1);
01887           }
01888         else
01889           {
01890           ADD_NEW(state_offset + 1, 0);
01891           }
01892         break;
01893         }
01894       break;
01895 
01896       /*-----------------------------------------------------------------*/
01897       case OP_NOT_VSPACE:
01898       if (clen > 0) switch(c)
01899         {
01900         case 0x000a:
01901         case 0x000b:
01902         case 0x000c:
01903         case 0x000d:
01904         case 0x0085:
01905         case 0x2028:
01906         case 0x2029:
01907         break;
01908 
01909         default:
01910         ADD_NEW(state_offset + 1, 0);
01911         break;
01912         }
01913       break;
01914 
01915       /*-----------------------------------------------------------------*/
01916       case OP_VSPACE:
01917       if (clen > 0) switch(c)
01918         {
01919         case 0x000a:
01920         case 0x000b:
01921         case 0x000c:
01922         case 0x000d:
01923         case 0x0085:
01924         case 0x2028:
01925         case 0x2029:
01926         ADD_NEW(state_offset + 1, 0);
01927         break;
01928 
01929         default: break;
01930         }
01931       break;
01932 
01933       /*-----------------------------------------------------------------*/
01934       case OP_NOT_HSPACE:
01935       if (clen > 0) switch(c)
01936         {
01937         case 0x09:      /* HT */
01938         case 0x20:      /* SPACE */
01939         case 0xa0:      /* NBSP */
01940         case 0x1680:    /* OGHAM SPACE MARK */
01941         case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
01942         case 0x2000:    /* EN QUAD */
01943         case 0x2001:    /* EM QUAD */
01944         case 0x2002:    /* EN SPACE */
01945         case 0x2003:    /* EM SPACE */
01946         case 0x2004:    /* THREE-PER-EM SPACE */
01947         case 0x2005:    /* FOUR-PER-EM SPACE */
01948         case 0x2006:    /* SIX-PER-EM SPACE */
01949         case 0x2007:    /* FIGURE SPACE */
01950         case 0x2008:    /* PUNCTUATION SPACE */
01951         case 0x2009:    /* THIN SPACE */
01952         case 0x200A:    /* HAIR SPACE */
01953         case 0x202f:    /* NARROW NO-BREAK SPACE */
01954         case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
01955         case 0x3000:    /* IDEOGRAPHIC SPACE */
01956         break;
01957 
01958         default:
01959         ADD_NEW(state_offset + 1, 0);
01960         break;
01961         }
01962       break;
01963 
01964       /*-----------------------------------------------------------------*/
01965       case OP_HSPACE:
01966       if (clen > 0) switch(c)
01967         {
01968         case 0x09:      /* HT */
01969         case 0x20:      /* SPACE */
01970         case 0xa0:      /* NBSP */
01971         case 0x1680:    /* OGHAM SPACE MARK */
01972         case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
01973         case 0x2000:    /* EN QUAD */
01974         case 0x2001:    /* EM QUAD */
01975         case 0x2002:    /* EN SPACE */
01976         case 0x2003:    /* EM SPACE */
01977         case 0x2004:    /* THREE-PER-EM SPACE */
01978         case 0x2005:    /* FOUR-PER-EM SPACE */
01979         case 0x2006:    /* SIX-PER-EM SPACE */
01980         case 0x2007:    /* FIGURE SPACE */
01981         case 0x2008:    /* PUNCTUATION SPACE */
01982         case 0x2009:    /* THIN SPACE */
01983         case 0x200A:    /* HAIR SPACE */
01984         case 0x202f:    /* NARROW NO-BREAK SPACE */
01985         case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
01986         case 0x3000:    /* IDEOGRAPHIC SPACE */
01987         ADD_NEW(state_offset + 1, 0);
01988         break;
01989         }
01990       break;
01991 
01992       /*-----------------------------------------------------------------*/
01993       /* Match a negated single character. This is only used for one-byte
01994       characters, that is, we know that d < 256. The character we are
01995       checking (c) can be multibyte. */
01996 
01997       case OP_NOT:
01998       if (clen > 0)
01999         {
02000         unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;
02001         if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }
02002         }
02003       break;
02004 
02005       /*-----------------------------------------------------------------*/
02006       case OP_PLUS:
02007       case OP_MINPLUS:
02008       case OP_POSPLUS:
02009       case OP_NOTPLUS:
02010       case OP_NOTMINPLUS:
02011       case OP_NOTPOSPLUS:
02012       count = current_state->count;  /* Already matched */
02013       if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
02014       if (clen > 0)
02015         {
02016         unsigned int otherd = NOTACHAR;
02017         if ((ims & PCRE_CASELESS) != 0)
02018           {
02019 #ifdef SUPPORT_UTF8
02020           if (utf8 && d >= 128)
02021             {
02022 #ifdef SUPPORT_UCP
02023             otherd = UCD_OTHERCASE(d);
02024 #endif  /* SUPPORT_UCP */
02025             }
02026           else
02027 #endif  /* SUPPORT_UTF8 */
02028           otherd = fcc[d];
02029           }
02030         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
02031           {
02032           if (count > 0 &&
02033               (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
02034             {
02035             active_count--;             /* Remove non-match possibility */
02036             next_active_state--;
02037             }
02038           count++;
02039           ADD_NEW(state_offset, count);
02040           }
02041         }
02042       break;
02043 
02044       /*-----------------------------------------------------------------*/
02045       case OP_QUERY:
02046       case OP_MINQUERY:
02047       case OP_POSQUERY:
02048       case OP_NOTQUERY:
02049       case OP_NOTMINQUERY:
02050       case OP_NOTPOSQUERY:
02051       ADD_ACTIVE(state_offset + dlen + 1, 0);
02052       if (clen > 0)
02053         {
02054         unsigned int otherd = NOTACHAR;
02055         if ((ims & PCRE_CASELESS) != 0)
02056           {
02057 #ifdef SUPPORT_UTF8
02058           if (utf8 && d >= 128)
02059             {
02060 #ifdef SUPPORT_UCP
02061             otherd = UCD_OTHERCASE(d);
02062 #endif  /* SUPPORT_UCP */
02063             }
02064           else
02065 #endif  /* SUPPORT_UTF8 */
02066           otherd = fcc[d];
02067           }
02068         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
02069           {
02070           if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
02071             {
02072             active_count--;            /* Remove non-match possibility */
02073             next_active_state--;
02074             }
02075           ADD_NEW(state_offset + dlen + 1, 0);
02076           }
02077         }
02078       break;
02079 
02080       /*-----------------------------------------------------------------*/
02081       case OP_STAR:
02082       case OP_MINSTAR:
02083       case OP_POSSTAR:
02084       case OP_NOTSTAR:
02085       case OP_NOTMINSTAR:
02086       case OP_NOTPOSSTAR:
02087       ADD_ACTIVE(state_offset + dlen + 1, 0);
02088       if (clen > 0)
02089         {
02090         unsigned int otherd = NOTACHAR;
02091         if ((ims & PCRE_CASELESS) != 0)
02092           {
02093 #ifdef SUPPORT_UTF8
02094           if (utf8 && d >= 128)
02095             {
02096 #ifdef SUPPORT_UCP
02097             otherd = UCD_OTHERCASE(d);
02098 #endif  /* SUPPORT_UCP */
02099             }
02100           else
02101 #endif  /* SUPPORT_UTF8 */
02102           otherd = fcc[d];
02103           }
02104         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
02105           {
02106           if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
02107             {
02108             active_count--;            /* Remove non-match possibility */
02109             next_active_state--;
02110             }
02111           ADD_NEW(state_offset, 0);
02112           }
02113         }
02114       break;
02115 
02116       /*-----------------------------------------------------------------*/
02117       case OP_EXACT:
02118       case OP_NOTEXACT:
02119       count = current_state->count;  /* Number already matched */
02120       if (clen > 0)
02121         {
02122         unsigned int otherd = NOTACHAR;
02123         if ((ims & PCRE_CASELESS) != 0)
02124           {
02125 #ifdef SUPPORT_UTF8
02126           if (utf8 && d >= 128)
02127             {
02128 #ifdef SUPPORT_UCP
02129             otherd = UCD_OTHERCASE(d);
02130 #endif  /* SUPPORT_UCP */
02131             }
02132           else
02133 #endif  /* SUPPORT_UTF8 */
02134           otherd = fcc[d];
02135           }
02136         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
02137           {
02138           if (++count >= GET2(code, 1))
02139             { ADD_NEW(state_offset + dlen + 3, 0); }
02140           else
02141             { ADD_NEW(state_offset, count); }
02142           }
02143         }
02144       break;
02145 
02146       /*-----------------------------------------------------------------*/
02147       case OP_UPTO:
02148       case OP_MINUPTO:
02149       case OP_POSUPTO:
02150       case OP_NOTUPTO:
02151       case OP_NOTMINUPTO:
02152       case OP_NOTPOSUPTO:
02153       ADD_ACTIVE(state_offset + dlen + 3, 0);
02154       count = current_state->count;  /* Number already matched */
02155       if (clen > 0)
02156         {
02157         unsigned int otherd = NOTACHAR;
02158         if ((ims & PCRE_CASELESS) != 0)
02159           {
02160 #ifdef SUPPORT_UTF8
02161           if (utf8 && d >= 128)
02162             {
02163 #ifdef SUPPORT_UCP
02164             otherd = UCD_OTHERCASE(d);
02165 #endif  /* SUPPORT_UCP */
02166             }
02167           else
02168 #endif  /* SUPPORT_UTF8 */
02169           otherd = fcc[d];
02170           }
02171         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
02172           {
02173           if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
02174             {
02175             active_count--;             /* Remove non-match possibility */
02176             next_active_state--;
02177             }
02178           if (++count >= GET2(code, 1))
02179             { ADD_NEW(state_offset + dlen + 3, 0); }
02180           else
02181             { ADD_NEW(state_offset, count); }
02182           }
02183         }
02184       break;
02185 
02186 
02187 /* ========================================================================== */
02188       /* These are the class-handling opcodes */
02189 
02190       case OP_CLASS:
02191       case OP_NCLASS:
02192       case OP_XCLASS:
02193         {
02194         BOOL isinclass = FALSE;
02195         int next_state_offset;
02196         const uschar *ecode;
02197 
02198         /* For a simple class, there is always just a 32-byte table, and we
02199         can set isinclass from it. */
02200 
02201         if (codevalue != OP_XCLASS)
02202           {
02203           ecode = code + 33;
02204           if (clen > 0)
02205             {
02206             isinclass = (c > 255)? (codevalue == OP_NCLASS) :
02207               ((code[1 + c/8] & (1 << (c&7))) != 0);
02208             }
02209           }
02210 
02211         /* An extended class may have a table or a list of single characters,
02212         ranges, or both, and it may be positive or negative. There's a
02213         function that sorts all this out. */
02214 
02215         else
02216          {
02217          ecode = code + GET(code, 1);
02218          if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);
02219          }
02220 
02221         /* At this point, isinclass is set for all kinds of class, and ecode
02222         points to the byte after the end of the class. If there is a
02223         quantifier, this is where it will be. */
02224 
02225         next_state_offset = ecode - start_code;
02226 
02227         switch (*ecode)
02228           {
02229           case OP_CRSTAR:
02230           case OP_CRMINSTAR:
02231           ADD_ACTIVE(next_state_offset + 1, 0);
02232           if (isinclass) { ADD_NEW(state_offset, 0); }
02233           break;
02234 
02235           case OP_CRPLUS:
02236           case OP_CRMINPLUS:
02237           count = current_state->count;  /* Already matched */
02238           if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
02239           if (isinclass) { count++; ADD_NEW(state_offset, count); }
02240           break;
02241 
02242           case OP_CRQUERY:
02243           case OP_CRMINQUERY:
02244           ADD_ACTIVE(next_state_offset + 1, 0);
02245           if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
02246           break;
02247 
02248           case OP_CRRANGE:
02249           case OP_CRMINRANGE:
02250           count = current_state->count;  /* Already matched */
02251           if (count >= GET2(ecode, 1))
02252             { ADD_ACTIVE(next_state_offset + 5, 0); }
02253           if (isinclass)
02254             {
02255             int max = GET2(ecode, 3);
02256             if (++count >= max && max != 0)   /* Max 0 => no limit */
02257               { ADD_NEW(next_state_offset + 5, 0); }
02258             else
02259               { ADD_NEW(state_offset, count); }
02260             }
02261           break;
02262 
02263           default:
02264           if (isinclass) { ADD_NEW(next_state_offset, 0); }
02265           break;
02266           }
02267         }
02268       break;
02269 
02270 /* ========================================================================== */
02271       /* These are the opcodes for fancy brackets of various kinds. We have
02272       to use recursion in order to handle them. The "always failing" assertion
02273       (?!) is optimised to OP_FAIL when compiling, so we have to support that,
02274       though the other "backtracking verbs" are not supported. */
02275 
02276       case OP_FAIL:
02277       forced_fail++;    /* Count FAILs for multiple states */
02278       break;
02279 
02280       case OP_ASSERT:
02281       case OP_ASSERT_NOT:
02282       case OP_ASSERTBACK:
02283       case OP_ASSERTBACK_NOT:
02284         {
02285         int rc;
02286         int local_offsets[2];
02287         int local_workspace[1000];
02288         const uschar *endasscode = code + GET(code, 1);
02289 
02290         while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
02291 
02292         rc = internal_dfa_exec(
02293           md,                                   /* static match data */
02294           code,                                 /* this subexpression's code */
02295           ptr,                                  /* where we currently are */
02296           ptr - start_subject,                  /* start offset */
02297           local_offsets,                        /* offset vector */
02298           sizeof(local_offsets)/sizeof(int),    /* size of same */
02299           local_workspace,                      /* workspace vector */
02300           sizeof(local_workspace)/sizeof(int),  /* size of same */
02301           ims,                                  /* the current ims flags */
02302           rlevel,                               /* function recursion level */
02303           recursing);                           /* pass on regex recursion */
02304 
02305         if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
02306             { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
02307         }
02308       break;
02309 
02310       /*-----------------------------------------------------------------*/
02311       case OP_COND:
02312       case OP_SCOND:
02313         {
02314         int local_offsets[1000];
02315         int local_workspace[1000];
02316         int codelink = GET(code, 1);
02317         int condcode;
02318 
02319         /* Because of the way auto-callout works during compile, a callout item
02320         is inserted between OP_COND and an assertion condition. This does not
02321         happen for the other conditions. */
02322 
02323         if (code[LINK_SIZE+1] == OP_CALLOUT)
02324           {
02325           rrc = 0;
02326           if (pcre_callout != NULL)
02327             {
02328             pcre_callout_block cb;
02329             cb.version          = 1;   /* Version 1 of the callout block */
02330             cb.callout_number   = code[LINK_SIZE+2];
02331             cb.offset_vector    = offsets;
02332             cb.subject          = (PCRE_SPTR)start_subject;
02333             cb.subject_length   = end_subject - start_subject;
02334             cb.start_match      = current_subject - start_subject;
02335             cb.current_position = ptr - start_subject;
02336             cb.pattern_position = GET(code, LINK_SIZE + 3);
02337             cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
02338             cb.capture_top      = 1;
02339             cb.capture_last     = -1;
02340             cb.callout_data     = md->callout_data;
02341             if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */
02342             }
02343           if (rrc > 0) break;                      /* Fail this thread */
02344           code += _pcre_OP_lengths[OP_CALLOUT];    /* Skip callout data */
02345           }
02346 
02347         condcode = code[LINK_SIZE+1];
02348 
02349         /* Back reference conditions are not supported */
02350 
02351         if (condcode == OP_CREF || condcode == OP_NCREF)
02352           return PCRE_ERROR_DFA_UCOND;
02353 
02354         /* The DEFINE condition is always false */
02355 
02356         if (condcode == OP_DEF)
02357           { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
02358 
02359         /* The only supported version of OP_RREF is for the value RREF_ANY,
02360         which means "test if in any recursion". We can't test for specifically
02361         recursed groups. */
02362 
02363         else if (condcode == OP_RREF || condcode == OP_NRREF)
02364           {
02365           int value = GET2(code, LINK_SIZE+2);
02366           if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
02367           if (recursing > 0)
02368             { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
02369           else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
02370           }
02371 
02372         /* Otherwise, the condition is an assertion */
02373 
02374         else
02375           {
02376           int rc;
02377           const uschar *asscode = code + LINK_SIZE + 1;
02378           const uschar *endasscode = asscode + GET(asscode, 1);
02379 
02380           while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
02381 
02382           rc = internal_dfa_exec(
02383             md,                                   /* fixed match data */
02384             asscode,                              /* this subexpression's code */
02385             ptr,                                  /* where we currently are */
02386             ptr - start_subject,                  /* start offset */
02387             local_offsets,                        /* offset vector */
02388             sizeof(local_offsets)/sizeof(int),    /* size of same */
02389             local_workspace,                      /* workspace vector */
02390             sizeof(local_workspace)/sizeof(int),  /* size of same */
02391             ims,                                  /* the current ims flags */
02392             rlevel,                               /* function recursion level */
02393             recursing);                           /* pass on regex recursion */
02394 
02395           if ((rc >= 0) ==
02396                 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
02397             { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
02398           else
02399             { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
02400           }
02401         }
02402       break;
02403 
02404       /*-----------------------------------------------------------------*/
02405       case OP_RECURSE:
02406         {
02407         int local_offsets[1000];
02408         int local_workspace[1000];
02409         int rc;
02410 
02411         DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,
02412           recursing + 1));
02413 
02414         rc = internal_dfa_exec(
02415           md,                                   /* fixed match data */
02416           start_code + GET(code, 1),            /* this subexpression's code */
02417           ptr,                                  /* where we currently are */
02418           ptr - start_subject,                  /* start offset */
02419           local_offsets,                        /* offset vector */
02420           sizeof(local_offsets)/sizeof(int),    /* size of same */
02421           local_workspace,                      /* workspace vector */
02422           sizeof(local_workspace)/sizeof(int),  /* size of same */
02423           ims,                                  /* the current ims flags */
02424           rlevel,                               /* function recursion level */
02425           recursing + 1);                       /* regex recurse level */
02426 
02427         DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,
02428           recursing + 1, rc));
02429 
02430         /* Ran out of internal offsets */
02431 
02432         if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
02433 
02434         /* For each successful matched substring, set up the next state with a
02435         count of characters to skip before trying it. Note that the count is in
02436         characters, not bytes. */
02437 
02438         if (rc > 0)
02439           {
02440           for (rc = rc*2 - 2; rc >= 0; rc -= 2)
02441             {
02442             const uschar *p = start_subject + local_offsets[rc];
02443             const uschar *pp = start_subject + local_offsets[rc+1];
02444             int charcount = local_offsets[rc+1] - local_offsets[rc];
02445             while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
02446             if (charcount > 0)
02447               {
02448               ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
02449               }
02450             else
02451               {
02452               ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
02453               }
02454             }
02455           }
02456         else if (rc != PCRE_ERROR_NOMATCH) return rc;
02457         }
02458       break;
02459 
02460       /*-----------------------------------------------------------------*/
02461       case OP_ONCE:
02462         {
02463         int local_offsets[2];
02464         int local_workspace[1000];
02465 
02466         int rc = internal_dfa_exec(
02467           md,                                   /* fixed match data */
02468           code,                                 /* this subexpression's code */
02469           ptr,                                  /* where we currently are */
02470           ptr - start_subject,                  /* start offset */
02471           local_offsets,                        /* offset vector */
02472           sizeof(local_offsets)/sizeof(int),    /* size of same */
02473           local_workspace,                      /* workspace vector */
02474           sizeof(local_workspace)/sizeof(int),  /* size of same */
02475           ims,                                  /* the current ims flags */
02476           rlevel,                               /* function recursion level */
02477           recursing);                           /* pass on regex recursion */
02478 
02479         if (rc >= 0)
02480           {
02481           const uschar *end_subpattern = code;
02482           int charcount = local_offsets[1] - local_offsets[0];
02483           int next_state_offset, repeat_state_offset;
02484 
02485           do { end_subpattern += GET(end_subpattern, 1); }
02486             while (*end_subpattern == OP_ALT);
02487           next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;
02488 
02489           /* If the end of this subpattern is KETRMAX or KETRMIN, we must
02490           arrange for the repeat state also to be added to the relevant list.
02491           Calculate the offset, or set -1 for no repeat. */
02492 
02493           repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
02494                                  *end_subpattern == OP_KETRMIN)?
02495             end_subpattern - start_code - GET(end_subpattern, 1) : -1;
02496 
02497           /* If we have matched an empty string, add the next state at the
02498           current character pointer. This is important so that the duplicate
02499           checking kicks in, which is what breaks infinite loops that match an
02500           empty string. */
02501 
02502           if (charcount == 0)
02503             {
02504             ADD_ACTIVE(next_state_offset, 0);
02505             }
02506 
02507           /* Optimization: if there are no more active states, and there
02508           are no new states yet set up, then skip over the subject string
02509           right here, to save looping. Otherwise, set up the new state to swing
02510           into action when the end of the substring is reached. */
02511 
02512           else if (i + 1 >= active_count && new_count == 0)
02513             {
02514             ptr += charcount;
02515             clen = 0;
02516             ADD_NEW(next_state_offset, 0);
02517 
02518             /* If we are adding a repeat state at the new character position,
02519             we must fudge things so that it is the only current state.
02520             Otherwise, it might be a duplicate of one we processed before, and
02521             that would cause it to be skipped. */
02522 
02523             if (repeat_state_offset >= 0)
02524               {
02525               next_active_state = active_states;
02526               active_count = 0;
02527               i = -1;
02528               ADD_ACTIVE(repeat_state_offset, 0);
02529               }
02530             }
02531           else
02532             {
02533             const uschar *p = start_subject + local_offsets[0];
02534             const uschar *pp = start_subject + local_offsets[1];
02535             while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
02536             ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
02537             if (repeat_state_offset >= 0)
02538               { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
02539             }
02540 
02541           }
02542         else if (rc != PCRE_ERROR_NOMATCH) return rc;
02543         }
02544       break;
02545 
02546 
02547 /* ========================================================================== */
02548       /* Handle callouts */
02549 
02550       case OP_CALLOUT:
02551       rrc = 0;
02552       if (pcre_callout != NULL)
02553         {
02554         pcre_callout_block cb;
02555         cb.version          = 1;   /* Version 1 of the callout block */
02556         cb.callout_number   = code[1];
02557         cb.offset_vector    = offsets;
02558         cb.subject          = (PCRE_SPTR)start_subject;
02559         cb.subject_length   = end_subject - start_subject;
02560         cb.start_match      = current_subject - start_subject;
02561         cb.current_position = ptr - start_subject;
02562         cb.pattern_position = GET(code, 2);
02563         cb.next_item_length = GET(code, 2 + LINK_SIZE);
02564         cb.capture_top      = 1;
02565         cb.capture_last     = -1;
02566         cb.callout_data     = md->callout_data;
02567         if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */
02568         }
02569       if (rrc == 0)
02570         { ADD_ACTIVE(state_offset + _pcre_OP_lengths[OP_CALLOUT], 0); }
02571       break;
02572 
02573 
02574 /* ========================================================================== */
02575       default:        /* Unsupported opcode */
02576       return PCRE_ERROR_DFA_UITEM;
02577       }
02578 
02579     NEXT_ACTIVE_STATE: continue;
02580 
02581     }      /* End of loop scanning active states */
02582 
02583   /* We have finished the processing at the current subject character. If no
02584   new states have been set for the next character, we have found all the
02585   matches that we are going to find. If we are at the top level and partial
02586   matching has been requested, check for appropriate conditions.
02587 
02588   The "forced_ fail" variable counts the number of (*F) encountered for the
02589   character. If it is equal to the original active_count (saved in
02590   workspace[1]) it means that (*F) was found on every active state. In this
02591   case we don't want to give a partial match.
02592 
02593   The "could_continue" variable is true if a state could have continued but
02594   for the fact that the end of the subject was reached. */
02595 
02596   if (new_count <= 0)
02597     {
02598     if (rlevel == 1 &&                               /* Top level, and */
02599         could_continue &&                            /* Some could go on */
02600         forced_fail != workspace[1] &&               /* Not all forced fail & */
02601         (                                            /* either... */
02602         (md->moptions & PCRE_PARTIAL_HARD) != 0      /* Hard partial */
02603         ||                                           /* or... */
02604         ((md->moptions & PCRE_PARTIAL_SOFT) != 0 &&  /* Soft partial and */
02605          match_count < 0)                            /* no matches */
02606         ) &&                                         /* And... */
02607         ptr >= end_subject &&                     /* Reached end of subject */
02608         ptr > current_subject)                    /* Matched non-empty string */
02609       {
02610       if (offsetcount >= 2)
02611         {
02612         offsets[0] = md->start_used_ptr - start_subject;
02613         offsets[1] = end_subject - start_subject;
02614         }
02615       match_count = PCRE_ERROR_PARTIAL;
02616       }
02617 
02618     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
02619       "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
02620       rlevel*2-2, SP));
02621     break;        /* In effect, "return", but see the comment below */
02622     }
02623 
02624   /* One or more states are active for the next character. */
02625 
02626   ptr += clen;    /* Advance to next subject character */
02627   }               /* Loop to move along the subject string */
02628 
02629 /* Control gets here from "break" a few lines above. We do it this way because
02630 if we use "return" above, we have compiler trouble. Some compilers warn if
02631 there's nothing here because they think the function doesn't return a value. On
02632 the other hand, if we put a dummy statement here, some more clever compilers
02633 complain that it can't be reached. Sigh. */
02634 
02635 return match_count;
02636 }
02637 
02638 
02639 
02640 
02641 /*************************************************
02642 *    Execute a Regular Expression - DFA engine   *
02643 *************************************************/
02644 
02645 /* This external function applies a compiled re to a subject string using a DFA
02646 engine. This function calls the internal function multiple times if the pattern
02647 is not anchored.
02648 
02649 Arguments:
02650   argument_re     points to the compiled expression
02651   extra_data      points to extra data or is NULL
02652   subject         points to the subject string
02653   length          length of subject string (may contain binary zeros)
02654   start_offset    where to start in the subject string
02655   options         option bits
02656   offsets         vector of match offsets
02657   offsetcount     size of same
02658   workspace       workspace vector
02659   wscount         size of same
02660 
02661 Returns:          > 0 => number of match offset pairs placed in offsets
02662                   = 0 => offsets overflowed; longest matches are present
02663                    -1 => failed to match
02664                  < -1 => some kind of unexpected problem
02665 */
02666 
02667 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
02668 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
02669   const char *subject, int length, int start_offset, int options, int *offsets,
02670   int offsetcount, int *workspace, int wscount)
02671 {
02672 real_pcre *re = (real_pcre *)argument_re;
02673 dfa_match_data match_block;
02674 dfa_match_data *md = &match_block;
02675 BOOL utf8, anchored, startline, firstline;
02676 const uschar *current_subject, *end_subject, *lcc;
02677 
02678 pcre_study_data internal_study;
02679 const pcre_study_data *study = NULL;
02680 real_pcre internal_re;
02681 
02682 const uschar *req_byte_ptr;
02683 const uschar *start_bits = NULL;
02684 BOOL first_byte_caseless = FALSE;
02685 BOOL req_byte_caseless = FALSE;
02686 int first_byte = -1;
02687 int req_byte = -1;
02688 int req_byte2 = -1;
02689 int newline;
02690 
02691 /* Plausibility checks */
02692 
02693 if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
02694 if (re == NULL || subject == NULL || workspace == NULL ||
02695    (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
02696 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
02697 if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
02698 
02699 /* We need to find the pointer to any study data before we test for byte
02700 flipping, so we scan the extra_data block first. This may set two fields in the
02701 match block, so we must initialize them beforehand. However, the other fields
02702 in the match block must not be set until after the byte flipping. */
02703 
02704 md->tables = re->tables;
02705 md->callout_data = NULL;
02706 
02707 if (extra_data != NULL)
02708   {
02709   unsigned int flags = extra_data->flags;
02710   if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
02711     study = (const pcre_study_data *)extra_data->study_data;
02712   if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
02713   if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
02714     return PCRE_ERROR_DFA_UMLIMIT;
02715   if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
02716     md->callout_data = extra_data->callout_data;
02717   if ((flags & PCRE_EXTRA_TABLES) != 0)
02718     md->tables = extra_data->tables;
02719   }
02720 
02721 /* Check that the first field in the block is the magic number. If it is not,
02722 test for a regex that was compiled on a host of opposite endianness. If this is
02723 the case, flipped values are put in internal_re and internal_study if there was
02724 study data too. */
02725 
02726 if (re->magic_number != MAGIC_NUMBER)
02727   {
02728   re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
02729   if (re == NULL) return PCRE_ERROR_BADMAGIC;
02730   if (study != NULL) study = &internal_study;
02731   }
02732 
02733 /* Set some local values */
02734 
02735 current_subject = (const unsigned char *)subject + start_offset;
02736 end_subject = (const unsigned char *)subject + length;
02737 req_byte_ptr = current_subject - 1;
02738 
02739 #ifdef SUPPORT_UTF8
02740 utf8 = (re->options & PCRE_UTF8) != 0;
02741 #else
02742 utf8 = FALSE;
02743 #endif
02744 
02745 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
02746   (re->options & PCRE_ANCHORED) != 0;
02747 
02748 /* The remaining fixed data for passing around. */
02749 
02750 md->start_code = (const uschar *)argument_re +
02751     re->name_table_offset + re->name_count * re->name_entry_size;
02752 md->start_subject = (const unsigned char *)subject;
02753 md->end_subject = end_subject;
02754 md->start_offset = start_offset;
02755 md->moptions = options;
02756 md->poptions = re->options;
02757 
02758 /* If the BSR option is not set at match time, copy what was set
02759 at compile time. */
02760 
02761 if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
02762   {
02763   if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
02764     md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
02765 #ifdef BSR_ANYCRLF
02766   else md->moptions |= PCRE_BSR_ANYCRLF;
02767 #endif
02768   }
02769 
02770 /* Handle different types of newline. The three bits give eight cases. If
02771 nothing is set at run time, whatever was used at compile time applies. */
02772 
02773 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
02774          PCRE_NEWLINE_BITS)
02775   {
02776   case 0: newline = NEWLINE; break;   /* Compile-time default */
02777   case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
02778   case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
02779   case PCRE_NEWLINE_CR+
02780        PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
02781   case PCRE_NEWLINE_ANY: newline = -1; break;
02782   case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
02783   default: return PCRE_ERROR_BADNEWLINE;
02784   }
02785 
02786 if (newline == -2)
02787   {
02788   md->nltype = NLTYPE_ANYCRLF;
02789   }
02790 else if (newline < 0)
02791   {
02792   md->nltype = NLTYPE_ANY;
02793   }
02794 else
02795   {
02796   md->nltype = NLTYPE_FIXED;
02797   if (newline > 255)
02798     {
02799     md->nllen = 2;
02800     md->nl[0] = (newline >> 8) & 255;
02801     md->nl[1] = newline & 255;
02802     }
02803   else
02804     {
02805     md->nllen = 1;
02806     md->nl[0] = newline;
02807     }
02808   }
02809 
02810 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
02811 back the character offset. */
02812 
02813 #ifdef SUPPORT_UTF8
02814 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
02815   {
02816   if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
02817     return PCRE_ERROR_BADUTF8;
02818   if (start_offset > 0 && start_offset < length)
02819     {
02820     int tb = ((uschar *)subject)[start_offset];
02821     if (tb > 127)
02822       {
02823       tb &= 0xc0;
02824       if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
02825       }
02826     }
02827   }
02828 #endif
02829 
02830 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
02831 is a feature that makes it possible to save compiled regex and re-use them
02832 in other programs later. */
02833 
02834 if (md->tables == NULL) md->tables = _pcre_default_tables;
02835 
02836 /* The lower casing table and the "must be at the start of a line" flag are
02837 used in a loop when finding where to start. */
02838 
02839 lcc = md->tables + lcc_offset;
02840 startline = (re->flags & PCRE_STARTLINE) != 0;
02841 firstline = (re->options & PCRE_FIRSTLINE) != 0;
02842 
02843 /* Set up the first character to match, if available. The first_byte value is
02844 never set for an anchored regular expression, but the anchoring may be forced
02845 at run time, so we have to test for anchoring. The first char may be unset for
02846 an unanchored pattern, of course. If there's no first char and the pattern was
02847 studied, there may be a bitmap of possible first characters. */
02848 
02849 if (!anchored)
02850   {
02851   if ((re->flags & PCRE_FIRSTSET) != 0)
02852     {
02853     first_byte = re->first_byte & 255;
02854     if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
02855       first_byte = lcc[first_byte];
02856     }
02857   else
02858     {
02859     if (!startline && study != NULL &&
02860          (study->flags & PCRE_STUDY_MAPPED) != 0)
02861       start_bits = study->start_bits;
02862     }
02863   }
02864 
02865 /* For anchored or unanchored matches, there may be a "last known required
02866 character" set. */
02867 
02868 if ((re->flags & PCRE_REQCHSET) != 0)
02869   {
02870   req_byte = re->req_byte & 255;
02871   req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
02872   req_byte2 = (md->tables + fcc_offset)[req_byte];  /* case flipped */
02873   }
02874 
02875 /* Call the main matching function, looping for a non-anchored regex after a
02876 failed match. If not restarting, perform certain optimizations at the start of
02877 a match. */
02878 
02879 for (;;)
02880   {
02881   int rc;
02882 
02883   if ((options & PCRE_DFA_RESTART) == 0)
02884     {
02885     const uschar *save_end_subject = end_subject;
02886 
02887     /* If firstline is TRUE, the start of the match is constrained to the first
02888     line of a multiline string. Implement this by temporarily adjusting
02889     end_subject so that we stop scanning at a newline. If the match fails at
02890     the newline, later code breaks this loop. */
02891 
02892     if (firstline)
02893       {
02894       USPTR t = current_subject;
02895 #ifdef SUPPORT_UTF8
02896       if (utf8)
02897         {
02898         while (t < md->end_subject && !IS_NEWLINE(t))
02899           {
02900           t++;
02901           while (t < end_subject && (*t & 0xc0) == 0x80) t++;
02902           }
02903         }
02904       else
02905 #endif
02906       while (t < md->end_subject && !IS_NEWLINE(t)) t++;
02907       end_subject = t;
02908       }
02909 
02910     /* There are some optimizations that avoid running the match if a known
02911     starting point is not found. However, there is an option that disables
02912     these, for testing and for ensuring that all callouts do actually occur. */
02913 
02914     if ((options & PCRE_NO_START_OPTIMIZE) == 0)
02915       {
02916       /* Advance to a known first byte. */
02917 
02918       if (first_byte >= 0)
02919         {
02920         if (first_byte_caseless)
02921           while (current_subject < end_subject &&
02922                  lcc[*current_subject] != first_byte)
02923             current_subject++;
02924         else
02925           while (current_subject < end_subject &&
02926                  *current_subject != first_byte)
02927             current_subject++;
02928         }
02929 
02930       /* Or to just after a linebreak for a multiline match if possible */
02931 
02932       else if (startline)
02933         {
02934         if (current_subject > md->start_subject + start_offset)
02935           {
02936 #ifdef SUPPORT_UTF8
02937           if (utf8)
02938             {
02939             while (current_subject < end_subject &&
02940                    !WAS_NEWLINE(current_subject))
02941               {
02942               current_subject++;
02943               while(current_subject < end_subject &&
02944                     (*current_subject & 0xc0) == 0x80)
02945                 current_subject++;
02946               }
02947             }
02948           else
02949 #endif
02950           while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
02951             current_subject++;
02952 
02953           /* If we have just passed a CR and the newline option is ANY or
02954           ANYCRLF, and we are now at a LF, advance the match position by one
02955           more character. */
02956 
02957           if (current_subject[-1] == CHAR_CR &&
02958                (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
02959                current_subject < end_subject &&
02960                *current_subject == CHAR_NL)
02961             current_subject++;
02962           }
02963         }
02964 
02965       /* Or to a non-unique first char after study */
02966 
02967       else if (start_bits != NULL)
02968         {
02969         while (current_subject < end_subject)
02970           {
02971           register unsigned int c = *current_subject;
02972           if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
02973             else break;
02974           }
02975         }
02976       }
02977 
02978     /* Restore fudged end_subject */
02979 
02980     end_subject = save_end_subject;
02981 
02982     /* The following two optimizations are disabled for partial matching or if
02983     disabling is explicitly requested (and of course, by the test above, this
02984     code is not obeyed when restarting after a partial match). */
02985 
02986     if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&
02987         (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0)
02988       {
02989       /* If the pattern was studied, a minimum subject length may be set. This
02990       is a lower bound; no actual string of that length may actually match the
02991       pattern. Although the value is, strictly, in characters, we treat it as
02992       bytes to avoid spending too much time in this optimization. */
02993 
02994       if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
02995           end_subject - current_subject < study->minlength)
02996         return PCRE_ERROR_NOMATCH;
02997 
02998       /* If req_byte is set, we know that that character must appear in the
02999       subject for the match to succeed. If the first character is set, req_byte
03000       must be later in the subject; otherwise the test starts at the match
03001       point. This optimization can save a huge amount of work in patterns with
03002       nested unlimited repeats that aren't going to match. Writing separate
03003       code for cased/caseless versions makes it go faster, as does using an
03004       autoincrement and backing off on a match.
03005 
03006       HOWEVER: when the subject string is very, very long, searching to its end
03007       can take a long time, and give bad performance on quite ordinary
03008       patterns. This showed up when somebody was matching /^C/ on a 32-megabyte
03009       string... so we don't do this when the string is sufficiently long. */
03010 
03011       if (req_byte >= 0 && end_subject - current_subject < REQ_BYTE_MAX)
03012         {
03013         register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
03014 
03015         /* We don't need to repeat the search if we haven't yet reached the
03016         place we found it at last time. */
03017 
03018         if (p > req_byte_ptr)
03019           {
03020           if (req_byte_caseless)
03021             {
03022             while (p < end_subject)
03023               {
03024               register int pp = *p++;
03025               if (pp == req_byte || pp == req_byte2) { p--; break; }
03026               }
03027             }
03028           else
03029             {
03030             while (p < end_subject)
03031               {
03032               if (*p++ == req_byte) { p--; break; }
03033               }
03034             }
03035 
03036           /* If we can't find the required character, break the matching loop,
03037           which will cause a return or PCRE_ERROR_NOMATCH. */
03038 
03039           if (p >= end_subject) break;
03040 
03041           /* If we have found the required character, save the point where we
03042           found it, so that we don't search again next time round the loop if
03043           the start hasn't passed this character yet. */
03044 
03045           req_byte_ptr = p;
03046           }
03047         }
03048       }
03049     }   /* End of optimizations that are done when not restarting */
03050 
03051   /* OK, now we can do the business */
03052 
03053   md->start_used_ptr = current_subject;
03054 
03055   rc = internal_dfa_exec(
03056     md,                                /* fixed match data */
03057     md->start_code,                    /* this subexpression's code */
03058     current_subject,                   /* where we currently are */
03059     start_offset,                      /* start offset in subject */
03060     offsets,                           /* offset vector */
03061     offsetcount,                       /* size of same */
03062     workspace,                         /* workspace vector */
03063     wscount,                           /* size of same */
03064     re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */
03065     0,                                 /* function recurse level */
03066     0);                                /* regex recurse level */
03067 
03068   /* Anything other than "no match" means we are done, always; otherwise, carry
03069   on only if not anchored. */
03070 
03071   if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
03072 
03073   /* Advance to the next subject character unless we are at the end of a line
03074   and firstline is set. */
03075 
03076   if (firstline && IS_NEWLINE(current_subject)) break;
03077   current_subject++;
03078   if (utf8)
03079     {
03080     while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
03081       current_subject++;
03082     }
03083   if (current_subject > end_subject) break;
03084 
03085   /* If we have just passed a CR and we are now at a LF, and the pattern does
03086   not contain any explicit matches for \r or \n, and the newline option is CRLF
03087   or ANY or ANYCRLF, advance the match position by one more character. */
03088 
03089   if (current_subject[-1] == CHAR_CR &&
03090       current_subject < end_subject &&
03091       *current_subject == CHAR_NL &&
03092       (re->flags & PCRE_HASCRORLF) == 0 &&
03093         (md->nltype == NLTYPE_ANY ||
03094          md->nltype == NLTYPE_ANYCRLF ||
03095          md->nllen == 2))
03096     current_subject++;
03097 
03098   }   /* "Bumpalong" loop */
03099 
03100 return PCRE_ERROR_NOMATCH;
03101 }
03102 
03103 /* End of pcre_dfa_exec.c */
src/pcre/pcre_dfa_exec.c