Standard Portable Library: src/pcre/pcre

00001 /*************************************************
00002 *      Perl-Compatible Regular Expressions       *
00003 *************************************************/
00004 
00005 /* PCRE is a library of functions to support regular expressions whose syntax
00006 and semantics are as close as possible to those of the Perl 5 language.
00007 
00008                        Written by Philip Hazel
00009            Copyright (c) 1997-2009 University of Cambridge
00010 
00011 -----------------------------------------------------------------------------
00012 Redistribution and use in source and binary forms, with or without
00013 modification, are permitted provided that the following conditions are met:
00014 
00015     * Redistributions of source code must retain the above copyright notice,
00016       this list of conditions and the following disclaimer.
00017 
00018     * Redistributions in binary form must reproduce the above copyright
00019       notice, this list of conditions and the following disclaimer in the
00020       documentation and/or other materials provided with the distribution.
00021 
00022     * Neither the name of the University of Cambridge nor the names of its
00023       contributors may be used to endorse or promote products derived from
00024       this software without specific prior written permission.
00025 
00026 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
00027 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00028 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
00029 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
00030 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
00031 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
00032 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
00033 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
00034 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
00035 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
00036 POSSIBILITY OF SUCH DAMAGE.
00037 -----------------------------------------------------------------------------
00038 */
00039 
00040 
00041 /* This module contains the external function pcre_compile(), along with
00042 supporting internal functions that are not used by other modules. */
00043 
00044 #ifdef HAVE_CONFIG_H
00045 #include "config.h"
00046 #else if defined(_WINDOWS)
00047 #include <spl/configwin32.h>
00048 #endif
00049 
00050 #define NLBLOCK cd             /* Block containing newline information */
00051 #define PSSTART start_pattern  /* Field containing processed string start */
00052 #define PSEND   end_pattern    /* Field containing processed string end */
00053 
00054 #include "pcre_internal.h"
00055 
00056 
00057 /* When DEBUG is defined, we need the pcre_printint() function, which is also
00058 used by pcretest. DEBUG is not defined when building a production library. */
00059 
00060 #ifdef DEBUG
00061 #include "pcre_printint.src"
00062 #endif
00063 
00064 
00065 /* Macro for setting individual bits in class bitmaps. */
00066 
00067 #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
00068 
00069 /* Maximum length value to check against when making sure that the integer that
00070 holds the compiled pattern length does not overflow. We make it a bit less than
00071 INT_MAX to allow for adding in group terminating bytes, so that we don't have
00072 to check them every time. */
00073 
00074 #define OFLOW_MAX (INT_MAX - 20)
00075 
00076 
00077 /*************************************************
00078 *      Code parameters and static tables         *
00079 *************************************************/
00080 
00081 /* This value specifies the size of stack workspace that is used during the
00082 first pre-compile phase that determines how much memory is required. The regex
00083 is partly compiled into this space, but the compiled parts are discarded as
00084 soon as they can be, so that hopefully there will never be an overrun. The code
00085 does, however, check for an overrun. The largest amount I've seen used is 218,
00086 so this number is very generous.
00087 
00088 The same workspace is used during the second, actual compile phase for
00089 remembering forward references to groups so that they can be filled in at the
00090 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
00091 is 4 there is plenty of room. */
00092 
00093 #define COMPILE_WORK_SIZE (4096)
00094 
00095 
00096 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
00097 are simple data values; negative values are for special things like \d and so
00098 on. Zero means further processing is needed (for things like \x), or the escape
00099 is invalid. */
00100 
00101 #ifndef EBCDIC
00102 
00103 /* This is the "normal" table for ASCII systems or for EBCDIC systems running
00104 in UTF-8 mode. */
00105 
00106 static const short int escapes[] = {
00107      0,                       0,
00108      0,                       0,
00109      0,                       0,
00110      0,                       0,
00111      0,                       0,
00112      CHAR_COLON,              CHAR_SEMICOLON,
00113      CHAR_LESS_THAN_SIGN,     CHAR_EQUALS_SIGN,
00114      CHAR_GREATER_THAN_SIGN,  CHAR_QUESTION_MARK,
00115      CHAR_COMMERCIAL_AT,      -ESC_A,
00116      -ESC_B,                  -ESC_C,
00117      -ESC_D,                  -ESC_E,
00118      0,                       -ESC_G,
00119      -ESC_H,                  0,
00120      0,                       -ESC_K,
00121      0,                       0,
00122      0,                       0,
00123      -ESC_P,                  -ESC_Q,
00124      -ESC_R,                  -ESC_S,
00125      0,                       0,
00126      -ESC_V,                  -ESC_W,
00127      -ESC_X,                  0,
00128      -ESC_Z,                  CHAR_LEFT_SQUARE_BRACKET,
00129      CHAR_BACKSLASH,          CHAR_RIGHT_SQUARE_BRACKET,
00130      CHAR_CIRCUMFLEX_ACCENT,  CHAR_UNDERSCORE,
00131      CHAR_GRAVE_ACCENT,       7,
00132      -ESC_b,                  0,
00133      -ESC_d,                  ESC_e,
00134      ESC_f,                   0,
00135      -ESC_h,                  0,
00136      0,                       -ESC_k,
00137      0,                       0,
00138      ESC_n,                   0,
00139      -ESC_p,                  0,
00140      ESC_r,                   -ESC_s,
00141      ESC_tee,                 0,
00142      -ESC_v,                  -ESC_w,
00143      0,                       0,
00144      -ESC_z
00145 };
00146 
00147 #else
00148 
00149 /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
00150 
00151 static const short int escapes[] = {
00152 /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
00153 /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
00154 /*  58 */     0,     0,    '!',     '$',    '*',   ')',    ';',    '~',
00155 /*  60 */   '-',   '/',      0,       0,      0,     0,      0,      0,
00156 /*  68 */     0,     0,    '|',     ',',    '%',   '_',    '>',    '?',
00157 /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,
00158 /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
00159 /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
00160 /*  88 */-ESC_h,     0,      0,     '{',      0,     0,      0,      0,
00161 /*  90 */     0,     0, -ESC_k,     'l',      0, ESC_n,      0, -ESC_p,
00162 /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
00163 /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,-ESC_v, -ESC_w,      0,
00164 /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
00165 /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,
00166 /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
00167 /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
00168 /*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,
00169 /*  D0 */   '}',     0, -ESC_K,       0,      0,     0,      0, -ESC_P,
00170 /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
00171 /*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,
00172 /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
00173 /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
00174 /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
00175 };
00176 #endif
00177 
00178 
00179 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
00180 searched linearly. Put all the names into a single string, in order to reduce
00181 the number of relocations when a shared library is dynamically linked. The
00182 string is built from string macros so that it works in UTF-8 mode on EBCDIC
00183 platforms. */
00184 
00185 typedef struct verbitem {
00186   int   len;
00187   int   op;
00188 } verbitem;
00189 
00190 static const char verbnames[] =
00191   STRING_ACCEPT0
00192   STRING_COMMIT0
00193   STRING_F0
00194   STRING_FAIL0
00195   STRING_PRUNE0
00196   STRING_SKIP0
00197   STRING_THEN;
00198 
00199 static const verbitem verbs[] = {
00200   { 6, OP_ACCEPT },
00201   { 6, OP_COMMIT },
00202   { 1, OP_FAIL },
00203   { 4, OP_FAIL },
00204   { 5, OP_PRUNE },
00205   { 4, OP_SKIP  },
00206   { 4, OP_THEN  }
00207 };
00208 
00209 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
00210 
00211 
00212 /* Tables of names of POSIX character classes and their lengths. The names are
00213 now all in a single string, to reduce the number of relocations when a shared
00214 library is dynamically loaded. The list of lengths is terminated by a zero
00215 length entry. The first three must be alpha, lower, upper, as this is assumed
00216 for handling case independence. */
00217 
00218 static const char posix_names[] =
00219   STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
00220   STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
00221   STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
00222   STRING_word0  STRING_xdigit;
00223 
00224 static const uschar posix_name_lengths[] = {
00225   5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
00226 
00227 /* Table of class bit maps for each POSIX class. Each class is formed from a
00228 base map, with an optional addition or removal of another map. Then, for some
00229 classes, there is some additional tweaking: for [:blank:] the vertical space
00230 characters are removed, and for [:alpha:] and [:alnum:] the underscore
00231 character is removed. The triples in the table consist of the base map offset,
00232 second map offset or -1 if no second map, and a non-negative value for map
00233 addition or a negative value for map subtraction (if there are two maps). The
00234 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
00235 remove vertical space characters, 2 => remove underscore. */
00236 
00237 static const int posix_class_maps[] = {
00238   cbit_word,  cbit_digit, -2,             /* alpha */
00239   cbit_lower, -1,          0,             /* lower */
00240   cbit_upper, -1,          0,             /* upper */
00241   cbit_word,  -1,          2,             /* alnum - word without underscore */
00242   cbit_print, cbit_cntrl,  0,             /* ascii */
00243   cbit_space, -1,          1,             /* blank - a GNU extension */
00244   cbit_cntrl, -1,          0,             /* cntrl */
00245   cbit_digit, -1,          0,             /* digit */
00246   cbit_graph, -1,          0,             /* graph */
00247   cbit_print, -1,          0,             /* print */
00248   cbit_punct, -1,          0,             /* punct */
00249   cbit_space, -1,          0,             /* space */
00250   cbit_word,  -1,          0,             /* word - a Perl extension */
00251   cbit_xdigit,-1,          0              /* xdigit */
00252 };
00253 
00254 
00255 #define STRING(a)  # a
00256 #define XSTRING(s) STRING(s)
00257 
00258 /* The texts of compile-time error messages. These are "char *" because they
00259 are passed to the outside world. Do not ever re-use any error number, because
00260 they are documented. Always add a new error instead. Messages marked DEAD below
00261 are no longer used. This used to be a table of strings, but in order to reduce
00262 the number of relocations needed when a shared library is loaded dynamically,
00263 it is now one long string. We cannot use a table of offsets, because the
00264 lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
00265 simply count through to the one we want - this isn't a performance issue
00266 because these strings are used only when there is a compilation error. */
00267 
00268 static const char error_texts[] =
00269   "no error\0"
00270   "\\ at end of pattern\0"
00271   "\\c at end of pattern\0"
00272   "unrecognized character follows \\\0"
00273   "numbers out of order in {} quantifier\0"
00274   /* 5 */
00275   "number too big in {} quantifier\0"
00276   "missing terminating ] for character class\0"
00277   "invalid escape sequence in character class\0"
00278   "range out of order in character class\0"
00279   "nothing to repeat\0"
00280   /* 10 */
00281   "operand of unlimited repeat could match the empty string\0"  
00282   "internal error: unexpected repeat\0"
00283   "unrecognized character after (? or (?-\0"
00284   "POSIX named classes are supported only within a class\0"
00285   "missing )\0"
00286   /* 15 */
00287   "reference to non-existent subpattern\0"
00288   "erroffset passed as NULL\0"
00289   "unknown option bit(s) set\0"
00290   "missing ) after comment\0"
00291   "parentheses nested too deeply\0"  
00292   /* 20 */
00293   "regular expression is too large\0"
00294   "failed to get memory\0"
00295   "unmatched parentheses\0"
00296   "internal error: code overflow\0"
00297   "unrecognized character after (?<\0"
00298   /* 25 */
00299   "lookbehind assertion is not fixed length\0"
00300   "malformed number or name after (?(\0"
00301   "conditional group contains more than two branches\0"
00302   "assertion expected after (?(\0"
00303   "(?R or (?[+-]digits must be followed by )\0"
00304   /* 30 */
00305   "unknown POSIX class name\0"
00306   "POSIX collating elements are not supported\0"
00307   "this version of PCRE is not compiled with PCRE_UTF8 support\0"
00308   "spare error\0"  
00309   "character value in \\x{...} sequence is too large\0"
00310   /* 35 */
00311   "invalid condition (?(0)\0"
00312   "\\C not allowed in lookbehind assertion\0"
00313   "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
00314   "number after (?C is > 255\0"
00315   "closing ) for (?C expected\0"
00316   /* 40 */
00317   "recursive call could loop indefinitely\0"
00318   "unrecognized character after (?P\0"
00319   "syntax error in subpattern name (missing terminator)\0"
00320   "two named subpatterns have the same name\0"
00321   "invalid UTF-8 string\0"
00322   /* 45 */
00323   "support for \\P, \\p, and \\X has not been compiled\0"
00324   "malformed \\P or \\p sequence\0"
00325   "unknown property name after \\P or \\p\0"
00326   "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
00327   "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
00328   /* 50 */
00329   "repeated subpattern is too long\0"    
00330   "octal value is greater than \\377 (not in UTF-8 mode)\0"
00331   "internal error: overran compiling workspace\0"
00332   "internal error: previously-checked referenced subpattern not found\0"
00333   "DEFINE group contains more than one branch\0"
00334   /* 55 */
00335   "repeating a DEFINE group is not allowed\0"
00336   "inconsistent NEWLINE options\0"
00337   "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
00338   "a numbered reference must not be zero\0"
00339   "(*VERB) with an argument is not supported\0"
00340   /* 60 */
00341   "(*VERB) not recognized\0"
00342   "number is too big\0"
00343   "subpattern name expected\0"
00344   "digit expected after (?+\0"
00345   "] is an invalid data character in JavaScript compatibility mode\0"
00346   /* 65 */
00347   "different names for subpatterns of the same number are not allowed";
00348 
00349 
00350 /* Table to identify digits and hex digits. This is used when compiling
00351 patterns. Note that the tables in chartables are dependent on the locale, and
00352 may mark arbitrary characters as digits - but the PCRE compiling code expects
00353 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
00354 a private table here. It costs 256 bytes, but it is a lot faster than doing
00355 character value tests (at least in some simple cases I timed), and in some
00356 applications one wants PCRE to compile efficiently as well as match
00357 efficiently.
00358 
00359 For convenience, we use the same bit definitions as in chartables:
00360 
00361   0x04   decimal digit
00362   0x08   hexadecimal digit
00363 
00364 Then we can use ctype_digit and ctype_xdigit in the code. */
00365 
00366 #ifndef EBCDIC
00367 
00368 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
00369 UTF-8 mode. */
00370 
00371 static const unsigned char digitab[] =
00372   {
00373   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
00374   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15 */
00375   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 */
00376   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
00377   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - '  */
00378   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ( - /  */
00379   0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  */
00380   0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /*  8 - ?  */
00381   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  @ - G  */
00382   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H - O  */
00383   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  P - W  */
00384   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  X - _  */
00385   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  ` - g  */
00386   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h - o  */
00387   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  p - w  */
00388   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  x -127 */
00389   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
00390   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
00391   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
00392   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
00393   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
00394   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
00395   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
00396   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
00397   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
00398   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
00399   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
00400   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
00401   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
00402   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
00403   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
00404   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
00405 
00406 #else
00407 
00408 /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
00409 
00410 static const unsigned char digitab[] =
00411   {
00412   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
00413   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15    */
00414   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 10 */
00415   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31    */
00416   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  32- 39 20 */
00417   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47    */
00418   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 30 */
00419   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63    */
00420   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
00421   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
00422   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
00423   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- 95    */
00424   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
00425   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
00426   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
00427   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "     */
00428   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g  80 */
00429   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143    */
00430   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p  90 */
00431   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159    */
00432   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x  A0 */
00433   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175    */
00434   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 B0 */
00435   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191    */
00436   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  { - G  C0 */
00437   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207    */
00438   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  } - P  D0 */
00439   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223    */
00440   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  \ - X  E0 */
00441   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239    */
00442   0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  F0 */
00443   0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255    */
00444 
00445 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
00446   0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*   0-  7 */
00447   0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /*   8- 15 */
00448   0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  16- 23 */
00449   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
00450   0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  32- 39 */
00451   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47 */
00452   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 */
00453   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63 */
00454   0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
00455   0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
00456   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
00457   0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- 95 */
00458   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
00459   0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
00460   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
00461   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "  */
00462   0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g  */
00463   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143 */
00464   0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p  */
00465   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159 */
00466   0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x  */
00467   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175 */
00468   0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 */
00469   0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
00470   0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /*  { - G  */
00471   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207 */
00472   0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  } - P  */
00473   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223 */
00474   0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /*  \ - X  */
00475   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239 */
00476   0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /*  0 - 7  */
00477   0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255 */
00478 #endif
00479 
00480 
00481 /* Definition to allow mutual recursion */
00482 
00483 static BOOL
00484   compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
00485     int *, int *, branch_chain *, compile_data *, int *);
00486 
00487 
00488 
00489 /*************************************************
00490 *            Find an error text                  *
00491 *************************************************/
00492 
00493 /* The error texts are now all in one long string, to save on relocations. As
00494 some of the text is of unknown length, we can't use a table of offsets.
00495 Instead, just count through the strings. This is not a performance issue
00496 because it happens only when there has been a compilation error.
00497 
00498 Argument:   the error number
00499 Returns:    pointer to the error string
00500 */
00501 
00502 static const char *
00503 find_error_text(int n)
00504 {
00505 const char *s = error_texts;
00506 for (; n > 0; n--) while (*s++ != 0) {};
00507 return s;
00508 }
00509 
00510 
00511 /*************************************************
00512 *            Handle escapes                      *
00513 *************************************************/
00514 
00515 /* This function is called when a \ has been encountered. It either returns a
00516 positive value for a simple escape such as \n, or a negative value which
00517 encodes one of the more complicated things such as \d. A backreference to group
00518 n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
00519 UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
00520 ptr is pointing at the \. On exit, it is on the final character of the escape
00521 sequence.
00522 
00523 Arguments:
00524   ptrptr         points to the pattern position pointer
00525   errorcodeptr   points to the errorcode variable
00526   bracount       number of previous extracting brackets
00527   options        the options bits
00528   isclass        TRUE if inside a character class
00529 
00530 Returns:         zero or positive => a data character
00531                  negative => a special escape sequence
00532                  on error, errorcodeptr is set
00533 */
00534 
00535 static int
00536 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
00537   int options, BOOL isclass)
00538 {
00539 BOOL utf8 = (options & PCRE_UTF8) != 0;
00540 const uschar *ptr = *ptrptr + 1;
00541 int c, i;
00542 
00543 GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */
00544 ptr--;                            /* Set pointer back to the last byte */
00545 
00546 /* If backslash is at the end of the pattern, it's an error. */
00547 
00548 if (c == 0) *errorcodeptr = ERR1;
00549 
00550 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
00551 in a table. A non-zero result is something that can be returned immediately.
00552 Otherwise further processing may be required. */
00553 
00554 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
00555 else if (c < CHAR_0 || c > CHAR_z) {}                     /* Not alphanumeric */
00556 else if ((i = escapes[c - CHAR_0]) != 0) c = i;
00557 
00558 #else           /* EBCDIC coding */
00559 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphanumeric */
00560 else if ((i = escapes[c - 0x48]) != 0)  c = i;
00561 #endif
00562 
00563 /* Escapes that need further processing, or are illegal. */
00564 
00565 else
00566   {
00567   const uschar *oldptr;
00568   BOOL braced, negated;
00569 
00570   switch (c)
00571     {
00572     /* A number of Perl escapes are not handled by PCRE. We give an explicit
00573     error. */
00574 
00575     case CHAR_l:
00576     case CHAR_L:
00577     case CHAR_N:
00578     case CHAR_u:
00579     case CHAR_U:
00580     *errorcodeptr = ERR37;
00581     break;
00582 
00583     /* \g must be followed by one of a number of specific things:
00584 
00585     (1) A number, either plain or braced. If positive, it is an absolute
00586     backreference. If negative, it is a relative backreference. This is a Perl
00587     5.10 feature.
00588 
00589     (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
00590     is part of Perl's movement towards a unified syntax for back references. As
00591     this is synonymous with \k{name}, we fudge it up by pretending it really
00592     was \k.
00593 
00594     (3) For Oniguruma compatibility we also support \g followed by a name or a
00595     number either in angle brackets or in single quotes. However, these are
00596     (possibly recursive) subroutine calls, _not_ backreferences. Just return
00597     the -ESC_g code (cf \k). */
00598 
00599     case CHAR_g:
00600     if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
00601       {
00602       c = -ESC_g;
00603       break;
00604       }
00605 
00606     /* Handle the Perl-compatible cases */
00607 
00608     if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
00609       {
00610       const uschar *p;
00611       for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
00612         if (*p != CHAR_MINUS && (digitab[*p] & ctype_digit) == 0) break;
00613       if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)
00614         {
00615         c = -ESC_k;
00616         break;
00617         }
00618       braced = TRUE;
00619       ptr++;
00620       }
00621     else braced = FALSE;
00622 
00623     if (ptr[1] == CHAR_MINUS)
00624       {
00625       negated = TRUE;
00626       ptr++;
00627       }
00628     else negated = FALSE;
00629 
00630     c = 0;
00631     while ((digitab[ptr[1]] & ctype_digit) != 0)
00632       c = c * 10 + *(++ptr) - CHAR_0;
00633 
00634     if (c < 0)   /* Integer overflow */
00635       {
00636       *errorcodeptr = ERR61;
00637       break;
00638       }
00639 
00640     if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
00641       {
00642       *errorcodeptr = ERR57;
00643       break;
00644       }
00645 
00646     if (c == 0)
00647       {
00648       *errorcodeptr = ERR58;
00649       break;
00650       }
00651 
00652     if (negated)
00653       {
00654       if (c > bracount)
00655         {
00656         *errorcodeptr = ERR15;
00657         break;
00658         }
00659       c = bracount - (c - 1);
00660       }
00661 
00662     c = -(ESC_REF + c);
00663     break;
00664 
00665     /* The handling of escape sequences consisting of a string of digits
00666     starting with one that is not zero is not straightforward. By experiment,
00667     the way Perl works seems to be as follows:
00668 
00669     Outside a character class, the digits are read as a decimal number. If the
00670     number is less than 10, or if there are that many previous extracting
00671     left brackets, then it is a back reference. Otherwise, up to three octal
00672     digits are read to form an escaped byte. Thus \123 is likely to be octal
00673     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
00674     value is greater than 377, the least significant 8 bits are taken. Inside a
00675     character class, \ followed by a digit is always an octal number. */
00676 
00677     case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
00678     case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
00679 
00680     if (!isclass)
00681       {
00682       oldptr = ptr;
00683       c -= CHAR_0;
00684       while ((digitab[ptr[1]] & ctype_digit) != 0)
00685         c = c * 10 + *(++ptr) - CHAR_0;
00686       if (c < 0)    /* Integer overflow */
00687         {
00688         *errorcodeptr = ERR61;
00689         break;
00690         }
00691       if (c < 10 || c <= bracount)
00692         {
00693         c = -(ESC_REF + c);
00694         break;
00695         }
00696       ptr = oldptr;      /* Put the pointer back and fall through */
00697       }
00698 
00699     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
00700     generates a binary zero byte and treats the digit as a following literal.
00701     Thus we have to pull back the pointer by one. */
00702 
00703     if ((c = *ptr) >= CHAR_8)
00704       {
00705       ptr--;
00706       c = 0;
00707       break;
00708       }
00709 
00710     /* \0 always starts an octal number, but we may drop through to here with a
00711     larger first octal digit. The original code used just to take the least
00712     significant 8 bits of octal numbers (I think this is what early Perls used
00713     to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
00714     than 3 octal digits. */
00715 
00716     case CHAR_0:
00717     c -= CHAR_0;
00718     while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
00719         c = c * 8 + *(++ptr) - CHAR_0;
00720     if (!utf8 && c > 255) *errorcodeptr = ERR51;
00721     break;
00722 
00723     /* \x is complicated. \x{ddd} is a character number which can be greater
00724     than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
00725     treated as a data character. */
00726 
00727     case CHAR_x:
00728     if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
00729       {
00730       const uschar *pt = ptr + 2;
00731       int count = 0;
00732 
00733       c = 0;
00734       while ((digitab[*pt] & ctype_xdigit) != 0)
00735         {
00736         register int cc = *pt++;
00737         if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
00738         count++;
00739 
00740 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
00741         if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
00742         c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
00743 #else           /* EBCDIC coding */
00744         if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
00745         c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
00746 #endif
00747         }
00748 
00749       if (*pt == CHAR_RIGHT_CURLY_BRACKET)
00750         {
00751         if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
00752         ptr = pt;
00753         break;
00754         }
00755 
00756       /* If the sequence of hex digits does not end with '}', then we don't
00757       recognize this construct; fall through to the normal \x handling. */
00758       }
00759 
00760     /* Read just a single-byte hex-defined char */
00761 
00762     c = 0;
00763     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
00764       {
00765       int cc;                                  /* Some compilers don't like */
00766       cc = *(++ptr);                           /* ++ in initializers */
00767 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
00768       if (cc >= CHAR_a) cc -= 32;              /* Convert to upper case */
00769       c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
00770 #else           /* EBCDIC coding */
00771       if (cc <= CHAR_z) cc += 64;              /* Convert to upper case */
00772       c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
00773 #endif
00774       }
00775     break;
00776 
00777     /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
00778     This coding is ASCII-specific, but then the whole concept of \cx is
00779     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
00780 
00781     case CHAR_c:
00782     c = *(++ptr);
00783     if (c == 0)
00784       {
00785       *errorcodeptr = ERR2;
00786       break;
00787       }
00788 
00789 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
00790     if (c >= CHAR_a && c <= CHAR_z) c -= 32;
00791     c ^= 0x40;
00792 #else           /* EBCDIC coding */
00793     if (c >= CHAR_a && c <= CHAR_z) c += 64;
00794     c ^= 0xC0;
00795 #endif
00796     break;
00797 
00798     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
00799     other alphanumeric following \ is an error if PCRE_EXTRA was set;
00800     otherwise, for Perl compatibility, it is a literal. This code looks a bit
00801     odd, but there used to be some cases other than the default, and there may
00802     be again in future, so I haven't "optimized" it. */
00803 
00804     default:
00805     if ((options & PCRE_EXTRA) != 0) switch(c)
00806       {
00807       default:
00808       *errorcodeptr = ERR3;
00809       break;
00810       }
00811     break;
00812     }
00813   }
00814 
00815 *ptrptr = ptr;
00816 return c;
00817 }
00818 
00819 
00820 
00821 #ifdef SUPPORT_UCP
00822 /*************************************************
00823 *               Handle \P and \p                 *
00824 *************************************************/
00825 
00826 /* This function is called after \P or \p has been encountered, provided that
00827 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
00828 pointing at the P or p. On exit, it is pointing at the final character of the
00829 escape sequence.
00830 
00831 Argument:
00832   ptrptr         points to the pattern position pointer
00833   negptr         points to a boolean that is set TRUE for negation else FALSE
00834   dptr           points to an int that is set to the detailed property value
00835   errorcodeptr   points to the error code variable
00836 
00837 Returns:         type value from ucp_type_table, or -1 for an invalid type
00838 */
00839 
00840 static int
00841 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
00842 {
00843 int c, i, bot, top;
00844 const uschar *ptr = *ptrptr;
00845 char name[32];
00846 
00847 c = *(++ptr);
00848 if (c == 0) goto ERROR_RETURN;
00849 
00850 *negptr = FALSE;
00851 
00852 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
00853 negation. */
00854 
00855 if (c == CHAR_LEFT_CURLY_BRACKET)
00856   {
00857   if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
00858     {
00859     *negptr = TRUE;
00860     ptr++;
00861     }
00862   for (i = 0; i < (int)sizeof(name) - 1; i++)
00863     {
00864     c = *(++ptr);
00865     if (c == 0) goto ERROR_RETURN;
00866     if (c == CHAR_RIGHT_CURLY_BRACKET) break;
00867     name[i] = c;
00868     }
00869   if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
00870   name[i] = 0;
00871   }
00872 
00873 /* Otherwise there is just one following character */
00874 
00875 else
00876   {
00877   name[0] = c;
00878   name[1] = 0;
00879   }
00880 
00881 *ptrptr = ptr;
00882 
00883 /* Search for a recognized property name using binary chop */
00884 
00885 bot = 0;
00886 top = _pcre_utt_size;
00887 
00888 while (bot < top)
00889   {
00890   i = (bot + top) >> 1;
00891   c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
00892   if (c == 0)
00893     {
00894     *dptr = _pcre_utt[i].value;
00895     return _pcre_utt[i].type;
00896     }
00897   if (c > 0) bot = i + 1; else top = i;
00898   }
00899 
00900 *errorcodeptr = ERR47;
00901 *ptrptr = ptr;
00902 return -1;
00903 
00904 ERROR_RETURN:
00905 *errorcodeptr = ERR46;
00906 *ptrptr = ptr;
00907 return -1;
00908 }
00909 #endif
00910 
00911 
00912 
00913 
00914 /*************************************************
00915 *            Check for counted repeat            *
00916 *************************************************/
00917 
00918 /* This function is called when a '{' is encountered in a place where it might
00919 start a quantifier. It looks ahead to see if it really is a quantifier or not.
00920 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
00921 where the ddds are digits.
00922 
00923 Arguments:
00924   p         pointer to the first char after '{'
00925 
00926 Returns:    TRUE or FALSE
00927 */
00928 
00929 static BOOL
00930 is_counted_repeat(const uschar *p)
00931 {
00932 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
00933 while ((digitab[*p] & ctype_digit) != 0) p++;
00934 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
00935 
00936 if (*p++ != CHAR_COMMA) return FALSE;
00937 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
00938 
00939 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
00940 while ((digitab[*p] & ctype_digit) != 0) p++;
00941 
00942 return (*p == CHAR_RIGHT_CURLY_BRACKET);
00943 }
00944 
00945 
00946 
00947 /*************************************************
00948 *         Read repeat counts                     *
00949 *************************************************/
00950 
00951 /* Read an item of the form {n,m} and return the values. This is called only
00952 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
00953 so the syntax is guaranteed to be correct, but we need to check the values.
00954 
00955 Arguments:
00956   p              pointer to first char after '{'
00957   minp           pointer to int for min
00958   maxp           pointer to int for max
00959                  returned as -1 if no max
00960   errorcodeptr   points to error code variable
00961 
00962 Returns:         pointer to '}' on success;
00963                  current ptr on error, with errorcodeptr set non-zero
00964 */
00965 
00966 static const uschar *
00967 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
00968 {
00969 int min = 0;
00970 int max = -1;
00971 
00972 /* Read the minimum value and do a paranoid check: a negative value indicates
00973 an integer overflow. */
00974 
00975 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - CHAR_0;
00976 if (min < 0 || min > 65535)
00977   {
00978   *errorcodeptr = ERR5;
00979   return p;
00980   }
00981 
00982 /* Read the maximum value if there is one, and again do a paranoid on its size.
00983 Also, max must not be less than min. */
00984 
00985 if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
00986   {
00987   if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
00988     {
00989     max = 0;
00990     while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - CHAR_0;
00991     if (max < 0 || max > 65535)
00992       {
00993       *errorcodeptr = ERR5;
00994       return p;
00995       }
00996     if (max < min)
00997       {
00998       *errorcodeptr = ERR4;
00999       return p;
01000       }
01001     }
01002   }
01003 
01004 /* Fill in the required variables, and pass back the pointer to the terminating
01005 '}'. */
01006 
01007 *minp = min;
01008 *maxp = max;
01009 return p;
01010 }
01011 
01012 
01013 
01014 /*************************************************
01015 *  Subroutine for finding forward reference      *
01016 *************************************************/
01017 
01018 /* This recursive function is called only from find_parens() below. The
01019 top-level call starts at the beginning of the pattern. All other calls must
01020 start at a parenthesis. It scans along a pattern's text looking for capturing
01021 subpatterns, and counting them. If it finds a named pattern that matches the
01022 name it is given, it returns its number. Alternatively, if the name is NULL, it
01023 returns when it reaches a given numbered subpattern. We know that if (?P< is
01024 encountered, the name will be terminated by '>' because that is checked in the
01025 first pass. Recursion is used to keep track of subpatterns that reset the
01026 capturing group numbers - the (?| feature.
01027 
01028 Arguments:
01029   ptrptr       address of the current character pointer (updated)
01030   cd           compile background data
01031   name         name to seek, or NULL if seeking a numbered subpattern
01032   lorn         name length, or subpattern number if name is NULL
01033   xmode        TRUE if we are in /x mode
01034   count        pointer to the current capturing subpattern number (updated)
01035 
01036 Returns:       the number of the named subpattern, or -1 if not found
01037 */
01038 
01039 static int
01040 find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,
01041   BOOL xmode, int *count)
01042 {
01043 uschar *ptr = *ptrptr;
01044 int start_count = *count;
01045 int hwm_count = start_count;
01046 BOOL dup_parens = FALSE;
01047 
01048 /* If the first character is a parenthesis, check on the type of group we are
01049 dealing with. The very first call may not start with a parenthesis. */
01050 
01051 if (ptr[0] == CHAR_LEFT_PARENTHESIS)
01052   {
01053   if (ptr[1] == CHAR_QUESTION_MARK &&
01054       ptr[2] == CHAR_VERTICAL_LINE)
01055     {
01056     ptr += 3;
01057     dup_parens = TRUE;
01058     }
01059 
01060   /* Handle a normal, unnamed capturing parenthesis */
01061 
01062   else if (ptr[1] != CHAR_QUESTION_MARK && ptr[1] != CHAR_ASTERISK)
01063     {
01064     *count += 1;
01065     if (name == NULL && *count == lorn) return *count;
01066     ptr++;
01067     }
01068 
01069   /* Handle a condition. If it is an assertion, just carry on so that it
01070   is processed as normal. If not, skip to the closing parenthesis of the
01071   condition (there can't be any nested parens. */
01072 
01073   else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
01074     {
01075     ptr += 2;
01076     if (ptr[1] != CHAR_QUESTION_MARK)
01077       {
01078       while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
01079       if (*ptr != 0) ptr++;
01080       }
01081     }
01082 
01083   /* We have either (? or (* and not a condition */
01084 
01085   else
01086     {
01087     ptr += 2;
01088     if (*ptr == CHAR_P) ptr++;                      /* Allow optional P */
01089 
01090     /* We have to disambiguate (?<! and (?<= from (?<name> for named groups */
01091 
01092     if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&
01093         ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
01094       {
01095       int term;
01096       const uschar *thisname;
01097       *count += 1;
01098       if (name == NULL && *count == lorn) return *count;
01099       term = *ptr++;
01100       if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;
01101       thisname = ptr;
01102       while (*ptr != term) ptr++;
01103       if (name != NULL && lorn == ptr - thisname &&
01104           strncmp((const char *)name, (const char *)thisname, lorn) == 0)
01105         return *count;
01106       term++;
01107       }
01108     }
01109   }
01110 
01111 /* Past any initial parenthesis handling, scan for parentheses or vertical
01112 bars. */
01113 
01114 for (; *ptr != 0; ptr++)
01115   {
01116   /* Skip over backslashed characters and also entire \Q...\E */
01117 
01118   if (*ptr == CHAR_BACKSLASH)
01119     {
01120     if (*(++ptr) == 0) goto FAIL_EXIT;
01121     if (*ptr == CHAR_Q) for (;;)
01122       {
01123       while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
01124       if (*ptr == 0) goto FAIL_EXIT;
01125       if (*(++ptr) == CHAR_E) break;
01126       }
01127     continue;
01128     }
01129 
01130   /* Skip over character classes; this logic must be similar to the way they
01131   are handled for real. If the first character is '^', skip it. Also, if the
01132   first few characters (either before or after ^) are \Q\E or \E we skip them
01133   too. This makes for compatibility with Perl. Note the use of STR macros to
01134   encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */
01135 
01136   if (*ptr == CHAR_LEFT_SQUARE_BRACKET)
01137     {
01138     BOOL negate_class = FALSE;
01139     for (;;)
01140       {
01141       if (ptr[1] == CHAR_BACKSLASH)
01142         {
01143         if (ptr[2] == CHAR_E)
01144           ptr+= 2;
01145         else if (strncmp((const char *)ptr+2,
01146                  STR_Q STR_BACKSLASH STR_E, 3) == 0)
01147           ptr += 4;
01148         else
01149           break;
01150         }
01151       else if (!negate_class && ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
01152         {
01153         negate_class = TRUE;
01154         ptr++;
01155         }
01156       else break;
01157       }
01158 
01159     /* If the next character is ']', it is a data character that must be
01160     skipped, except in JavaScript compatibility mode. */
01161 
01162     if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&
01163         (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
01164       ptr++;
01165 
01166     while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET)
01167       {
01168       if (*ptr == 0) return -1;
01169       if (*ptr == CHAR_BACKSLASH)
01170         {
01171         if (*(++ptr) == 0) goto FAIL_EXIT;
01172         if (*ptr == CHAR_Q) for (;;)
01173           {
01174           while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
01175           if (*ptr == 0) goto FAIL_EXIT;
01176           if (*(++ptr) == CHAR_E) break;
01177           }
01178         continue;
01179         }
01180       }
01181     continue;
01182     }
01183 
01184   /* Skip comments in /x mode */
01185 
01186   if (xmode && *ptr == CHAR_NUMBER_SIGN)
01187     {
01188     while (*(++ptr) != 0 && *ptr != CHAR_NL) {};
01189     if (*ptr == 0) goto FAIL_EXIT;
01190     continue;
01191     }
01192 
01193   /* Check for the special metacharacters */
01194 
01195   if (*ptr == CHAR_LEFT_PARENTHESIS)
01196     {
01197     int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, count);
01198     if (rc > 0) return rc;
01199     if (*ptr == 0) goto FAIL_EXIT;
01200     }
01201 
01202   else if (*ptr == CHAR_RIGHT_PARENTHESIS)
01203     {
01204     if (dup_parens && *count < hwm_count) *count = hwm_count;
01205     *ptrptr = ptr;
01206     return -1;
01207     }
01208 
01209   else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
01210     {
01211     if (*count > hwm_count) hwm_count = *count;
01212     *count = start_count;
01213     }
01214   }
01215 
01216 FAIL_EXIT:
01217 *ptrptr = ptr;
01218 return -1;
01219 }
01220 
01221 
01222 
01223 
01224 /*************************************************
01225 *       Find forward referenced subpattern       *
01226 *************************************************/
01227 
01228 /* This function scans along a pattern's text looking for capturing
01229 subpatterns, and counting them. If it finds a named pattern that matches the
01230 name it is given, it returns its number. Alternatively, if the name is NULL, it
01231 returns when it reaches a given numbered subpattern. This is used for forward
01232 references to subpatterns. We used to be able to start this scan from the
01233 current compiling point, using the current count value from cd->bracount, and
01234 do it all in a single loop, but the addition of the possibility of duplicate
01235 subpattern numbers means that we have to scan from the very start, in order to
01236 take account of such duplicates, and to use a recursive function to keep track
01237 of the different types of group.
01238 
01239 Arguments:
01240   cd           compile background data
01241   name         name to seek, or NULL if seeking a numbered subpattern
01242   lorn         name length, or subpattern number if name is NULL
01243   xmode        TRUE if we are in /x mode
01244 
01245 Returns:       the number of the found subpattern, or -1 if not found
01246 */
01247 
01248 static int
01249 find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode)
01250 {
01251 uschar *ptr = (uschar *)cd->start_pattern;
01252 int count = 0;
01253 int rc;
01254 
01255 /* If the pattern does not start with an opening parenthesis, the first call
01256 to find_parens_sub() will scan right to the end (if necessary). However, if it
01257 does start with a parenthesis, find_parens_sub() will return when it hits the
01258 matching closing parens. That is why we have to have a loop. */
01259 
01260 for (;;)
01261   {
01262   rc = find_parens_sub(&ptr, cd, name, lorn, xmode, &count);
01263   if (rc > 0 || *ptr++ == 0) break;
01264   }
01265 
01266 return rc;
01267 }
01268 
01269 
01270 
01271 
01272 /*************************************************
01273 *      Find first significant op code            *
01274 *************************************************/
01275 
01276 /* This is called by several functions that scan a compiled expression looking
01277 for a fixed first character, or an anchoring op code etc. It skips over things
01278 that do not influence this. For some calls, a change of option is important.
01279 For some calls, it makes sense to skip negative forward and all backward
01280 assertions, and also the \b assertion; for others it does not.
01281 
01282 Arguments:
01283   code         pointer to the start of the group
01284   options      pointer to external options
01285   optbit       the option bit whose changing is significant, or
01286                  zero if none are
01287   skipassert   TRUE if certain assertions are to be skipped
01288 
01289 Returns:       pointer to the first significant opcode
01290 */
01291 
01292 static const uschar*
01293 first_significant_code(const uschar *code, int *options, int optbit,
01294   BOOL skipassert)
01295 {
01296 for (;;)
01297   {
01298   switch ((int)*code)
01299     {
01300     case OP_OPT:
01301     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
01302       *options = (int)code[1];
01303     code += 2;
01304     break;
01305 
01306     case OP_ASSERT_NOT:
01307     case OP_ASSERTBACK:
01308     case OP_ASSERTBACK_NOT:
01309     if (!skipassert) return code;
01310     do code += GET(code, 1); while (*code == OP_ALT);
01311     code += _pcre_OP_lengths[*code];
01312     break;
01313 
01314     case OP_WORD_BOUNDARY:
01315     case OP_NOT_WORD_BOUNDARY:
01316     if (!skipassert) return code;
01317     /* Fall through */
01318 
01319     case OP_CALLOUT:
01320     case OP_CREF:
01321     case OP_NCREF:
01322     case OP_RREF:
01323     case OP_NRREF:
01324     case OP_DEF:
01325     code += _pcre_OP_lengths[*code];
01326     break;
01327 
01328     default:
01329     return code;
01330     }
01331   }
01332 /* Control never reaches here */
01333 }
01334 
01335 
01336 
01337 
01338 /*************************************************
01339 *        Find the fixed length of a branch       *
01340 *************************************************/
01341 
01342 /* Scan a branch and compute the fixed length of subject that will match it,
01343 if the length is fixed. This is needed for dealing with backward assertions.
01344 In UTF8 mode, the result is in characters rather than bytes. The branch is
01345 temporarily terminated with OP_END when this function is called.
01346 
01347 This function is called when a backward assertion is encountered, so that if it
01348 fails, the error message can point to the correct place in the pattern.
01349 However, we cannot do this when the assertion contains subroutine calls,
01350 because they can be forward references. We solve this by remembering this case
01351 and doing the check at the end; a flag specifies which mode we are running in.
01352 
01353 Arguments:
01354   code     points to the start of the pattern (the bracket)
01355   options  the compiling options
01356   atend    TRUE if called when the pattern is complete
01357   cd       the "compile data" structure
01358 
01359 Returns:   the fixed length,
01360              or -1 if there is no fixed length,
01361              or -2 if \C was encountered
01362              or -3 if an OP_RECURSE item was encountered and atend is FALSE
01363 */
01364 
01365 static int
01366 find_fixedlength(uschar *code, int options, BOOL atend, compile_data *cd)
01367 {
01368 int length = -1;
01369 
01370 register int branchlength = 0;
01371 register uschar *cc = code + 1 + LINK_SIZE;
01372 
01373 /* Scan along the opcodes for this branch. If we get to the end of the
01374 branch, check the length against that of the other branches. */
01375 
01376 for (;;)
01377   {
01378   int d;
01379   uschar *ce, *cs;
01380   register int op = *cc;
01381   switch (op)
01382     {
01383     case OP_CBRA:
01384     case OP_BRA:
01385     case OP_ONCE:
01386     case OP_COND:
01387     d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options, atend, cd);
01388     if (d < 0) return d;
01389     branchlength += d;
01390     do cc += GET(cc, 1); while (*cc == OP_ALT);
01391     cc += 1 + LINK_SIZE;
01392     break;
01393 
01394     /* Reached end of a branch; if it's a ket it is the end of a nested
01395     call. If it's ALT it is an alternation in a nested call. If it is
01396     END it's the end of the outer call. All can be handled by the same code. */
01397 
01398     case OP_ALT:
01399     case OP_KET:
01400     case OP_KETRMAX:
01401     case OP_KETRMIN:
01402     case OP_END:
01403     if (length < 0) length = branchlength;
01404       else if (length != branchlength) return -1;
01405     if (*cc != OP_ALT) return length;
01406     cc += 1 + LINK_SIZE;
01407     branchlength = 0;
01408     break;
01409 
01410     /* A true recursion implies not fixed length, but a subroutine call may
01411     be OK. If the subroutine is a forward reference, we can't deal with
01412     it until the end of the pattern, so return -3. */
01413 
01414     case OP_RECURSE:
01415     if (!atend) return -3;
01416     cs = ce = (uschar *)cd->start_code + GET(cc, 1);  /* Start subpattern */
01417     do ce += GET(ce, 1); while (*ce == OP_ALT);       /* End subpattern */
01418     if (cc > cs && cc < ce) return -1;                /* Recursion */
01419     d = find_fixedlength(cs + 2, options, atend, cd);
01420     if (d < 0) return d;
01421     branchlength += d;
01422     cc += 1 + LINK_SIZE;
01423     break;
01424 
01425     /* Skip over assertive subpatterns */
01426 
01427     case OP_ASSERT:
01428     case OP_ASSERT_NOT:
01429     case OP_ASSERTBACK:
01430     case OP_ASSERTBACK_NOT:
01431     do cc += GET(cc, 1); while (*cc == OP_ALT);
01432     /* Fall through */
01433 
01434     /* Skip over things that don't match chars */
01435 
01436     case OP_REVERSE:
01437     case OP_CREF:
01438     case OP_NCREF:
01439     case OP_RREF:
01440     case OP_NRREF:
01441     case OP_DEF:
01442     case OP_OPT:
01443     case OP_CALLOUT:
01444     case OP_SOD:
01445     case OP_SOM:
01446     case OP_EOD:
01447     case OP_EODN:
01448     case OP_CIRC:
01449     case OP_DOLL:
01450     case OP_NOT_WORD_BOUNDARY:
01451     case OP_WORD_BOUNDARY:
01452     cc += _pcre_OP_lengths[*cc];
01453     break;
01454 
01455     /* Handle literal characters */
01456 
01457     case OP_CHAR:
01458     case OP_CHARNC:
01459     case OP_NOT:
01460     branchlength++;
01461     cc += 2;
01462 #ifdef SUPPORT_UTF8
01463     if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
01464       cc += _pcre_utf8_table4[cc[-1] & 0x3f];
01465 #endif
01466     break;
01467 
01468     /* Handle exact repetitions. The count is already in characters, but we
01469     need to skip over a multibyte character in UTF8 mode.  */
01470 
01471     case OP_EXACT:
01472     branchlength += GET2(cc,1);
01473     cc += 4;
01474 #ifdef SUPPORT_UTF8
01475     if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
01476       cc += _pcre_utf8_table4[cc[-1] & 0x3f];
01477 #endif
01478     break;
01479 
01480     case OP_TYPEEXACT:
01481     branchlength += GET2(cc,1);
01482     if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
01483     cc += 4;
01484     break;
01485 
01486     /* Handle single-char matchers */
01487 
01488     case OP_PROP:
01489     case OP_NOTPROP:
01490     cc += 2;
01491     /* Fall through */
01492 
01493     case OP_NOT_DIGIT:
01494     case OP_DIGIT:
01495     case OP_NOT_WHITESPACE:
01496     case OP_WHITESPACE:
01497     case OP_NOT_WORDCHAR:
01498     case OP_WORDCHAR:
01499     case OP_ANY:
01500     case OP_ALLANY:
01501     branchlength++;
01502     cc++;
01503     break;
01504 
01505     /* The single-byte matcher isn't allowed */
01506 
01507     case OP_ANYBYTE:
01508     return -2;
01509 
01510     /* Check a class for variable quantification */
01511 
01512 #ifdef SUPPORT_UTF8
01513     case OP_XCLASS:
01514     cc += GET(cc, 1) - 33;
01515     /* Fall through */
01516 #endif
01517 
01518     case OP_CLASS:
01519     case OP_NCLASS:
01520     cc += 33;
01521 
01522     switch (*cc)
01523       {
01524       case OP_CRSTAR:
01525       case OP_CRMINSTAR:
01526       case OP_CRQUERY:
01527       case OP_CRMINQUERY:
01528       return -1;
01529 
01530       case OP_CRRANGE:
01531       case OP_CRMINRANGE:
01532       if (GET2(cc,1) != GET2(cc,3)) return -1;
01533       branchlength += GET2(cc,1);
01534       cc += 5;
01535       break;
01536 
01537       default:
01538       branchlength++;
01539       }
01540     break;
01541 
01542     /* Anything else is variable length */
01543 
01544     default:
01545     return -1;
01546     }
01547   }
01548 /* Control never gets here */
01549 }
01550 
01551 
01552 
01553 
01554 /*************************************************
01555 *    Scan compiled regex for specific bracket    *
01556 *************************************************/
01557 
01558 /* This little function scans through a compiled pattern until it finds a
01559 capturing bracket with the given number, or, if the number is negative, an
01560 instance of OP_REVERSE for a lookbehind. The function is global in the C sense
01561 so that it can be called from pcre_study() when finding the minimum matching
01562 length.
01563 
01564 Arguments:
01565   code        points to start of expression
01566   utf8        TRUE in UTF-8 mode
01567   number      the required bracket number or negative to find a lookbehind
01568 
01569 Returns:      pointer to the opcode for the bracket, or NULL if not found
01570 */
01571 
01572 const uschar *
01573 _pcre_find_bracket(const uschar *code, BOOL utf8, int number)
01574 {
01575 for (;;)
01576   {
01577   register int c = *code;
01578   if (c == OP_END) return NULL;
01579 
01580   /* XCLASS is used for classes that cannot be represented just by a bit
01581   map. This includes negated single high-valued characters. The length in
01582   the table is zero; the actual length is stored in the compiled code. */
01583 
01584   if (c == OP_XCLASS) code += GET(code, 1);
01585 
01586   /* Handle recursion */
01587 
01588   else if (c == OP_REVERSE)
01589     {
01590     if (number < 0) return (uschar *)code;
01591     code += _pcre_OP_lengths[c];
01592     }
01593 
01594   /* Handle capturing bracket */
01595 
01596   else if (c == OP_CBRA)
01597     {
01598     int n = GET2(code, 1+LINK_SIZE);
01599     if (n == number) return (uschar *)code;
01600     code += _pcre_OP_lengths[c];
01601     }
01602 
01603   /* Otherwise, we can get the item's length from the table, except that for
01604   repeated character types, we have to test for \p and \P, which have an extra
01605   two bytes of parameters. */
01606 
01607   else
01608     {
01609     switch(c)
01610       {
01611       case OP_TYPESTAR:
01612       case OP_TYPEMINSTAR:
01613       case OP_TYPEPLUS:
01614       case OP_TYPEMINPLUS:
01615       case OP_TYPEQUERY:
01616       case OP_TYPEMINQUERY:
01617       case OP_TYPEPOSSTAR:
01618       case OP_TYPEPOSPLUS:
01619       case OP_TYPEPOSQUERY:
01620       if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
01621       break;
01622 
01623       case OP_TYPEUPTO:
01624       case OP_TYPEMINUPTO:
01625       case OP_TYPEEXACT:
01626       case OP_TYPEPOSUPTO:
01627       if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
01628       break;
01629       }
01630 
01631     /* Add in the fixed length from the table */
01632 
01633     code += _pcre_OP_lengths[c];
01634 
01635   /* In UTF-8 mode, opcodes that are followed by a character may be followed by
01636   a multi-byte character. The length in the table is a minimum, so we have to
01637   arrange to skip the extra bytes. */
01638 
01639 #ifdef SUPPORT_UTF8
01640     if (utf8) switch(c)
01641       {
01642       case OP_CHAR:
01643       case OP_CHARNC:
01644       case OP_EXACT:
01645       case OP_UPTO:
01646       case OP_MINUPTO:
01647       case OP_POSUPTO:
01648       case OP_STAR:
01649       case OP_MINSTAR:
01650       case OP_POSSTAR:
01651       case OP_PLUS:
01652       case OP_MINPLUS:
01653       case OP_POSPLUS:
01654       case OP_QUERY:
01655       case OP_MINQUERY:
01656       case OP_POSQUERY:
01657       if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
01658       break;
01659       }
01660 #else
01661     (void)(utf8);  /* Keep compiler happy by referencing function argument */
01662 #endif
01663     }
01664   }
01665 }
01666 
01667 
01668 
01669 /*************************************************
01670 *   Scan compiled regex for recursion reference  *
01671 *************************************************/
01672 
01673 /* This little function scans through a compiled pattern until it finds an
01674 instance of OP_RECURSE.
01675 
01676 Arguments:
01677   code        points to start of expression
01678   utf8        TRUE in UTF-8 mode
01679 
01680 Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
01681 */
01682 
01683 static const uschar *
01684 find_recurse(const uschar *code, BOOL utf8)
01685 {
01686 for (;;)
01687   {
01688   register int c = *code;
01689   if (c == OP_END) return NULL;
01690   if (c == OP_RECURSE) return code;
01691 
01692   /* XCLASS is used for classes that cannot be represented just by a bit
01693   map. This includes negated single high-valued characters. The length in
01694   the table is zero; the actual length is stored in the compiled code. */
01695 
01696   if (c == OP_XCLASS) code += GET(code, 1);
01697 
01698   /* Otherwise, we can get the item's length from the table, except that for
01699   repeated character types, we have to test for \p and \P, which have an extra
01700   two bytes of parameters. */
01701 
01702   else
01703     {
01704     switch(c)
01705       {
01706       case OP_TYPESTAR:
01707       case OP_TYPEMINSTAR:
01708       case OP_TYPEPLUS:
01709       case OP_TYPEMINPLUS:
01710       case OP_TYPEQUERY:
01711       case OP_TYPEMINQUERY:
01712       case OP_TYPEPOSSTAR:
01713       case OP_TYPEPOSPLUS:
01714       case OP_TYPEPOSQUERY:
01715       if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
01716       break;
01717 
01718       case OP_TYPEPOSUPTO:
01719       case OP_TYPEUPTO:
01720       case OP_TYPEMINUPTO:
01721       case OP_TYPEEXACT:
01722       if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
01723       break;
01724       }
01725 
01726     /* Add in the fixed length from the table */
01727 
01728     code += _pcre_OP_lengths[c];
01729 
01730     /* In UTF-8 mode, opcodes that are followed by a character may be followed
01731     by a multi-byte character. The length in the table is a minimum, so we have
01732     to arrange to skip the extra bytes. */
01733 
01734 #ifdef SUPPORT_UTF8
01735     if (utf8) switch(c)
01736       {
01737       case OP_CHAR:
01738       case OP_CHARNC:
01739       case OP_EXACT:
01740       case OP_UPTO:
01741       case OP_MINUPTO:
01742       case OP_POSUPTO:
01743       case OP_STAR:
01744       case OP_MINSTAR:
01745       case OP_POSSTAR:
01746       case OP_PLUS:
01747       case OP_MINPLUS:
01748       case OP_POSPLUS:
01749       case OP_QUERY:
01750       case OP_MINQUERY:
01751       case OP_POSQUERY:
01752       if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
01753       break;
01754       }
01755 #else
01756     (void)(utf8);  /* Keep compiler happy by referencing function argument */
01757 #endif
01758     }
01759   }
01760 }
01761 
01762 
01763 
01764 /*************************************************
01765 *    Scan compiled branch for non-emptiness      *
01766 *************************************************/
01767 
01768 /* This function scans through a branch of a compiled pattern to see whether it
01769 can match the empty string or not. It is called from could_be_empty()
01770 below and from compile_branch() when checking for an unlimited repeat of a
01771 group that can match nothing. Note that first_significant_code() skips over
01772 backward and negative forward assertions when its final argument is TRUE. If we
01773 hit an unclosed bracket, we return "empty" - this means we've struck an inner
01774 bracket whose current branch will already have been scanned.
01775 
01776 Arguments:
01777   code        points to start of search
01778   endcode     points to where to stop
01779   utf8        TRUE if in UTF8 mode
01780 
01781 Returns:      TRUE if what is matched could be empty
01782 */
01783 
01784 static BOOL
01785 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
01786 {
01787 register int c;
01788 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
01789      code < endcode;
01790      code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
01791   {
01792   const uschar *ccode;
01793 
01794   c = *code;
01795 
01796   /* Skip over forward assertions; the other assertions are skipped by
01797   first_significant_code() with a TRUE final argument. */
01798 
01799   if (c == OP_ASSERT)
01800     {
01801     do code += GET(code, 1); while (*code == OP_ALT);
01802     c = *code;
01803     continue;
01804     }
01805 
01806   /* Groups with zero repeats can of course be empty; skip them. */
01807 
01808   if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO)
01809     {
01810     code += _pcre_OP_lengths[c];
01811     do code += GET(code, 1); while (*code == OP_ALT);
01812     c = *code;
01813     continue;
01814     }
01815 
01816   /* For other groups, scan the branches. */
01817 
01818   if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
01819     {
01820     BOOL empty_branch;
01821     if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
01822 
01823     /* If a conditional group has only one branch, there is a second, implied,
01824     empty branch, so just skip over the conditional, because it could be empty.
01825     Otherwise, scan the individual branches of the group. */
01826 
01827     if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
01828       code += GET(code, 1);
01829     else
01830       {
01831       empty_branch = FALSE;
01832       do
01833         {
01834         if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
01835           empty_branch = TRUE;
01836         code += GET(code, 1);
01837         }
01838       while (*code == OP_ALT);
01839       if (!empty_branch) return FALSE;   /* All branches are non-empty */
01840       }
01841 
01842     c = *code;
01843     continue;
01844     }
01845 
01846   /* Handle the other opcodes */
01847 
01848   switch (c)
01849     {
01850     /* Check for quantifiers after a class. XCLASS is used for classes that
01851     cannot be represented just by a bit map. This includes negated single
01852     high-valued characters. The length in _pcre_OP_lengths[] is zero; the
01853     actual length is stored in the compiled code, so we must update "code"
01854     here. */
01855 
01856 #ifdef SUPPORT_UTF8
01857     case OP_XCLASS:
01858     ccode = code += GET(code, 1);
01859     goto CHECK_CLASS_REPEAT;
01860 #endif
01861 
01862     case OP_CLASS:
01863     case OP_NCLASS:
01864     ccode = code + 33;
01865 
01866 #ifdef SUPPORT_UTF8
01867     CHECK_CLASS_REPEAT:
01868 #endif
01869 
01870     switch (*ccode)
01871       {
01872       case OP_CRSTAR:            /* These could be empty; continue */
01873       case OP_CRMINSTAR:
01874       case OP_CRQUERY:
01875       case OP_CRMINQUERY:
01876       break;
01877 
01878       default:                   /* Non-repeat => class must match */
01879       case OP_CRPLUS:            /* These repeats aren't empty */
01880       case OP_CRMINPLUS:
01881       return FALSE;
01882 
01883       case OP_CRRANGE:
01884       case OP_CRMINRANGE:
01885       if (GET2(ccode, 1) > 0) return FALSE;  /* Minimum > 0 */
01886       break;
01887       }
01888     break;
01889 
01890     /* Opcodes that must match a character */
01891 
01892     case OP_PROP:
01893     case OP_NOTPROP:
01894     case OP_EXTUNI:
01895     case OP_NOT_DIGIT:
01896     case OP_DIGIT:
01897     case OP_NOT_WHITESPACE:
01898     case OP_WHITESPACE:
01899     case OP_NOT_WORDCHAR:
01900     case OP_WORDCHAR:
01901     case OP_ANY:
01902     case OP_ALLANY:
01903     case OP_ANYBYTE:
01904     case OP_CHAR:
01905     case OP_CHARNC:
01906     case OP_NOT:
01907     case OP_PLUS:
01908     case OP_MINPLUS:
01909     case OP_POSPLUS:
01910     case OP_EXACT:
01911     case OP_NOTPLUS:
01912     case OP_NOTMINPLUS:
01913     case OP_NOTPOSPLUS:
01914     case OP_NOTEXACT:
01915     case OP_TYPEPLUS:
01916     case OP_TYPEMINPLUS:
01917     case OP_TYPEPOSPLUS:
01918     case OP_TYPEEXACT:
01919     return FALSE;
01920 
01921     /* These are going to continue, as they may be empty, but we have to
01922     fudge the length for the \p and \P cases. */
01923 
01924     case OP_TYPESTAR:
01925     case OP_TYPEMINSTAR:
01926     case OP_TYPEPOSSTAR:
01927     case OP_TYPEQUERY:
01928     case OP_TYPEMINQUERY:
01929     case OP_TYPEPOSQUERY:
01930     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
01931     break;
01932 
01933     /* Same for these */
01934 
01935     case OP_TYPEUPTO:
01936     case OP_TYPEMINUPTO:
01937     case OP_TYPEPOSUPTO:
01938     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
01939     break;
01940 
01941     /* End of branch */
01942 
01943     case OP_KET:
01944     case OP_KETRMAX:
01945     case OP_KETRMIN:
01946     case OP_ALT:
01947     return TRUE;
01948 
01949     /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
01950     MINUPTO, and POSUPTO may be followed by a multibyte character */
01951 
01952 #ifdef SUPPORT_UTF8
01953     case OP_STAR:
01954     case OP_MINSTAR:
01955     case OP_POSSTAR:
01956     case OP_QUERY:
01957     case OP_MINQUERY:
01958     case OP_POSQUERY:
01959     if (utf8 && code[1] >= 0xc0) code += _pcre_utf8_table4[code[1] & 0x3f];
01960     break;
01961 
01962     case OP_UPTO:
01963     case OP_MINUPTO:
01964     case OP_POSUPTO:
01965     if (utf8 && code[3] >= 0xc0) code += _pcre_utf8_table4[code[3] & 0x3f];
01966     break;
01967 #endif
01968     }
01969   }
01970 
01971 return TRUE;
01972 }
01973 
01974 
01975 
01976 /*************************************************
01977 *    Scan compiled regex for non-emptiness       *
01978 *************************************************/
01979 
01980 /* This function is called to check for left recursive calls. We want to check
01981 the current branch of the current pattern to see if it could match the empty
01982 string. If it could, we must look outwards for branches at other levels,
01983 stopping when we pass beyond the bracket which is the subject of the recursion.
01984 
01985 Arguments:
01986   code        points to start of the recursion
01987   endcode     points to where to stop (current RECURSE item)
01988   bcptr       points to the chain of current (unclosed) branch starts
01989   utf8        TRUE if in UTF-8 mode
01990 
01991 Returns:      TRUE if what is matched could be empty
01992 */
01993 
01994 static BOOL
01995 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
01996   BOOL utf8)
01997 {
01998 while (bcptr != NULL && bcptr->current >= code)
01999   {
02000   if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
02001   bcptr = bcptr->outer;
02002   }
02003 return TRUE;
02004 }
02005 
02006 
02007 
02008 /*************************************************
02009 *           Check for POSIX class syntax         *
02010 *************************************************/
02011 
02012 /* This function is called when the sequence "[:" or "[." or "[=" is
02013 encountered in a character class. It checks whether this is followed by a
02014 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
02015 reach an unescaped ']' without the special preceding character, return FALSE.
02016 
02017 Originally, this function only recognized a sequence of letters between the
02018 terminators, but it seems that Perl recognizes any sequence of characters,
02019 though of course unknown POSIX names are subsequently rejected. Perl gives an
02020 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
02021 didn't consider this to be a POSIX class. Likewise for [:1234:].
02022 
02023 The problem in trying to be exactly like Perl is in the handling of escapes. We
02024 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
02025 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
02026 below handles the special case of \], but does not try to do any other escape
02027 processing. This makes it different from Perl for cases such as [:l\ower:]
02028 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
02029 "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
02030 I think.
02031 
02032 Arguments:
02033   ptr      pointer to the initial [
02034   endptr   where to return the end pointer
02035 
02036 Returns:   TRUE or FALSE
02037 */
02038 
02039 static BOOL
02040 check_posix_syntax(const uschar *ptr, const uschar **endptr)
02041 {
02042 int terminator;          /* Don't combine these lines; the Solaris cc */
02043 terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
02044 for (++ptr; *ptr != 0; ptr++)
02045   {
02046   if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) ptr++; else
02047     {
02048     if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
02049     if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
02050       {
02051       *endptr = ptr;
02052       return TRUE;
02053       }
02054     }
02055   }
02056 return FALSE;
02057 }
02058 
02059 
02060 
02061 
02062 /*************************************************
02063 *          Check POSIX class name                *
02064 *************************************************/
02065 
02066 /* This function is called to check the name given in a POSIX-style class entry
02067 such as [:alnum:].
02068 
02069 Arguments:
02070   ptr        points to the first letter
02071   len        the length of the name
02072 
02073 Returns:     a value representing the name, or -1 if unknown
02074 */
02075 
02076 static int
02077 check_posix_name(const uschar *ptr, int len)
02078 {
02079 const char *pn = posix_names;
02080 register int yield = 0;
02081 while (posix_name_lengths[yield] != 0)
02082   {
02083   if (len == posix_name_lengths[yield] &&
02084     strncmp((const char *)ptr, pn, len) == 0) return yield;
02085   pn += posix_name_lengths[yield] + 1;
02086   yield++;
02087   }
02088 return -1;
02089 }
02090 
02091 
02092 /*************************************************
02093 *    Adjust OP_RECURSE items in repeated group   *
02094 *************************************************/
02095 
02096 /* OP_RECURSE items contain an offset from the start of the regex to the group
02097 that is referenced. This means that groups can be replicated for fixed
02098 repetition simply by copying (because the recursion is allowed to refer to
02099 earlier groups that are outside the current group). However, when a group is
02100 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
02101 inserted before it, after it has been compiled. This means that any OP_RECURSE
02102 items within it that refer to the group itself or any contained groups have to
02103 have their offsets adjusted. That one of the jobs of this function. Before it
02104 is called, the partially compiled regex must be temporarily terminated with
02105 OP_END.
02106 
02107 This function has been extended with the possibility of forward references for
02108 recursions and subroutine calls. It must also check the list of such references
02109 for the group we are dealing with. If it finds that one of the recursions in
02110 the current group is on this list, it adjusts the offset in the list, not the
02111 value in the reference (which is a group number).
02112 
02113 Arguments:
02114   group      points to the start of the group
02115   adjust     the amount by which the group is to be moved
02116   utf8       TRUE in UTF-8 mode
02117   cd         contains pointers to tables etc.
02118   save_hwm   the hwm forward reference pointer at the start of the group
02119 
02120 Returns:     nothing
02121 */
02122 
02123 static void
02124 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
02125   uschar *save_hwm)
02126 {
02127 uschar *ptr = group;
02128 
02129 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
02130   {
02131   int offset;
02132   uschar *hc;
02133 
02134   /* See if this recursion is on the forward reference list. If so, adjust the
02135   reference. */
02136 
02137   for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
02138     {
02139     offset = GET(hc, 0);
02140     if (cd->start_code + offset == ptr + 1)
02141       {
02142       PUT(hc, 0, offset + adjust);
02143       break;
02144       }
02145     }
02146 
02147   /* Otherwise, adjust the recursion offset if it's after the start of this
02148   group. */
02149 
02150   if (hc >= cd->hwm)
02151     {
02152     offset = GET(ptr, 1);
02153     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
02154     }
02155 
02156   ptr += 1 + LINK_SIZE;
02157   }
02158 }
02159 
02160 
02161 
02162 /*************************************************
02163 *        Insert an automatic callout point       *
02164 *************************************************/
02165 
02166 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
02167 callout points before each pattern item.
02168 
02169 Arguments:
02170   code           current code pointer
02171   ptr            current pattern pointer
02172   cd             pointers to tables etc
02173 
02174 Returns:         new code pointer
02175 */
02176 
02177 static uschar *
02178 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
02179 {
02180 *code++ = OP_CALLOUT;
02181 *code++ = 255;
02182 PUT(code, 0, ptr - cd->start_pattern);  /* Pattern offset */
02183 PUT(code, LINK_SIZE, 0);                /* Default length */
02184 return code + 2*LINK_SIZE;
02185 }
02186 
02187 
02188 
02189 /*************************************************
02190 *         Complete a callout item                *
02191 *************************************************/
02192 
02193 /* A callout item contains the length of the next item in the pattern, which
02194 we can't fill in till after we have reached the relevant point. This is used
02195 for both automatic and manual callouts.
02196 
02197 Arguments:
02198   previous_callout   points to previous callout item
02199   ptr                current pattern pointer
02200   cd                 pointers to tables etc
02201 
02202 Returns:             nothing
02203 */
02204 
02205 static void
02206 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
02207 {
02208 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
02209 PUT(previous_callout, 2 + LINK_SIZE, length);
02210 }
02211 
02212 
02213 
02214 #ifdef SUPPORT_UCP
02215 /*************************************************
02216 *           Get othercase range                  *
02217 *************************************************/
02218 
02219 /* This function is passed the start and end of a class range, in UTF-8 mode
02220 with UCP support. It searches up the characters, looking for internal ranges of
02221 characters in the "other" case. Each call returns the next one, updating the
02222 start address.
02223 
02224 Arguments:
02225   cptr        points to starting character value; updated
02226   d           end value
02227   ocptr       where to put start of othercase range
02228   odptr       where to put end of othercase range
02229 
02230 Yield:        TRUE when range returned; FALSE when no more
02231 */
02232 
02233 static BOOL
02234 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
02235   unsigned int *odptr)
02236 {
02237 unsigned int c, othercase, next;
02238 
02239 for (c = *cptr; c <= d; c++)
02240   { if ((othercase = UCD_OTHERCASE(c)) != c) break; }
02241 
02242 if (c > d) return FALSE;
02243 
02244 *ocptr = othercase;
02245 next = othercase + 1;
02246 
02247 for (++c; c <= d; c++)
02248   {
02249   if (UCD_OTHERCASE(c) != next) break;
02250   next++;
02251   }
02252 
02253 *odptr = next - 1;
02254 *cptr = c;
02255 
02256 return TRUE;
02257 }
02258 #endif  /* SUPPORT_UCP */
02259 
02260 
02261 
02262 /*************************************************
02263 *     Check if auto-possessifying is possible    *
02264 *************************************************/
02265 
02266 /* This function is called for unlimited repeats of certain items, to see
02267 whether the next thing could possibly match the repeated item. If not, it makes
02268 sense to automatically possessify the repeated item.
02269 
02270 Arguments:
02271   op_code       the repeated op code
02272   this          data for this item, depends on the opcode
02273   utf8          TRUE in UTF-8 mode
02274   utf8_char     used for utf8 character bytes, NULL if not relevant
02275   ptr           next character in pattern
02276   options       options bits
02277   cd            contains pointers to tables etc.
02278 
02279 Returns:        TRUE if possessifying is wanted
02280 */
02281 
02282 static BOOL
02283 check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
02284   const uschar *ptr, int options, compile_data *cd)
02285 {
02286 int next;
02287 
02288 /* Skip whitespace and comments in extended mode */
02289 
02290 if ((options & PCRE_EXTENDED) != 0)
02291   {
02292   for (;;)
02293     {
02294     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
02295     if (*ptr == CHAR_NUMBER_SIGN)
02296       {
02297       while (*(++ptr) != 0)
02298         if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
02299       }
02300     else break;
02301     }
02302   }
02303 
02304 /* If the next item is one that we can handle, get its value. A non-negative
02305 value is a character, a negative value is an escape value. */
02306 
02307 if (*ptr == CHAR_BACKSLASH)
02308   {
02309   int temperrorcode = 0;
02310   next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
02311   if (temperrorcode != 0) return FALSE;
02312   ptr++;    /* Point after the escape sequence */
02313   }
02314 
02315 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
02316   {
02317 #ifdef SUPPORT_UTF8
02318   if (utf8) { GETCHARINC(next, ptr); } else
02319 #endif
02320   next = *ptr++;
02321   }
02322 
02323 else return FALSE;
02324 
02325 /* Skip whitespace and comments in extended mode */
02326 
02327 if ((options & PCRE_EXTENDED) != 0)
02328   {
02329   for (;;)
02330     {
02331     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
02332     if (*ptr == CHAR_NUMBER_SIGN)
02333       {
02334       while (*(++ptr) != 0)
02335         if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
02336       }
02337     else break;
02338     }
02339   }
02340 
02341 /* If the next thing is itself optional, we have to give up. */
02342 
02343 if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
02344   strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
02345     return FALSE;
02346 
02347 /* Now compare the next item with the previous opcode. If the previous is a
02348 positive single character match, "item" either contains the character or, if
02349 "item" is greater than 127 in utf8 mode, the character's bytes are in
02350 utf8_char. */
02351 
02352 
02353 /* Handle cases when the next item is a character. */
02354 
02355 if (next >= 0) switch(op_code)
02356   {
02357   case OP_CHAR:
02358 #ifdef SUPPORT_UTF8
02359   if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
02360 #else
02361   (void)(utf8_char);  /* Keep compiler happy by referencing function argument */
02362 #endif
02363   return item != next;
02364 
02365   /* For CHARNC (caseless character) we must check the other case. If we have
02366   Unicode property support, we can use it to test the other case of
02367   high-valued characters. */
02368 
02369   case OP_CHARNC:
02370 #ifdef SUPPORT_UTF8
02371   if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
02372 #endif
02373   if (item == next) return FALSE;
02374 #ifdef SUPPORT_UTF8
02375   if (utf8)
02376     {
02377     unsigned int othercase;
02378     if (next < 128) othercase = cd->fcc[next]; else
02379 #ifdef SUPPORT_UCP
02380     othercase = UCD_OTHERCASE((unsigned int)next);
02381 #else
02382     othercase = NOTACHAR;
02383 #endif
02384     return (unsigned int)item != othercase;
02385     }
02386   else
02387 #endif  /* SUPPORT_UTF8 */
02388   return (item != cd->fcc[next]);  /* Non-UTF-8 mode */
02389 
02390   /* For OP_NOT, "item" must be a single-byte character. */
02391 
02392   case OP_NOT:
02393   if (item == next) return TRUE;
02394   if ((options & PCRE_CASELESS) == 0) return FALSE;
02395 #ifdef SUPPORT_UTF8
02396   if (utf8)
02397     {
02398     unsigned int othercase;
02399     if (next < 128) othercase = cd->fcc[next]; else
02400 #ifdef SUPPORT_UCP
02401     othercase = UCD_OTHERCASE(next);
02402 #else
02403     othercase = NOTACHAR;
02404 #endif
02405     return (unsigned int)item == othercase;
02406     }
02407   else
02408 #endif  /* SUPPORT_UTF8 */
02409   return (item == cd->fcc[next]);  /* Non-UTF-8 mode */
02410 
02411   case OP_DIGIT:
02412   return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
02413 
02414   case OP_NOT_DIGIT:
02415   return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
02416 
02417   case OP_WHITESPACE:
02418   return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
02419 
02420   case OP_NOT_WHITESPACE:
02421   return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
02422 
02423   case OP_WORDCHAR:
02424   return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
02425 
02426   case OP_NOT_WORDCHAR:
02427   return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
02428 
02429   case OP_HSPACE:
02430   case OP_NOT_HSPACE:
02431   switch(next)
02432     {
02433     case 0x09:
02434     case 0x20:
02435     case 0xa0:
02436     case 0x1680:
02437     case 0x180e:
02438     case 0x2000:
02439     case 0x2001:
02440     case 0x2002:
02441     case 0x2003:
02442     case 0x2004:
02443     case 0x2005:
02444     case 0x2006:
02445     case 0x2007:
02446     case 0x2008:
02447     case 0x2009:
02448     case 0x200A:
02449     case 0x202f:
02450     case 0x205f:
02451     case 0x3000:
02452     return op_code != OP_HSPACE;
02453     default:
02454     return op_code == OP_HSPACE;
02455     }
02456 
02457   case OP_VSPACE:
02458   case OP_NOT_VSPACE:
02459   switch(next)
02460     {
02461     case 0x0a:
02462     case 0x0b:
02463     case 0x0c:
02464     case 0x0d:
02465     case 0x85:
02466     case 0x2028:
02467     case 0x2029:
02468     return op_code != OP_VSPACE;
02469     default:
02470     return op_code == OP_VSPACE;
02471     }
02472 
02473   default:
02474   return FALSE;
02475   }
02476 
02477 
02478 /* Handle the case when the next item is \d, \s, etc. */
02479 
02480 switch(op_code)
02481   {
02482   case OP_CHAR:
02483   case OP_CHARNC:
02484 #ifdef SUPPORT_UTF8
02485   if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
02486 #endif
02487   switch(-next)
02488     {
02489     case ESC_d:
02490     return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
02491 
02492     case ESC_D:
02493     return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
02494 
02495     case ESC_s:
02496     return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
02497 
02498     case ESC_S:
02499     return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
02500 
02501     case ESC_w:
02502     return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
02503 
02504     case ESC_W:
02505     return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
02506 
02507     case ESC_h:
02508     case ESC_H:
02509     switch(item)
02510       {
02511       case 0x09:
02512       case 0x20:
02513       case 0xa0:
02514       case 0x1680:
02515       case 0x180e:
02516       case 0x2000:
02517       case 0x2001:
02518       case 0x2002:
02519       case 0x2003:
02520       case 0x2004:
02521       case 0x2005:
02522       case 0x2006:
02523       case 0x2007:
02524       case 0x2008:
02525       case 0x2009:
02526       case 0x200A:
02527       case 0x202f:
02528       case 0x205f:
02529       case 0x3000:
02530       return -next != ESC_h;
02531       default:
02532       return -next == ESC_h;
02533       }
02534 
02535     case ESC_v:
02536     case ESC_V:
02537     switch(item)
02538       {
02539       case 0x0a:
02540       case 0x0b:
02541       case 0x0c:
02542       case 0x0d:
02543       case 0x85:
02544       case 0x2028:
02545       case 0x2029:
02546       return -next != ESC_v;
02547       default:
02548       return -next == ESC_v;
02549       }
02550 
02551     default:
02552     return FALSE;
02553     }
02554 
02555   case OP_DIGIT:
02556   return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
02557          next == -ESC_h || next == -ESC_v;
02558 
02559   case OP_NOT_DIGIT:
02560   return next == -ESC_d;
02561 
02562   case OP_WHITESPACE:
02563   return next == -ESC_S || next == -ESC_d || next == -ESC_w;
02564 
02565   case OP_NOT_WHITESPACE:
02566   return next == -ESC_s || next == -ESC_h || next == -ESC_v;
02567 
02568   case OP_HSPACE:
02569   return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
02570 
02571   case OP_NOT_HSPACE:
02572   return next == -ESC_h;
02573 
02574   /* Can't have \S in here because VT matches \S (Perl anomaly) */
02575   case OP_VSPACE:
02576   return next == -ESC_V || next == -ESC_d || next == -ESC_w;
02577 
02578   case OP_NOT_VSPACE:
02579   return next == -ESC_v;
02580 
02581   case OP_WORDCHAR:
02582   return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
02583 
02584   case OP_NOT_WORDCHAR:
02585   return next == -ESC_w || next == -ESC_d;
02586 
02587   default:
02588   return FALSE;
02589   }
02590 
02591 /* Control does not reach here */
02592 }
02593 
02594 
02595 
02596 /*************************************************
02597 *           Compile one branch                   *
02598 *************************************************/
02599 
02600 /* Scan the pattern, compiling it into the a vector. If the options are
02601 changed during the branch, the pointer is used to change the external options
02602 bits. This function is used during the pre-compile phase when we are trying
02603 to find out the amount of memory needed, as well as during the real compile
02604 phase. The value of lengthptr distinguishes the two phases.
02605 
02606 Arguments:
02607   optionsptr     pointer to the option bits
02608   codeptr        points to the pointer to the current code point
02609   ptrptr         points to the current pattern pointer
02610   errorcodeptr   points to error code variable
02611   firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
02612   reqbyteptr     set to the last literal character required, else < 0
02613   bcptr          points to current branch chain
02614   cd             contains pointers to tables etc.
02615   lengthptr      NULL during the real compile phase
02616                  points to length accumulator during pre-compile phase
02617 
02618 Returns:         TRUE on success
02619                  FALSE, with *errorcodeptr set non-zero on error
02620 */
02621 
02622 static BOOL
02623 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
02624   int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
02625   compile_data *cd, int *lengthptr)
02626 {
02627 int repeat_type, op_type;
02628 int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
02629 int bravalue = 0;
02630 int greedy_default, greedy_non_default;
02631 int firstbyte, reqbyte;
02632 int zeroreqbyte, zerofirstbyte;
02633 int req_caseopt, reqvary, tempreqvary;
02634 int options = *optionsptr;
02635 int after_manual_callout = 0;
02636 int length_prevgroup = 0;
02637 register int c;
02638 register uschar *code = *codeptr;
02639 uschar *last_code = code;
02640 uschar *orig_code = code;
02641 uschar *tempcode;
02642 BOOL inescq = FALSE;
02643 BOOL groupsetfirstbyte = FALSE;
02644 const uschar *ptr = *ptrptr;
02645 const uschar *tempptr;
02646 uschar *previous = NULL;
02647 uschar *previous_callout = NULL;
02648 uschar *save_hwm = NULL;
02649 uschar classbits[32];
02650 
02651 #ifdef SUPPORT_UTF8
02652 BOOL class_utf8;
02653 BOOL utf8 = (options & PCRE_UTF8) != 0;
02654 uschar *class_utf8data;
02655 uschar *class_utf8data_base;
02656 uschar utf8_char[6];
02657 #else
02658 BOOL utf8 = FALSE;
02659 uschar *utf8_char = NULL;
02660 #endif
02661 
02662 #ifdef DEBUG
02663 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
02664 #endif
02665 
02666 /* Set up the default and non-default settings for greediness */
02667 
02668 greedy_default = ((options & PCRE_UNGREEDY) != 0);
02669 greedy_non_default = greedy_default ^ 1;
02670 
02671 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
02672 matching encountered yet". It gets changed to REQ_NONE if we hit something that
02673 matches a non-fixed char first char; reqbyte just remains unset if we never
02674 find one.
02675 
02676 When we hit a repeat whose minimum is zero, we may have to adjust these values
02677 to take the zero repeat into account. This is implemented by setting them to
02678 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
02679 item types that can be repeated set these backoff variables appropriately. */
02680 
02681 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
02682 
02683 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
02684 according to the current setting of the caseless flag. REQ_CASELESS is a bit
02685 value > 255. It is added into the firstbyte or reqbyte variables to record the
02686 case status of the value. This is used only for ASCII characters. */
02687 
02688 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
02689 
02690 /* Switch on next character until the end of the branch */
02691 
02692 for (;; ptr++)
02693   {
02694   BOOL negate_class;
02695   BOOL should_flip_negation;
02696   BOOL possessive_quantifier;
02697   BOOL is_quantifier;
02698   BOOL is_recurse;
02699   BOOL reset_bracount;
02700   int class_charcount;
02701   int class_lastchar;
02702   int newoptions;
02703   int recno;
02704   int refsign;
02705   int skipbytes;
02706   int subreqbyte;
02707   int subfirstbyte;
02708   int terminator;
02709   int mclength;
02710   uschar mcbuffer[8];
02711 
02712   /* Get next byte in the pattern */
02713 
02714   c = *ptr;
02715 
02716   /* If we are in the pre-compile phase, accumulate the length used for the
02717   previous cycle of this loop. */
02718 
02719   if (lengthptr != NULL)
02720     {
02721 #ifdef DEBUG
02722     if (code > cd->hwm) cd->hwm = code;                 /* High water info */
02723 #endif
02724     if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
02725       {
02726       *errorcodeptr = ERR52;
02727       goto FAILED;
02728       }
02729 
02730     /* There is at least one situation where code goes backwards: this is the
02731     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
02732     the class is simply eliminated. However, it is created first, so we have to
02733     allow memory for it. Therefore, don't ever reduce the length at this point.
02734     */
02735 
02736     if (code < last_code) code = last_code;
02737 
02738     /* Paranoid check for integer overflow */
02739 
02740     if (OFLOW_MAX - *lengthptr < code - last_code)
02741       {
02742       *errorcodeptr = ERR20;
02743       goto FAILED;
02744       }
02745 
02746     *lengthptr += code - last_code;
02747     DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
02748 
02749     /* If "previous" is set and it is not at the start of the work space, move
02750     it back to there, in order to avoid filling up the work space. Otherwise,
02751     if "previous" is NULL, reset the current code pointer to the start. */
02752 
02753     if (previous != NULL)
02754       {
02755       if (previous > orig_code)
02756         {
02757         memmove(orig_code, previous, code - previous);
02758         code -= previous - orig_code;
02759         previous = orig_code;
02760         }
02761       }
02762     else code = orig_code;
02763 
02764     /* Remember where this code item starts so we can pick up the length
02765     next time round. */
02766 
02767     last_code = code;
02768     }
02769 
02770   /* In the real compile phase, just check the workspace used by the forward
02771   reference list. */
02772 
02773   else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
02774     {
02775     *errorcodeptr = ERR52;
02776     goto FAILED;
02777     }
02778 
02779   /* If in \Q...\E, check for the end; if not, we have a literal */
02780 
02781   if (inescq && c != 0)
02782     {
02783     if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
02784       {
02785       inescq = FALSE;
02786       ptr++;
02787       continue;
02788       }
02789     else
02790       {
02791       if (previous_callout != NULL)
02792         {
02793         if (lengthptr == NULL)  /* Don't attempt in pre-compile phase */
02794           complete_callout(previous_callout, ptr, cd);
02795         previous_callout = NULL;
02796         }
02797       if ((options & PCRE_AUTO_CALLOUT) != 0)
02798         {
02799         previous_callout = code;
02800         code = auto_callout(code, ptr, cd);
02801         }
02802       goto NORMAL_CHAR;
02803       }
02804     }
02805 
02806   /* Fill in length of a previous callout, except when the next thing is
02807   a quantifier. */
02808 
02809   is_quantifier =
02810     c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
02811     (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
02812 
02813   if (!is_quantifier && previous_callout != NULL &&
02814        after_manual_callout-- <= 0)
02815     {
02816     if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
02817       complete_callout(previous_callout, ptr, cd);
02818     previous_callout = NULL;
02819     }
02820 
02821   /* In extended mode, skip white space and comments */
02822 
02823   if ((options & PCRE_EXTENDED) != 0)
02824     {
02825     if ((cd->ctypes[c] & ctype_space) != 0) continue;
02826     if (c == CHAR_NUMBER_SIGN)
02827       {
02828       while (*(++ptr) != 0)
02829         {
02830         if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
02831         }
02832       if (*ptr != 0) continue;
02833 
02834       /* Else fall through to handle end of string */
02835       c = 0;
02836       }
02837     }
02838 
02839   /* No auto callout for quantifiers. */
02840 
02841   if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
02842     {
02843     previous_callout = code;
02844     code = auto_callout(code, ptr, cd);
02845     }
02846 
02847   switch(c)
02848     {
02849     /* ===================================================================*/
02850     case 0:                        /* The branch terminates at string end */
02851     case CHAR_VERTICAL_LINE:       /* or | or ) */
02852     case CHAR_RIGHT_PARENTHESIS:
02853     *firstbyteptr = firstbyte;
02854     *reqbyteptr = reqbyte;
02855     *codeptr = code;
02856     *ptrptr = ptr;
02857     if (lengthptr != NULL)
02858       {
02859       if (OFLOW_MAX - *lengthptr < code - last_code)
02860         {
02861         *errorcodeptr = ERR20;
02862         goto FAILED;
02863         }
02864       *lengthptr += code - last_code;   /* To include callout length */
02865       DPRINTF((">> end branch\n"));
02866       }
02867     return TRUE;
02868 
02869 
02870     /* ===================================================================*/
02871     /* Handle single-character metacharacters. In multiline mode, ^ disables
02872     the setting of any following char as a first character. */
02873 
02874     case CHAR_CIRCUMFLEX_ACCENT:
02875     if ((options & PCRE_MULTILINE) != 0)
02876       {
02877       if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
02878       }
02879     previous = NULL;
02880     *code++ = OP_CIRC;
02881     break;
02882 
02883     case CHAR_DOLLAR_SIGN:
02884     previous = NULL;
02885     *code++ = OP_DOLL;
02886     break;
02887 
02888     /* There can never be a first char if '.' is first, whatever happens about
02889     repeats. The value of reqbyte doesn't change either. */
02890 
02891     case CHAR_DOT:
02892     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
02893     zerofirstbyte = firstbyte;
02894     zeroreqbyte = reqbyte;
02895     previous = code;
02896     *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
02897     break;
02898 
02899 
02900     /* ===================================================================*/
02901     /* Character classes. If the included characters are all < 256, we build a
02902     32-byte bitmap of the permitted characters, except in the special case
02903     where there is only one such character. For negated classes, we build the
02904     map as usual, then invert it at the end. However, we use a different opcode
02905     so that data characters > 255 can be handled correctly.
02906 
02907     If the class contains characters outside the 0-255 range, a different
02908     opcode is compiled. It may optionally have a bit map for characters < 256,
02909     but those above are are explicitly listed afterwards. A flag byte tells
02910     whether the bitmap is present, and whether this is a negated class or not.
02911 
02912     In JavaScript compatibility mode, an isolated ']' causes an error. In
02913     default (Perl) mode, it is treated as a data character. */
02914 
02915     case CHAR_RIGHT_SQUARE_BRACKET:
02916     if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
02917       {
02918       *errorcodeptr = ERR64;
02919       goto FAILED;
02920       }
02921     goto NORMAL_CHAR;
02922 
02923     case CHAR_LEFT_SQUARE_BRACKET:
02924     previous = code;
02925 
02926     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
02927     they are encountered at the top level, so we'll do that too. */
02928 
02929     if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
02930          ptr[1] == CHAR_EQUALS_SIGN) &&
02931         check_posix_syntax(ptr, &tempptr))
02932       {
02933       *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
02934       goto FAILED;
02935       }
02936 
02937     /* If the first character is '^', set the negation flag and skip it. Also,
02938     if the first few characters (either before or after ^) are \Q\E or \E we
02939     skip them too. This makes for compatibility with Perl. */
02940 
02941     negate_class = FALSE;
02942     for (;;)
02943       {
02944       c = *(++ptr);
02945       if (c == CHAR_BACKSLASH)
02946         {
02947         if (ptr[1] == CHAR_E)
02948           ptr++;
02949         else if (strncmp((const char *)ptr+1,
02950                           STR_Q STR_BACKSLASH STR_E, 3) == 0)
02951           ptr += 3;
02952         else
02953           break;
02954         }
02955       else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
02956         negate_class = TRUE;
02957       else break;
02958       }
02959 
02960     /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
02961     an initial ']' is taken as a data character -- the code below handles
02962     that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
02963     [^] must match any character, so generate OP_ALLANY. */
02964 
02965     if (c == CHAR_RIGHT_SQUARE_BRACKET &&
02966         (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
02967       {
02968       *code++ = negate_class? OP_ALLANY : OP_FAIL;
02969       if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
02970       zerofirstbyte = firstbyte;
02971       break;
02972       }
02973 
02974     /* If a class contains a negative special such as \S, we need to flip the
02975     negation flag at the end, so that support for characters > 255 works
02976     correctly (they are all included in the class). */
02977 
02978     should_flip_negation = FALSE;
02979 
02980     /* Keep a count of chars with values < 256 so that we can optimize the case
02981     of just a single character (as long as it's < 256). However, For higher
02982     valued UTF-8 characters, we don't yet do any optimization. */
02983 
02984     class_charcount = 0;
02985     class_lastchar = -1;
02986 
02987     /* Initialize the 32-char bit map to all zeros. We build the map in a
02988     temporary bit of memory, in case the class contains only 1 character (less
02989     than 256), because in that case the compiled code doesn't use the bit map.
02990     */
02991 
02992     memset(classbits, 0, 32 * sizeof(uschar));
02993 
02994 #ifdef SUPPORT_UTF8
02995     class_utf8 = FALSE;                       /* No chars >= 256 */
02996     class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */
02997     class_utf8data_base = class_utf8data;     /* For resetting in pass 1 */
02998 #endif
02999 
03000     /* Process characters until ] is reached. By writing this as a "do" it
03001     means that an initial ] is taken as a data character. At the start of the
03002     loop, c contains the first byte of the character. */
03003 
03004     if (c != 0) do
03005       {
03006       const uschar *oldptr;
03007 
03008 #ifdef SUPPORT_UTF8
03009       if (utf8 && c > 127)
03010         {                           /* Braces are required because the */
03011         GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
03012         }
03013 
03014       /* In the pre-compile phase, accumulate the length of any UTF-8 extra
03015       data and reset the pointer. This is so that very large classes that
03016       contain a zillion UTF-8 characters no longer overwrite the work space
03017       (which is on the stack). */
03018 
03019       if (lengthptr != NULL)
03020         {
03021         *lengthptr += class_utf8data - class_utf8data_base;
03022         class_utf8data = class_utf8data_base;
03023         }
03024 
03025 #endif
03026 
03027       /* Inside \Q...\E everything is literal except \E */
03028 
03029       if (inescq)
03030         {
03031         if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)  /* If we are at \E */
03032           {
03033           inescq = FALSE;                   /* Reset literal state */
03034           ptr++;                            /* Skip the 'E' */
03035           continue;                         /* Carry on with next */
03036           }
03037         goto CHECK_RANGE;                   /* Could be range if \E follows */
03038         }
03039 
03040       /* Handle POSIX class names. Perl allows a negation extension of the
03041       form [:^name:]. A square bracket that doesn't match the syntax is
03042       treated as a literal. We also recognize the POSIX constructions
03043       [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
03044       5.6 and 5.8 do. */
03045 
03046       if (c == CHAR_LEFT_SQUARE_BRACKET &&
03047           (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
03048            ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
03049         {
03050         BOOL local_negate = FALSE;
03051         int posix_class, taboffset, tabopt;
03052         register const uschar *cbits = cd->cbits;
03053         uschar pbits[32];
03054 
03055         if (ptr[1] != CHAR_COLON)
03056           {
03057           *errorcodeptr = ERR31;
03058           goto FAILED;
03059           }
03060 
03061         ptr += 2;
03062         if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
03063           {
03064           local_negate = TRUE;
03065           should_flip_negation = TRUE;  /* Note negative special */
03066           ptr++;
03067           }
03068 
03069         posix_class = check_posix_name(ptr, tempptr - ptr);
03070         if (posix_class < 0)
03071           {
03072           *errorcodeptr = ERR30;
03073           goto FAILED;
03074           }
03075 
03076         /* If matching is caseless, upper and lower are converted to
03077         alpha. This relies on the fact that the class table starts with
03078         alpha, lower, upper as the first 3 entries. */
03079 
03080         if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
03081           posix_class = 0;
03082 
03083         /* We build the bit map for the POSIX class in a chunk of local store
03084         because we may be adding and subtracting from it, and we don't want to
03085         subtract bits that may be in the main map already. At the end we or the
03086         result into the bit map that is being built. */
03087 
03088         posix_class *= 3;
03089 
03090         /* Copy in the first table (always present) */
03091 
03092         memcpy(pbits, cbits + posix_class_maps[posix_class],
03093           32 * sizeof(uschar));
03094 
03095         /* If there is a second table, add or remove it as required. */
03096 
03097         taboffset = posix_class_maps[posix_class + 1];
03098         tabopt = posix_class_maps[posix_class + 2];
03099 
03100         if (taboffset >= 0)
03101           {
03102           if (tabopt >= 0)
03103             for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
03104           else
03105             for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
03106           }
03107 
03108         /* Not see if we need to remove any special characters. An option
03109         value of 1 removes vertical space and 2 removes underscore. */
03110 
03111         if (tabopt < 0) tabopt = -tabopt;
03112         if (tabopt == 1) pbits[1] &= ~0x3c;
03113           else if (tabopt == 2) pbits[11] &= 0x7f;
03114 
03115         /* Add the POSIX table or its complement into the main table that is
03116         being built and we are done. */
03117 
03118         if (local_negate)
03119           for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
03120         else
03121           for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
03122 
03123         ptr = tempptr + 1;
03124         class_charcount = 10;  /* Set > 1; assumes more than 1 per class */
03125         continue;    /* End of POSIX syntax handling */
03126         }
03127 
03128       /* Backslash may introduce a single character, or it may introduce one
03129       of the specials, which just set a flag. The sequence \b is a special
03130       case. Inside a class (and only there) it is treated as backspace.
03131       Elsewhere it marks a word boundary. Other escapes have preset maps ready
03132       to 'or' into the one we are building. We assume they have more than one
03133       character in them, so set class_charcount bigger than one. */
03134 
03135       if (c == CHAR_BACKSLASH)
03136         {
03137         c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
03138         if (*errorcodeptr != 0) goto FAILED;
03139 
03140         if (-c == ESC_b) c = CHAR_BS;       /* \b is backspace in a class */
03141         else if (-c == ESC_X) c = CHAR_X;   /* \X is literal X in a class */
03142         else if (-c == ESC_R) c = CHAR_R;   /* \R is literal R in a class */
03143         else if (-c == ESC_Q)            /* Handle start of quoted string */
03144           {
03145           if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
03146             {
03147             ptr += 2; /* avoid empty string */
03148             }
03149           else inescq = TRUE;
03150           continue;
03151           }
03152         else if (-c == ESC_E) continue;  /* Ignore orphan \E */
03153 
03154         if (c < 0)
03155           {
03156           register const uschar *cbits = cd->cbits;
03157           class_charcount += 2;     /* Greater than 1 is what matters */
03158 
03159           /* Save time by not doing this in the pre-compile phase. */
03160 
03161           if (lengthptr == NULL) switch (-c)
03162             {
03163             case ESC_d:
03164             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
03165             continue;
03166 
03167             case ESC_D:
03168             should_flip_negation = TRUE;
03169             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
03170             continue;
03171 
03172             case ESC_w:
03173             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
03174             continue;
03175 
03176             case ESC_W:
03177             should_flip_negation = TRUE;
03178             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
03179             continue;
03180 
03181             case ESC_s:
03182             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
03183             classbits[1] &= ~0x08;   /* Perl 5.004 onwards omits VT from \s */
03184             continue;
03185 
03186             case ESC_S:
03187             should_flip_negation = TRUE;
03188             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
03189             classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
03190             continue;
03191 
03192             default:    /* Not recognized; fall through */
03193             break;      /* Need "default" setting to stop compiler warning. */
03194             }
03195 
03196           /* In the pre-compile phase, just do the recognition. */
03197 
03198           else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
03199                    c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
03200 
03201           /* We need to deal with \H, \h, \V, and \v in both phases because
03202           they use extra memory. */
03203 
03204           if (-c == ESC_h)
03205             {
03206             SETBIT(classbits, 0x09); /* VT */
03207             SETBIT(classbits, 0x20); /* SPACE */
03208             SETBIT(classbits, 0xa0); /* NSBP */
03209 #ifdef SUPPORT_UTF8
03210             if (utf8)
03211               {
03212               class_utf8 = TRUE;
03213               *class_utf8data++ = XCL_SINGLE;
03214               class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
03215               *class_utf8data++ = XCL_SINGLE;
03216               class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
03217               *class_utf8data++ = XCL_RANGE;
03218               class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
03219               class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
03220               *class_utf8data++ = XCL_SINGLE;
03221               class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
03222               *class_utf8data++ = XCL_SINGLE;
03223               class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
03224               *class_utf8data++ = XCL_SINGLE;
03225               class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
03226               }
03227 #endif
03228             continue;
03229             }
03230 
03231           if (-c == ESC_H)
03232             {
03233             for (c = 0; c < 32; c++)
03234               {
03235               int x = 0xff;
03236               switch (c)
03237                 {
03238                 case 0x09/8: x ^= 1 << (0x09%8); break;
03239                 case 0x20/8: x ^= 1 << (0x20%8); break;
03240                 case 0xa0/8: x ^= 1 << (0xa0%8); break;
03241                 default: break;
03242                 }
03243               classbits[c] |= x;
03244               }
03245 
03246 #ifdef SUPPORT_UTF8
03247             if (utf8)
03248               {
03249               class_utf8 = TRUE;
03250               *class_utf8data++ = XCL_RANGE;
03251               class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
03252               class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
03253               *class_utf8data++ = XCL_RANGE;
03254               class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
03255               class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
03256               *class_utf8data++ = XCL_RANGE;
03257               class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
03258               class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
03259               *class_utf8data++ = XCL_RANGE;
03260               class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
03261               class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
03262               *class_utf8data++ = XCL_RANGE;
03263               class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
03264               class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
03265               *class_utf8data++ = XCL_RANGE;
03266               class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
03267               class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
03268               *class_utf8data++ = XCL_RANGE;
03269               class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
03270               class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
03271               }
03272 #endif
03273             continue;
03274             }
03275 
03276           if (-c == ESC_v)
03277             {
03278             SETBIT(classbits, 0x0a); /* LF */
03279             SETBIT(classbits, 0x0b); /* VT */
03280             SETBIT(classbits, 0x0c); /* FF */
03281             SETBIT(classbits, 0x0d); /* CR */
03282             SETBIT(classbits, 0x85); /* NEL */
03283 #ifdef SUPPORT_UTF8
03284             if (utf8)
03285               {
03286               class_utf8 = TRUE;
03287               *class_utf8data++ = XCL_RANGE;
03288               class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
03289               class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
03290               }
03291 #endif
03292             continue;
03293             }
03294 
03295           if (-c == ESC_V)
03296             {
03297             for (c = 0; c < 32; c++)
03298               {
03299               int x = 0xff;
03300               switch (c)
03301                 {
03302                 case 0x0a/8: x ^= 1 << (0x0a%8);
03303                              x ^= 1 << (0x0b%8);
03304                              x ^= 1 << (0x0c%8);
03305                              x ^= 1 << (0x0d%8);
03306                              break;
03307                 case 0x85/8: x ^= 1 << (0x85%8); break;
03308                 default: break;
03309                 }
03310               classbits[c] |= x;
03311               }
03312 
03313 #ifdef SUPPORT_UTF8
03314             if (utf8)
03315               {
03316               class_utf8 = TRUE;
03317               *class_utf8data++ = XCL_RANGE;
03318               class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
03319               class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
03320               *class_utf8data++ = XCL_RANGE;
03321               class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
03322               class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
03323               }
03324 #endif
03325             continue;
03326             }
03327 
03328           /* We need to deal with \P and \p in both phases. */
03329 
03330 #ifdef SUPPORT_UCP
03331           if (-c == ESC_p || -c == ESC_P)
03332             {
03333             BOOL negated;
03334             int pdata;
03335             int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
03336             if (ptype < 0) goto FAILED;
03337             class_utf8 = TRUE;
03338             *class_utf8data++ = ((-c == ESC_p) != negated)?
03339               XCL_PROP : XCL_NOTPROP;
03340             *class_utf8data++ = ptype;
03341             *class_utf8data++ = pdata;
03342             class_charcount -= 2;   /* Not a < 256 character */
03343             continue;
03344             }
03345 #endif
03346           /* Unrecognized escapes are faulted if PCRE is running in its
03347           strict mode. By default, for compatibility with Perl, they are
03348           treated as literals. */
03349 
03350           if ((options & PCRE_EXTRA) != 0)
03351             {
03352             *errorcodeptr = ERR7;
03353             goto FAILED;
03354             }
03355 
03356           class_charcount -= 2;  /* Undo the default count from above */
03357           c = *ptr;              /* Get the final character and fall through */
03358           }
03359 
03360         /* Fall through if we have a single character (c >= 0). This may be
03361         greater than 256 in UTF-8 mode. */
03362 
03363         }   /* End of backslash handling */
03364 
03365       /* A single character may be followed by '-' to form a range. However,
03366       Perl does not permit ']' to be the end of the range. A '-' character
03367       at the end is treated as a literal. Perl ignores orphaned \E sequences
03368       entirely. The code for handling \Q and \E is messy. */
03369 
03370       CHECK_RANGE:
03371       while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
03372         {
03373         inescq = FALSE;
03374         ptr += 2;
03375         }
03376 
03377       oldptr = ptr;
03378 
03379       /* Remember \r or \n */
03380 
03381       if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
03382 
03383       /* Check for range */
03384 
03385       if (!inescq && ptr[1] == CHAR_MINUS)
03386         {
03387         int d;
03388         ptr += 2;
03389         while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
03390 
03391         /* If we hit \Q (not followed by \E) at this point, go into escaped
03392         mode. */
03393 
03394         while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
03395           {
03396           ptr += 2;
03397           if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
03398             { ptr += 2; continue; }
03399           inescq = TRUE;
03400           break;
03401           }
03402 
03403         if (*ptr == 0 || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
03404           {
03405           ptr = oldptr;
03406           goto LONE_SINGLE_CHARACTER;
03407           }
03408 
03409 #ifdef SUPPORT_UTF8
03410         if (utf8)
03411           {                           /* Braces are required because the */
03412           GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */
03413           }
03414         else
03415 #endif
03416         d = *ptr;  /* Not UTF-8 mode */
03417 
03418         /* The second part of a range can be a single-character escape, but
03419         not any of the other escapes. Perl 5.6 treats a hyphen as a literal
03420         in such circumstances. */
03421 
03422         if (!inescq && d == CHAR_BACKSLASH)
03423           {
03424           d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
03425           if (*errorcodeptr != 0) goto FAILED;
03426 
03427           /* \b is backspace; \X is literal X; \R is literal R; any other
03428           special means the '-' was literal */
03429 
03430           if (d < 0)
03431             {
03432             if (d == -ESC_b) d = CHAR_BS;
03433             else if (d == -ESC_X) d = CHAR_X;
03434             else if (d == -ESC_R) d = CHAR_R; else
03435               {
03436               ptr = oldptr;
03437               goto LONE_SINGLE_CHARACTER;  /* A few lines below */
03438               }
03439             }
03440           }
03441 
03442         /* Check that the two values are in the correct order. Optimize
03443         one-character ranges */
03444 
03445         if (d < c)
03446           {
03447           *errorcodeptr = ERR8;
03448           goto FAILED;
03449           }
03450 
03451         if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */
03452 
03453         /* Remember \r or \n */
03454 
03455         if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
03456 
03457         /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
03458         matching, we have to use an XCLASS with extra data items. Caseless
03459         matching for characters > 127 is available only if UCP support is
03460         available. */
03461 
03462 #ifdef SUPPORT_UTF8
03463         if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
03464           {
03465           class_utf8 = TRUE;
03466 
03467           /* With UCP support, we can find the other case equivalents of
03468           the relevant characters. There may be several ranges. Optimize how
03469           they fit with the basic range. */
03470 
03471 #ifdef SUPPORT_UCP
03472           if ((options & PCRE_CASELESS) != 0)
03473             {
03474             unsigned int occ, ocd;
03475             unsigned int cc = c;
03476             unsigned int origd = d;
03477             while (get_othercase_range(&cc, origd, &occ, &ocd))
03478               {
03479               if (occ >= (unsigned int)c &&
03480                   ocd <= (unsigned int)d)
03481                 continue;                          /* Skip embedded ranges */
03482 
03483               if (occ < (unsigned int)c  &&
03484                   ocd >= (unsigned int)c - 1)      /* Extend the basic range */
03485                 {                                  /* if there is overlap,   */
03486                 c = occ;                           /* noting that if occ < c */
03487                 continue;                          /* we can't have ocd > d  */
03488                 }                                  /* because a subrange is  */
03489               if (ocd > (unsigned int)d &&
03490                   occ <= (unsigned int)d + 1)      /* always shorter than    */
03491                 {                                  /* the basic range.       */
03492                 d = ocd;
03493                 continue;
03494                 }
03495 
03496               if (occ == ocd)
03497                 {
03498                 *class_utf8data++ = XCL_SINGLE;
03499                 }
03500               else
03501                 {
03502                 *class_utf8data++ = XCL_RANGE;
03503                 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
03504                 }
03505               class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
03506               }
03507             }
03508 #endif  /* SUPPORT_UCP */
03509 
03510           /* Now record the original range, possibly modified for UCP caseless
03511           overlapping ranges. */
03512 
03513           *class_utf8data++ = XCL_RANGE;
03514           class_utf8data += _pcre_ord2utf8(c, class_utf8data);
03515           class_utf8data += _pcre_ord2utf8(d, class_utf8data);
03516 
03517           /* With UCP support, we are done. Without UCP support, there is no
03518           caseless matching for UTF-8 characters > 127; we can use the bit map
03519           for the smaller ones. */
03520 
03521 #ifdef SUPPORT_UCP
03522           continue;    /* With next character in the class */
03523 #else
03524           if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
03525 
03526           /* Adjust upper limit and fall through to set up the map */
03527 
03528           d = 127;
03529 
03530 #endif  /* SUPPORT_UCP */
03531           }
03532 #endif  /* SUPPORT_UTF8 */
03533 
03534         /* We use the bit map for all cases when not in UTF-8 mode; else
03535         ranges that lie entirely within 0-127 when there is UCP support; else
03536         for partial ranges without UCP support. */
03537 
03538         class_charcount += d - c + 1;
03539         class_lastchar = d;
03540 
03541         /* We can save a bit of time by skipping this in the pre-compile. */
03542 
03543         if (lengthptr == NULL) for (; c <= d; c++)
03544           {
03545           classbits[c/8] |= (1 << (c&7));
03546           if ((options & PCRE_CASELESS) != 0)
03547             {
03548             int uc = cd->fcc[c];           /* flip case */
03549             classbits[uc/8] |= (1 << (uc&7));
03550             }
03551           }
03552 
03553         continue;   /* Go get the next char in the class */
03554         }
03555 
03556       /* Handle a lone single character - we can get here for a normal
03557       non-escape char, or after \ that introduces a single character or for an
03558       apparent range that isn't. */
03559 
03560       LONE_SINGLE_CHARACTER:
03561 
03562       /* Handle a character that cannot go in the bit map */
03563 
03564 #ifdef SUPPORT_UTF8
03565       if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
03566         {
03567         class_utf8 = TRUE;
03568         *class_utf8data++ = XCL_SINGLE;
03569         class_utf8data += _pcre_ord2utf8(c, class_utf8data);
03570 
03571 #ifdef SUPPORT_UCP
03572         if ((options & PCRE_CASELESS) != 0)
03573           {
03574           unsigned int othercase;
03575           if ((othercase = UCD_OTHERCASE(c)) != c)
03576             {
03577             *class_utf8data++ = XCL_SINGLE;
03578             class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
03579             }
03580           }
03581 #endif  /* SUPPORT_UCP */
03582 
03583         }
03584       else
03585 #endif  /* SUPPORT_UTF8 */
03586 
03587       /* Handle a single-byte character */
03588         {
03589         classbits[c/8] |= (1 << (c&7));
03590         if ((options & PCRE_CASELESS) != 0)
03591           {
03592           c = cd->fcc[c];   /* flip case */
03593           classbits[c/8] |= (1 << (c&7));
03594           }
03595         class_charcount++;
03596         class_lastchar = c;
03597         }
03598       }
03599 
03600     /* Loop until ']' reached. This "while" is the end of the "do" above. */
03601 
03602     while ((c = *(++ptr)) != 0 && (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
03603 
03604     if (c == 0)                          /* Missing terminating ']' */
03605       {
03606       *errorcodeptr = ERR6;
03607       goto FAILED;
03608       }
03609 
03610 
03611 /* This code has been disabled because it would mean that \s counts as
03612 an explicit \r or \n reference, and that's not really what is wanted. Now
03613 we set the flag only if there is a literal "\r" or "\n" in the class. */
03614 
03615 #if 0
03616     /* Remember whether \r or \n are in this class */
03617 
03618     if (negate_class)
03619       {
03620       if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
03621       }
03622     else
03623       {
03624       if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
03625       }
03626 #endif
03627 
03628 
03629     /* If class_charcount is 1, we saw precisely one character whose value is
03630     less than 256. As long as there were no characters >= 128 and there was no
03631     use of \p or \P, in other words, no use of any XCLASS features, we can
03632     optimize.
03633 
03634     In UTF-8 mode, we can optimize the negative case only if there were no
03635     characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
03636     operate on single-bytes only. This is an historical hangover. Maybe one day
03637     we can tidy these opcodes to handle multi-byte characters.
03638 
03639     The optimization throws away the bit map. We turn the item into a
03640     1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
03641     that OP_NOT does not support multibyte characters. In the positive case, it
03642     can cause firstbyte to be set. Otherwise, there can be no first char if
03643     this item is first, whatever repeat count may follow. In the case of
03644     reqbyte, save the previous value for reinstating. */
03645 
03646 #ifdef SUPPORT_UTF8
03647     if (class_charcount == 1 && !class_utf8 &&
03648       (!utf8 || !negate_class || class_lastchar < 128))
03649 #else
03650     if (class_charcount == 1)
03651 #endif
03652       {
03653       zeroreqbyte = reqbyte;
03654 
03655       /* The OP_NOT opcode works on one-byte characters only. */
03656 
03657       if (negate_class)
03658         {
03659         if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
03660         zerofirstbyte = firstbyte;
03661         *code++ = OP_NOT;
03662         *code++ = class_lastchar;
03663         break;
03664         }
03665 
03666       /* For a single, positive character, get the value into mcbuffer, and
03667       then we can handle this with the normal one-character code. */
03668 
03669 #ifdef SUPPORT_UTF8
03670       if (utf8 && class_lastchar > 127)
03671         mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
03672       else
03673 #endif
03674         {
03675         mcbuffer[0] = class_lastchar;
03676         mclength = 1;
03677         }
03678       goto ONE_CHAR;
03679       }       /* End of 1-char optimization */
03680 
03681     /* The general case - not the one-char optimization. If this is the first
03682     thing in the branch, there can be no first char setting, whatever the
03683     repeat count. Any reqbyte setting must remain unchanged after any kind of
03684     repeat. */
03685 
03686     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
03687     zerofirstbyte = firstbyte;
03688     zeroreqbyte = reqbyte;
03689 
03690     /* If there are characters with values > 255, we have to compile an
03691     extended class, with its own opcode, unless there was a negated special
03692     such as \S in the class, because in that case all characters > 255 are in
03693     the class, so any that were explicitly given as well can be ignored. If
03694     (when there are explicit characters > 255 that must be listed) there are no
03695     characters < 256, we can omit the bitmap in the actual compiled code. */
03696 
03697 #ifdef SUPPORT_UTF8
03698     if (class_utf8 && !should_flip_negation)
03699       {
03700       *class_utf8data++ = XCL_END;    /* Marks the end of extra data */
03701       *code++ = OP_XCLASS;
03702       code += LINK_SIZE;
03703       *code = negate_class? XCL_NOT : 0;
03704 
03705       /* If the map is required, move up the extra data to make room for it;
03706       otherwise just move the code pointer to the end of the extra data. */
03707 
03708       if (class_charcount > 0)
03709         {
03710         *code++ |= XCL_MAP;
03711         memmove(code + 32, code, class_utf8data - code);
03712         memcpy(code, classbits, 32);
03713         code = class_utf8data + 32;
03714         }
03715       else code = class_utf8data;
03716 
03717       /* Now fill in the complete length of the item */
03718 
03719       PUT(previous, 1, code - previous);
03720       break;   /* End of class handling */
03721       }
03722 #endif
03723 
03724     /* If there are no characters > 255, set the opcode to OP_CLASS or
03725     OP_NCLASS, depending on whether the whole class was negated and whether
03726     there were negative specials such as \S in the class. Then copy the 32-byte
03727     map into the code vector, negating it if necessary. */
03728 
03729     *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
03730     if (negate_class)
03731       {
03732       if (lengthptr == NULL)    /* Save time in the pre-compile phase */
03733         for (c = 0; c < 32; c++) code[c] = ~classbits[c];
03734       }
03735     else
03736       {
03737       memcpy(code, classbits, 32);
03738       }
03739     code += 32;
03740     break;
03741 
03742 
03743     /* ===================================================================*/
03744     /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
03745     has been tested above. */
03746 
03747     case CHAR_LEFT_CURLY_BRACKET:
03748     if (!is_quantifier) goto NORMAL_CHAR;
03749     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
03750     if (*errorcodeptr != 0) goto FAILED;
03751     goto REPEAT;
03752 
03753     case CHAR_ASTERISK:
03754     repeat_min = 0;
03755     repeat_max = -1;
03756     goto REPEAT;
03757 
03758     case CHAR_PLUS:
03759     repeat_min = 1;
03760     repeat_max = -1;
03761     goto REPEAT;
03762 
03763     case CHAR_QUESTION_MARK:
03764     repeat_min = 0;
03765     repeat_max = 1;
03766 
03767     REPEAT:
03768     if (previous == NULL)
03769       {
03770       *errorcodeptr = ERR9;
03771       goto FAILED;
03772       }
03773 
03774     if (repeat_min == 0)
03775       {
03776       firstbyte = zerofirstbyte;    /* Adjust for zero repeat */
03777       reqbyte = zeroreqbyte;        /* Ditto */
03778       }
03779 
03780     /* Remember whether this is a variable length repeat */
03781 
03782     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
03783 
03784     op_type = 0;                    /* Default single-char op codes */
03785     possessive_quantifier = FALSE;  /* Default not possessive quantifier */
03786 
03787     /* Save start of previous item, in case we have to move it up to make space
03788     for an inserted OP_ONCE for the additional '+' extension. */
03789 
03790     tempcode = previous;
03791 
03792     /* If the next character is '+', we have a possessive quantifier. This
03793     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
03794     If the next character is '?' this is a minimizing repeat, by default,
03795     but if PCRE_UNGREEDY is set, it works the other way round. We change the
03796     repeat type to the non-default. */
03797 
03798     if (ptr[1] == CHAR_PLUS)
03799       {
03800       repeat_type = 0;                  /* Force greedy */
03801       possessive_quantifier = TRUE;
03802       ptr++;
03803       }
03804     else if (ptr[1] == CHAR_QUESTION_MARK)
03805       {
03806       repeat_type = greedy_non_default;
03807       ptr++;
03808       }
03809     else repeat_type = greedy_default;
03810 
03811     /* If previous was a character match, abolish the item and generate a
03812     repeat item instead. If a char item has a minumum of more than one, ensure
03813     that it is set in reqbyte - it might not be if a sequence such as x{3} is
03814     the first thing in a branch because the x will have gone into firstbyte
03815     instead.  */
03816 
03817     if (*previous == OP_CHAR || *previous == OP_CHARNC)
03818       {
03819       /* Deal with UTF-8 characters that take up more than one byte. It's
03820       easier to write this out separately than try to macrify it. Use c to
03821       hold the length of the character in bytes, plus 0x80 to flag that it's a
03822       length rather than a small character. */
03823 
03824 #ifdef SUPPORT_UTF8
03825       if (utf8 && (code[-1] & 0x80) != 0)
03826         {
03827         uschar *lastchar = code - 1;
03828         while((*lastchar & 0xc0) == 0x80) lastchar--;
03829         c = code - lastchar;            /* Length of UTF-8 character */
03830         memcpy(utf8_char, lastchar, c); /* Save the char */
03831         c |= 0x80;                      /* Flag c as a length */
03832         }
03833       else
03834 #endif
03835 
03836       /* Handle the case of a single byte - either with no UTF8 support, or
03837       with UTF-8 disabled, or for a UTF-8 character < 128. */
03838 
03839         {
03840         c = code[-1];
03841         if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
03842         }
03843 
03844       /* If the repetition is unlimited, it pays to see if the next thing on
03845       the line is something that cannot possibly match this character. If so,
03846       automatically possessifying this item gains some performance in the case
03847       where the match fails. */
03848 
03849       if (!possessive_quantifier &&
03850           repeat_max < 0 &&
03851           check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
03852             options, cd))
03853         {
03854         repeat_type = 0;    /* Force greedy */
03855         possessive_quantifier = TRUE;
03856         }
03857 
03858       goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
03859       }
03860 
03861     /* If previous was a single negated character ([^a] or similar), we use
03862     one of the special opcodes, replacing it. The code is shared with single-
03863     character repeats by setting opt_type to add a suitable offset into
03864     repeat_type. We can also test for auto-possessification. OP_NOT is
03865     currently used only for single-byte chars. */
03866 
03867     else if (*previous == OP_NOT)
03868       {
03869       op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
03870       c = previous[1];
03871       if (!possessive_quantifier &&
03872           repeat_max < 0 &&
03873           check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
03874         {
03875         repeat_type = 0;    /* Force greedy */
03876         possessive_quantifier = TRUE;
03877         }
03878       goto OUTPUT_SINGLE_REPEAT;
03879       }
03880 
03881     /* If previous was a character type match (\d or similar), abolish it and
03882     create a suitable repeat item. The code is shared with single-character
03883     repeats by setting op_type to add a suitable offset into repeat_type. Note
03884     the the Unicode property types will be present only when SUPPORT_UCP is
03885     defined, but we don't wrap the little bits of code here because it just
03886     makes it horribly messy. */
03887 
03888     else if (*previous < OP_EODN)
03889       {
03890       uschar *oldcode;
03891       int prop_type, prop_value;
03892       op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
03893       c = *previous;
03894 
03895       if (!possessive_quantifier &&
03896           repeat_max < 0 &&
03897           check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
03898         {
03899         repeat_type = 0;    /* Force greedy */
03900         possessive_quantifier = TRUE;
03901         }
03902 
03903       OUTPUT_SINGLE_REPEAT:
03904       if (*previous == OP_PROP || *previous == OP_NOTPROP)
03905         {
03906         prop_type = previous[1];
03907         prop_value = previous[2];
03908         }
03909       else prop_type = prop_value = -1;
03910 
03911       oldcode = code;
03912       code = previous;                  /* Usually overwrite previous item */
03913 
03914       /* If the maximum is zero then the minimum must also be zero; Perl allows
03915       this case, so we do too - by simply omitting the item altogether. */
03916 
03917       if (repeat_max == 0) goto END_REPEAT;
03918 
03919       /*--------------------------------------------------------------------*/
03920       /* This code is obsolete from release 8.00; the restriction was finally
03921       removed: */
03922 
03923       /* All real repeats make it impossible to handle partial matching (maybe
03924       one day we will be able to remove this restriction). */
03925 
03926       /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */
03927       /*--------------------------------------------------------------------*/
03928 
03929       /* Combine the op_type with the repeat_type */
03930 
03931       repeat_type += op_type;
03932 
03933       /* A minimum of zero is handled either as the special case * or ?, or as
03934       an UPTO, with the maximum given. */
03935 
03936       if (repeat_min == 0)
03937         {
03938         if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
03939           else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
03940         else
03941           {
03942           *code++ = OP_UPTO + repeat_type;
03943           PUT2INC(code, 0, repeat_max);
03944           }
03945         }
03946 
03947       /* A repeat minimum of 1 is optimized into some special cases. If the
03948       maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
03949       left in place and, if the maximum is greater than 1, we use OP_UPTO with
03950       one less than the maximum. */
03951 
03952       else if (repeat_min == 1)
03953         {
03954         if (repeat_max == -1)
03955           *code++ = OP_PLUS + repeat_type;
03956         else
03957           {
03958           code = oldcode;                 /* leave previous item in place */
03959           if (repeat_max == 1) goto END_REPEAT;
03960           *code++ = OP_UPTO + repeat_type;
03961           PUT2INC(code, 0, repeat_max - 1);
03962           }
03963         }
03964 
03965       /* The case {n,n} is just an EXACT, while the general case {n,m} is
03966       handled as an EXACT followed by an UPTO. */
03967 
03968       else
03969         {
03970         *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
03971         PUT2INC(code, 0, repeat_min);
03972 
03973         /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
03974         we have to insert the character for the previous code. For a repeated
03975         Unicode property match, there are two extra bytes that define the
03976         required property. In UTF-8 mode, long characters have their length in
03977         c, with the 0x80 bit as a flag. */
03978 
03979         if (repeat_max < 0)
03980           {
03981 #ifdef SUPPORT_UTF8
03982           if (utf8 && c >= 128)
03983             {
03984             memcpy(code, utf8_char, c & 7);
03985             code += c & 7;
03986             }
03987           else
03988 #endif
03989             {
03990             *code++ = c;
03991             if (prop_type >= 0)
03992               {
03993               *code++ = prop_type;
03994               *code++ = prop_value;
03995               }
03996             }
03997           *code++ = OP_STAR + repeat_type;
03998           }
03999 
04000         /* Else insert an UPTO if the max is greater than the min, again
04001         preceded by the character, for the previously inserted code. If the
04002         UPTO is just for 1 instance, we can use QUERY instead. */
04003 
04004         else if (repeat_max != repeat_min)
04005           {
04006 #ifdef SUPPORT_UTF8
04007           if (utf8 && c >= 128)
04008             {
04009             memcpy(code, utf8_char, c & 7);
04010             code += c & 7;
04011             }
04012           else
04013 #endif
04014           *code++ = c;
04015           if (prop_type >= 0)
04016             {
04017             *code++ = prop_type;
04018             *code++ = prop_value;
04019             }
04020           repeat_max -= repeat_min;
04021 
04022           if (repeat_max == 1)
04023             {
04024             *code++ = OP_QUERY + repeat_type;
04025             }
04026           else
04027             {
04028             *code++ = OP_UPTO + repeat_type;
04029             PUT2INC(code, 0, repeat_max);
04030             }
04031           }
04032         }
04033 
04034       /* The character or character type itself comes last in all cases. */
04035 
04036 #ifdef SUPPORT_UTF8
04037       if (utf8 && c >= 128)
04038         {
04039         memcpy(code, utf8_char, c & 7);
04040         code += c & 7;
04041         }
04042       else
04043 #endif
04044       *code++ = c;
04045 
04046       /* For a repeated Unicode property match, there are two extra bytes that
04047       define the required property. */
04048 
04049 #ifdef SUPPORT_UCP
04050       if (prop_type >= 0)
04051         {
04052         *code++ = prop_type;
04053         *code++ = prop_value;
04054         }
04055 #endif
04056       }
04057 
04058     /* If previous was a character class or a back reference, we put the repeat
04059     stuff after it, but just skip the item if the repeat was {0,0}. */
04060 
04061     else if (*previous == OP_CLASS ||
04062              *previous == OP_NCLASS ||
04063 #ifdef SUPPORT_UTF8
04064              *previous == OP_XCLASS ||
04065 #endif
04066              *previous == OP_REF)
04067       {
04068       if (repeat_max == 0)
04069         {
04070         code = previous;
04071         goto END_REPEAT;
04072         }
04073 
04074       /*--------------------------------------------------------------------*/
04075       /* This code is obsolete from release 8.00; the restriction was finally
04076       removed: */
04077 
04078       /* All real repeats make it impossible to handle partial matching (maybe
04079       one day we will be able to remove this restriction). */
04080 
04081       /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */
04082       /*--------------------------------------------------------------------*/
04083 
04084       if (repeat_min == 0 && repeat_max == -1)
04085         *code++ = OP_CRSTAR + repeat_type;
04086       else if (repeat_min == 1 && repeat_max == -1)
04087         *code++ = OP_CRPLUS + repeat_type;
04088       else if (repeat_min == 0 && repeat_max == 1)
04089         *code++ = OP_CRQUERY + repeat_type;
04090       else
04091         {
04092         *code++ = OP_CRRANGE + repeat_type;
04093         PUT2INC(code, 0, repeat_min);
04094         if (repeat_max == -1) repeat_max = 0;  /* 2-byte encoding for max */
04095         PUT2INC(code, 0, repeat_max);
04096         }
04097       }
04098 
04099     /* If previous was a bracket group, we may have to replicate it in certain
04100     cases. */
04101 
04102     else if (*previous == OP_BRA  || *previous == OP_CBRA ||
04103              *previous == OP_ONCE || *previous == OP_COND)
04104       {
04105       register int i;
04106       int ketoffset = 0;
04107       int len = code - previous;
04108       uschar *bralink = NULL;
04109 
04110       /* Repeating a DEFINE group is pointless */
04111 
04112       if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
04113         {
04114         *errorcodeptr = ERR55;
04115         goto FAILED;
04116         }
04117 
04118       /* If the maximum repeat count is unlimited, find the end of the bracket
04119       by scanning through from the start, and compute the offset back to it
04120       from the current code pointer. There may be an OP_OPT setting following
04121       the final KET, so we can't find the end just by going back from the code
04122       pointer. */
04123 
04124       if (repeat_max == -1)
04125         {
04126         register uschar *ket = previous;
04127         do ket += GET(ket, 1); while (*ket != OP_KET);
04128         ketoffset = code - ket;
04129         }
04130 
04131       /* The case of a zero minimum is special because of the need to stick
04132       OP_BRAZERO in front of it, and because the group appears once in the
04133       data, whereas in other cases it appears the minimum number of times. For
04134       this reason, it is simplest to treat this case separately, as otherwise
04135       the code gets far too messy. There are several special subcases when the
04136       minimum is zero. */
04137 
04138       if (repeat_min == 0)
04139         {
04140         /* If the maximum is also zero, we used to just omit the group from the
04141         output altogether, like this:
04142 
04143         ** if (repeat_max == 0)
04144         **   {
04145         **   code = previous;
04146         **   goto END_REPEAT;
04147         **   }
04148 
04149         However, that fails when a group is referenced as a subroutine from
04150         elsewhere in the pattern, so now we stick in OP_SKIPZERO in front of it
04151         so that it is skipped on execution. As we don't have a list of which
04152         groups are referenced, we cannot do this selectively.
04153 
04154         If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
04155         and do no more at this point. However, we do need to adjust any
04156         OP_RECURSE calls inside the group that refer to the group itself or any
04157         internal or forward referenced group, because the offset is from the
04158         start of the whole regex. Temporarily terminate the pattern while doing
04159         this. */
04160 
04161         if (repeat_max <= 1)    /* Covers 0, 1, and unlimited */
04162           {
04163           *code = OP_END;
04164           adjust_recurse(previous, 1, utf8, cd, save_hwm);
04165           memmove(previous+1, previous, len);
04166           code++;
04167           if (repeat_max == 0)
04168             {
04169             *previous++ = OP_SKIPZERO;
04170             goto END_REPEAT;
04171             }
04172           *previous++ = OP_BRAZERO + repeat_type;
04173           }
04174 
04175         /* If the maximum is greater than 1 and limited, we have to replicate
04176         in a nested fashion, sticking OP_BRAZERO before each set of brackets.
04177         The first one has to be handled carefully because it's the original
04178         copy, which has to be moved up. The remainder can be handled by code
04179         that is common with the non-zero minimum case below. We have to
04180         adjust the value or repeat_max, since one less copy is required. Once
04181         again, we may have to adjust any OP_RECURSE calls inside the group. */
04182 
04183         else
04184           {
04185           int offset;
04186           *code = OP_END;
04187           adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
04188           memmove(previous + 2 + LINK_SIZE, previous, len);
04189           code += 2 + LINK_SIZE;
04190           *previous++ = OP_BRAZERO + repeat_type;
04191           *previous++ = OP_BRA;
04192 
04193           /* We chain together the bracket offset fields that have to be
04194           filled in later when the ends of the brackets are reached. */
04195 
04196           offset = (bralink == NULL)? 0 : previous - bralink;
04197           bralink = previous;
04198           PUTINC(previous, 0, offset);
04199           }
04200 
04201         repeat_max--;
04202         }
04203 
04204       /* If the minimum is greater than zero, replicate the group as many
04205       times as necessary, and adjust the maximum to the number of subsequent
04206       copies that we need. If we set a first char from the group, and didn't
04207       set a required char, copy the latter from the former. If there are any
04208       forward reference subroutine calls in the group, there will be entries on
04209       the workspace list; replicate these with an appropriate increment. */
04210 
04211       else
04212         {
04213         if (repeat_min > 1)
04214           {
04215           /* In the pre-compile phase, we don't actually do the replication. We
04216           just adjust the length as if we had. Do some paranoid checks for
04217           potential integer overflow. */
04218 
04219           if (lengthptr != NULL)
04220             {
04221             int delta = (repeat_min - 1)*length_prevgroup;
04222             if ((double)(repeat_min - 1)*(double)length_prevgroup >
04223                                                             (double)INT_MAX ||
04224                 OFLOW_MAX - *lengthptr < delta)
04225               {
04226               *errorcodeptr = ERR20;
04227               goto FAILED;
04228               }
04229             *lengthptr += delta;
04230             }
04231 
04232           /* This is compiling for real */
04233 
04234           else
04235             {
04236             if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
04237             for (i = 1; i < repeat_min; i++)
04238               {
04239               uschar *hc;
04240               uschar *this_hwm = cd->hwm;
04241               memcpy(code, previous, len);
04242               for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
04243                 {
04244                 PUT(cd->hwm, 0, GET(hc, 0) + len);
04245                 cd->hwm += LINK_SIZE;
04246                 }
04247               save_hwm = this_hwm;
04248               code += len;
04249               }
04250             }
04251           }
04252 
04253         if (repeat_max > 0) repeat_max -= repeat_min;
04254         }
04255 
04256       /* This code is common to both the zero and non-zero minimum cases. If
04257       the maximum is limited, it replicates the group in a nested fashion,
04258       remembering the bracket starts on a stack. In the case of a zero minimum,
04259       the first one was set up above. In all cases the repeat_max now specifies
04260       the number of additional copies needed. Again, we must remember to
04261       replicate entries on the forward reference list. */
04262 
04263       if (repeat_max >= 0)
04264         {
04265         /* In the pre-compile phase, we don't actually do the replication. We
04266         just adjust the length as if we had. For each repetition we must add 1
04267         to the length for BRAZERO and for all but the last repetition we must
04268         add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
04269         paranoid checks to avoid integer overflow. */
04270 
04271         if (lengthptr != NULL && repeat_max > 0)
04272           {
04273           int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
04274                       2 - 2*LINK_SIZE;   /* Last one doesn't nest */
04275           if ((double)repeat_max *
04276                 (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
04277                   > (double)INT_MAX ||
04278               OFLOW_MAX - *lengthptr < delta)
04279             {
04280             *errorcodeptr = ERR20;
04281             goto FAILED;
04282             }
04283           *lengthptr += delta;
04284           }
04285 
04286         /* This is compiling for real */
04287 
04288         else for (i = repeat_max - 1; i >= 0; i--)
04289           {
04290           uschar *hc;
04291           uschar *this_hwm = cd->hwm;
04292 
04293           *code++ = OP_BRAZERO + repeat_type;
04294 
04295           /* All but the final copy start a new nesting, maintaining the
04296           chain of brackets outstanding. */
04297 
04298           if (i != 0)
04299             {
04300             int offset;
04301             *code++ = OP_BRA;
04302             offset = (bralink == NULL)? 0 : code - bralink;
04303             bralink = code;
04304             PUTINC(code, 0, offset);
04305             }
04306 
04307           memcpy(code, previous, len);
04308           for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
04309             {
04310             PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
04311             cd->hwm += LINK_SIZE;
04312             }
04313           save_hwm = this_hwm;
04314           code += len;
04315           }
04316 
04317         /* Now chain through the pending brackets, and fill in their length
04318         fields (which are holding the chain links pro tem). */
04319 
04320         while (bralink != NULL)
04321           {
04322           int oldlinkoffset;
04323           int offset = code - bralink + 1;
04324           uschar *bra = code - offset;
04325           oldlinkoffset = GET(bra, 1);
04326           bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
04327           *code++ = OP_KET;
04328           PUTINC(code, 0, offset);
04329           PUT(bra, 1, offset);
04330           }
04331         }
04332 
04333       /* If the maximum is unlimited, set a repeater in the final copy. We
04334       can't just offset backwards from the current code point, because we
04335       don't know if there's been an options resetting after the ket. The
04336       correct offset was computed above.
04337 
04338       Then, when we are doing the actual compile phase, check to see whether
04339       this group is a non-atomic one that could match an empty string. If so,
04340       convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
04341       that runtime checking can be done. [This check is also applied to
04342       atomic groups at runtime, but in a different way.] */
04343 
04344       else
04345         {
04346         uschar *ketcode = code - ketoffset;
04347         uschar *bracode = ketcode - GET(ketcode, 1);
04348         *ketcode = OP_KETRMAX + repeat_type;
04349         if (lengthptr == NULL && *bracode != OP_ONCE)
04350           {
04351           uschar *scode = bracode;
04352           do
04353             {
04354             if (could_be_empty_branch(scode, ketcode, utf8))
04355               {
04356               *bracode += OP_SBRA - OP_BRA;
04357               break;
04358               }
04359             scode += GET(scode, 1);
04360             }
04361           while (*scode == OP_ALT);
04362           }
04363         }
04364       }
04365 
04366     /* If previous is OP_FAIL, it was generated by an empty class [] in
04367     JavaScript mode. The other ways in which OP_FAIL can be generated, that is
04368     by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
04369     error above. We can just ignore the repeat in JS case. */
04370 
04371     else if (*previous == OP_FAIL) goto END_REPEAT;
04372 
04373     /* Else there's some kind of shambles */
04374 
04375     else
04376       {
04377       *errorcodeptr = ERR11;
04378       goto FAILED;
04379       }
04380 
04381     /* If the character following a repeat is '+', or if certain optimization
04382     tests above succeeded, possessive_quantifier is TRUE. For some of the
04383     simpler opcodes, there is an special alternative opcode for this. For
04384     anything else, we wrap the entire repeated item inside OP_ONCE brackets.
04385     The '+' notation is just syntactic sugar, taken from Sun's Java package,
04386     but the special opcodes can optimize it a bit. The repeated item starts at
04387     tempcode, not at previous, which might be the first part of a string whose
04388     (former) last char we repeated.
04389 
04390     Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
04391     an 'upto' may follow. We skip over an 'exact' item, and then test the
04392     length of what remains before proceeding. */
04393 
04394     if (possessive_quantifier)
04395       {
04396       int len;
04397 
04398       if (*tempcode == OP_TYPEEXACT)
04399         tempcode += _pcre_OP_lengths[*tempcode] +
04400           ((tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP)? 2 : 0);
04401 
04402       else if (*tempcode == OP_EXACT || *tempcode == OP_NOTEXACT)
04403         {
04404         tempcode += _pcre_OP_lengths[*tempcode];
04405 #ifdef SUPPORT_UTF8
04406         if (utf8 && tempcode[-1] >= 0xc0)
04407           tempcode += _pcre_utf8_table4[tempcode[-1] & 0x3f];
04408 #endif
04409         }
04410 
04411       len = code - tempcode;
04412       if (len > 0) switch (*tempcode)
04413         {
04414         case OP_STAR:  *tempcode = OP_POSSTAR; break;
04415         case OP_PLUS:  *tempcode = OP_POSPLUS; break;
04416         case OP_QUERY: *tempcode = OP_POSQUERY; break;
04417         case OP_UPTO:  *tempcode = OP_POSUPTO; break;
04418 
04419         case OP_TYPESTAR:  *tempcode = OP_TYPEPOSSTAR; break;
04420         case OP_TYPEPLUS:  *tempcode = OP_TYPEPOSPLUS; break;
04421         case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
04422         case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
04423 
04424         case OP_NOTSTAR:  *tempcode = OP_NOTPOSSTAR; break;
04425         case OP_NOTPLUS:  *tempcode = OP_NOTPOSPLUS; break;
04426         case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
04427         case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;
04428 
04429         default:
04430         memmove(tempcode + 1+LINK_SIZE, tempcode, len);
04431         code += 1 + LINK_SIZE;
04432         len += 1 + LINK_SIZE;
04433         tempcode[0] = OP_ONCE;
04434         *code++ = OP_KET;
04435         PUTINC(code, 0, len);
04436         PUT(tempcode, 1, len);
04437         break;
04438         }
04439       }
04440 
04441     /* In all case we no longer have a previous item. We also set the
04442     "follows varying string" flag for subsequently encountered reqbytes if
04443     it isn't already set and we have just passed a varying length item. */
04444 
04445     END_REPEAT:
04446     previous = NULL;
04447     cd->req_varyopt |= reqvary;
04448     break;
04449 
04450 
04451     /* ===================================================================*/
04452     /* Start of nested parenthesized sub-expression, or comment or lookahead or
04453     lookbehind or option setting or condition or all the other extended
04454     parenthesis forms.  */
04455 
04456     case CHAR_LEFT_PARENTHESIS:
04457     newoptions = options;
04458     skipbytes = 0;
04459     bravalue = OP_CBRA;
04460     save_hwm = cd->hwm;
04461     reset_bracount = FALSE;
04462 
04463     /* First deal with various "verbs" that can be introduced by '*'. */
04464 
04465     if (*(++ptr) == CHAR_ASTERISK && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
04466       {
04467       int i, namelen;
04468       const char *vn = verbnames;
04469       const uschar *name = ++ptr;
04470       previous = NULL;
04471       while ((cd->ctypes[*++ptr] & ctype_letter) != 0) {};
04472       if (*ptr == CHAR_COLON)
04473         {
04474         *errorcodeptr = ERR59;   /* Not supported */
04475         goto FAILED;
04476         }
04477       if (*ptr != CHAR_RIGHT_PARENTHESIS)
04478         {
04479         *errorcodeptr = ERR60;
04480         goto FAILED;
04481         }
04482       namelen = ptr - name;
04483       for (i = 0; i < verbcount; i++)
04484         {
04485         if (namelen == verbs[i].len &&
04486             strncmp((char *)name, vn, namelen) == 0)
04487           {
04488           /* Check for open captures before ACCEPT */
04489 
04490           if (verbs[i].op == OP_ACCEPT)
04491             {
04492             open_capitem *oc;
04493             cd->had_accept = TRUE;
04494             for (oc = cd->open_caps; oc != NULL; oc = oc->next)
04495               {
04496               *code++ = OP_CLOSE;
04497               PUT2INC(code, 0, oc->number);
04498               }
04499             }
04500           *code++ = verbs[i].op;
04501           break;
04502           }
04503         vn += verbs[i].len + 1;
04504         }
04505       if (i < verbcount) continue;
04506       *errorcodeptr = ERR60;
04507       goto FAILED;
04508       }
04509 
04510     /* Deal with the extended parentheses; all are introduced by '?', and the
04511     appearance of any of them means that this is not a capturing group. */
04512 
04513     else if (*ptr == CHAR_QUESTION_MARK)
04514       {
04515       int i, set, unset, namelen;
04516       int *optset;
04517       const uschar *name;
04518       uschar *slot;
04519 
04520       switch (*(++ptr))
04521         {
04522         case CHAR_NUMBER_SIGN:                 /* Comment; skip to ket */
04523         ptr++;
04524         while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
04525         if (*ptr == 0)
04526           {
04527           *errorcodeptr = ERR18;
04528           goto FAILED;
04529           }
04530         continue;
04531 
04532 
04533         /* ------------------------------------------------------------ */
04534         case CHAR_VERTICAL_LINE:  /* Reset capture count for each branch */
04535         reset_bracount = TRUE;
04536         /* Fall through */
04537 
04538         /* ------------------------------------------------------------ */
04539         case CHAR_COLON:          /* Non-capturing bracket */
04540         bravalue = OP_BRA;
04541         ptr++;
04542         break;
04543 
04544 
04545         /* ------------------------------------------------------------ */
04546         case CHAR_LEFT_PARENTHESIS:
04547         bravalue = OP_COND;       /* Conditional group */
04548 
04549         /* A condition can be an assertion, a number (referring to a numbered
04550         group), a name (referring to a named group), or 'R', referring to
04551         recursion. R<digits> and R&name are also permitted for recursion tests.
04552 
04553         There are several syntaxes for testing a named group: (?(name)) is used
04554         by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
04555 
04556         There are two unfortunate ambiguities, caused by history. (a) 'R' can
04557         be the recursive thing or the name 'R' (and similarly for 'R' followed
04558         by digits), and (b) a number could be a name that consists of digits.
04559         In both cases, we look for a name first; if not found, we try the other
04560         cases. */
04561 
04562         /* For conditions that are assertions, check the syntax, and then exit
04563         the switch. This will take control down to where bracketed groups,
04564         including assertions, are processed. */
04565 
04566         if (ptr[1] == CHAR_QUESTION_MARK && (ptr[2] == CHAR_EQUALS_SIGN ||
04567             ptr[2] == CHAR_EXCLAMATION_MARK || ptr[2] == CHAR_LESS_THAN_SIGN))
04568           break;
04569 
04570         /* Most other conditions use OP_CREF (a couple change to OP_RREF
04571         below), and all need to skip 3 bytes at the start of the group. */
04572 
04573         code[1+LINK_SIZE] = OP_CREF;
04574         skipbytes = 3;
04575         refsign = -1;
04576 
04577         /* Check for a test for recursion in a named group. */
04578 
04579         if (ptr[1] == CHAR_R && ptr[2] == CHAR_AMPERSAND)
04580           {
04581           terminator = -1;
04582           ptr += 2;
04583           code[1+LINK_SIZE] = OP_RREF;    /* Change the type of test */
04584           }
04585 
04586         /* Check for a test for a named group's having been set, using the Perl
04587         syntax (?(<name>) or (?('name') */
04588 
04589         else if (ptr[1] == CHAR_LESS_THAN_SIGN)
04590           {
04591           terminator = CHAR_GREATER_THAN_SIGN;
04592           ptr++;
04593           }
04594         else if (ptr[1] == CHAR_APOSTROPHE)
04595           {
04596           terminator = CHAR_APOSTROPHE;
04597           ptr++;
04598           }
04599         else
04600           {
04601           terminator = 0;
04602           if (ptr[1] == CHAR_MINUS || ptr[1] == CHAR_PLUS) refsign = *(++ptr);
04603           }
04604 
04605         /* We now expect to read a name; any thing else is an error */
04606 
04607         if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
04608           {
04609           ptr += 1;  /* To get the right offset */
04610           *errorcodeptr = ERR28;
04611           goto FAILED;
04612           }
04613 
04614         /* Read the name, but also get it as a number if it's all digits */
04615 
04616         recno = 0;
04617         name = ++ptr;
04618         while ((cd->ctypes[*ptr] & ctype_word) != 0)
04619           {
04620           if (recno >= 0)
04621             recno = ((digitab[*ptr] & ctype_digit) != 0)?
04622               recno * 10 + *ptr - CHAR_0 : -1;
04623           ptr++;
04624           }
04625         namelen = ptr - name;
04626 
04627         if ((terminator > 0 && *ptr++ != terminator) ||
04628             *ptr++ != CHAR_RIGHT_PARENTHESIS)
04629           {
04630           ptr--;      /* Error offset */
04631           *errorcodeptr = ERR26;
04632           goto FAILED;
04633           }
04634 
04635         /* Do no further checking in the pre-compile phase. */
04636 
04637         if (lengthptr != NULL) break;
04638 
04639         /* In the real compile we do the work of looking for the actual
04640         reference. If the string started with "+" or "-" we require the rest to
04641         be digits, in which case recno will be set. */
04642 
04643         if (refsign > 0)
04644           {
04645           if (recno <= 0)
04646             {
04647             *errorcodeptr = ERR58;
04648             goto FAILED;
04649             }
04650           recno = (refsign == CHAR_MINUS)?
04651             cd->bracount - recno + 1 : recno +cd->bracount;
04652           if (recno <= 0 || recno > cd->final_bracount)
04653             {
04654             *errorcodeptr = ERR15;
04655             goto FAILED;
04656             }
04657           PUT2(code, 2+LINK_SIZE, recno);
04658           break;
04659           }
04660 
04661         /* Otherwise (did not start with "+" or "-"), start by looking for the
04662         name. If we find a name, add one to the opcode to change OP_CREF or
04663         OP_RREF into OP_NCREF or OP_NRREF. These behave exactly the same,
04664         except they record that the reference was originally to a name. The
04665         information is used to check duplicate names. */
04666 
04667         slot = cd->name_table;
04668         for (i = 0; i < cd->names_found; i++)
04669           {
04670           if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
04671           slot += cd->name_entry_size;
04672           }
04673 
04674         /* Found a previous named subpattern */
04675 
04676         if (i < cd->names_found)
04677           {
04678           recno = GET2(slot, 0);
04679           PUT2(code, 2+LINK_SIZE, recno);
04680           code[1+LINK_SIZE]++;
04681           }
04682 
04683         /* Search the pattern for a forward reference */
04684 
04685         else if ((i = find_parens(cd, name, namelen,
04686                         (options & PCRE_EXTENDED) != 0)) > 0)
04687           {
04688           PUT2(code, 2+LINK_SIZE, i);
04689           code[1+LINK_SIZE]++;
04690           }
04691 
04692         /* If terminator == 0 it means that the name followed directly after
04693         the opening parenthesis [e.g. (?(abc)...] and in this case there are
04694         some further alternatives to try. For the cases where terminator != 0
04695         [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
04696         now checked all the possibilities, so give an error. */
04697 
04698         else if (terminator != 0)
04699           {
04700           *errorcodeptr = ERR15;
04701           goto FAILED;
04702           }
04703 
04704         /* Check for (?(R) for recursion. Allow digits after R to specify a
04705         specific group number. */
04706 
04707         else if (*name == CHAR_R)
04708           {
04709           recno = 0;
04710           for (i = 1; i < namelen; i++)
04711             {
04712             if ((digitab[name[i]] & ctype_digit) == 0)
04713               {
04714               *errorcodeptr = ERR15;
04715               goto FAILED;
04716               }
04717             recno = recno * 10 + name[i] - CHAR_0;
04718             }
04719           if (recno == 0) recno = RREF_ANY;
04720           code[1+LINK_SIZE] = OP_RREF;      /* Change test type */
04721           PUT2(code, 2+LINK_SIZE, recno);
04722           }
04723 
04724         /* Similarly, check for the (?(DEFINE) "condition", which is always
04725         false. */
04726 
04727         else if (namelen == 6 && strncmp((char *)name, STRING_DEFINE, 6) == 0)
04728           {
04729           code[1+LINK_SIZE] = OP_DEF;
04730           skipbytes = 1;
04731           }
04732 
04733         /* Check for the "name" actually being a subpattern number. We are
04734         in the second pass here, so final_bracount is set. */
04735 
04736         else if (recno > 0 && recno <= cd->final_bracount)
04737           {
04738           PUT2(code, 2+LINK_SIZE, recno);
04739           }
04740 
04741         /* Either an unidentified subpattern, or a reference to (?(0) */
04742 
04743         else
04744           {
04745           *errorcodeptr = (recno == 0)? ERR35: ERR15;
04746           goto FAILED;
04747           }
04748         break;
04749 
04750 
04751         /* ------------------------------------------------------------ */
04752         case CHAR_EQUALS_SIGN:                 /* Positive lookahead */
04753         bravalue = OP_ASSERT;
04754         ptr++;
04755         break;
04756 
04757 
04758         /* ------------------------------------------------------------ */
04759         case CHAR_EXCLAMATION_MARK:            /* Negative lookahead */
04760         ptr++;
04761         if (*ptr == CHAR_RIGHT_PARENTHESIS)    /* Optimize (?!) */
04762           {
04763           *code++ = OP_FAIL;
04764           previous = NULL;
04765           continue;
04766           }
04767         bravalue = OP_ASSERT_NOT;
04768         break;
04769 
04770 
04771         /* ------------------------------------------------------------ */
04772         case CHAR_LESS_THAN_SIGN:              /* Lookbehind or named define */
04773         switch (ptr[1])
04774           {
04775           case CHAR_EQUALS_SIGN:               /* Positive lookbehind */
04776           bravalue = OP_ASSERTBACK;
04777           ptr += 2;
04778           break;
04779 
04780           case CHAR_EXCLAMATION_MARK:          /* Negative lookbehind */
04781           bravalue = OP_ASSERTBACK_NOT;
04782           ptr += 2;
04783           break;
04784 
04785           default:                /* Could be name define, else bad */
04786           if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
04787           ptr++;                  /* Correct offset for error */
04788           *errorcodeptr = ERR24;
04789           goto FAILED;
04790           }
04791         break;
04792 
04793 
04794         /* ------------------------------------------------------------ */
04795         case CHAR_GREATER_THAN_SIGN:           /* One-time brackets */
04796         bravalue = OP_ONCE;
04797         ptr++;
04798         break;
04799 
04800 
04801         /* ------------------------------------------------------------ */
04802         case CHAR_C:                 /* Callout - may be followed by digits; */
04803         previous_callout = code;  /* Save for later completion */
04804         after_manual_callout = 1; /* Skip one item before completing */
04805         *code++ = OP_CALLOUT;
04806           {
04807           int n = 0;
04808           while ((digitab[*(++ptr)] & ctype_digit) != 0)
04809             n = n * 10 + *ptr - CHAR_0;
04810           if (*ptr != CHAR_RIGHT_PARENTHESIS)
04811             {
04812             *errorcodeptr = ERR39;
04813             goto FAILED;
04814             }
04815           if (n > 255)
04816             {
04817             *errorcodeptr = ERR38;
04818             goto FAILED;
04819             }
04820           *code++ = n;
04821           PUT(code, 0, ptr - cd->start_pattern + 1);  /* Pattern offset */
04822           PUT(code, LINK_SIZE, 0);                    /* Default length */
04823           code += 2 * LINK_SIZE;
04824           }
04825         previous = NULL;
04826         continue;
04827 
04828 
04829         /* ------------------------------------------------------------ */
04830         case CHAR_P:              /* Python-style named subpattern handling */
04831         if (*(++ptr) == CHAR_EQUALS_SIGN ||
04832             *ptr == CHAR_GREATER_THAN_SIGN)  /* Reference or recursion */
04833           {
04834           is_recurse = *ptr == CHAR_GREATER_THAN_SIGN;
04835           terminator = CHAR_RIGHT_PARENTHESIS;
04836           goto NAMED_REF_OR_RECURSE;
04837           }
04838         else if (*ptr != CHAR_LESS_THAN_SIGN)  /* Test for Python-style defn */
04839           {
04840           *errorcodeptr = ERR41;
04841           goto FAILED;
04842           }
04843         /* Fall through to handle (?P< as (?< is handled */
04844 
04845 
04846         /* ------------------------------------------------------------ */
04847         DEFINE_NAME:    /* Come here from (?< handling */
04848         case CHAR_APOSTROPHE:
04849           {
04850           terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
04851             CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
04852           name = ++ptr;
04853 
04854           while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
04855           namelen = ptr - name;
04856 
04857           /* In the pre-compile phase, just do a syntax check. */
04858 
04859           if (lengthptr != NULL)
04860             {
04861             if (*ptr != terminator)
04862               {
04863               *errorcodeptr = ERR42;
04864               goto FAILED;
04865               }
04866             if (cd->names_found >= MAX_NAME_COUNT)
04867               {
04868               *errorcodeptr = ERR49;
04869               goto FAILED;
04870               }
04871             if (namelen + 3 > cd->name_entry_size)
04872               {
04873               cd->name_entry_size = namelen + 3;
04874               if (namelen > MAX_NAME_SIZE)
04875                 {
04876                 *errorcodeptr = ERR48;
04877                 goto FAILED;
04878                 }
04879               }
04880             }
04881 
04882           /* In the real compile, create the entry in the table, maintaining
04883           alphabetical order. Duplicate names for different numbers are
04884           permitted only if PCRE_DUPNAMES is set. Duplicate names for the same
04885           number are always OK. (An existing number can be re-used if (?|
04886           appears in the pattern.) In either event, a duplicate name results in
04887           a duplicate entry in the table, even if the number is the same. This
04888           is because the number of names, and hence the table size, is computed
04889           in the pre-compile, and it affects various numbers and pointers which
04890           would all have to be modified, and the compiled code moved down, if
04891           duplicates with the same number were omitted from the table. This
04892           doesn't seem worth the hassle. However, *different* names for the
04893           same number are not permitted. */
04894 
04895           else
04896             {
04897             BOOL dupname = FALSE;
04898             slot = cd->name_table;
04899 
04900             for (i = 0; i < cd->names_found; i++)
04901               {
04902               int crc = memcmp(name, slot+2, namelen);
04903               if (crc == 0)
04904                 {
04905                 if (slot[2+namelen] == 0)
04906                   {
04907                   if (GET2(slot, 0) != cd->bracount + 1 &&
04908                       (options & PCRE_DUPNAMES) == 0)
04909                     {
04910                     *errorcodeptr = ERR43;
04911                     goto FAILED;
04912                     }
04913                   else dupname = TRUE;
04914                   }
04915                 else crc = -1;      /* Current name is a substring */
04916                 }
04917 
04918               /* Make space in the table and break the loop for an earlier
04919               name. For a duplicate or later name, carry on. We do this for
04920               duplicates so that in the simple case (when ?(| is not used) they
04921               are in order of their numbers. */
04922 
04923               if (crc < 0)
04924                 {
04925                 memmove(slot + cd->name_entry_size, slot,
04926                   (cd->names_found - i) * cd->name_entry_size);
04927                 break;
04928                 }
04929 
04930               /* Continue the loop for a later or duplicate name */
04931 
04932               slot += cd->name_entry_size;
04933               }
04934 
04935             /* For non-duplicate names, check for a duplicate number before
04936             adding the new name. */
04937 
04938             if (!dupname)
04939               {
04940               uschar *cslot = cd->name_table;
04941               for (i = 0; i < cd->names_found; i++)
04942                 {
04943                 if (cslot != slot)
04944                   {
04945                   if (GET2(cslot, 0) == cd->bracount + 1)
04946                     {
04947                     *errorcodeptr = ERR65;
04948                     goto FAILED;
04949                     }
04950                   }
04951                 else i--;
04952                 cslot += cd->name_entry_size;
04953                 }
04954               }
04955 
04956             PUT2(slot, 0, cd->bracount + 1);
04957             memcpy(slot + 2, name, namelen);
04958             slot[2+namelen] = 0;
04959             }
04960           }
04961 
04962         /* In both pre-compile and compile, count the number of names we've
04963         encountered. */
04964 
04965         cd->names_found++;
04966         ptr++;                    /* Move past > or ' */
04967         goto NUMBERED_GROUP;
04968 
04969 
04970         /* ------------------------------------------------------------ */
04971         case CHAR_AMPERSAND:            /* Perl recursion/subroutine syntax */
04972         terminator = CHAR_RIGHT_PARENTHESIS;
04973         is_recurse = TRUE;
04974         /* Fall through */
04975 
04976         /* We come here from the Python syntax above that handles both
04977         references (?P=name) and recursion (?P>name), as well as falling
04978         through from the Perl recursion syntax (?&name). We also come here from
04979         the Perl \k<name> or \k'name' back reference syntax and the \k{name}
04980         .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
04981 
04982         NAMED_REF_OR_RECURSE:
04983         name = ++ptr;
04984         while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
04985         namelen = ptr - name;
04986 
04987         /* In the pre-compile phase, do a syntax check and set a dummy
04988         reference number. */
04989 
04990         if (lengthptr != NULL)
04991           {
04992           if (namelen == 0)
04993             {
04994             *errorcodeptr = ERR62;
04995             goto FAILED;
04996             }
04997           if (*ptr != terminator)
04998             {
04999             *errorcodeptr = ERR42;
05000             goto FAILED;
05001             }
05002           if (namelen > MAX_NAME_SIZE)
05003             {
05004             *errorcodeptr = ERR48;
05005             goto FAILED;
05006             }
05007           recno = 0;
05008           }
05009 
05010         /* In the real compile, seek the name in the table. We check the name
05011         first, and then check that we have reached the end of the name in the
05012         table. That way, if the name that is longer than any in the table,
05013         the comparison will fail without reading beyond the table entry. */
05014 
05015         else
05016           {
05017           slot = cd->name_table;
05018           for (i = 0; i < cd->names_found; i++)
05019             {
05020             if (strncmp((char *)name, (char *)slot+2, namelen) == 0 &&
05021                 slot[2+namelen] == 0)
05022               break;
05023             slot += cd->name_entry_size;
05024             }
05025 
05026           if (i < cd->names_found)         /* Back reference */
05027             {
05028             recno = GET2(slot, 0);
05029             }
05030           else if ((recno =                /* Forward back reference */
05031                     find_parens(cd, name, namelen,
05032                       (options & PCRE_EXTENDED) != 0)) <= 0)
05033             {
05034             *errorcodeptr = ERR15;
05035             goto FAILED;
05036             }
05037           }
05038 
05039         /* In both phases, we can now go to the code than handles numerical
05040         recursion or backreferences. */
05041 
05042         if (is_recurse) goto HANDLE_RECURSION;
05043           else goto HANDLE_REFERENCE;
05044 
05045 
05046         /* ------------------------------------------------------------ */
05047         case CHAR_R:              /* Recursion */
05048         ptr++;                    /* Same as (?0)      */
05049         /* Fall through */
05050 
05051 
05052         /* ------------------------------------------------------------ */
05053         case CHAR_MINUS: case CHAR_PLUS:  /* Recursion or subroutine */
05054         case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
05055         case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
05056           {
05057           const uschar *called;
05058           terminator = CHAR_RIGHT_PARENTHESIS;
05059 
05060           /* Come here from the \g<...> and \g'...' code (Oniguruma
05061           compatibility). However, the syntax has been checked to ensure that
05062           the ... are a (signed) number, so that neither ERR63 nor ERR29 will
05063           be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
05064           ever be taken. */
05065 
05066           HANDLE_NUMERICAL_RECURSION:
05067 
05068           if ((refsign = *ptr) == CHAR_PLUS)
05069             {
05070             ptr++;
05071             if ((digitab[*ptr] & ctype_digit) == 0)
05072               {
05073               *errorcodeptr = ERR63;
05074               goto FAILED;
05075               }
05076             }
05077           else if (refsign == CHAR_MINUS)
05078             {
05079             if ((digitab[ptr[1]] & ctype_digit) == 0)
05080               goto OTHER_CHAR_AFTER_QUERY;
05081             ptr++;
05082             }
05083 
05084           recno = 0;
05085           while((digitab[*ptr] & ctype_digit) != 0)
05086             recno = recno * 10 + *ptr++ - CHAR_0;
05087 
05088           if (*ptr != terminator)
05089             {
05090             *errorcodeptr = ERR29;
05091             goto FAILED;
05092             }
05093 
05094           if (refsign == CHAR_MINUS)
05095             {
05096             if (recno == 0)
05097               {
05098               *errorcodeptr = ERR58;
05099               goto FAILED;
05100               }
05101             recno = cd->bracount - recno + 1;
05102             if (recno <= 0)
05103               {
05104               *errorcodeptr = ERR15;
05105               goto FAILED;
05106               }
05107             }
05108           else if (refsign == CHAR_PLUS)
05109             {
05110             if (recno == 0)
05111               {
05112               *errorcodeptr = ERR58;
05113               goto FAILED;
05114               }
05115             recno += cd->bracount;
05116             }
05117 
05118           /* Come here from code above that handles a named recursion */
05119 
05120           HANDLE_RECURSION:
05121 
05122           previous = code;
05123           called = cd->start_code;
05124 
05125           /* When we are actually compiling, find the bracket that is being
05126           referenced. Temporarily end the regex in case it doesn't exist before
05127           this point. If we end up with a forward reference, first check that
05128           the bracket does occur later so we can give the error (and position)
05129           now. Then remember this forward reference in the workspace so it can
05130           be filled in at the end. */
05131 
05132           if (lengthptr == NULL)
05133             {
05134             *code = OP_END;
05135             if (recno != 0)
05136               called = _pcre_find_bracket(cd->start_code, utf8, recno);
05137 
05138             /* Forward reference */
05139 
05140             if (called == NULL)
05141               {
05142               if (find_parens(cd, NULL, recno,
05143                     (options & PCRE_EXTENDED) != 0) < 0)
05144                 {
05145                 *errorcodeptr = ERR15;
05146                 goto FAILED;
05147                 }
05148               called = cd->start_code + recno;
05149               PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
05150               }
05151 
05152             /* If not a forward reference, and the subpattern is still open,
05153             this is a recursive call. We check to see if this is a left
05154             recursion that could loop for ever, and diagnose that case. */
05155 
05156             else if (GET(called, 1) == 0 &&
05157                      could_be_empty(called, code, bcptr, utf8))
05158               {
05159               *errorcodeptr = ERR40;
05160               goto FAILED;
05161               }
05162             }
05163 
05164           /* Insert the recursion/subroutine item, automatically wrapped inside
05165           "once" brackets. Set up a "previous group" length so that a
05166           subsequent quantifier will work. */
05167 
05168           *code = OP_ONCE;
05169           PUT(code, 1, 2 + 2*LINK_SIZE);
05170           code += 1 + LINK_SIZE;
05171 
05172           *code = OP_RECURSE;
05173           PUT(code, 1, called - cd->start_code);
05174           code += 1 + LINK_SIZE;
05175 
05176           *code = OP_KET;
05177           PUT(code, 1, 2 + 2*LINK_SIZE);
05178           code += 1 + LINK_SIZE;
05179 
05180           length_prevgroup = 3 + 3*LINK_SIZE;
05181           }
05182 
05183         /* Can't determine a first byte now */
05184 
05185         if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
05186         continue;
05187 
05188 
05189         /* ------------------------------------------------------------ */
05190         default:              /* Other characters: check option setting */
05191         OTHER_CHAR_AFTER_QUERY:
05192         set = unset = 0;
05193         optset = &set;
05194 
05195         while (*ptr != CHAR_RIGHT_PARENTHESIS && *ptr != CHAR_COLON)
05196           {
05197           switch (*ptr++)
05198             {
05199             case CHAR_MINUS: optset = &unset; break;
05200 
05201             case CHAR_J:    /* Record that it changed in the external options */
05202             *optset |= PCRE_DUPNAMES;
05203             cd->external_flags |= PCRE_JCHANGED;
05204             break;
05205 
05206             case CHAR_i: *optset |= PCRE_CASELESS; break;
05207             case CHAR_m: *optset |= PCRE_MULTILINE; break;
05208             case CHAR_s: *optset |= PCRE_DOTALL; break;
05209             case CHAR_x: *optset |= PCRE_EXTENDED; break;
05210             case CHAR_U: *optset |= PCRE_UNGREEDY; break;
05211             case CHAR_X: *optset |= PCRE_EXTRA; break;
05212 
05213             default:  *errorcodeptr = ERR12;
05214                       ptr--;    /* Correct the offset */
05215                       goto FAILED;
05216             }
05217           }
05218 
05219         /* Set up the changed option bits, but don't change anything yet. */
05220 
05221         newoptions = (options | set) & (~unset);
05222 
05223         /* If the options ended with ')' this is not the start of a nested
05224         group with option changes, so the options change at this level. If this
05225         item is right at the start of the pattern, the options can be
05226         abstracted and made external in the pre-compile phase, and ignored in
05227         the compile phase. This can be helpful when matching -- for instance in
05228         caseless checking of required bytes.
05229 
05230         If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
05231         definitely *not* at the start of the pattern because something has been
05232         compiled. In the pre-compile phase, however, the code pointer can have
05233         that value after the start, because it gets reset as code is discarded
05234         during the pre-compile. However, this can happen only at top level - if
05235         we are within parentheses, the starting BRA will still be present. At
05236         any parenthesis level, the length value can be used to test if anything
05237         has been compiled at that level. Thus, a test for both these conditions
05238         is necessary to ensure we correctly detect the start of the pattern in
05239         both phases.
05240 
05241         If we are not at the pattern start, compile code to change the ims
05242         options if this setting actually changes any of them, and reset the
05243         greedy defaults and the case value for firstbyte and reqbyte. */
05244 
05245         if (*ptr == CHAR_RIGHT_PARENTHESIS)
05246           {
05247           if (code == cd->start_code + 1 + LINK_SIZE &&
05248                (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
05249             {
05250             cd->external_options = newoptions;
05251             }
05252          else
05253             {
05254             if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
05255               {
05256               *code++ = OP_OPT;
05257               *code++ = newoptions & PCRE_IMS;
05258               }
05259             greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
05260             greedy_non_default = greedy_default ^ 1;
05261             req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
05262             }
05263 
05264           /* Change options at this level, and pass them back for use
05265           in subsequent branches. When not at the start of the pattern, this
05266           information is also necessary so that a resetting item can be
05267           compiled at the end of a group (if we are in a group). */
05268 
05269           *optionsptr = options = newoptions;
05270           previous = NULL;       /* This item can't be repeated */
05271           continue;              /* It is complete */
05272           }
05273 
05274         /* If the options ended with ':' we are heading into a nested group
05275         with possible change of options. Such groups are non-capturing and are
05276         not assertions of any kind. All we need to do is skip over the ':';
05277         the newoptions value is handled below. */
05278 
05279         bravalue = OP_BRA;
05280         ptr++;
05281         }     /* End of switch for character following (? */
05282       }       /* End of (? handling */
05283 
05284     /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
05285     all unadorned brackets become non-capturing and behave like (?:...)
05286     brackets. */
05287 
05288     else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
05289       {
05290       bravalue = OP_BRA;
05291       }
05292 
05293     /* Else we have a capturing group. */
05294 
05295     else
05296       {
05297       NUMBERED_GROUP:
05298       cd->bracount += 1;
05299       PUT2(code, 1+LINK_SIZE, cd->bracount);
05300       skipbytes = 2;
05301       }
05302 
05303     /* Process nested bracketed regex. Assertions may not be repeated, but
05304     other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
05305     non-register variable in order to be able to pass its address because some
05306     compilers complain otherwise. Pass in a new setting for the ims options if
05307     they have changed. */
05308 
05309     previous = (bravalue >= OP_ONCE)? code : NULL;
05310     *code = bravalue;
05311     tempcode = code;
05312     tempreqvary = cd->req_varyopt;     /* Save value before bracket */
05313     length_prevgroup = 0;              /* Initialize for pre-compile phase */
05314 
05315     if (!compile_regex(
05316          newoptions,                   /* The complete new option state */
05317          options & PCRE_IMS,           /* The previous ims option state */
05318          &tempcode,                    /* Where to put code (updated) */
05319          &ptr,                         /* Input pointer (updated) */
05320          errorcodeptr,                 /* Where to put an error message */
05321          (bravalue == OP_ASSERTBACK ||
05322           bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
05323          reset_bracount,               /* True if (?| group */
05324          skipbytes,                    /* Skip over bracket number */
05325          &subfirstbyte,                /* For possible first char */
05326          &subreqbyte,                  /* For possible last char */
05327          bcptr,                        /* Current branch chain */
05328          cd,                           /* Tables block */
05329          (lengthptr == NULL)? NULL :   /* Actual compile phase */
05330            &length_prevgroup           /* Pre-compile phase */
05331          ))
05332       goto FAILED;
05333 
05334     /* At the end of compiling, code is still pointing to the start of the
05335     group, while tempcode has been updated to point past the end of the group
05336     and any option resetting that may follow it. The pattern pointer (ptr)
05337     is on the bracket. */
05338 
05339     /* If this is a conditional bracket, check that there are no more than
05340     two branches in the group, or just one if it's a DEFINE group. We do this
05341     in the real compile phase, not in the pre-pass, where the whole group may
05342     not be available. */
05343 
05344     if (bravalue == OP_COND && lengthptr == NULL)
05345       {
05346       uschar *tc = code;
05347       int condcount = 0;
05348 
05349       do {
05350          condcount++;
05351          tc += GET(tc,1);
05352          }
05353       while (*tc != OP_KET);
05354 
05355       /* A DEFINE group is never obeyed inline (the "condition" is always
05356       false). It must have only one branch. */
05357 
05358       if (code[LINK_SIZE+1] == OP_DEF)
05359         {
05360         if (condcount > 1)
05361           {
05362           *errorcodeptr = ERR54;
05363           goto FAILED;
05364           }
05365         bravalue = OP_DEF;   /* Just a flag to suppress char handling below */
05366         }
05367 
05368       /* A "normal" conditional group. If there is just one branch, we must not
05369       make use of its firstbyte or reqbyte, because this is equivalent to an
05370       empty second branch. */
05371 
05372       else
05373         {
05374         if (condcount > 2)
05375           {
05376           *errorcodeptr = ERR27;
05377           goto FAILED;
05378           }
05379         if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
05380         }
05381       }
05382 
05383     /* Error if hit end of pattern */
05384 
05385     if (*ptr != CHAR_RIGHT_PARENTHESIS)
05386       {
05387       *errorcodeptr = ERR14;
05388       goto FAILED;
05389       }
05390 
05391     /* In the pre-compile phase, update the length by the length of the group,
05392     less the brackets at either end. Then reduce the compiled code to just a
05393     set of non-capturing brackets so that it doesn't use much memory if it is
05394     duplicated by a quantifier.*/
05395 
05396     if (lengthptr != NULL)
05397       {
05398       if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
05399         {
05400         *errorcodeptr = ERR20;
05401         goto FAILED;
05402         }
05403       *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
05404       *code++ = OP_BRA;
05405       PUTINC(code, 0, 1 + LINK_SIZE);
05406       *code++ = OP_KET;
05407       PUTINC(code, 0, 1 + LINK_SIZE);
05408       break;    /* No need to waste time with special character handling */
05409       }
05410 
05411     /* Otherwise update the main code pointer to the end of the group. */
05412 
05413     code = tempcode;
05414 
05415     /* For a DEFINE group, required and first character settings are not
05416     relevant. */
05417 
05418     if (bravalue == OP_DEF) break;
05419 
05420     /* Handle updating of the required and first characters for other types of
05421     group. Update for normal brackets of all kinds, and conditions with two
05422     branches (see code above). If the bracket is followed by a quantifier with
05423     zero repeat, we have to back off. Hence the definition of zeroreqbyte and
05424     zerofirstbyte outside the main loop so that they can be accessed for the
05425     back off. */
05426 
05427     zeroreqbyte = reqbyte;
05428     zerofirstbyte = firstbyte;
05429     groupsetfirstbyte = FALSE;
05430 
05431     if (bravalue >= OP_ONCE)
05432       {
05433       /* If we have not yet set a firstbyte in this branch, take it from the
05434       subpattern, remembering that it was set here so that a repeat of more
05435       than one can replicate it as reqbyte if necessary. If the subpattern has
05436       no firstbyte, set "none" for the whole branch. In both cases, a zero
05437       repeat forces firstbyte to "none". */
05438 
05439       if (firstbyte == REQ_UNSET)
05440         {
05441         if (subfirstbyte >= 0)
05442           {
05443           firstbyte = subfirstbyte;
05444           groupsetfirstbyte = TRUE;
05445           }
05446         else firstbyte = REQ_NONE;
05447         zerofirstbyte = REQ_NONE;
05448         }
05449 
05450       /* If firstbyte was previously set, convert the subpattern's firstbyte
05451       into reqbyte if there wasn't one, using the vary flag that was in
05452       existence beforehand. */
05453 
05454       else if (subfirstbyte >= 0 && subreqbyte < 0)
05455         subreqbyte = subfirstbyte | tempreqvary;
05456 
05457       /* If the subpattern set a required byte (or set a first byte that isn't
05458       really the first byte - see above), set it. */
05459 
05460       if (subreqbyte >= 0) reqbyte = subreqbyte;
05461       }
05462 
05463     /* For a forward assertion, we take the reqbyte, if set. This can be
05464     helpful if the pattern that follows the assertion doesn't set a different
05465     char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
05466     for an assertion, however because it leads to incorrect effect for patterns
05467     such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
05468     of a firstbyte. This is overcome by a scan at the end if there's no
05469     firstbyte, looking for an asserted first char. */
05470 
05471     else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
05472     break;     /* End of processing '(' */
05473 
05474 
05475     /* ===================================================================*/
05476     /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
05477     are arranged to be the negation of the corresponding OP_values. For the
05478     back references, the values are ESC_REF plus the reference number. Only
05479     back references and those types that consume a character may be repeated.
05480     We can test for values between ESC_b and ESC_Z for the latter; this may
05481     have to change if any new ones are ever created. */
05482 
05483     case CHAR_BACKSLASH:
05484     tempptr = ptr;
05485     c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
05486     if (*errorcodeptr != 0) goto FAILED;
05487 
05488     if (c < 0)
05489       {
05490       if (-c == ESC_Q)            /* Handle start of quoted string */
05491         {
05492         if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
05493           ptr += 2;               /* avoid empty string */
05494             else inescq = TRUE;
05495         continue;
05496         }
05497 
05498       if (-c == ESC_E) continue;  /* Perl ignores an orphan \E */
05499 
05500       /* For metasequences that actually match a character, we disable the
05501       setting of a first character if it hasn't already been set. */
05502 
05503       if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
05504         firstbyte = REQ_NONE;
05505 
05506       /* Set values to reset to if this is followed by a zero repeat. */
05507 
05508       zerofirstbyte = firstbyte;
05509       zeroreqbyte = reqbyte;
05510 
05511       /* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n'
05512       is a subroutine call by number (Oniguruma syntax). In fact, the value
05513       -ESC_g is returned only for these cases. So we don't need to check for <
05514       or ' if the value is -ESC_g. For the Perl syntax \g{n} the value is
05515       -ESC_REF+n, and for the Perl syntax \g{name} the result is -ESC_k (as
05516       that is a synonym for a named back reference). */
05517 
05518       if (-c == ESC_g)
05519         {
05520         const uschar *p;
05521         save_hwm = cd->hwm;   /* Normally this is set when '(' is read */
05522         terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
05523           CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
05524 
05525         /* These two statements stop the compiler for warning about possibly
05526         unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
05527         fact, because we actually check for a number below, the paths that
05528         would actually be in error are never taken. */
05529 
05530         skipbytes = 0;
05531         reset_bracount = FALSE;
05532 
05533         /* Test for a name */
05534 
05535         if (ptr[1] != CHAR_PLUS && ptr[1] != CHAR_MINUS)
05536           {
05537           BOOL isnumber = TRUE;
05538           for (p = ptr + 1; *p != 0 && *p != terminator; p++)
05539             {
05540             if ((cd->ctypes[*p] & ctype_digit) == 0) isnumber = FALSE;
05541             if ((cd->ctypes[*p] & ctype_word) == 0) break;
05542             }
05543           if (*p != terminator)
05544             {
05545             *errorcodeptr = ERR57;
05546             break;
05547             }
05548           if (isnumber)
05549             {
05550             ptr++;
05551             goto HANDLE_NUMERICAL_RECURSION;
05552             }
05553           is_recurse = TRUE;
05554           goto NAMED_REF_OR_RECURSE;
05555           }
05556 
05557         /* Test a signed number in angle brackets or quotes. */
05558 
05559         p = ptr + 2;
05560         while ((digitab[*p] & ctype_digit) != 0) p++;
05561         if (*p != terminator)
05562           {
05563           *errorcodeptr = ERR57;
05564           break;
05565           }
05566         ptr++;
05567         goto HANDLE_NUMERICAL_RECURSION;
05568         }
05569 
05570       /* \k<name> or \k'name' is a back reference by name (Perl syntax).
05571       We also support \k{name} (.NET syntax) */
05572 
05573       if (-c == ESC_k && (ptr[1] == CHAR_LESS_THAN_SIGN ||
05574           ptr[1] == CHAR_APOSTROPHE || ptr[1] == CHAR_LEFT_CURLY_BRACKET))
05575         {
05576         is_recurse = FALSE;
05577         terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
05578           CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
05579           CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
05580         goto NAMED_REF_OR_RECURSE;
05581         }
05582 
05583       /* Back references are handled specially; must disable firstbyte if
05584       not set to cope with cases like (?=(\w+))\1: which would otherwise set
05585       ':' later. */
05586 
05587       if (-c >= ESC_REF)
05588         {
05589         recno = -c - ESC_REF;
05590 
05591         HANDLE_REFERENCE:    /* Come here from named backref handling */
05592         if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
05593         previous = code;
05594         *code++ = OP_REF;
05595         PUT2INC(code, 0, recno);
05596         cd->backref_map |= (recno < 32)? (1 << recno) : 1;
05597         if (recno > cd->top_backref) cd->top_backref = recno;
05598         }
05599 
05600       /* So are Unicode property matches, if supported. */
05601 
05602 #ifdef SUPPORT_UCP
05603       else if (-c == ESC_P || -c == ESC_p)
05604         {
05605         BOOL negated;
05606         int pdata;
05607         int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
05608         if (ptype < 0) goto FAILED;
05609         previous = code;
05610         *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
05611         *code++ = ptype;
05612         *code++ = pdata;
05613         }
05614 #else
05615 
05616       /* If Unicode properties are not supported, \X, \P, and \p are not
05617       allowed. */
05618 
05619       else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
05620         {
05621         *errorcodeptr = ERR45;
05622         goto FAILED;
05623         }
05624 #endif
05625 
05626       /* For the rest (including \X when Unicode properties are supported), we
05627       can obtain the OP value by negating the escape value. */
05628 
05629       else
05630         {
05631         previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
05632         *code++ = -c;
05633         }
05634       continue;
05635       }
05636 
05637     /* We have a data character whose value is in c. In UTF-8 mode it may have
05638     a value > 127. We set its representation in the length/buffer, and then
05639     handle it as a data character. */
05640 
05641 #ifdef SUPPORT_UTF8
05642     if (utf8 && c > 127)
05643       mclength = _pcre_ord2utf8(c, mcbuffer);
05644     else
05645 #endif
05646 
05647      {
05648      mcbuffer[0] = c;
05649      mclength = 1;
05650      }
05651     goto ONE_CHAR;
05652 
05653 
05654     /* ===================================================================*/
05655     /* Handle a literal character. It is guaranteed not to be whitespace or #
05656     when the extended flag is set. If we are in UTF-8 mode, it may be a
05657     multi-byte literal character. */
05658 
05659     default:
05660     NORMAL_CHAR:
05661     mclength = 1;
05662     mcbuffer[0] = c;
05663 
05664 #ifdef SUPPORT_UTF8
05665     if (utf8 && c >= 0xc0)
05666       {
05667       while ((ptr[1] & 0xc0) == 0x80)
05668         mcbuffer[mclength++] = *(++ptr);
05669       }
05670 #endif
05671 
05672     /* At this point we have the character's bytes in mcbuffer, and the length
05673     in mclength. When not in UTF-8 mode, the length is always 1. */
05674 
05675     ONE_CHAR:
05676     previous = code;
05677     *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
05678     for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
05679 
05680     /* Remember if \r or \n were seen */
05681 
05682     if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
05683       cd->external_flags |= PCRE_HASCRORLF;
05684 
05685     /* Set the first and required bytes appropriately. If no previous first
05686     byte, set it from this character, but revert to none on a zero repeat.
05687     Otherwise, leave the firstbyte value alone, and don't change it on a zero
05688     repeat. */
05689 
05690     if (firstbyte == REQ_UNSET)
05691       {
05692       zerofirstbyte = REQ_NONE;
05693       zeroreqbyte = reqbyte;
05694 
05695       /* If the character is more than one byte long, we can set firstbyte
05696       only if it is not to be matched caselessly. */
05697 
05698       if (mclength == 1 || req_caseopt == 0)
05699         {
05700         firstbyte = mcbuffer[0] | req_caseopt;
05701         if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
05702         }
05703       else firstbyte = reqbyte = REQ_NONE;
05704       }
05705 
05706     /* firstbyte was previously set; we can set reqbyte only the length is
05707     1 or the matching is caseful. */
05708 
05709     else
05710       {
05711       zerofirstbyte = firstbyte;
05712       zeroreqbyte = reqbyte;
05713       if (mclength == 1 || req_caseopt == 0)
05714         reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
05715       }
05716 
05717     break;            /* End of literal character handling */
05718     }
05719   }                   /* end of big loop */
05720 
05721 
05722 /* Control never reaches here by falling through, only by a goto for all the
05723 error states. Pass back the position in the pattern so that it can be displayed
05724 to the user for diagnosing the error. */
05725 
05726 FAILED:
05727 *ptrptr = ptr;
05728 return FALSE;
05729 }
05730 
05731 
05732 
05733 
05734 /*************************************************
05735 *     Compile sequence of alternatives           *
05736 *************************************************/
05737 
05738 /* On entry, ptr is pointing past the bracket character, but on return it
05739 points to the closing bracket, or vertical bar, or end of string. The code
05740 variable is pointing at the byte into which the BRA operator has been stored.
05741 If the ims options are changed at the start (for a (?ims: group) or during any
05742 branch, we need to insert an OP_OPT item at the start of every following branch
05743 to ensure they get set correctly at run time, and also pass the new options
05744 into every subsequent branch compile.
05745 
05746 This function is used during the pre-compile phase when we are trying to find
05747 out the amount of memory needed, as well as during the real compile phase. The
05748 value of lengthptr distinguishes the two phases.
05749 
05750 Arguments:
05751   options        option bits, including any changes for this subpattern
05752   oldims         previous settings of ims option bits
05753   codeptr        -> the address of the current code pointer
05754   ptrptr         -> the address of the current pattern pointer
05755   errorcodeptr   -> pointer to error code variable
05756   lookbehind     TRUE if this is a lookbehind assertion
05757   reset_bracount TRUE to reset the count for each branch
05758   skipbytes      skip this many bytes at start (for brackets and OP_COND)
05759   firstbyteptr   place to put the first required character, or a negative number
05760   reqbyteptr     place to put the last required character, or a negative number
05761   bcptr          pointer to the chain of currently open branches
05762   cd             points to the data block with tables pointers etc.
05763   lengthptr      NULL during the real compile phase
05764                  points to length accumulator during pre-compile phase
05765 
05766 Returns:         TRUE on success
05767 */
05768 
05769 static BOOL
05770 compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
05771   int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
05772   int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,
05773   int *lengthptr)
05774 {
05775 const uschar *ptr = *ptrptr;
05776 uschar *code = *codeptr;
05777 uschar *last_branch = code;
05778 uschar *start_bracket = code;
05779 uschar *reverse_count = NULL;
05780 open_capitem capitem;
05781 int capnumber = 0;
05782 int firstbyte, reqbyte;
05783 int branchfirstbyte, branchreqbyte;
05784 int length;
05785 int orig_bracount;
05786 int max_bracount;
05787 branch_chain bc;
05788 
05789 bc.outer = bcptr;
05790 bc.current = code;
05791 
05792 firstbyte = reqbyte = REQ_UNSET;
05793 
05794 /* Accumulate the length for use in the pre-compile phase. Start with the
05795 length of the BRA and KET and any extra bytes that are required at the
05796 beginning. We accumulate in a local variable to save frequent testing of
05797 lenthptr for NULL. We cannot do this by looking at the value of code at the
05798 start and end of each alternative, because compiled items are discarded during
05799 the pre-compile phase so that the work space is not exceeded. */
05800 
05801 length = 2 + 2*LINK_SIZE + skipbytes;
05802 
05803 /* WARNING: If the above line is changed for any reason, you must also change
05804 the code that abstracts option settings at the start of the pattern and makes
05805 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
05806 pre-compile phase to find out whether anything has yet been compiled or not. */
05807 
05808 /* If this is a capturing subpattern, add to the chain of open capturing items
05809 so that we can detect them if (*ACCEPT) is encountered. */
05810 
05811 if (*code == OP_CBRA)
05812   {
05813   capnumber = GET2(code, 1 + LINK_SIZE);
05814   capitem.number = capnumber;
05815   capitem.next = cd->open_caps;
05816   cd->open_caps = &capitem;
05817   }
05818 
05819 /* Offset is set zero to mark that this bracket is still open */
05820 
05821 PUT(code, 1, 0);
05822 code += 1 + LINK_SIZE + skipbytes;
05823 
05824 /* Loop for each alternative branch */
05825 
05826 orig_bracount = max_bracount = cd->bracount;
05827 for (;;)
05828   {
05829   /* For a (?| group, reset the capturing bracket count so that each branch
05830   uses the same numbers. */
05831 
05832   if (reset_bracount) cd->bracount = orig_bracount;
05833 
05834   /* Handle a change of ims options at the start of the branch */
05835 
05836   if ((options & PCRE_IMS) != oldims)
05837     {
05838     *code++ = OP_OPT;
05839     *code++ = options & PCRE_IMS;
05840     length += 2;
05841     }
05842 
05843   /* Set up dummy OP_REVERSE if lookbehind assertion */
05844 
05845   if (lookbehind)
05846     {
05847     *code++ = OP_REVERSE;
05848     reverse_count = code;
05849     PUTINC(code, 0, 0);
05850     length += 1 + LINK_SIZE;
05851     }
05852 
05853   /* Now compile the branch; in the pre-compile phase its length gets added
05854   into the length. */
05855 
05856   if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
05857         &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
05858     {
05859     *ptrptr = ptr;
05860     return FALSE;
05861     }
05862 
05863   /* Keep the highest bracket count in case (?| was used and some branch
05864   has fewer than the rest. */
05865 
05866   if (cd->bracount > max_bracount) max_bracount = cd->bracount;
05867 
05868   /* In the real compile phase, there is some post-processing to be done. */
05869 
05870   if (lengthptr == NULL)
05871     {
05872     /* If this is the first branch, the firstbyte and reqbyte values for the
05873     branch become the values for the regex. */
05874 
05875     if (*last_branch != OP_ALT)
05876       {
05877       firstbyte = branchfirstbyte;
05878       reqbyte = branchreqbyte;
05879       }
05880 
05881     /* If this is not the first branch, the first char and reqbyte have to
05882     match the values from all the previous branches, except that if the
05883     previous value for reqbyte didn't have REQ_VARY set, it can still match,
05884     and we set REQ_VARY for the regex. */
05885 
05886     else
05887       {
05888       /* If we previously had a firstbyte, but it doesn't match the new branch,
05889       we have to abandon the firstbyte for the regex, but if there was
05890       previously no reqbyte, it takes on the value of the old firstbyte. */
05891 
05892       if (firstbyte >= 0 && firstbyte != branchfirstbyte)
05893         {
05894         if (reqbyte < 0) reqbyte = firstbyte;
05895         firstbyte = REQ_NONE;
05896         }
05897 
05898       /* If we (now or from before) have no firstbyte, a firstbyte from the
05899       branch becomes a reqbyte if there isn't a branch reqbyte. */
05900 
05901       if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
05902           branchreqbyte = branchfirstbyte;
05903 
05904       /* Now ensure that the reqbytes match */
05905 
05906       if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
05907         reqbyte = REQ_NONE;
05908       else reqbyte |= branchreqbyte;   /* To "or" REQ_VARY */
05909       }
05910 
05911     /* If lookbehind, check that this branch matches a fixed-length string, and
05912     put the length into the OP_REVERSE item. Temporarily mark the end of the
05913     branch with OP_END. If the branch contains OP_RECURSE, the result is -3
05914     because there may be forward references that we can't check here. Set a
05915     flag to cause another lookbehind check at the end. Why not do it all at the
05916     end? Because common, erroneous checks are picked up here and the offset of
05917     the problem can be shown. */
05918 
05919     if (lookbehind)
05920       {
05921       int fixed_length;
05922       *code = OP_END;
05923       fixed_length = find_fixedlength(last_branch, options, FALSE, cd);
05924       DPRINTF(("fixed length = %d\n", fixed_length));
05925       if (fixed_length == -3)
05926         {
05927         cd->check_lookbehind = TRUE;
05928         }
05929       else if (fixed_length < 0)
05930         {
05931         *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
05932         *ptrptr = ptr;
05933         return FALSE;
05934         }
05935       else { PUT(reverse_count, 0, fixed_length); }
05936       }
05937     }
05938 
05939   /* Reached end of expression, either ')' or end of pattern. In the real
05940   compile phase, go back through the alternative branches and reverse the chain
05941   of offsets, with the field in the BRA item now becoming an offset to the
05942   first alternative. If there are no alternatives, it points to the end of the
05943   group. The length in the terminating ket is always the length of the whole
05944   bracketed item. If any of the ims options were changed inside the group,
05945   compile a resetting op-code following, except at the very end of the pattern.
05946   Return leaving the pointer at the terminating char. */
05947 
05948   if (*ptr != CHAR_VERTICAL_LINE)
05949     {
05950     if (lengthptr == NULL)
05951       {
05952       int branch_length = code - last_branch;
05953       do
05954         {
05955         int prev_length = GET(last_branch, 1);
05956         PUT(last_branch, 1, branch_length);
05957         branch_length = prev_length;
05958         last_branch -= branch_length;
05959         }
05960       while (branch_length > 0);
05961       }
05962 
05963     /* If it was a capturing subpattern, remove it from the chain. */
05964 
05965     if (capnumber > 0) cd->open_caps = cd->open_caps->next;
05966 
05967     /* Fill in the ket */
05968 
05969     *code = OP_KET;
05970     PUT(code, 1, code - start_bracket);
05971     code += 1 + LINK_SIZE;
05972 
05973     /* Resetting option if needed */
05974 
05975     if ((options & PCRE_IMS) != oldims && *ptr == CHAR_RIGHT_PARENTHESIS)
05976       {
05977       *code++ = OP_OPT;
05978       *code++ = oldims;
05979       length += 2;
05980       }
05981 
05982     /* Retain the highest bracket number, in case resetting was used. */
05983 
05984     cd->bracount = max_bracount;
05985 
05986     /* Set values to pass back */
05987 
05988     *codeptr = code;
05989     *ptrptr = ptr;
05990     *firstbyteptr = firstbyte;
05991     *reqbyteptr = reqbyte;
05992     if (lengthptr != NULL)
05993       {
05994       if (OFLOW_MAX - *lengthptr < length)
05995         {
05996         *errorcodeptr = ERR20;
05997         return FALSE;
05998         }
05999       *lengthptr += length;
06000       }
06001     return TRUE;
06002     }
06003 
06004   /* Another branch follows. In the pre-compile phase, we can move the code
06005   pointer back to where it was for the start of the first branch. (That is,
06006   pretend that each branch is the only one.)
06007 
06008   In the real compile phase, insert an ALT node. Its length field points back
06009   to the previous branch while the bracket remains open. At the end the chain
06010   is reversed. It's done like this so that the start of the bracket has a
06011   zero offset until it is closed, making it possible to detect recursion. */
06012 
06013   if (lengthptr != NULL)
06014     {
06015     code = *codeptr + 1 + LINK_SIZE + skipbytes;
06016     length += 1 + LINK_SIZE;
06017     }
06018   else
06019     {
06020     *code = OP_ALT;
06021     PUT(code, 1, code - last_branch);
06022     bc.current = last_branch = code;
06023     code += 1 + LINK_SIZE;
06024     }
06025 
06026   ptr++;
06027   }
06028 /* Control never reaches here */
06029 }
06030 
06031 
06032 
06033 
06034 /*************************************************
06035 *          Check for anchored expression         *
06036 *************************************************/
06037 
06038 /* Try to find out if this is an anchored regular expression. Consider each
06039 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
06040 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
06041 it's anchored. However, if this is a multiline pattern, then only OP_SOD
06042 counts, since OP_CIRC can match in the middle.
06043 
06044 We can also consider a regex to be anchored if OP_SOM starts all its branches.
06045 This is the code for \G, which means "match at start of match position, taking
06046 into account the match offset".
06047 
06048 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
06049 because that will try the rest of the pattern at all possible matching points,
06050 so there is no point trying again.... er ....
06051 
06052 .... except when the .* appears inside capturing parentheses, and there is a
06053 subsequent back reference to those parentheses. We haven't enough information
06054 to catch that case precisely.
06055 
06056 At first, the best we could do was to detect when .* was in capturing brackets
06057 and the highest back reference was greater than or equal to that level.
06058 However, by keeping a bitmap of the first 31 back references, we can catch some
06059 of the more common cases more precisely.
06060 
06061 Arguments:
06062   code           points to start of expression (the bracket)
06063   options        points to the options setting
06064   bracket_map    a bitmap of which brackets we are inside while testing; this
06065                   handles up to substring 31; after that we just have to take
06066                   the less precise approach
06067   backref_map    the back reference bitmap
06068 
06069 Returns:     TRUE or FALSE
06070 */
06071 
06072 static BOOL
06073 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
06074   unsigned int backref_map)
06075 {
06076 do {
06077    const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
06078      options, PCRE_MULTILINE, FALSE);
06079    register int op = *scode;
06080 
06081    /* Non-capturing brackets */
06082 
06083    if (op == OP_BRA)
06084      {
06085      if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
06086      }
06087 
06088    /* Capturing brackets */
06089 
06090    else if (op == OP_CBRA)
06091      {
06092      int n = GET2(scode, 1+LINK_SIZE);
06093      int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
06094      if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
06095      }
06096 
06097    /* Other brackets */
06098 
06099    else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
06100      {
06101      if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
06102      }
06103 
06104    /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
06105    it isn't in brackets that are or may be referenced. */
06106 
06107    else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
06108              op == OP_TYPEPOSSTAR))
06109      {
06110      if (scode[1] != OP_ALLANY || (bracket_map & backref_map) != 0)
06111        return FALSE;
06112      }
06113 
06114    /* Check for explicit anchoring */
06115 
06116    else if (op != OP_SOD && op != OP_SOM &&
06117            ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
06118      return FALSE;
06119    code += GET(code, 1);
06120    }
06121 while (*code == OP_ALT);   /* Loop for each alternative */
06122 return TRUE;
06123 }
06124 
06125 
06126 
06127 /*************************************************
06128 *         Check for starting with ^ or .*        *
06129 *************************************************/
06130 
06131 /* This is called to find out if every branch starts with ^ or .* so that
06132 "first char" processing can be done to speed things up in multiline
06133 matching and for non-DOTALL patterns that start with .* (which must start at
06134 the beginning or after \n). As in the case of is_anchored() (see above), we
06135 have to take account of back references to capturing brackets that contain .*
06136 because in that case we can't make the assumption.
06137 
06138 Arguments:
06139   code           points to start of expression (the bracket)
06140   bracket_map    a bitmap of which brackets we are inside while testing; this
06141                   handles up to substring 31; after that we just have to take
06142                   the less precise approach
06143   backref_map    the back reference bitmap
06144 
06145 Returns:         TRUE or FALSE
06146 */
06147 
06148 static BOOL
06149 is_startline(const uschar *code, unsigned int bracket_map,
06150   unsigned int backref_map)
06151 {
06152 do {
06153    const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
06154      NULL, 0, FALSE);
06155    register int op = *scode;
06156 
06157    /* If we are at the start of a conditional assertion group, *both* the
06158    conditional assertion *and* what follows the condition must satisfy the test
06159    for start of line. Other kinds of condition fail. Note that there may be an
06160    auto-callout at the start of a condition. */
06161 
06162    if (op == OP_COND)
06163      {
06164      scode += 1 + LINK_SIZE;
06165      if (*scode == OP_CALLOUT) scode += _pcre_OP_lengths[OP_CALLOUT];
06166      switch (*scode)
06167        {
06168        case OP_CREF:
06169        case OP_NCREF:
06170        case OP_RREF:
06171        case OP_NRREF:
06172        case OP_DEF:
06173        return FALSE;
06174 
06175        default:     /* Assertion */
06176        if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
06177        do scode += GET(scode, 1); while (*scode == OP_ALT);
06178        scode += 1 + LINK_SIZE;
06179        break;
06180        }
06181      scode = first_significant_code(scode, NULL, 0, FALSE);
06182      op = *scode;
06183      }
06184 
06185    /* Non-capturing brackets */
06186 
06187    if (op == OP_BRA)
06188      {
06189      if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
06190      }
06191 
06192    /* Capturing brackets */
06193 
06194    else if (op == OP_CBRA)
06195      {
06196      int n = GET2(scode, 1+LINK_SIZE);
06197      int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
06198      if (!is_startline(scode, new_map, backref_map)) return FALSE;
06199      }
06200 
06201    /* Other brackets */
06202 
06203    else if (op == OP_ASSERT || op == OP_ONCE)
06204      {
06205      if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
06206      }
06207 
06208    /* .* means "start at start or after \n" if it isn't in brackets that
06209    may be referenced. */
06210 
06211    else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
06212      {
06213      if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
06214      }
06215 
06216    /* Check for explicit circumflex */
06217 
06218    else if (op != OP_CIRC) return FALSE;
06219 
06220    /* Move on to the next alternative */
06221 
06222    code += GET(code, 1);
06223    }
06224 while (*code == OP_ALT);  /* Loop for each alternative */
06225 return TRUE;
06226 }
06227 
06228 
06229 
06230 /*************************************************
06231 *       Check for asserted fixed first char      *
06232 *************************************************/
06233 
06234 /* During compilation, the "first char" settings from forward assertions are
06235 discarded, because they can cause conflicts with actual literals that follow.
06236 However, if we end up without a first char setting for an unanchored pattern,
06237 it is worth scanning the regex to see if there is an initial asserted first
06238 char. If all branches start with the same asserted char, or with a bracket all
06239 of whose alternatives start with the same asserted char (recurse ad lib), then
06240 we return that char, otherwise -1.
06241 
06242 Arguments:
06243   code       points to start of expression (the bracket)
06244   options    pointer to the options (used to check casing changes)
06245   inassert   TRUE if in an assertion
06246 
06247 Returns:     -1 or the fixed first char
06248 */
06249 
06250 static int
06251 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
06252 {
06253 register int c = -1;
06254 do {
06255    int d;
06256    const uschar *scode =
06257      first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
06258    register int op = *scode;
06259 
06260    switch(op)
06261      {
06262      default:
06263      return -1;
06264 
06265      case OP_BRA:
06266      case OP_CBRA:
06267      case OP_ASSERT:
06268      case OP_ONCE:
06269      case OP_COND:
06270      if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
06271        return -1;
06272      if (c < 0) c = d; else if (c != d) return -1;
06273      break;
06274 
06275      case OP_EXACT:       /* Fall through */
06276      scode += 2;
06277 
06278      case OP_CHAR:
06279      case OP_CHARNC:
06280      case OP_PLUS:
06281      case OP_MINPLUS:
06282      case OP_POSPLUS:
06283      if (!inassert) return -1;
06284      if (c < 0)
06285        {
06286        c = scode[1];
06287        if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
06288        }
06289      else if (c != scode[1]) return -1;
06290      break;
06291      }
06292 
06293    code += GET(code, 1);
06294    }
06295 while (*code == OP_ALT);
06296 return c;
06297 }
06298 
06299 
06300 
06301 /*************************************************
06302 *        Compile a Regular Expression            *
06303 *************************************************/
06304 
06305 /* This function takes a string and returns a pointer to a block of store
06306 holding a compiled version of the expression. The original API for this
06307 function had no error code return variable; it is retained for backwards
06308 compatibility. The new function is given a new name.
06309 
06310 Arguments:
06311   pattern       the regular expression
06312   options       various option bits
06313   errorcodeptr  pointer to error code variable (pcre_compile2() only)
06314                   can be NULL if you don't want a code value
06315   errorptr      pointer to pointer to error text
06316   erroroffset   ptr offset in pattern where error was detected
06317   tables        pointer to character tables or NULL
06318 
06319 Returns:        pointer to compiled data block, or NULL on error,
06320                 with errorptr and erroroffset set
06321 */
06322 
06323 PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
06324 pcre_compile(const char *pattern, int options, const char **errorptr,
06325   int *erroroffset, const unsigned char *tables)
06326 {
06327 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
06328 }
06329 
06330 
06331 PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
06332 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
06333   const char **errorptr, int *erroroffset, const unsigned char *tables)
06334 {
06335 real_pcre *re;
06336 int length = 1;  /* For final END opcode */
06337 int firstbyte, reqbyte, newline;
06338 int errorcode = 0;
06339 int skipatstart = 0;
06340 BOOL utf8 = (options & PCRE_UTF8) != 0;
06341 size_t size;
06342 uschar *code;
06343 const uschar *codestart;
06344 const uschar *ptr;
06345 compile_data compile_block;
06346 compile_data *cd = &compile_block;
06347 
06348 /* This space is used for "compiling" into during the first phase, when we are
06349 computing the amount of memory that is needed. Compiled items are thrown away
06350 as soon as possible, so that a fairly large buffer should be sufficient for
06351 this purpose. The same space is used in the second phase for remembering where
06352 to fill in forward references to subpatterns. */
06353 
06354 uschar cworkspace[COMPILE_WORK_SIZE];
06355 
06356 /* Set this early so that early errors get offset 0. */
06357 
06358 ptr = (const uschar *)pattern;
06359 
06360 /* We can't pass back an error message if errorptr is NULL; I guess the best we
06361 can do is just return NULL, but we can set a code value if there is a code
06362 pointer. */
06363 
06364 if (errorptr == NULL)
06365   {
06366   if (errorcodeptr != NULL) *errorcodeptr = 99;
06367   return NULL;
06368   }
06369 
06370 *errorptr = NULL;
06371 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
06372 
06373 /* However, we can give a message for this error */
06374 
06375 if (erroroffset == NULL)
06376   {
06377   errorcode = ERR16;
06378   goto PCRE_EARLY_ERROR_RETURN2;
06379   }
06380 
06381 *erroroffset = 0;
06382 
06383 /* Set up pointers to the individual character tables */
06384 
06385 if (tables == NULL) tables = _pcre_default_tables;
06386 cd->lcc = tables + lcc_offset;
06387 cd->fcc = tables + fcc_offset;
06388 cd->cbits = tables + cbits_offset;
06389 cd->ctypes = tables + ctypes_offset;
06390 
06391 /* Check that all undefined public option bits are zero */
06392 
06393 if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0)
06394   {
06395   errorcode = ERR17;
06396   goto PCRE_EARLY_ERROR_RETURN;
06397   }
06398 
06399 /* Check for global one-time settings at the start of the pattern, and remember
06400 the offset for later. */
06401 
06402 while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
06403        ptr[skipatstart+1] == CHAR_ASTERISK)
06404   {
06405   int newnl = 0;
06406   int newbsr = 0;
06407 
06408   if (strncmp((char *)(ptr+skipatstart+2), STRING_UTF8_RIGHTPAR, 5) == 0)
06409     { skipatstart += 7; options |= PCRE_UTF8; continue; }
06410 
06411   if (strncmp((char *)(ptr+skipatstart+2), STRING_CR_RIGHTPAR, 3) == 0)
06412     { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
06413   else if (strncmp((char *)(ptr+skipatstart+2), STRING_LF_RIGHTPAR, 3)  == 0)
06414     { skipatstart += 5; newnl = PCRE_NEWLINE_LF; }
06415   else if (strncmp((char *)(ptr+skipatstart+2), STRING_CRLF_RIGHTPAR, 5)  == 0)
06416     { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
06417   else if (strncmp((char *)(ptr+skipatstart+2), STRING_ANY_RIGHTPAR, 4) == 0)
06418     { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; }
06419   else if (strncmp((char *)(ptr+skipatstart+2), STRING_ANYCRLF_RIGHTPAR, 8) == 0)
06420     { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; }
06421 
06422   else if (strncmp((char *)(ptr+skipatstart+2), STRING_BSR_ANYCRLF_RIGHTPAR, 12) == 0)
06423     { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; }
06424   else if (strncmp((char *)(ptr+skipatstart+2), STRING_BSR_UNICODE_RIGHTPAR, 12) == 0)
06425     { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; }
06426 
06427   if (newnl != 0)
06428     options = (options & ~PCRE_NEWLINE_BITS) | newnl;
06429   else if (newbsr != 0)
06430     options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr;
06431   else break;
06432   }
06433 
06434 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
06435 
06436 #ifdef SUPPORT_UTF8
06437 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
06438      (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
06439   {
06440   errorcode = ERR44;
06441   goto PCRE_EARLY_ERROR_RETURN2;
06442   }
06443 #else
06444 if (utf8)
06445   {
06446   errorcode = ERR32;
06447   goto PCRE_EARLY_ERROR_RETURN;
06448   }
06449 #endif
06450 
06451 /* Check validity of \R options. */
06452 
06453 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
06454   {
06455   case 0:
06456   case PCRE_BSR_ANYCRLF:
06457   case PCRE_BSR_UNICODE:
06458   break;
06459   default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
06460   }
06461 
06462 /* Handle different types of newline. The three bits give seven cases. The
06463 current code allows for fixed one- or two-byte sequences, plus "any" and
06464 "anycrlf". */
06465 
06466 switch (options & PCRE_NEWLINE_BITS)
06467   {
06468   case 0: newline = NEWLINE; break;   /* Build-time default */
06469   case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
06470   case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
06471   case PCRE_NEWLINE_CR+
06472        PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
06473   case PCRE_NEWLINE_ANY: newline = -1; break;
06474   case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
06475   default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
06476   }
06477 
06478 if (newline == -2)
06479   {
06480   cd->nltype = NLTYPE_ANYCRLF;
06481   }
06482 else if (newline < 0)
06483   {
06484   cd->nltype = NLTYPE_ANY;
06485   }
06486 else
06487   {
06488   cd->nltype = NLTYPE_FIXED;
06489   if (newline > 255)
06490     {
06491     cd->nllen = 2;
06492     cd->nl[0] = (newline >> 8) & 255;
06493     cd->nl[1] = newline & 255;
06494     }
06495   else
06496     {
06497     cd->nllen = 1;
06498     cd->nl[0] = newline;
06499     }
06500   }
06501 
06502 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
06503 references to help in deciding whether (.*) can be treated as anchored or not.
06504 */
06505 
06506 cd->top_backref = 0;
06507 cd->backref_map = 0;
06508 
06509 /* Reflect pattern for debugging output */
06510 
06511 DPRINTF(("------------------------------------------------------------------\n"));
06512 DPRINTF(("%s\n", pattern));
06513 
06514 /* Pretend to compile the pattern while actually just accumulating the length
06515 of memory required. This behaviour is triggered by passing a non-NULL final
06516 argument to compile_regex(). We pass a block of workspace (cworkspace) for it
06517 to compile parts of the pattern into; the compiled code is discarded when it is
06518 no longer needed, so hopefully this workspace will never overflow, though there
06519 is a test for its doing so. */
06520 
06521 cd->bracount = cd->final_bracount = 0;
06522 cd->names_found = 0;
06523 cd->name_entry_size = 0;
06524 cd->name_table = NULL;
06525 cd->start_workspace = cworkspace;
06526 cd->start_code = cworkspace;
06527 cd->hwm = cworkspace;
06528 cd->start_pattern = (const uschar *)pattern;
06529 cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
06530 cd->req_varyopt = 0;
06531 cd->external_options = options;
06532 cd->external_flags = 0;
06533 cd->open_caps = NULL;
06534 
06535 /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
06536 don't need to look at the result of the function here. The initial options have
06537 been put into the cd block so that they can be changed if an option setting is
06538 found within the regex right at the beginning. Bringing initial option settings
06539 outside can help speed up starting point checks. */
06540 
06541 ptr += skipatstart;
06542 code = cworkspace;
06543 *code = OP_BRA;
06544 (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
06545   &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd,
06546   &length);
06547 if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
06548 
06549 DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
06550   cd->hwm - cworkspace));
06551 
06552 if (length > MAX_PATTERN_SIZE)
06553   {
06554   errorcode = ERR20;
06555   goto PCRE_EARLY_ERROR_RETURN;
06556   }
06557 
06558 /* Compute the size of data block needed and get it, either from malloc or
06559 externally provided function. Integer overflow should no longer be possible
06560 because nowadays we limit the maximum value of cd->names_found and
06561 cd->name_entry_size. */
06562 
06563 size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3);
06564 re = (real_pcre *)(pcre_malloc)(size);
06565 
06566 if (re == NULL)
06567   {
06568   errorcode = ERR21;
06569   goto PCRE_EARLY_ERROR_RETURN;
06570   }
06571 
06572 /* Put in the magic number, and save the sizes, initial options, internal
06573 flags, and character table pointer. NULL is used for the default character
06574 tables. The nullpad field is at the end; it's there to help in the case when a
06575 regex compiled on a system with 4-byte pointers is run on another with 8-byte
06576 pointers. */
06577 
06578 re->magic_number = MAGIC_NUMBER;
06579 re->size = size;
06580 re->options = cd->external_options;
06581 re->flags = cd->external_flags;
06582 re->dummy1 = 0;
06583 re->first_byte = 0;
06584 re->req_byte = 0;
06585 re->name_table_offset = sizeof(real_pcre);
06586 re->name_entry_size = cd->name_entry_size;
06587 re->name_count = cd->names_found;
06588 re->ref_count = 0;
06589 re->tables = (tables == _pcre_default_tables)? NULL : tables;
06590 re->nullpad = NULL;
06591 
06592 /* The starting points of the name/number translation table and of the code are
06593 passed around in the compile data block. The start/end pattern and initial
06594 options are already set from the pre-compile phase, as is the name_entry_size
06595 field. Reset the bracket count and the names_found field. Also reset the hwm
06596 field; this time it's used for remembering forward references to subpatterns.
06597 */
06598 
06599 cd->final_bracount = cd->bracount;  /* Save for checking forward references */
06600 cd->bracount = 0;
06601 cd->names_found = 0;
06602 cd->name_table = (uschar *)re + re->name_table_offset;
06603 codestart = cd->name_table + re->name_entry_size * re->name_count;
06604 cd->start_code = codestart;
06605 cd->hwm = cworkspace;
06606 cd->req_varyopt = 0;
06607 cd->had_accept = FALSE;
06608 cd->check_lookbehind = FALSE;
06609 cd->open_caps = NULL;
06610 
06611 /* Set up a starting, non-extracting bracket, then compile the expression. On
06612 error, errorcode will be set non-zero, so we don't need to look at the result
06613 of the function here. */
06614 
06615 ptr = (const uschar *)pattern + skipatstart;
06616 code = (uschar *)codestart;
06617 *code = OP_BRA;
06618 (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
06619   &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
06620 re->top_bracket = cd->bracount;
06621 re->top_backref = cd->top_backref;
06622 re->flags = cd->external_flags;
06623 
06624 if (cd->had_accept) reqbyte = -1;   /* Must disable after (*ACCEPT) */
06625 
06626 /* If not reached end of pattern on success, there's an excess bracket. */
06627 
06628 if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
06629 
06630 /* Fill in the terminating state and check for disastrous overflow, but
06631 if debugging, leave the test till after things are printed out. */
06632 
06633 *code++ = OP_END;
06634 
06635 #ifndef DEBUG
06636 if (code - codestart > length) errorcode = ERR23;
06637 #endif
06638 
06639 /* Fill in any forward references that are required. */
06640 
06641 while (errorcode == 0 && cd->hwm > cworkspace)
06642   {
06643   int offset, recno;
06644   const uschar *groupptr;
06645   cd->hwm -= LINK_SIZE;
06646   offset = GET(cd->hwm, 0);
06647   recno = GET(codestart, offset);
06648   groupptr = _pcre_find_bracket(codestart, utf8, recno);
06649   if (groupptr == NULL) errorcode = ERR53;
06650     else PUT(((uschar *)codestart), offset, groupptr - codestart);
06651   }
06652 
06653 /* Give an error if there's back reference to a non-existent capturing
06654 subpattern. */
06655 
06656 if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
06657 
06658 /* If there were any lookbehind assertions that contained OP_RECURSE
06659 (recursions or subroutine calls), a flag is set for them to be checked here,
06660 because they may contain forward references. Actual recursions can't be fixed
06661 length, but subroutine calls can. It is done like this so that those without
06662 OP_RECURSE that are not fixed length get a diagnosic with a useful offset. The
06663 exceptional ones forgo this. We scan the pattern to check that they are fixed
06664 length, and set their lengths. */
06665 
06666 if (cd->check_lookbehind)
06667   {
06668   uschar *cc = (uschar *)codestart;
06669 
06670   /* Loop, searching for OP_REVERSE items, and process those that do not have
06671   their length set. (Actually, it will also re-process any that have a length
06672   of zero, but that is a pathological case, and it does no harm.) When we find
06673   one, we temporarily terminate the branch it is in while we scan it. */
06674 
06675   for (cc = (uschar *)_pcre_find_bracket(codestart, utf8, -1);
06676        cc != NULL;
06677        cc = (uschar *)_pcre_find_bracket(cc, utf8, -1))
06678     {
06679     if (GET(cc, 1) == 0)
06680       {
06681       int fixed_length;
06682       uschar *be = cc - 1 - LINK_SIZE + GET(cc, -LINK_SIZE);
06683       int end_op = *be;
06684       *be = OP_END;
06685       fixed_length = find_fixedlength(cc, re->options, TRUE, cd);
06686       *be = end_op;
06687       DPRINTF(("fixed length = %d\n", fixed_length));
06688       if (fixed_length < 0)
06689         {
06690         errorcode = (fixed_length == -2)? ERR36 : ERR25;
06691         break;
06692         }
06693       PUT(cc, 1, fixed_length);
06694       }
06695     cc += 1 + LINK_SIZE;
06696     }
06697   }
06698 
06699 /* Failed to compile, or error while post-processing */
06700 
06701 if (errorcode != 0)
06702   {
06703   (pcre_free)(re);
06704   PCRE_EARLY_ERROR_RETURN:
06705   *erroroffset = ptr - (const uschar *)pattern;
06706   PCRE_EARLY_ERROR_RETURN2:
06707   *errorptr = find_error_text(errorcode);
06708   if (errorcodeptr != NULL) *errorcodeptr = errorcode;
06709   return NULL;
06710   }
06711 
06712 /* If the anchored option was not passed, set the flag if we can determine that
06713 the pattern is anchored by virtue of ^ characters or \A or anything else (such
06714 as starting with .* when DOTALL is set).
06715 
06716 Otherwise, if we know what the first byte has to be, save it, because that
06717 speeds up unanchored matches no end. If not, see if we can set the
06718 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
06719 start with ^. and also when all branches start with .* for non-DOTALL matches.
06720 */
06721 
06722 if ((re->options & PCRE_ANCHORED) == 0)
06723   {
06724   int temp_options = re->options;   /* May get changed during these scans */
06725   if (is_anchored(codestart, &temp_options, 0, cd->backref_map))
06726     re->options |= PCRE_ANCHORED;
06727   else
06728     {
06729     if (firstbyte < 0)
06730       firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
06731     if (firstbyte >= 0)   /* Remove caseless flag for non-caseable chars */
06732       {
06733       int ch = firstbyte & 255;
06734       re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
06735          cd->fcc[ch] == ch)? ch : firstbyte;
06736       re->flags |= PCRE_FIRSTSET;
06737       }
06738     else if (is_startline(codestart, 0, cd->backref_map))
06739       re->flags |= PCRE_STARTLINE;
06740     }
06741   }
06742 
06743 /* For an anchored pattern, we use the "required byte" only if it follows a
06744 variable length item in the regex. Remove the caseless flag for non-caseable
06745 bytes. */
06746 
06747 if (reqbyte >= 0 &&
06748      ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
06749   {
06750   int ch = reqbyte & 255;
06751   re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
06752     cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
06753   re->flags |= PCRE_REQCHSET;
06754   }
06755 
06756 /* Print out the compiled data if debugging is enabled. This is never the
06757 case when building a production library. */
06758 
06759 #ifdef DEBUG_PCRE
06760 
06761 printf("Length = %d top_bracket = %d top_backref = %d\n",
06762   length, re->top_bracket, re->top_backref);
06763 
06764 printf("Options=%08x\n", re->options);
06765 
06766 if ((re->flags & PCRE_FIRSTSET) != 0)
06767   {
06768   int ch = re->first_byte & 255;
06769   const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
06770     "" : " (caseless)";
06771   if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
06772     else printf("First char = \\x%02x%s\n", ch, caseless);
06773   }
06774 
06775 if ((re->flags & PCRE_REQCHSET) != 0)
06776   {
06777   int ch = re->req_byte & 255;
06778   const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
06779     "" : " (caseless)";
06780   if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
06781     else printf("Req char = \\x%02x%s\n", ch, caseless);
06782   }
06783 
06784 pcre_printint(re, stdout, TRUE);
06785 
06786 /* This check is done here in the debugging case so that the code that
06787 was compiled can be seen. */
06788 
06789 if (code - codestart > length)
06790   {
06791   (pcre_free)(re);
06792   *errorptr = find_error_text(ERR23);
06793   *erroroffset = ptr - (uschar *)pattern;
06794   if (errorcodeptr != NULL) *errorcodeptr = ERR23;
06795   return NULL;
06796   }
06797 #endif   /* DEBUG */
06798 
06799 return (pcre *)re;
06800 }
06801 
06802 /* End of pcre_compile.c */
src/pcre/pcre_compile.c