00001 /************************************************* 00002 * Perl-Compatible Regular Expressions * 00003 *************************************************/ 00004 00005 /* PCRE is a library of functions to support regular expressions whose syntax 00006 and semantics are as close as possible to those of the Perl 5 language. 00007 00008 Written by Philip Hazel 00009 Copyright (c) 1997-2009 University of Cambridge 00010 00011 ----------------------------------------------------------------------------- 00012 Redistribution and use in source and binary forms, with or without 00013 modification, are permitted provided that the following conditions are met: 00014 00015 * Redistributions of source code must retain the above copyright notice, 00016 this list of conditions and the following disclaimer. 00017 00018 * Redistributions in binary form must reproduce the above copyright 00019 notice, this list of conditions and the following disclaimer in the 00020 documentation and/or other materials provided with the distribution. 00021 00022 * Neither the name of the University of Cambridge nor the names of its 00023 contributors may be used to endorse or promote products derived from 00024 this software without specific prior written permission. 00025 00026 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 00027 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 00028 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 00029 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 00030 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 00031 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 00032 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 00033 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 00034 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 00035 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 00036 POSSIBILITY OF SUCH DAMAGE. 00037 ----------------------------------------------------------------------------- 00038 */ 00039 00040 00041 /* This module is a wrapper that provides a POSIX API to the underlying PCRE 00042 functions. */ 00043 00044 00045 #ifdef HAVE_CONFIG_H 00046 #include "config.h" 00047 #else if defined(_WINDOWS) 00048 #include <spl/configwin32.h> 00049 #endif 00050 00051 00052 00053 /* Ensure that the PCREPOSIX_EXP_xxx macros are set appropriately for 00054 compiling these functions. This must come before including pcreposix.h, where 00055 they are set for an application (using these functions) if they have not 00056 previously been set. */ 00057 00058 //#if defined(_WIN32) && !defined(PCRE_STATIC) 00059 //# define PCREPOSIX_EXP_DECL extern __declspec(dllexport) 00060 //# define PCREPOSIX_EXP_DEFN __declspec(dllexport) 00061 //#endif 00062 00063 #include "pcre.h" 00064 #include "pcre_internal.h" 00065 #include <spl/text/pcreposix.h> 00066 00067 00068 /* Table to translate PCRE compile time error codes into POSIX error codes. */ 00069 00070 static const int eint[] = { 00071 0, /* no error */ 00072 REG_EESCAPE, /* \ at end of pattern */ 00073 REG_EESCAPE, /* \c at end of pattern */ 00074 REG_EESCAPE, /* unrecognized character follows \ */ 00075 REG_BADBR, /* numbers out of order in {} quantifier */ 00076 /* 5 */ 00077 REG_BADBR, /* number too big in {} quantifier */ 00078 REG_EBRACK, /* missing terminating ] for character class */ 00079 REG_ECTYPE, /* invalid escape sequence in character class */ 00080 REG_ERANGE, /* range out of order in character class */ 00081 REG_BADRPT, /* nothing to repeat */ 00082 /* 10 */ 00083 REG_BADRPT, /* operand of unlimited repeat could match the empty string */ 00084 REG_ASSERT, /* internal error: unexpected repeat */ 00085 REG_BADPAT, /* unrecognized character after (? */ 00086 REG_BADPAT, /* POSIX named classes are supported only within a class */ 00087 REG_EPAREN, /* missing ) */ 00088 /* 15 */ 00089 REG_ESUBREG, /* reference to non-existent subpattern */ 00090 REG_INVARG, /* erroffset passed as NULL */ 00091 REG_INVARG, /* unknown option bit(s) set */ 00092 REG_EPAREN, /* missing ) after comment */ 00093 REG_ESIZE, /* parentheses nested too deeply */ 00094 /* 20 */ 00095 REG_ESIZE, /* regular expression too large */ 00096 REG_ESPACE, /* failed to get memory */ 00097 REG_EPAREN, /* unmatched parentheses */ 00098 REG_ASSERT, /* internal error: code overflow */ 00099 REG_BADPAT, /* unrecognized character after (?< */ 00100 /* 25 */ 00101 REG_BADPAT, /* lookbehind assertion is not fixed length */ 00102 REG_BADPAT, /* malformed number or name after (?( */ 00103 REG_BADPAT, /* conditional group contains more than two branches */ 00104 REG_BADPAT, /* assertion expected after (?( */ 00105 REG_BADPAT, /* (?R or (?[+-]digits must be followed by ) */ 00106 /* 30 */ 00107 REG_ECTYPE, /* unknown POSIX class name */ 00108 REG_BADPAT, /* POSIX collating elements are not supported */ 00109 REG_INVARG, /* this version of PCRE is not compiled with PCRE_UTF8 support */ 00110 REG_BADPAT, /* spare error */ 00111 REG_BADPAT, /* character value in \x{...} sequence is too large */ 00112 /* 35 */ 00113 REG_BADPAT, /* invalid condition (?(0) */ 00114 REG_BADPAT, /* \C not allowed in lookbehind assertion */ 00115 REG_EESCAPE, /* PCRE does not support \L, \l, \N, \U, or \u */ 00116 REG_BADPAT, /* number after (?C is > 255 */ 00117 REG_BADPAT, /* closing ) for (?C expected */ 00118 /* 40 */ 00119 REG_BADPAT, /* recursive call could loop indefinitely */ 00120 REG_BADPAT, /* unrecognized character after (?P */ 00121 REG_BADPAT, /* syntax error in subpattern name (missing terminator) */ 00122 REG_BADPAT, /* two named subpatterns have the same name */ 00123 REG_BADPAT, /* invalid UTF-8 string */ 00124 /* 45 */ 00125 REG_BADPAT, /* support for \P, \p, and \X has not been compiled */ 00126 REG_BADPAT, /* malformed \P or \p sequence */ 00127 REG_BADPAT, /* unknown property name after \P or \p */ 00128 REG_BADPAT, /* subpattern name is too long (maximum 32 characters) */ 00129 REG_BADPAT, /* too many named subpatterns (maximum 10,000) */ 00130 /* 50 */ 00131 REG_BADPAT, /* repeated subpattern is too long */ 00132 REG_BADPAT, /* octal value is greater than \377 (not in UTF-8 mode) */ 00133 REG_BADPAT, /* internal error: overran compiling workspace */ 00134 REG_BADPAT, /* internal error: previously-checked referenced subpattern not found */ 00135 REG_BADPAT, /* DEFINE group contains more than one branch */ 00136 /* 55 */ 00137 REG_BADPAT, /* repeating a DEFINE group is not allowed */ 00138 REG_INVARG, /* inconsistent NEWLINE options */ 00139 REG_BADPAT, /* \g is not followed followed by an (optionally braced) non-zero number */ 00140 REG_BADPAT, /* a numbered reference must not be zero */ 00141 REG_BADPAT, /* (*VERB) with an argument is not supported */ 00142 /* 60 */ 00143 REG_BADPAT, /* (*VERB) not recognized */ 00144 REG_BADPAT, /* number is too big */ 00145 REG_BADPAT, /* subpattern name expected */ 00146 REG_BADPAT, /* digit expected after (?+ */ 00147 REG_BADPAT, /* ] is an invalid data character in JavaScript compatibility mode */ 00148 /* 65 */ 00149 REG_BADPAT /* different names for subpatterns of the same number are not allowed */ 00150 }; 00151 00152 /* Table of texts corresponding to POSIX error codes */ 00153 00154 static const char *const pstring[] = { 00155 "", /* Dummy for value 0 */ 00156 "internal error", /* REG_ASSERT */ 00157 "invalid repeat counts in {}", /* BADBR */ 00158 "pattern error", /* BADPAT */ 00159 "? * + invalid", /* BADRPT */ 00160 "unbalanced {}", /* EBRACE */ 00161 "unbalanced []", /* EBRACK */ 00162 "collation error - not relevant", /* ECOLLATE */ 00163 "bad class", /* ECTYPE */ 00164 "bad escape sequence", /* EESCAPE */ 00165 "empty expression", /* EMPTY */ 00166 "unbalanced ()", /* EPAREN */ 00167 "bad range inside []", /* ERANGE */ 00168 "expression too big", /* ESIZE */ 00169 "failed to get memory", /* ESPACE */ 00170 "bad back reference", /* ESUBREG */ 00171 "bad argument", /* INVARG */ 00172 "match failed" /* NOMATCH */ 00173 }; 00174 00175 00176 00177 00178 /************************************************* 00179 * Translate error code to string * 00180 *************************************************/ 00181 00182 PCREPOSIX_EXP_DEFN size_t PCRE_CALL_CONVENTION 00183 regerror(int errcode, const regex_t *preg, char *errbuf, size_t errbuf_size) 00184 { 00185 const char *message, *addmessage; 00186 size_t length, addlength; 00187 00188 message = (errcode >= (int)(sizeof(pstring)/sizeof(char *)))? 00189 "unknown error code" : pstring[errcode]; 00190 length = strlen(message) + 1; 00191 00192 addmessage = " at offset "; 00193 addlength = (preg != NULL && (int)preg->re_erroffset != -1)? 00194 strlen(addmessage) + 6 : 0; 00195 00196 if (errbuf_size > 0) 00197 { 00198 if (addlength > 0 && errbuf_size >= length + addlength) 00199 sprintf(errbuf, "%s%s%-6d", message, addmessage, (int)preg->re_erroffset); 00200 else 00201 { 00202 strncpy(errbuf, message, errbuf_size - 1); 00203 errbuf[errbuf_size-1] = 0; 00204 } 00205 } 00206 00207 return length + addlength; 00208 } 00209 00210 00211 00212 00213 /************************************************* 00214 * Free store held by a regex * 00215 *************************************************/ 00216 00217 PCREPOSIX_EXP_DEFN void PCRE_CALL_CONVENTION 00218 regfree(regex_t *preg) 00219 { 00220 (pcre_free)(preg->re_pcre); 00221 } 00222 00223 00224 00225 00226 /************************************************* 00227 * Compile a regular expression * 00228 *************************************************/ 00229 00230 /* 00231 Arguments: 00232 preg points to a structure for recording the compiled expression 00233 pattern the pattern to compile 00234 cflags compilation flags 00235 00236 Returns: 0 on success 00237 various non-zero codes on failure 00238 */ 00239 00240 PCREPOSIX_EXP_DEFN int PCRE_CALL_CONVENTION 00241 regcomp(regex_t *preg, const char *pattern, int cflags) 00242 { 00243 const char *errorptr; 00244 int erroffset; 00245 int errorcode; 00246 int options = 0; 00247 00248 if ((cflags & REG_ICASE) != 0) options |= PCRE_CASELESS; 00249 if ((cflags & REG_NEWLINE) != 0) options |= PCRE_MULTILINE; 00250 if ((cflags & REG_DOTALL) != 0) options |= PCRE_DOTALL; 00251 if ((cflags & REG_NOSUB) != 0) options |= PCRE_NO_AUTO_CAPTURE; 00252 if ((cflags & REG_UTF8) != 0) options |= PCRE_UTF8; 00253 if ((cflags & REG_UNGREEDY) != 0) options |= PCRE_UNGREEDY; 00254 00255 preg->re_pcre = pcre_compile2(pattern, options, &errorcode, &errorptr, 00256 &erroffset, NULL); 00257 preg->re_erroffset = erroffset; 00258 00259 /* Safety: if the error code is too big for the translation vector (which 00260 should not happen, but we all make mistakes), return REG_BADPAT. */ 00261 00262 if (preg->re_pcre == NULL) 00263 { 00264 return (errorcode < sizeof(eint)/sizeof(const int))? 00265 eint[errorcode] : REG_BADPAT; 00266 } 00267 00268 preg->re_nsub = pcre_info((const pcre *)preg->re_pcre, NULL, NULL); 00269 return 0; 00270 } 00271 00272 00273 00274 00275 /************************************************* 00276 * Match a regular expression * 00277 *************************************************/ 00278 00279 /* Unfortunately, PCRE requires 3 ints of working space for each captured 00280 substring, so we have to get and release working store instead of just using 00281 the POSIX structures as was done in earlier releases when PCRE needed only 2 00282 ints. However, if the number of possible capturing brackets is small, use a 00283 block of store on the stack, to reduce the use of malloc/free. The threshold is 00284 in a macro that can be changed at configure time. 00285 00286 If REG_NOSUB was specified at compile time, the PCRE_NO_AUTO_CAPTURE flag will 00287 be set. When this is the case, the nmatch and pmatch arguments are ignored, and 00288 the only result is yes/no/error. */ 00289 00290 PCREPOSIX_EXP_DEFN int PCRE_CALL_CONVENTION 00291 regexec(const regex_t *preg, const char *string, size_t nmatch, 00292 regmatch_t pmatch[], int eflags) 00293 { 00294 int rc, so, eo; 00295 int options = 0; 00296 int *ovector = NULL; 00297 int small_ovector[POSIX_MALLOC_THRESHOLD * 3]; 00298 BOOL allocated_ovector = FALSE; 00299 BOOL nosub = 00300 (((const pcre *)preg->re_pcre)->options & PCRE_NO_AUTO_CAPTURE) != 0; 00301 00302 if ((eflags & REG_NOTBOL) != 0) options |= PCRE_NOTBOL; 00303 if ((eflags & REG_NOTEOL) != 0) options |= PCRE_NOTEOL; 00304 if ((eflags & REG_NOTEMPTY) != 0) options |= PCRE_NOTEMPTY; 00305 00306 ((regex_t *)preg)->re_erroffset = (size_t)(-1); /* Only has meaning after compile */ 00307 00308 /* When no string data is being returned, or no vector has been passed in which 00309 to put it, ensure that nmatch is zero. Otherwise, ensure the vector for holding 00310 the return data is large enough. */ 00311 00312 if (nosub || pmatch == NULL) nmatch = 0; 00313 00314 else if (nmatch > 0) 00315 { 00316 if (nmatch <= POSIX_MALLOC_THRESHOLD) 00317 { 00318 ovector = &(small_ovector[0]); 00319 } 00320 else 00321 { 00322 if (nmatch > INT_MAX/(sizeof(int) * 3)) return REG_ESPACE; 00323 ovector = (int *)malloc(sizeof(int) * nmatch * 3); 00324 if (ovector == NULL) return REG_ESPACE; 00325 allocated_ovector = TRUE; 00326 } 00327 } 00328 00329 /* REG_STARTEND is a BSD extension, to allow for non-NUL-terminated strings. 00330 The man page from OS X says "REG_STARTEND affects only the location of the 00331 string, not how it is matched". That is why the "so" value is used to bump the 00332 start location rather than being passed as a PCRE "starting offset". */ 00333 00334 if ((eflags & REG_STARTEND) != 0) 00335 { 00336 so = pmatch[0].rm_so; 00337 eo = pmatch[0].rm_eo; 00338 } 00339 else 00340 { 00341 so = 0; 00342 eo = strlen(string); 00343 } 00344 00345 rc = pcre_exec((const pcre *)preg->re_pcre, NULL, string + so, (eo - so), 00346 0, options, ovector, nmatch * 3); 00347 00348 if (rc == 0) rc = nmatch; /* All captured slots were filled in */ 00349 00350 if (rc >= 0) 00351 { 00352 size_t i; 00353 if (!nosub) 00354 { 00355 for (i = 0; i < (size_t)rc; i++) 00356 { 00357 pmatch[i].rm_so = ovector[i*2]; 00358 pmatch[i].rm_eo = ovector[i*2+1]; 00359 } 00360 if (allocated_ovector) free(ovector); 00361 for (; i < nmatch; i++) pmatch[i].rm_so = pmatch[i].rm_eo = -1; 00362 } 00363 return 0; 00364 } 00365 00366 else 00367 { 00368 if (allocated_ovector) free(ovector); 00369 switch(rc) 00370 { 00371 case PCRE_ERROR_NOMATCH: return REG_NOMATCH; 00372 case PCRE_ERROR_NULL: return REG_INVARG; 00373 case PCRE_ERROR_BADOPTION: return REG_INVARG; 00374 case PCRE_ERROR_BADMAGIC: return REG_INVARG; 00375 case PCRE_ERROR_UNKNOWN_NODE: return REG_ASSERT; 00376 case PCRE_ERROR_NOMEMORY: return REG_ESPACE; 00377 case PCRE_ERROR_MATCHLIMIT: return REG_ESPACE; 00378 case PCRE_ERROR_BADUTF8: return REG_INVARG; 00379 case PCRE_ERROR_BADUTF8_OFFSET: return REG_INVARG; 00380 default: return REG_ASSERT; 00381 } 00382 } 00383 } 00384 00385 /* End of pcreposix.c */