• Main Page
  • Related Pages
  • Modules
  • Namespaces
  • Classes
  • Files
  • File List
  • File Members

src/pcre/pcreposix.c

00001 /*************************************************
00002 *      Perl-Compatible Regular Expressions       *
00003 *************************************************/
00004 
00005 /* PCRE is a library of functions to support regular expressions whose syntax
00006 and semantics are as close as possible to those of the Perl 5 language.
00007 
00008                        Written by Philip Hazel
00009            Copyright (c) 1997-2009 University of Cambridge
00010 
00011 -----------------------------------------------------------------------------
00012 Redistribution and use in source and binary forms, with or without
00013 modification, are permitted provided that the following conditions are met:
00014 
00015     * Redistributions of source code must retain the above copyright notice,
00016       this list of conditions and the following disclaimer.
00017 
00018     * Redistributions in binary form must reproduce the above copyright
00019       notice, this list of conditions and the following disclaimer in the
00020       documentation and/or other materials provided with the distribution.
00021 
00022     * Neither the name of the University of Cambridge nor the names of its
00023       contributors may be used to endorse or promote products derived from
00024       this software without specific prior written permission.
00025 
00026 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
00027 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00028 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
00029 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
00030 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
00031 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
00032 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
00033 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
00034 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
00035 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
00036 POSSIBILITY OF SUCH DAMAGE.
00037 -----------------------------------------------------------------------------
00038 */
00039 
00040 
00041 /* This module is a wrapper that provides a POSIX API to the underlying PCRE
00042 functions. */
00043 
00044 
00045 #ifdef HAVE_CONFIG_H
00046 #include "config.h"
00047 #else if defined(_WINDOWS)
00048 #include <spl/configwin32.h>
00049 #endif
00050 
00051 
00052 
00053 /* Ensure that the PCREPOSIX_EXP_xxx macros are set appropriately for
00054 compiling these functions. This must come before including pcreposix.h, where
00055 they are set for an application (using these functions) if they have not
00056 previously been set. */
00057 
00058 //#if defined(_WIN32) && !defined(PCRE_STATIC)
00059 //#  define PCREPOSIX_EXP_DECL extern __declspec(dllexport)
00060 //#  define PCREPOSIX_EXP_DEFN __declspec(dllexport)
00061 //#endif
00062 
00063 #include "pcre.h"
00064 #include "pcre_internal.h"
00065 #include <spl/text/pcreposix.h>
00066 
00067 
00068 /* Table to translate PCRE compile time error codes into POSIX error codes. */
00069 
00070 static const int eint[] = {
00071   0,           /* no error */
00072   REG_EESCAPE, /* \ at end of pattern */
00073   REG_EESCAPE, /* \c at end of pattern */
00074   REG_EESCAPE, /* unrecognized character follows \ */
00075   REG_BADBR,   /* numbers out of order in {} quantifier */
00076   /* 5 */
00077   REG_BADBR,   /* number too big in {} quantifier */
00078   REG_EBRACK,  /* missing terminating ] for character class */
00079   REG_ECTYPE,  /* invalid escape sequence in character class */
00080   REG_ERANGE,  /* range out of order in character class */
00081   REG_BADRPT,  /* nothing to repeat */
00082   /* 10 */
00083   REG_BADRPT,  /* operand of unlimited repeat could match the empty string */
00084   REG_ASSERT,  /* internal error: unexpected repeat */
00085   REG_BADPAT,  /* unrecognized character after (? */
00086   REG_BADPAT,  /* POSIX named classes are supported only within a class */
00087   REG_EPAREN,  /* missing ) */
00088   /* 15 */
00089   REG_ESUBREG, /* reference to non-existent subpattern */
00090   REG_INVARG,  /* erroffset passed as NULL */
00091   REG_INVARG,  /* unknown option bit(s) set */
00092   REG_EPAREN,  /* missing ) after comment */
00093   REG_ESIZE,   /* parentheses nested too deeply */
00094   /* 20 */
00095   REG_ESIZE,   /* regular expression too large */
00096   REG_ESPACE,  /* failed to get memory */
00097   REG_EPAREN,  /* unmatched parentheses */
00098   REG_ASSERT,  /* internal error: code overflow */
00099   REG_BADPAT,  /* unrecognized character after (?< */
00100   /* 25 */
00101   REG_BADPAT,  /* lookbehind assertion is not fixed length */
00102   REG_BADPAT,  /* malformed number or name after (?( */
00103   REG_BADPAT,  /* conditional group contains more than two branches */
00104   REG_BADPAT,  /* assertion expected after (?( */
00105   REG_BADPAT,  /* (?R or (?[+-]digits must be followed by ) */
00106   /* 30 */
00107   REG_ECTYPE,  /* unknown POSIX class name */
00108   REG_BADPAT,  /* POSIX collating elements are not supported */
00109   REG_INVARG,  /* this version of PCRE is not compiled with PCRE_UTF8 support */
00110   REG_BADPAT,  /* spare error */
00111   REG_BADPAT,  /* character value in \x{...} sequence is too large */
00112   /* 35 */
00113   REG_BADPAT,  /* invalid condition (?(0) */
00114   REG_BADPAT,  /* \C not allowed in lookbehind assertion */
00115   REG_EESCAPE, /* PCRE does not support \L, \l, \N, \U, or \u */
00116   REG_BADPAT,  /* number after (?C is > 255 */
00117   REG_BADPAT,  /* closing ) for (?C expected */
00118   /* 40 */
00119   REG_BADPAT,  /* recursive call could loop indefinitely */
00120   REG_BADPAT,  /* unrecognized character after (?P */
00121   REG_BADPAT,  /* syntax error in subpattern name (missing terminator) */
00122   REG_BADPAT,  /* two named subpatterns have the same name */
00123   REG_BADPAT,  /* invalid UTF-8 string */
00124   /* 45 */
00125   REG_BADPAT,  /* support for \P, \p, and \X has not been compiled */
00126   REG_BADPAT,  /* malformed \P or \p sequence */
00127   REG_BADPAT,  /* unknown property name after \P or \p */
00128   REG_BADPAT,  /* subpattern name is too long (maximum 32 characters) */
00129   REG_BADPAT,  /* too many named subpatterns (maximum 10,000) */
00130   /* 50 */
00131   REG_BADPAT,  /* repeated subpattern is too long */
00132   REG_BADPAT,  /* octal value is greater than \377 (not in UTF-8 mode) */
00133   REG_BADPAT,  /* internal error: overran compiling workspace */
00134   REG_BADPAT,  /* internal error: previously-checked referenced subpattern not found */
00135   REG_BADPAT,  /* DEFINE group contains more than one branch */
00136   /* 55 */
00137   REG_BADPAT,  /* repeating a DEFINE group is not allowed */
00138   REG_INVARG,  /* inconsistent NEWLINE options */
00139   REG_BADPAT,  /* \g is not followed followed by an (optionally braced) non-zero number */
00140   REG_BADPAT,  /* a numbered reference must not be zero */
00141   REG_BADPAT,  /* (*VERB) with an argument is not supported */
00142   /* 60 */
00143   REG_BADPAT,  /* (*VERB) not recognized */
00144   REG_BADPAT,  /* number is too big */
00145   REG_BADPAT,  /* subpattern name expected */
00146   REG_BADPAT,  /* digit expected after (?+ */
00147   REG_BADPAT,  /* ] is an invalid data character in JavaScript compatibility mode */
00148   /* 65 */
00149   REG_BADPAT   /* different names for subpatterns of the same number are not allowed */
00150 };
00151 
00152 /* Table of texts corresponding to POSIX error codes */
00153 
00154 static const char *const pstring[] = {
00155   "",                                /* Dummy for value 0 */
00156   "internal error",                  /* REG_ASSERT */
00157   "invalid repeat counts in {}",     /* BADBR      */
00158   "pattern error",                   /* BADPAT     */
00159   "? * + invalid",                   /* BADRPT     */
00160   "unbalanced {}",                   /* EBRACE     */
00161   "unbalanced []",                   /* EBRACK     */
00162   "collation error - not relevant",  /* ECOLLATE   */
00163   "bad class",                       /* ECTYPE     */
00164   "bad escape sequence",             /* EESCAPE    */
00165   "empty expression",                /* EMPTY      */
00166   "unbalanced ()",                   /* EPAREN     */
00167   "bad range inside []",             /* ERANGE     */
00168   "expression too big",              /* ESIZE      */
00169   "failed to get memory",            /* ESPACE     */
00170   "bad back reference",              /* ESUBREG    */
00171   "bad argument",                    /* INVARG     */
00172   "match failed"                     /* NOMATCH    */
00173 };
00174 
00175 
00176 
00177 
00178 /*************************************************
00179 *          Translate error code to string        *
00180 *************************************************/
00181 
00182 PCREPOSIX_EXP_DEFN size_t PCRE_CALL_CONVENTION
00183 regerror(int errcode, const regex_t *preg, char *errbuf, size_t errbuf_size)
00184 {
00185 const char *message, *addmessage;
00186 size_t length, addlength;
00187 
00188 message = (errcode >= (int)(sizeof(pstring)/sizeof(char *)))?
00189   "unknown error code" : pstring[errcode];
00190 length = strlen(message) + 1;
00191 
00192 addmessage = " at offset ";
00193 addlength = (preg != NULL && (int)preg->re_erroffset != -1)?
00194   strlen(addmessage) + 6 : 0;
00195 
00196 if (errbuf_size > 0)
00197   {
00198   if (addlength > 0 && errbuf_size >= length + addlength)
00199     sprintf(errbuf, "%s%s%-6d", message, addmessage, (int)preg->re_erroffset);
00200   else
00201     {
00202     strncpy(errbuf, message, errbuf_size - 1);
00203     errbuf[errbuf_size-1] = 0;
00204     }
00205   }
00206 
00207 return length + addlength;
00208 }
00209 
00210 
00211 
00212 
00213 /*************************************************
00214 *           Free store held by a regex           *
00215 *************************************************/
00216 
00217 PCREPOSIX_EXP_DEFN void PCRE_CALL_CONVENTION
00218 regfree(regex_t *preg)
00219 {
00220 (pcre_free)(preg->re_pcre);
00221 }
00222 
00223 
00224 
00225 
00226 /*************************************************
00227 *            Compile a regular expression        *
00228 *************************************************/
00229 
00230 /*
00231 Arguments:
00232   preg        points to a structure for recording the compiled expression
00233   pattern     the pattern to compile
00234   cflags      compilation flags
00235 
00236 Returns:      0 on success
00237               various non-zero codes on failure
00238 */
00239 
00240 PCREPOSIX_EXP_DEFN int PCRE_CALL_CONVENTION
00241 regcomp(regex_t *preg, const char *pattern, int cflags)
00242 {
00243 const char *errorptr;
00244 int erroffset;
00245 int errorcode;
00246 int options = 0;
00247 
00248 if ((cflags & REG_ICASE) != 0)    options |= PCRE_CASELESS;
00249 if ((cflags & REG_NEWLINE) != 0)  options |= PCRE_MULTILINE;
00250 if ((cflags & REG_DOTALL) != 0)   options |= PCRE_DOTALL;
00251 if ((cflags & REG_NOSUB) != 0)    options |= PCRE_NO_AUTO_CAPTURE;
00252 if ((cflags & REG_UTF8) != 0)     options |= PCRE_UTF8;
00253 if ((cflags & REG_UNGREEDY) != 0) options |= PCRE_UNGREEDY;
00254 
00255 preg->re_pcre = pcre_compile2(pattern, options, &errorcode, &errorptr,
00256   &erroffset, NULL);
00257 preg->re_erroffset = erroffset;
00258 
00259 /* Safety: if the error code is too big for the translation vector (which
00260 should not happen, but we all make mistakes), return REG_BADPAT. */
00261 
00262 if (preg->re_pcre == NULL)
00263   {
00264   return (errorcode < sizeof(eint)/sizeof(const int))?
00265     eint[errorcode] : REG_BADPAT;
00266   }
00267 
00268 preg->re_nsub = pcre_info((const pcre *)preg->re_pcre, NULL, NULL);
00269 return 0;
00270 }
00271 
00272 
00273 
00274 
00275 /*************************************************
00276 *              Match a regular expression        *
00277 *************************************************/
00278 
00279 /* Unfortunately, PCRE requires 3 ints of working space for each captured
00280 substring, so we have to get and release working store instead of just using
00281 the POSIX structures as was done in earlier releases when PCRE needed only 2
00282 ints. However, if the number of possible capturing brackets is small, use a
00283 block of store on the stack, to reduce the use of malloc/free. The threshold is
00284 in a macro that can be changed at configure time.
00285 
00286 If REG_NOSUB was specified at compile time, the PCRE_NO_AUTO_CAPTURE flag will
00287 be set. When this is the case, the nmatch and pmatch arguments are ignored, and
00288 the only result is yes/no/error. */
00289 
00290 PCREPOSIX_EXP_DEFN int PCRE_CALL_CONVENTION
00291 regexec(const regex_t *preg, const char *string, size_t nmatch,
00292   regmatch_t pmatch[], int eflags)
00293 {
00294 int rc, so, eo;
00295 int options = 0;
00296 int *ovector = NULL;
00297 int small_ovector[POSIX_MALLOC_THRESHOLD * 3];
00298 BOOL allocated_ovector = FALSE;
00299 BOOL nosub =
00300   (((const pcre *)preg->re_pcre)->options & PCRE_NO_AUTO_CAPTURE) != 0;
00301 
00302 if ((eflags & REG_NOTBOL) != 0) options |= PCRE_NOTBOL;
00303 if ((eflags & REG_NOTEOL) != 0) options |= PCRE_NOTEOL;
00304 if ((eflags & REG_NOTEMPTY) != 0) options |= PCRE_NOTEMPTY;
00305 
00306 ((regex_t *)preg)->re_erroffset = (size_t)(-1);  /* Only has meaning after compile */
00307 
00308 /* When no string data is being returned, or no vector has been passed in which
00309 to put it, ensure that nmatch is zero. Otherwise, ensure the vector for holding
00310 the return data is large enough. */
00311 
00312 if (nosub || pmatch == NULL) nmatch = 0;
00313 
00314 else if (nmatch > 0)
00315   {
00316   if (nmatch <= POSIX_MALLOC_THRESHOLD)
00317     {
00318     ovector = &(small_ovector[0]);
00319     }
00320   else
00321     {
00322     if (nmatch > INT_MAX/(sizeof(int) * 3)) return REG_ESPACE;
00323     ovector = (int *)malloc(sizeof(int) * nmatch * 3);
00324     if (ovector == NULL) return REG_ESPACE;
00325     allocated_ovector = TRUE;
00326     }
00327   }
00328 
00329 /* REG_STARTEND is a BSD extension, to allow for non-NUL-terminated strings.
00330 The man page from OS X says "REG_STARTEND affects only the location of the
00331 string, not how it is matched". That is why the "so" value is used to bump the
00332 start location rather than being passed as a PCRE "starting offset". */
00333 
00334 if ((eflags & REG_STARTEND) != 0)
00335   {
00336   so = pmatch[0].rm_so;
00337   eo = pmatch[0].rm_eo;
00338   }
00339 else
00340   {
00341   so = 0;
00342   eo = strlen(string);
00343   }
00344 
00345 rc = pcre_exec((const pcre *)preg->re_pcre, NULL, string + so, (eo - so),
00346   0, options, ovector, nmatch * 3);
00347 
00348 if (rc == 0) rc = nmatch;    /* All captured slots were filled in */
00349 
00350 if (rc >= 0)
00351   {
00352   size_t i;
00353   if (!nosub)
00354     {
00355     for (i = 0; i < (size_t)rc; i++)
00356       {
00357       pmatch[i].rm_so = ovector[i*2];
00358       pmatch[i].rm_eo = ovector[i*2+1];
00359       }
00360     if (allocated_ovector) free(ovector);
00361     for (; i < nmatch; i++) pmatch[i].rm_so = pmatch[i].rm_eo = -1;
00362     }
00363   return 0;
00364   }
00365 
00366 else
00367   {
00368   if (allocated_ovector) free(ovector);
00369   switch(rc)
00370     {
00371     case PCRE_ERROR_NOMATCH: return REG_NOMATCH;
00372     case PCRE_ERROR_NULL: return REG_INVARG;
00373     case PCRE_ERROR_BADOPTION: return REG_INVARG;
00374     case PCRE_ERROR_BADMAGIC: return REG_INVARG;
00375     case PCRE_ERROR_UNKNOWN_NODE: return REG_ASSERT;
00376     case PCRE_ERROR_NOMEMORY: return REG_ESPACE;
00377     case PCRE_ERROR_MATCHLIMIT: return REG_ESPACE;
00378     case PCRE_ERROR_BADUTF8: return REG_INVARG;
00379     case PCRE_ERROR_BADUTF8_OFFSET: return REG_INVARG;
00380     default: return REG_ASSERT;
00381     }
00382   }
00383 }
00384 
00385 /* End of pcreposix.c */