• Main Page
  • Related Pages
  • Modules
  • Namespaces
  • Classes
  • Files
  • File List
  • File Members

src/pcre/pcre_valid_utf8.c

00001 /*************************************************
00002 *      Perl-Compatible Regular Expressions       *
00003 *************************************************/
00004 
00005 /* PCRE is a library of functions to support regular expressions whose syntax
00006 and semantics are as close as possible to those of the Perl 5 language.
00007 
00008                        Written by Philip Hazel
00009            Copyright (c) 1997-2009 University of Cambridge
00010 
00011 -----------------------------------------------------------------------------
00012 Redistribution and use in source and binary forms, with or without
00013 modification, are permitted provided that the following conditions are met:
00014 
00015     * Redistributions of source code must retain the above copyright notice,
00016       this list of conditions and the following disclaimer.
00017 
00018     * Redistributions in binary form must reproduce the above copyright
00019       notice, this list of conditions and the following disclaimer in the
00020       documentation and/or other materials provided with the distribution.
00021 
00022     * Neither the name of the University of Cambridge nor the names of its
00023       contributors may be used to endorse or promote products derived from
00024       this software without specific prior written permission.
00025 
00026 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
00027 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00028 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
00029 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
00030 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
00031 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
00032 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
00033 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
00034 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
00035 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
00036 POSSIBILITY OF SUCH DAMAGE.
00037 -----------------------------------------------------------------------------
00038 */
00039 
00040 
00041 /* This module contains an internal function for validating UTF-8 character
00042 strings. */
00043 
00044 
00045 #ifdef HAVE_CONFIG_H
00046 #include "config.h"
00047 #else if defined(_WINDOWS)
00048 #include <spl/configwin32.h>
00049 #endif
00050 
00051 
00052 #include "pcre_internal.h"
00053 
00054 
00055 /*************************************************
00056 *         Validate a UTF-8 string                *
00057 *************************************************/
00058 
00059 /* This function is called (optionally) at the start of compile or match, to
00060 validate that a supposed UTF-8 string is actually valid. The early check means
00061 that subsequent code can assume it is dealing with a valid string. The check
00062 can be turned off for maximum performance, but the consequences of supplying
00063 an invalid string are then undefined.
00064 
00065 Originally, this function checked according to RFC 2279, allowing for values in
00066 the range 0 to 0x7fffffff, up to 6 bytes long, but ensuring that they were in
00067 the canonical format. Once somebody had pointed out RFC 3629 to me (it
00068 obsoletes 2279), additional restrictions were applied. The values are now
00069 limited to be between 0 and 0x0010ffff, no more than 4 bytes long, and the
00070 subrange 0xd000 to 0xdfff is excluded.
00071 
00072 Arguments:
00073   string       points to the string
00074   length       length of string, or -1 if the string is zero-terminated
00075 
00076 Returns:       < 0    if the string is a valid UTF-8 string
00077                >= 0   otherwise; the value is the offset of the bad byte
00078 */
00079 
00080 int
00081 _pcre_valid_utf8(USPTR string, int length)
00082 {
00083 #ifdef SUPPORT_UTF8
00084 register USPTR p;
00085 
00086 if (length < 0)
00087   {
00088   for (p = string; *p != 0; p++);
00089   length = p - string;
00090   }
00091 
00092 for (p = string; length-- > 0; p++)
00093   {
00094   register int ab;
00095   register int c = *p;
00096   if (c < 128) continue;
00097   if (c < 0xc0) return p - string;
00098   ab = _pcre_utf8_table4[c & 0x3f];     /* Number of additional bytes */
00099   if (length < ab || ab > 3) return p - string;
00100   length -= ab;
00101 
00102   /* Check top bits in the second byte */
00103   if ((*(++p) & 0xc0) != 0x80) return p - string;
00104 
00105   /* Check for overlong sequences for each different length, and for the
00106   excluded range 0xd000 to 0xdfff.  */
00107 
00108   switch (ab)
00109     {
00110     /* Check for xx00 000x (overlong sequence) */
00111 
00112     case 1:
00113     if ((c & 0x3e) == 0) return p - string;
00114     continue;   /* We know there aren't any more bytes to check */
00115 
00116     /* Check for 1110 0000, xx0x xxxx (overlong sequence) or
00117                  1110 1101, 1010 xxxx (0xd000 - 0xdfff) */
00118 
00119     case 2:
00120     if ((c == 0xe0 && (*p & 0x20) == 0) ||
00121         (c == 0xed && *p >= 0xa0))
00122       return p - string;
00123     break;
00124 
00125     /* Check for 1111 0000, xx00 xxxx (overlong sequence) or
00126        greater than 0x0010ffff (f4 8f bf bf) */
00127 
00128     case 3:
00129     if ((c == 0xf0 && (*p & 0x30) == 0) ||
00130         (c > 0xf4 ) ||
00131         (c == 0xf4 && *p > 0x8f))
00132       return p - string;
00133     break;
00134 
00135 #if 0
00136     /* These cases can no longer occur, as we restrict to a maximum of four
00137     bytes nowadays. Leave the code here in case we ever want to add an option
00138     for longer sequences. */
00139 
00140     /* Check for 1111 1000, xx00 0xxx */
00141     case 4:
00142     if (c == 0xf8 && (*p & 0x38) == 0) return p - string;
00143     break;
00144 
00145     /* Check for leading 0xfe or 0xff, and then for 1111 1100, xx00 00xx */
00146     case 5:
00147     if (c == 0xfe || c == 0xff ||
00148        (c == 0xfc && (*p & 0x3c) == 0)) return p - string;
00149     break;
00150 #endif
00151 
00152     }
00153 
00154   /* Check for valid bytes after the 2nd, if any; all must start 10 */
00155   while (--ab > 0)
00156     {
00157     if ((*(++p) & 0xc0) != 0x80) return p - string;
00158     }
00159   }
00160 #else
00161 (void)(string);  /* Keep picky compilers happy */
00162 (void)(length);
00163 #endif
00164 
00165 return -1;
00166 }
00167 
00168 /* End of pcre_valid_utf8.c */