00001 /************************************************* 00002 * Perl-Compatible Regular Expressions * 00003 *************************************************/ 00004 00005 /* PCRE is a library of functions to support regular expressions whose syntax 00006 and semantics are as close as possible to those of the Perl 5 language. 00007 00008 Written by Philip Hazel 00009 Copyright (c) 1997-2009 University of Cambridge 00010 00011 ----------------------------------------------------------------------------- 00012 Redistribution and use in source and binary forms, with or without 00013 modification, are permitted provided that the following conditions are met: 00014 00015 * Redistributions of source code must retain the above copyright notice, 00016 this list of conditions and the following disclaimer. 00017 00018 * Redistributions in binary form must reproduce the above copyright 00019 notice, this list of conditions and the following disclaimer in the 00020 documentation and/or other materials provided with the distribution. 00021 00022 * Neither the name of the University of Cambridge nor the names of its 00023 contributors may be used to endorse or promote products derived from 00024 this software without specific prior written permission. 00025 00026 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 00027 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 00028 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 00029 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 00030 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 00031 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 00032 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 00033 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 00034 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 00035 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 00036 POSSIBILITY OF SUCH DAMAGE. 00037 ----------------------------------------------------------------------------- 00038 */ 00039 00040 00041 /* This module contains an internal function for validating UTF-8 character 00042 strings. */ 00043 00044 00045 #ifdef HAVE_CONFIG_H 00046 #include "config.h" 00047 #else if defined(_WINDOWS) 00048 #include <spl/configwin32.h> 00049 #endif 00050 00051 00052 #include "pcre_internal.h" 00053 00054 00055 /************************************************* 00056 * Validate a UTF-8 string * 00057 *************************************************/ 00058 00059 /* This function is called (optionally) at the start of compile or match, to 00060 validate that a supposed UTF-8 string is actually valid. The early check means 00061 that subsequent code can assume it is dealing with a valid string. The check 00062 can be turned off for maximum performance, but the consequences of supplying 00063 an invalid string are then undefined. 00064 00065 Originally, this function checked according to RFC 2279, allowing for values in 00066 the range 0 to 0x7fffffff, up to 6 bytes long, but ensuring that they were in 00067 the canonical format. Once somebody had pointed out RFC 3629 to me (it 00068 obsoletes 2279), additional restrictions were applied. The values are now 00069 limited to be between 0 and 0x0010ffff, no more than 4 bytes long, and the 00070 subrange 0xd000 to 0xdfff is excluded. 00071 00072 Arguments: 00073 string points to the string 00074 length length of string, or -1 if the string is zero-terminated 00075 00076 Returns: < 0 if the string is a valid UTF-8 string 00077 >= 0 otherwise; the value is the offset of the bad byte 00078 */ 00079 00080 int 00081 _pcre_valid_utf8(USPTR string, int length) 00082 { 00083 #ifdef SUPPORT_UTF8 00084 register USPTR p; 00085 00086 if (length < 0) 00087 { 00088 for (p = string; *p != 0; p++); 00089 length = p - string; 00090 } 00091 00092 for (p = string; length-- > 0; p++) 00093 { 00094 register int ab; 00095 register int c = *p; 00096 if (c < 128) continue; 00097 if (c < 0xc0) return p - string; 00098 ab = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ 00099 if (length < ab || ab > 3) return p - string; 00100 length -= ab; 00101 00102 /* Check top bits in the second byte */ 00103 if ((*(++p) & 0xc0) != 0x80) return p - string; 00104 00105 /* Check for overlong sequences for each different length, and for the 00106 excluded range 0xd000 to 0xdfff. */ 00107 00108 switch (ab) 00109 { 00110 /* Check for xx00 000x (overlong sequence) */ 00111 00112 case 1: 00113 if ((c & 0x3e) == 0) return p - string; 00114 continue; /* We know there aren't any more bytes to check */ 00115 00116 /* Check for 1110 0000, xx0x xxxx (overlong sequence) or 00117 1110 1101, 1010 xxxx (0xd000 - 0xdfff) */ 00118 00119 case 2: 00120 if ((c == 0xe0 && (*p & 0x20) == 0) || 00121 (c == 0xed && *p >= 0xa0)) 00122 return p - string; 00123 break; 00124 00125 /* Check for 1111 0000, xx00 xxxx (overlong sequence) or 00126 greater than 0x0010ffff (f4 8f bf bf) */ 00127 00128 case 3: 00129 if ((c == 0xf0 && (*p & 0x30) == 0) || 00130 (c > 0xf4 ) || 00131 (c == 0xf4 && *p > 0x8f)) 00132 return p - string; 00133 break; 00134 00135 #if 0 00136 /* These cases can no longer occur, as we restrict to a maximum of four 00137 bytes nowadays. Leave the code here in case we ever want to add an option 00138 for longer sequences. */ 00139 00140 /* Check for 1111 1000, xx00 0xxx */ 00141 case 4: 00142 if (c == 0xf8 && (*p & 0x38) == 0) return p - string; 00143 break; 00144 00145 /* Check for leading 0xfe or 0xff, and then for 1111 1100, xx00 00xx */ 00146 case 5: 00147 if (c == 0xfe || c == 0xff || 00148 (c == 0xfc && (*p & 0x3c) == 0)) return p - string; 00149 break; 00150 #endif 00151 00152 } 00153 00154 /* Check for valid bytes after the 2nd, if any; all must start 10 */ 00155 while (--ab > 0) 00156 { 00157 if ((*(++p) & 0xc0) != 0x80) return p - string; 00158 } 00159 } 00160 #else 00161 (void)(string); /* Keep picky compilers happy */ 00162 (void)(length); 00163 #endif 00164 00165 return -1; 00166 } 00167 00168 /* End of pcre_valid_utf8.c */