Many hyperlinks are disabled.
Use anonymous login
to enable hyperlinks.
Changes In Branch invalid_utf8_table Excluding Merge-Ins
This is equivalent to a diff from 69328517f5 to 8a877a7b18
2016-06-17
| ||
07:24 | Remove a function which isn't use anywhere check-in: e2a280fc89 user: jan.nijtmans tags: trunk | |
00:04 | merged from trunk Closed-Leaf check-in: 8a877a7b18 user: sdr tags: invalid_utf8_table | |
2016-06-16
| ||
22:14 | shrunk size of lead byte table for invalid_utf8, and took a shortcut to invalidate lead bytes between 0x80 & 0xBF inclusive check-in: 69328517f5 user: sdr tags: trunk | |
17:01 | more optimizations (all lead bytes between 0x80 & 0xBF are invalid, so use simple check for those, and also can shrink the invalid_utf8 lead byte table even more) check-in: 6eb9a30c08 user: sdr tags: invalid_utf8_table | |
11:39 | Minor further speed-up: Only increment pointer if really needed. check-in: 5be2e9cf7d user: jan.nijtmans tags: trunk | |
Changes to src/lookslike.c.
︙ | ︙ | |||
133 134 135 136 137 138 139 | } return flags; } /* ** Checks for proper UTF-8. It uses the method described in: ** http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences | | > | | | < < < < < < | | | | | | | | > | < | < | > | | > | > > > > | > > > > > > > > | > > | | > | < < > | > > | | | | > > > > > > | | 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 | } return flags; } /* ** Checks for proper UTF-8. It uses the method described in: ** http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences ** except for the "overlong form" of \u0000 (Modified UTF-8) ** which is not considered invalid here: Some languages like ** Java and Tcl use it. This function also considers valid ** the derivatives CESU-8 & WTF-8 (as described in the same ** wikipedia article referenced previously). */ /* definitions for various UTF-8 sequence lengths */ #define US2A 2, 0x80, 0x80 /* for lead byte 0xC0 */ #define US2B 2, 0x80, 0xBF /* for lead bytes 0xC2-0xDF */ #define US3A 3, 0xA0, 0xBF /* for lead byte 0xE0 */ #define US3B 3, 0x80, 0xBF /* for lead bytes 0xE1-0xEF */ #define US4A 4, 0x90, 0xBF /* for lead byte 0xF0 */ #define US4B 4, 0x80, 0xBF /* for lead bytes 0xF1-0xF3 */ #define US4C 4, 0x80, 0x8F /* for lead byte 0xF4 */ #define US0A 0xFF, 0xFF, 0x00 /* for any other lead byte */ /* a table used for quick lookup of the definition that goes with a * particular lead byte */ static const unsigned char lb_tab[] = { US2A, US0A, US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B, US3A, US3B, US3B, US3B, US3B, US3B, US3B, US3B, US3B, US3B, US3B, US3B, US3B, US3B, US3B, US3B, US4A, US4B, US4B, US4B, US4C, US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A }; int invalid_utf8( const Blob *pContent ){ /* buffer pointer and size */ const unsigned char *z = (unsigned char *)blob_buffer(pContent); unsigned int n = blob_size(pContent); /* while we haven't checked all the bytes in the buffer */ while( n>0 ){ /* ascii is trivial */ if( *z<0x80 ){ ++z; --n; }else if( *z<0xC0 ){ return LOOK_INVALID; }else{ /* get the definition for this lead byte */ const unsigned char* def = &lb_tab[(3 * *z++)-0x240]; unsigned char len; /* get the expected sequence length */ len = *def; /* if there aren't enough bytes left, return invalid */ if( n<len ) { return LOOK_INVALID; } /* we already know byte #0 is good, so check the remaining bytes */ if( (*z<*++def) || (*z++>*++def) ){ /* if the byte is outside the allowed range for this definition, * return invalid */ return LOOK_INVALID; } if( len > 2 ){ /* if the next byte is not between 0x80 and 0xBF, return invalid */ if( (*z++&0xC0)!=0x80 ){ return LOOK_INVALID; } if( len > 3 ){ /* if the next byte is not between 0x80 and 0xBF, return invalid */ if( (*z++&0xC0)!=0x80 ){ return LOOK_INVALID; } } } /* advance to the next sequence */ n -= len; } } /* we made it all the way through the buffer so it's not invalid */ return LOOK_NONE; } /* ** Define the type needed to represent a Unicode (UTF-16) character. */ #ifndef WCHAR_T # ifdef _WIN32 |
︙ | ︙ |