Main Page | Class Hierarchy | Alphabetical List | Data Structures | File List | Data Fields | Globals | Related Pages

utf8.h

Go to the documentation of this file.
00001 /* 00002 ******************************************************************************* 00003 * 00004 * Copyright (C) 1999-2003, International Business Machines 00005 * Corporation and others. All Rights Reserved. 00006 * 00007 ******************************************************************************* 00008 * file name: utf8.h 00009 * encoding: US-ASCII 00010 * tab size: 8 (not used) 00011 * indentation:4 00012 * 00013 * created on: 1999sep13 00014 * created by: Markus W. Scherer 00015 */ 00016 00034 /* utf.h must be included first. */ 00035 #ifndef __UTF_H__ 00036 # include "unicode/utf.h" 00037 #endif 00038 00039 #ifndef __UTF8_H__ 00040 #define __UTF8_H__ 00041 00042 /* internal definitions ----------------------------------------------------- */ 00043 00050 #ifdef U_UTF8_IMPL 00051 U_CAPI const uint8_t 00052 utf8_countTrailBytes[256]; 00053 #else 00054 U_CFUNC U_IMPORT const uint8_t /* U_IMPORT2? */ /*U_IMPORT*/ 00055 utf8_countTrailBytes[256]; 00056 #endif 00057 00062 #define U8_COUNT_TRAIL_BYTES(leadByte) (utf8_countTrailBytes[(uint8_t)leadByte]) 00063 00068 #define U8_MASK_LEAD_BYTE(leadByte, countTrailBytes) ((leadByte)&=(1<<(6-(countTrailBytes)))-1) 00069 00074 U_CAPI UChar32 U_EXPORT2 00075 utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, UBool strict); 00076 00081 U_CAPI int32_t U_EXPORT2 00082 utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c, UBool *pIsError); 00083 00088 U_CAPI UChar32 U_EXPORT2 00089 utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, UBool strict); 00090 00095 U_CAPI int32_t U_EXPORT2 00096 utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i); 00097 00098 /* single-code point definitions -------------------------------------------- */ 00099 00106 #define U8_IS_SINGLE(c) (((c)&0x80)==0) 00107 00114 #define U8_IS_LEAD(c) ((uint8_t)((c)-0xc0)<0x3e) 00115 00122 #define U8_IS_TRAIL(c) (((c)&0xc0)==0x80) 00123 00131 #define U8_LENGTH(c) \ 00132 ((uint32_t)(c)<=0x7f ? 1 : \ 00133 ((uint32_t)(c)<=0x7ff ? 2 : \ 00134 ((uint32_t)(c)<=0xd7ff ? 3 : \ 00135 ((uint32_t)(c)<=0xdfff || (uint32_t)(c)>0x10ffff ? 0 : \ 00136 ((uint32_t)(c)<=0xffff ? 3 : 4)\ 00137 ) \ 00138 ) \ 00139 ) \ 00140 ) 00141 00147 #define U8_MAX_LENGTH 4 00148 00165 #define U8_GET_UNSAFE(s, i, c) { \ 00166 int32_t __I=(int32_t)(i); \ 00167 U8_SET_CP_START_UNSAFE(s, __I); \ 00168 U8_NEXT_UNSAFE(s, __I, c); \ 00169 } 00170 00189 #define U8_GET(s, start, i, length, c) { \ 00190 int32_t __I=(int32_t)(i); \ 00191 U8_SET_CP_START(s, start, __I); \ 00192 U8_NEXT(s, __I, length, c); \ 00193 } 00194 00195 /* definitions with forward iteration --------------------------------------- */ 00196 00214 #define U8_NEXT_UNSAFE(s, i, c) { \ 00215 (c)=(s)[(i)++]; \ 00216 if((uint8_t)((c)-0xc0)<0x35) { \ 00217 uint8_t __count=U8_COUNT_TRAIL_BYTES(c); \ 00218 U8_MASK_LEAD_BYTE(c, __count); \ 00219 switch(__count) { \ 00220 /* each following branch falls through to the next one */ \ 00221 case 3: \ 00222 (c)=((c)<<6)|((s)[(i)++]&0x3f); \ 00223 case 2: \ 00224 (c)=((c)<<6)|((s)[(i)++]&0x3f); \ 00225 case 1: \ 00226 (c)=((c)<<6)|((s)[(i)++]&0x3f); \ 00227 /* no other branches to optimize switch() */ \ 00228 break; \ 00229 } \ 00230 } \ 00231 } 00232 00251 #define U8_NEXT(s, i, length, c) { \ 00252 (c)=(s)[(i)++]; \ 00253 if(((uint8_t)(c))>=0x80) { \ 00254 if(U8_IS_LEAD(c)) { \ 00255 (c)=utf8_nextCharSafeBody((const uint8_t *)s, &(i), (int32_t)(length), c, -1); \ 00256 } else { \ 00257 (c)=U_SENTINEL; \ 00258 } \ 00259 } \ 00260 } 00261 00275 #define U8_APPEND_UNSAFE(s, i, c) { \ 00276 if((uint32_t)(c)<=0x7f) { \ 00277 (s)[(i)++]=(uint8_t)(c); \ 00278 } else { \ 00279 if((uint32_t)(c)<=0x7ff) { \ 00280 (s)[(i)++]=(uint8_t)(((c)>>6)|0xc0); \ 00281 } else { \ 00282 if((uint32_t)(c)<=0xffff) { \ 00283 (s)[(i)++]=(uint8_t)(((c)>>12)|0xe0); \ 00284 } else { \ 00285 (s)[(i)++]=(uint8_t)(((c)>>18)|0xf0); \ 00286 (s)[(i)++]=(uint8_t)((((c)>>12)&0x3f)|0x80); \ 00287 } \ 00288 (s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)|0x80); \ 00289 } \ 00290 (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80); \ 00291 } \ 00292 } 00293 00311 #define U8_APPEND(s, i, length, c, isError) { \ 00312 if((uint32_t)(c)<=0x7f) { \ 00313 (s)[(i)++]=(uint8_t)(c); \ 00314 } else { \ 00315 (i)=utf8_appendCharSafeBody(s, (int32_t)(i), (int32_t)(length), c, &(isError)); \ 00316 } \ 00317 } 00318 00329 #define U8_FWD_1_UNSAFE(s, i) { \ 00330 (i)+=1+U8_COUNT_TRAIL_BYTES((s)[i]); \ 00331 } 00332 00344 #define U8_FWD_1(s, i, length) { \ 00345 uint8_t __b=(s)[(i)++]; \ 00346 if(U8_IS_LEAD(__b)) { \ 00347 uint8_t __count=U8_COUNT_TRAIL_BYTES(__b); \ 00348 if((i)+__count>(length)) { \ 00349 __count=(uint8_t)((length)-(i)); \ 00350 } \ 00351 while(__count>0 && U8_IS_TRAIL((s)[i])) { \ 00352 ++(i); \ 00353 --__count; \ 00354 } \ 00355 } \ 00356 } 00357 00370 #define U8_FWD_N_UNSAFE(s, i, n) { \ 00371 int32_t __N=(n); \ 00372 while(__N>0) { \ 00373 U8_FWD_1_UNSAFE(s, i); \ 00374 --__N; \ 00375 } \ 00376 } 00377 00391 #define U8_FWD_N(s, i, length, n) { \ 00392 int32_t __N=(n); \ 00393 while(__N>0 && (i)<(length)) { \ 00394 U8_FWD_1(s, i, length); \ 00395 --__N; \ 00396 } \ 00397 } 00398 00412 #define U8_SET_CP_START_UNSAFE(s, i) { \ 00413 while(U8_IS_TRAIL((s)[i])) { --(i); } \ 00414 } 00415 00430 #define U8_SET_CP_START(s, start, i) { \ 00431 if(U8_IS_TRAIL((s)[(i)])) { \ 00432 (i)=utf8_back1SafeBody(s, start, (int32_t)(i)); \ 00433 } \ 00434 } 00435 00436 /* definitions with backward iteration -------------------------------------- */ 00437 00457 #define U8_PREV_UNSAFE(s, i, c) { \ 00458 (c)=(s)[--(i)]; \ 00459 if(U8_IS_TRAIL(c)) { \ 00460 uint8_t __b, __count=1, __shift=6; \ 00461 \ 00462 /* c is a trail byte */ \ 00463 (c)&=0x3f; \ 00464 for(;;) { \ 00465 __b=(s)[--(i)]; \ 00466 if(__b>=0xc0) { \ 00467 U8_MASK_LEAD_BYTE(__b, __count); \ 00468 (c)|=(UChar32)__b<<__shift; \ 00469 break; \ 00470 } else { \ 00471 (c)|=(UChar32)(__b&0x3f)<<__shift; \ 00472 ++__count; \ 00473 __shift+=6; \ 00474 } \ 00475 } \ 00476 } \ 00477 } 00478 00499 #define U8_PREV(s, start, i, c) { \ 00500 (c)=(s)[--(i)]; \ 00501 if((c)>=0x80) { \ 00502 if((c)<=0xbf) { \ 00503 (c)=utf8_prevCharSafeBody(s, start, &(i), c, -1); \ 00504 } else { \ 00505 (c)=U_SENTINEL; \ 00506 } \ 00507 } \ 00508 } 00509 00521 #define U8_BACK_1_UNSAFE(s, i) { \ 00522 while(U8_IS_TRAIL((s)[--(i)])) {} \ 00523 } 00524 00537 #define U8_BACK_1(s, start, i) { \ 00538 if(U8_IS_TRAIL((s)[--(i)])) { \ 00539 (i)=utf8_back1SafeBody(s, start, (int32_t)(i)); \ 00540 } \ 00541 } 00542 00556 #define U8_BACK_N_UNSAFE(s, i, n) { \ 00557 int32_t __N=(n); \ 00558 while(__N>0) { \ 00559 U8_BACK_1_UNSAFE(s, i); \ 00560 --__N; \ 00561 } \ 00562 } 00563 00578 #define U8_BACK_N(s, start, i, n) { \ 00579 int32_t __N=(n); \ 00580 while(__N>0 && (i)>(start)) { \ 00581 U8_BACK_1(s, start, i); \ 00582 --__N; \ 00583 } \ 00584 } 00585 00599 #define U8_SET_CP_LIMIT_UNSAFE(s, i) { \ 00600 U8_BACK_1_UNSAFE(s, i); \ 00601 U8_FWD_1_UNSAFE(s, i); \ 00602 } 00603 00619 #define U8_SET_CP_LIMIT(s, start, i, length) { \ 00620 if((start)<(i) && (i)<(length)) { \ 00621 U8_BACK_1(s, start, i); \ 00622 U8_FWD_1(s, i, length); \ 00623 } \ 00624 } 00625 00626 #endif

Generated on Wed Jul 28 09:15:54 2004 for ICU 2.8 by doxygen 1.3.7