Main Page | Class Hierarchy | Alphabetical List | Data Structures | File List | Data Fields | Globals | Related Pages

unicode.h

00001 /* 00002 ****************************************************************************** 00003 * Copyright (C) 1996-2001, International Business Machines 00004 * Corporation and others. All Rights Reserved. 00005 ****************************************************************************** 00006 */ 00007 // FILE NAME : unicode.h 00008 // 00009 // CREATED 00010 // Wednesday, December 11, 1996 00011 // 00012 // CREATED BY 00013 // Helena Shih 00014 // 00015 // CHANGES 00016 // Thursday, April 15, 1999 00017 // Modified the definitions of all the functions 00018 // C++ Wrappers for Unicode 00019 // CHANGES BY 00020 // Madhu Katragadda 00021 // 5/20/99 Madhu Added the function getVersion() 00022 // 11/22/99 aliu Added MIN_RADIX, MAX_RADIX, digit, forDigit 00023 //***************************************************************************** 00024 00025 00026 00027 #ifndef UNICODE_H 00028 #define UNICODE_H 00029 00030 #include "unicode/utypes.h" 00031 #include "unicode/uchar.h" 00032 00033 U_NAMESPACE_BEGIN 00055 class U_COMMON_API Unicode 00056 { 00057 public: 00058 /* 00059 * In C++, static const members actually take up memory and need to be accessed. 00060 * enum values are more like C #define's. 00061 * The following is a collection of constants, not an enumeration type. 00062 * 00063 * @deprecated See the Unicode class description. 00064 */ 00065 enum { 00067 MIN_VALUE=0, 00068 00074 MAX_VALUE=0x10ffff, 00075 00083 MAX_CHAR_LENGTH=UTF_MAX_CHAR_LENGTH, 00084 00095 MIN_RADIX=2, 00096 00107 MAX_RADIX=36 00108 }; 00109 00116 enum EUnicodeGeneralTypes 00117 { 00118 UNASSIGNED = 0, 00119 UPPERCASE_LETTER = 1, 00120 LOWERCASE_LETTER = 2, 00121 TITLECASE_LETTER = 3, 00122 MODIFIER_LETTER = 4, 00123 OTHER_LETTER = 5, 00124 NON_SPACING_MARK = 6, 00125 ENCLOSING_MARK = 7, 00126 COMBINING_SPACING_MARK = 8, 00127 DECIMAL_DIGIT_NUMBER = 9, 00128 LETTER_NUMBER = 10, 00129 OTHER_NUMBER = 11, 00130 SPACE_SEPARATOR = 12, 00131 LINE_SEPARATOR = 13, 00132 PARAGRAPH_SEPARATOR = 14, 00133 CONTROL = 15, 00134 FORMAT = 16, 00135 PRIVATE_USE = 17, 00136 SURROGATE = 18, 00137 DASH_PUNCTUATION = 19, 00138 START_PUNCTUATION = 20, 00139 END_PUNCTUATION = 21, 00140 CONNECTOR_PUNCTUATION = 22, 00141 OTHER_PUNCTUATION = 23, 00142 MATH_SYMBOL = 24, 00143 CURRENCY_SYMBOL = 25, 00144 MODIFIER_SYMBOL = 26, 00145 OTHER_SYMBOL = 27, 00146 INITIAL_PUNCTUATION = 28, 00147 FINAL_PUNCTUATION = 29, 00148 GENERAL_TYPES_COUNT = 30 00149 }; 00150 00151 /* Please keep these values in sync with UCharScript */ 00157 enum EUnicodeScript 00158 { 00159 kBasicLatin=UBLOCK_BASIC_LATIN, 00160 kLatin1Supplement, 00161 kLatinExtendedA, 00162 kLatinExtendedB, 00163 kIPAExtension, 00164 kSpacingModifier, 00165 kCombiningDiacritical, 00166 kGreek, 00167 kCyrillic, 00168 kArmenian, 00169 kHebrew, 00170 kArabic, 00171 kSyriac, 00172 kThaana, 00173 kDevanagari, 00174 kBengali, 00175 kGurmukhi, 00176 kGujarati, 00177 kOriya, 00178 kTamil, 00179 kTelugu, 00180 kKannada, 00181 kMalayalam, 00182 kSinhala, 00183 kThai, 00184 kLao, 00185 kTibetan, 00186 kMyanmar, 00187 kGeorgian, 00188 kHangulJamo, 00189 kEthiopic, 00190 kCherokee, 00191 kUnifiedCanadianAboriginalSyllabics, 00192 kogham, 00193 kRunic, 00194 kKhmer, 00195 kMongolian, 00196 kLatinExtendedAdditional, 00197 kGreekExtended, 00198 kGeneralPunctuation, 00199 kSuperSubScript, 00200 kCurrencySymbolScript, 00201 kSymbolCombiningMark, 00202 kLetterlikeSymbol, 00203 kNumberForm, 00204 kArrow, 00205 kMathOperator, 00206 kMiscTechnical, 00207 kControlPicture, 00208 kOpticalCharacter, 00209 kEnclosedAlphanumeric, 00210 kBoxDrawing, 00211 kBlockElement, 00212 kGeometricShape, 00213 kMiscSymbol, 00214 kDingbat, 00215 kBraillePatterns, 00216 kCJKRadicalsSupplement, 00217 kKangxiRadicals, 00218 kIdeographicDescriptionCharacters, 00219 kCJKSymbolPunctuation, 00220 kHiragana, 00221 kKatakana, 00222 kBopomofo, 00223 kHangulCompatibilityJamo, 00224 kKanbun, 00225 kBopomofoExtended, 00226 kEnclosedCJKLetterMonth, 00227 kCJKCompatibility, 00228 kCJKUnifiedIdeographExtensionA, 00229 kCJKUnifiedIdeograph, 00230 kYiSyllables, 00231 kYiRadicals, 00232 kHangulSyllable, 00233 kHighSurrogate, 00234 kHighPrivateUseSurrogate, 00235 kLowSurrogate, 00236 kPrivateUse, 00237 kCJKCompatibilityIdeograph, 00238 kAlphabeticPresentation, 00239 kArabicPresentationA, 00240 kCombiningHalfMark, 00241 kCJKCompatibilityForm, 00242 kSmallFormVariant, 00243 kArabicPresentationB, 00244 kNoScript, 00245 kHalfwidthFullwidthForm, 00246 kScriptCount=UBLOCK_COUNT 00247 }; 00248 00254 enum EDirectionProperty { 00255 LEFT_TO_RIGHT = 0, 00256 RIGHT_TO_LEFT = 1, 00257 EUROPEAN_NUMBER = 2, 00258 EUROPEAN_NUMBER_SEPARATOR = 3, 00259 EUROPEAN_NUMBER_TERMINATOR = 4, 00260 ARABIC_NUMBER = 5, 00261 COMMON_NUMBER_SEPARATOR = 6, 00262 BLOCK_SEPARATOR = 7, 00263 SEGMENT_SEPARATOR = 8, 00264 WHITE_SPACE_NEUTRAL = 9, 00265 OTHER_NEUTRAL = 10, 00266 LEFT_TO_RIGHT_EMBEDDING = 11, 00267 LEFT_TO_RIGHT_OVERRIDE = 12, 00268 RIGHT_TO_LEFT_ARABIC = 13, 00269 RIGHT_TO_LEFT_EMBEDDING = 14, 00270 RIGHT_TO_LEFT_OVERRIDE = 15, 00271 POP_DIRECTIONAL_FORMAT = 16, 00272 DIR_NON_SPACING_MARK = 17, 00273 BOUNDARY_NEUTRAL = 18 00274 }; 00275 00282 enum ECellWidths 00283 { 00284 ZERO_WIDTH = 0, 00285 HALF_WIDTH = 1, 00286 FULL_WIDTH = 2, 00287 NEUTRAL = 3 00288 }; 00289 00301 static inline UBool isSingle(UChar c); 00302 00312 static inline UBool isLead(UChar c); 00313 00323 static inline UBool isTrail(UChar c); 00324 00336 static inline UBool isSurrogate(UChar32 c); 00337 00351 static inline UBool isUnicodeChar(UChar32 c); 00352 00365 static inline UBool isError(UChar32 c); 00366 00377 static inline UBool isValid(UChar32 c); 00378 00391 static inline UBool needMultipleUChar(UChar32 c); 00392 00402 static inline int32_t charLength(UChar32 c); 00403 00418 static inline int32_t arraySize(int32_t size); 00419 00433 static inline UBool isLowerCase(UChar32 ch); 00434 00447 static inline UBool isUpperCase(UChar32 ch); 00448 00461 static inline UBool isTitleCase(UChar32 ch); 00462 00475 static inline UBool isDigit(UChar32 ch); 00476 00493 static inline UBool isDefined(UChar32 ch); 00494 00506 static inline UBool isControl(UChar32 ch); 00507 00519 static inline UBool isPrintable(UChar32 ch); 00520 00533 static inline UBool isBaseForm(UChar32 ch); 00534 00551 static inline UBool isLetter(UChar32 ch); 00552 00574 static inline UBool isJavaIdentifierStart(UChar32 ch); 00575 00605 static inline UBool isJavaIdentifierPart(UChar32 ch); 00606 00622 static inline UBool isUnicodeIdentifierStart(UChar32 ch); 00623 00651 static inline UBool isUnicodeIdentifierPart(UChar32 ch); 00652 00679 static inline UBool isIdentifierIgnorable(UChar32 ch); 00680 00706 static inline UChar32 toLowerCase(UChar32 ch); 00707 00730 static inline UChar32 toUpperCase(UChar32 ch); 00731 00750 static inline UChar32 toTitleCase(UChar32 ch); 00751 00766 static inline UChar32 00767 foldCase(UChar32 c, uint32_t options); 00768 00778 static inline UBool isSpaceChar(UChar32 ch); 00779 00809 static inline UBool isWhitespace(UChar32 ch); 00810 00846 static inline int8_t getType(UChar32 ch); 00847 00856 static inline uint8_t getCombiningClass(UChar32 c); 00857 00868 static inline EDirectionProperty characterDirection(UChar32 ch); 00869 00881 static inline UBool isMirrored(UChar32 c); 00882 00900 static inline UChar32 charMirror(UChar32 c); 00901 00907 static inline EUnicodeScript getScript(UChar32 ch); 00908 00961 static inline uint16_t getCellWidth(UChar32 ch); 00962 00991 static inline int32_t 00992 getCharName(uint32_t code, 00993 char *buffer, int32_t bufferLength, 00994 UCharNameChoice nameChoice=U_UNICODE_CHAR_NAME); 00995 01007 static inline int32_t digitValue(UChar32 ch); 01008 01047 static inline int32_t digit(UChar32 ch, int8_t radix); 01048 01077 static inline UChar32 forDigit(int32_t digit, int8_t radix); 01078 01085 static void getUnicodeVersion(UVersionInfo info); 01086 01087 protected: 01088 // These constructors, destructor, and assignment operator must 01089 // be protected (not private, as they semantically are) to make 01090 // various UNIX compilers happy. [LIU] 01091 // They should be private to prevent anyone from instantiating or 01092 // subclassing Unicode. 01093 Unicode(); 01094 Unicode(const Unicode &other); 01095 ~Unicode(); 01096 const Unicode &operator=(const Unicode &other); 01097 }; 01098 01099 /* inline implementations --------------------------------------------------- */ 01100 01101 inline UBool 01102 Unicode::isSingle(UChar c) { 01103 return UTF_IS_SINGLE(c); 01104 } 01105 01106 inline UBool 01107 Unicode::isLead(UChar c) { 01108 return UTF_IS_LEAD(c); 01109 } 01110 01111 inline UBool 01112 Unicode::isTrail(UChar c) { 01113 return UTF_IS_TRAIL(c); 01114 } 01115 01116 inline UBool 01117 Unicode::isSurrogate(UChar32 c) { 01118 return UTF_IS_SURROGATE(c); 01119 } 01120 01121 inline UBool 01122 Unicode::isUnicodeChar(UChar32 c) { 01123 return UTF_IS_UNICODE_CHAR(c); 01124 } 01125 01126 inline UBool 01127 Unicode::isError(UChar32 c) { 01128 return UTF_IS_ERROR(c); 01129 } 01130 01131 inline UBool 01132 Unicode::isValid(UChar32 c) { 01133 return UTF_IS_VALID(c); 01134 } 01135 01136 inline UBool 01137 Unicode::needMultipleUChar(UChar32 c) { 01138 return UTF_NEED_MULTIPLE_UCHAR(c); 01139 } 01140 01141 inline int32_t 01142 Unicode::charLength(UChar32 c) { 01143 return UTF_CHAR_LENGTH(c); 01144 } 01145 01146 inline int32_t 01147 Unicode::arraySize(int32_t size) { 01148 return UTF_ARRAY_SIZE(size); 01149 } 01150 01151 // Checks if ch is a lower case letter. 01152 inline UBool 01153 Unicode::isLowerCase(UChar32 ch) { 01154 return u_islower(ch); 01155 } 01156 01157 // Checks if ch is a upper case letter. 01158 inline UBool 01159 Unicode::isUpperCase(UChar32 ch) { 01160 return u_isupper(ch); 01161 } 01162 01163 // Checks if ch is a title case letter; usually upper case letters. 01164 inline UBool 01165 Unicode::isTitleCase(UChar32 ch) { 01166 return u_istitle(ch); 01167 } 01168 01169 // Checks if ch is a decimal digit. 01170 inline UBool 01171 Unicode::isDigit(UChar32 ch) { 01172 return u_isdigit(ch); 01173 } 01174 01175 // Checks if ch is a unicode character with assigned character type. 01176 inline UBool 01177 Unicode::isDefined(UChar32 ch) { 01178 return u_isdefined(ch); 01179 } 01180 01181 // Checks if the Unicode character is a control character. 01182 inline UBool 01183 Unicode::isControl(UChar32 ch) { 01184 return u_iscntrl(ch); 01185 } 01186 01187 // Checks if the Unicode character is printable. 01188 inline UBool 01189 Unicode::isPrintable(UChar32 ch) { 01190 return u_isprint(ch); 01191 } 01192 01193 // Checks if the Unicode character is a base form character that can take a diacritic. 01194 inline UBool 01195 Unicode::isBaseForm(UChar32 ch) { 01196 return u_isbase(ch); 01197 } 01198 01199 // Checks if the Unicode character is a letter. 01200 inline UBool 01201 Unicode::isLetter(UChar32 ch) { 01202 return u_isalpha(ch); 01203 } 01204 01205 // Checks if the Unicode character can start a Java identifier. 01206 inline UBool 01207 Unicode::isJavaIdentifierStart(UChar32 ch) { 01208 return u_isJavaIDStart(ch); 01209 } 01210 01211 // Checks if the Unicode character can be a Java identifier part other than starting the 01212 // identifier. 01213 inline UBool 01214 Unicode::isJavaIdentifierPart(UChar32 ch) { 01215 return u_isJavaIDPart(ch); 01216 } 01217 01218 // Checks if the Unicode character can start a Unicode identifier. 01219 inline UBool 01220 Unicode::isUnicodeIdentifierStart(UChar32 ch) { 01221 return u_isIDStart(ch); 01222 } 01223 01224 // Checks if the Unicode character can be a Unicode identifier part other than starting the 01225 // identifier. 01226 inline UBool 01227 Unicode::isUnicodeIdentifierPart(UChar32 ch) { 01228 return u_isIDPart(ch); 01229 } 01230 01231 // Checks if the Unicode character can be ignorable in a Java or Unicode identifier. 01232 inline UBool 01233 Unicode::isIdentifierIgnorable(UChar32 ch) { 01234 return u_isIDIgnorable(ch); 01235 } 01236 01237 // Transforms the Unicode character to its lower case equivalent. 01238 inline UChar32 01239 Unicode::toLowerCase(UChar32 ch) { 01240 return u_tolower(ch); 01241 } 01242 01243 // Transforms the Unicode character to its upper case equivalent. 01244 inline UChar32 01245 Unicode::toUpperCase(UChar32 ch) { 01246 return u_toupper(ch); 01247 } 01248 01249 // Transforms the Unicode character to its title case equivalent. 01250 inline UChar32 01251 Unicode::toTitleCase(UChar32 ch) { 01252 return u_totitle(ch); 01253 } 01254 01255 // Transforms the Unicode character to its case folded equivalent. 01256 inline UChar32 01257 Unicode::foldCase(UChar32 ch, uint32_t options) { 01258 return u_foldCase(ch, options); 01259 } 01260 01261 // Checks if the Unicode character is a space character. 01262 inline UBool 01263 Unicode::isSpaceChar(UChar32 ch) { 01264 return u_isspace(ch); 01265 } 01266 01267 // Determines if the specified character is white space according to ICU. 01268 inline UBool 01269 Unicode::isWhitespace(UChar32 ch) { 01270 return u_isWhitespace(ch); 01271 } 01272 01273 // Gets if the Unicode character's character property. 01274 inline int8_t 01275 Unicode::getType(UChar32 ch) { 01276 return u_charType(ch); 01277 } 01278 01279 inline uint8_t 01280 Unicode::getCombiningClass(UChar32 c) { 01281 return u_getCombiningClass(c); 01282 } 01283 01284 // Gets the character's linguistic directionality. 01285 inline Unicode::EDirectionProperty 01286 Unicode::characterDirection(UChar32 ch) { 01287 return (EDirectionProperty)u_charDirection(ch); 01288 } 01289 01290 // Determines if the character has the "mirrored" property. 01291 inline UBool 01292 Unicode::isMirrored(UChar32 ch) { 01293 return u_isMirrored(ch); 01294 } 01295 01296 // Maps the character to a "mirror-image" character, or to itself. 01297 inline UChar32 01298 Unicode::charMirror(UChar32 ch) { 01299 return u_charMirror(ch); 01300 } 01301 01302 // Get the script associated with the character 01303 inline Unicode::EUnicodeScript 01304 Unicode::getScript(UChar32 ch) { 01305 return (EUnicodeScript) u_charScript(ch); 01306 } 01307 01308 // Gets table cell width of the Unicode character. 01309 inline uint16_t 01310 Unicode::getCellWidth(UChar32 ch) { 01311 return u_charCellWidth(ch); 01312 } 01313 01314 inline int32_t 01315 Unicode::getCharName(uint32_t code, 01316 char *buffer, int32_t bufferLength, 01317 UCharNameChoice nameChoice) { 01318 UErrorCode errorCode=U_ZERO_ERROR; 01319 int32_t length=u_charName(code, nameChoice, buffer, bufferLength, &errorCode); 01320 return U_SUCCESS(errorCode) ? length : 0; 01321 } 01322 01323 inline int32_t 01324 Unicode::digitValue(UChar32 ch) { 01325 return u_charDigitValue(ch); 01326 } 01327 01328 inline int32_t 01329 Unicode::digit(UChar32 ch, int8_t radix) { 01330 return u_digit(ch, radix); 01331 } 01332 01333 inline UChar32 01334 Unicode::forDigit(int32_t digit, int8_t radix) { 01335 return u_forDigit(digit, radix); 01336 } 01337 01338 inline void 01339 Unicode::getUnicodeVersion(UVersionInfo versionArray) { 01340 u_getUnicodeVersion(versionArray); 01341 } 01342 U_NAMESPACE_END 01343 01344 #endif

Generated on Wed Aug 18 05:18:14 2004 for ICU 2.1 by doxygen 1.3.7