Main Page | Class Hierarchy | Alphabetical List | Data Structures | File List | Data Fields | Globals | Related Pages

regex.h

Go to the documentation of this file.
00001 /* 00002 ********************************************************************** 00003 * Copyright (C) 2002-2003, International Business Machines 00004 * Corporation and others. All Rights Reserved. 00005 ********************************************************************** 00006 * file name: regex.h 00007 * encoding: US-ASCII 00008 * indentation:4 00009 * 00010 * created on: 2002oct22 00011 * created by: Andy Heninger 00012 * 00013 * ICU Regular Expressions, API for C++ 00014 */ 00015 00016 #ifndef REGEX_H 00017 #define REGEX_H 00018 00019 // #define REGEX_DEBUG 00020 00040 #include "unicode/utypes.h" 00041 00042 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 00043 00044 #include "unicode/uobject.h" 00045 #include "unicode/unistr.h" 00046 #include "unicode/parseerr.h" 00047 00048 U_NAMESPACE_BEGIN 00049 00050 00051 // Forward Declarations... 00052 00053 class RegexMatcher; 00054 class RegexPattern; 00055 class UVector; 00056 class UVector32; 00057 class UnicodeSet; 00058 struct REStackFrame; 00059 struct Regex8BitSet; 00060 class RuleBasedBreakIterator; 00061 00062 00063 00068 enum { 00070 UREGEX_CANON_EQ = 128, 00071 00073 UREGEX_CASE_INSENSITIVE = 2, 00074 00076 UREGEX_COMMENTS = 4, 00077 00080 UREGEX_DOTALL = 32, 00081 00086 UREGEX_MULTILINE = 8, 00087 00095 UREGEX_UWORD = 256 00096 }; 00097 00098 00099 00100 00105 #ifdef REGEX_DEBUG 00106 U_CAPI void U_EXPORT2 00107 RegexPatternDump(const RegexPattern *pat); 00108 #else 00109 #define RegexPatternDump(pat) 00110 #endif 00111 00112 00113 00125 class U_I18N_API RegexPattern: public UObject { 00126 public: 00127 00135 RegexPattern(); 00136 00142 RegexPattern(const RegexPattern &source); 00143 00149 virtual ~RegexPattern(); 00150 00159 UBool operator==(const RegexPattern& that) const; 00160 00169 inline UBool operator!=(const RegexPattern& that) const {return ! operator ==(that);}; 00170 00176 RegexPattern &operator =(const RegexPattern &source); 00177 00185 virtual RegexPattern *clone() const; 00186 00187 00208 static RegexPattern *compile( const UnicodeString &regex, 00209 UParseError &pe, 00210 UErrorCode &status); 00211 00232 static RegexPattern *compile( const UnicodeString &regex, 00233 uint32_t flags, 00234 UParseError &pe, 00235 UErrorCode &status); 00236 00237 00256 static RegexPattern *compile( const UnicodeString &regex, 00257 uint32_t flags, 00258 UErrorCode &status); 00259 00260 00266 virtual uint32_t flags() const; 00267 00280 virtual RegexMatcher *matcher(const UnicodeString &input, 00281 UErrorCode &status) const; 00282 00283 00295 virtual RegexMatcher *matcher(UErrorCode &status) const; 00296 00297 00312 static UBool matches(const UnicodeString &regex, 00313 const UnicodeString &input, 00314 UParseError &pe, 00315 UErrorCode &status); 00316 00317 00322 virtual UnicodeString pattern() const; 00323 00324 00350 virtual int32_t split(const UnicodeString &input, 00351 UnicodeString dest[], 00352 int32_t destCapacity, 00353 UErrorCode &status) const; 00354 00355 00361 virtual UClassID getDynamicClassID() const; 00362 00368 static UClassID getStaticClassID(); 00369 00370 private: 00371 // 00372 // Implementation Data 00373 // 00374 UnicodeString fPattern; // The original pattern string. 00375 uint32_t fFlags; // The flags used when compiling the pattern. 00376 // 00377 UVector32 *fCompiledPat; // The compiled pattern p-code. 00378 UnicodeString fLiteralText; // Any literal string data from the pattern, 00379 // after un-escaping, for use during the match. 00380 00381 UVector *fSets; // Any UnicodeSets referenced from the pattern. 00382 Regex8BitSet *fSets8; // (and fast sets for latin-1 range.) 00383 00384 00385 UErrorCode fDeferredStatus; // status if some prior error has left this 00386 // RegexPattern in an unusable state. 00387 00388 int32_t fMinMatchLen; // Minimum Match Length. All matches will have length 00389 // >= this value. For some patterns, this calculated 00390 // value may be less than the true shortest 00391 // possible match. 00392 00393 int32_t fFrameSize; // Size of a state stack frame in the 00394 // execution engine. 00395 00396 int32_t fDataSize; // The size of the data needed by the pattern that 00397 // does not go on the state stack, but has just 00398 // a single copy per matcher. 00399 00400 UVector32 *fGroupMap; // Map from capture group number to position of 00401 // the group's variables in the matcher stack frame. 00402 00403 int32_t fMaxCaptureDigits; 00404 00405 UnicodeSet **fStaticSets; // Ptr to static (shared) sets for predefined 00406 // regex character classes, e.g. Word. 00407 00408 Regex8BitSet *fStaticSets8; // Ptr to the static (shared) latin-1 only 00409 // sets for predefined regex classes. 00410 00411 int32_t fStartType; // Info on how a match must start. 00412 int32_t fInitialStringIdx; // 00413 int32_t fInitialStringLen; 00414 UnicodeSet *fInitialChars; 00415 UChar32 fInitialChar; 00416 Regex8BitSet *fInitialChars8; 00417 00418 friend class RegexCompile; 00419 friend class RegexMatcher; 00420 00421 // 00422 // Implementation Methods 00423 // 00424 void init(); // Common initialization, for use by constructors. 00425 void zap(); // Common cleanup 00426 #ifdef REGEX_DEBUG 00427 void dumpOp(int32_t index) const; 00428 friend void RegexPatternDump(const RegexPattern *); 00429 #endif 00430 00431 }; 00432 00433 00434 00444 class U_I18N_API RegexMatcher: public UObject { 00445 public: 00446 00461 RegexMatcher(const UnicodeString &regexp, uint32_t flags, UErrorCode &status); 00462 00478 RegexMatcher(const UnicodeString &regexp, const UnicodeString &input, 00479 uint32_t flags, UErrorCode &status); 00480 00481 00487 virtual ~RegexMatcher(); 00488 00489 00496 virtual UBool matches(UErrorCode &status); 00497 00506 virtual UBool matches(int32_t startIndex, UErrorCode &status); 00507 00508 00509 00510 00523 virtual UBool lookingAt(UErrorCode &status); 00524 00525 00539 virtual UBool lookingAt(int32_t startIndex, UErrorCode &status); 00540 00553 virtual UBool find(); 00554 00555 00565 virtual UBool find(int32_t start, UErrorCode &status); 00566 00567 00577 virtual UnicodeString group(UErrorCode &status) const; 00578 00579 00592 virtual UnicodeString group(int32_t groupNum, UErrorCode &status) const; 00593 00594 00600 virtual int32_t groupCount() const; 00601 00602 00610 virtual int32_t start(UErrorCode &status) const; 00611 00612 00626 virtual int32_t start(int group, UErrorCode &status) const; 00627 00628 00638 virtual int32_t end(UErrorCode &status) const; 00639 00640 00654 virtual int32_t end(int group, UErrorCode &status) const; 00655 00656 00665 virtual RegexMatcher &reset(); 00666 00667 00677 virtual RegexMatcher &reset(int32_t index, UErrorCode &status); 00678 00679 00687 virtual RegexMatcher &reset(const UnicodeString &input); 00688 00689 00696 virtual const UnicodeString &input() const; 00697 00698 00704 virtual const RegexPattern &pattern() const; 00705 00706 00723 virtual UnicodeString replaceAll(const UnicodeString &replacement, UErrorCode &status); 00724 00725 00746 virtual UnicodeString replaceFirst(const UnicodeString &replacement, UErrorCode &status); 00747 00775 virtual RegexMatcher &appendReplacement(UnicodeString &dest, 00776 const UnicodeString &replacement, UErrorCode &status); 00777 00778 00789 virtual UnicodeString &appendTail(UnicodeString &dest); 00790 00791 00792 00817 virtual int32_t split(const UnicodeString &input, 00818 UnicodeString dest[], 00819 int32_t destCapacity, 00820 UErrorCode &status); 00821 00822 00823 00829 void setTrace(UBool state); 00830 00831 00837 static UClassID getStaticClassID(); 00838 00844 virtual UClassID getDynamicClassID() const; 00845 00846 private: 00847 // Constructors and other object boilerplate are private. 00848 // Instances of RegexMatcher can not be assigned, copied, cloned, etc. 00849 RegexMatcher(); // default constructor not implemented 00850 RegexMatcher(const RegexPattern *pat); 00851 RegexMatcher(const RegexMatcher &other); 00852 RegexMatcher &operator =(const RegexMatcher &rhs); 00853 friend class RegexPattern; 00854 00855 00856 // 00857 // MatchAt This is the internal interface to the match engine itself. 00858 // Match status comes back in matcher member variables. 00859 // 00860 void MatchAt(int32_t startIdx, UErrorCode &status); 00861 inline void backTrack(int32_t &inputIdx, int32_t &patIdx); 00862 UBool isWordBoundary(int32_t pos); // perform Perl-like \b test 00863 UBool isUWordBoundary(int32_t pos); // perform RBBI based \b test 00864 REStackFrame *resetStack(); 00865 inline REStackFrame *StateSave(REStackFrame *fp, int32_t savePatIdx, 00866 int32_t frameSize, UErrorCode &status); 00867 00868 00869 const RegexPattern *fPattern; 00870 RegexPattern *fPatternOwned; // Non-NULL if this matcher owns the pattern, and 00871 // should delete it when through. 00872 const UnicodeString *fInput; 00873 00874 UBool fMatch; // True if the last match was successful. 00875 int32_t fMatchStart; // Position of the start of the most recent match 00876 int32_t fMatchEnd; // First position after the end of the most recent match 00877 int32_t fLastMatchEnd; // First position after the end of the previous match. 00878 00879 UVector32 *fStack; 00880 REStackFrame *fFrame; // After finding a match, the last active stack 00881 // frame, which will contain the capture group results. 00882 // NOT valid while match engine is running. 00883 00884 int32_t *fData; // Data area for use by the compiled pattern. 00885 int32_t fSmallData[8]; // Use this for data if it's enough. 00886 00887 UBool fTraceDebug; // Set true for debug tracing of match engine. 00888 00889 UErrorCode fDeferredStatus; // Save error state if that cannot be immediately 00890 // reported, or that permanently disables this matcher. 00891 00892 RuleBasedBreakIterator *fWordBreakItr; 00893 00894 }; 00895 00896 U_NAMESPACE_END 00897 #endif // UCONFIG_NO_REGULAR_EXPRESSIONS 00898 #endif

Generated on Wed Jul 28 09:15:54 2004 for ICU 2.8 by doxygen 1.3.7