Main Page | Class Hierarchy | Alphabetical List | Data Structures | Directories | File List | Data Fields | Globals | Related Pages

regex.h

Go to the documentation of this file.
00001 /*
00002 **********************************************************************
00003 *   Copyright (C) 2002-2003, International Business Machines
00004 *   Corporation and others.  All Rights Reserved.
00005 **********************************************************************
00006 *   file name:  regex.h
00007 *   encoding:   US-ASCII
00008 *   indentation:4
00009 *
00010 *   created on: 2002oct22
00011 *   created by: Andy Heninger
00012 *
00013 *   ICU Regular Expressions, API for C++
00014 */
00015 
00016 #ifndef REGEX_H
00017 #define REGEX_H
00018 
00019 // #define REGEX_DEBUG
00020 
00040 #include "unicode/utypes.h"
00041 
00042 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
00043 
00044 #include "unicode/uobject.h"
00045 #include "unicode/unistr.h"
00046 #include "unicode/parseerr.h"
00047 
00048 U_NAMESPACE_BEGIN
00049 
00050 
00051 // Forward Declarations...
00052 
00053 class RegexMatcher;
00054 class RegexPattern;
00055 class UVector;
00056 class UVector32;
00057 class UnicodeSet;
00058 struct REStackFrame;
00059 struct Regex8BitSet;
00060 class  RuleBasedBreakIterator;
00061 
00062 
00063 
00068 enum {
00070     UREGEX_CANON_EQ         = 128,
00071 
00073     UREGEX_CASE_INSENSITIVE = 2,
00074 
00076     UREGEX_COMMENTS         = 4,
00077 
00080     UREGEX_DOTALL           = 32,
00081 
00086     UREGEX_MULTILINE        = 8,
00087 
00095     UREGEX_UWORD            = 256
00096 };
00097 
00098 
00099 
00100 
00105 #ifdef REGEX_DEBUG
00106 U_CAPI void U_EXPORT2
00107     RegexPatternDump(const RegexPattern *pat);
00108 #else
00109     #define RegexPatternDump(pat)
00110 #endif
00111 
00112 
00113 
00125 class U_I18N_API RegexPattern: public UObject {
00126 public:
00127 
00135     RegexPattern();
00136 
00142     RegexPattern(const RegexPattern &source);
00143 
00149     virtual ~RegexPattern();
00150 
00159     UBool           operator==(const RegexPattern& that) const;
00160 
00169     inline UBool    operator!=(const RegexPattern& that) const {return ! operator ==(that);};
00170 
00176     RegexPattern  &operator =(const RegexPattern &source);
00177 
00185     virtual RegexPattern  *clone() const;
00186 
00187 
00208     static RegexPattern *compile( const UnicodeString &regex,
00209         UParseError          &pe,
00210         UErrorCode           &status);
00211 
00232     static RegexPattern *compile( const UnicodeString &regex,
00233         uint32_t             flags,
00234         UParseError          &pe,
00235         UErrorCode           &status);
00236 
00237 
00256     static RegexPattern *compile( const UnicodeString &regex,
00257         uint32_t             flags,
00258         UErrorCode           &status);
00259 
00260 
00266     virtual uint32_t flags() const;
00267 
00280     virtual RegexMatcher *matcher(const UnicodeString &input,
00281         UErrorCode          &status) const;
00282 
00283 
00295     virtual RegexMatcher *matcher(UErrorCode  &status) const;
00296 
00297 
00312     static UBool matches(const UnicodeString   &regex,
00313         const UnicodeString   &input,
00314         UParseError     &pe,
00315         UErrorCode      &status);
00316 
00317 
00322     virtual UnicodeString pattern() const;
00323 
00324 
00350     virtual int32_t  split(const UnicodeString &input,
00351         UnicodeString    dest[],
00352         int32_t          destCapacity,
00353         UErrorCode       &status) const;
00354 
00355 
00361     virtual UClassID getDynamicClassID() const; 
00362 
00368     static UClassID getStaticClassID(); 
00369 
00370 private:
00371     //
00372     //  Implementation Data
00373     //
00374     UnicodeString   fPattern;      // The original pattern string.
00375     uint32_t        fFlags;        // The flags used when compiling the pattern.
00376                                    //
00377     UVector32       *fCompiledPat; // The compiled pattern p-code.
00378     UnicodeString   fLiteralText;  // Any literal string data from the pattern,
00379                                    //   after un-escaping, for use during the match.
00380 
00381     UVector         *fSets;        // Any UnicodeSets referenced from the pattern.
00382     Regex8BitSet    *fSets8;       //      (and fast sets for latin-1 range.)
00383 
00384 
00385     UErrorCode      fDeferredStatus; // status if some prior error has left this
00386                                    //  RegexPattern in an unusable state.
00387 
00388     int32_t         fMinMatchLen;  // Minimum Match Length.  All matches will have length
00389                                    //   >= this value.  For some patterns, this calculated
00390                                    //   value may be less than the true shortest
00391                                    //   possible match.
00392 
00393     int32_t         fFrameSize;    // Size of a state stack frame in the
00394                                    //   execution engine.
00395 
00396     int32_t         fDataSize;     // The size of the data needed by the pattern that
00397                                    //   does not go on the state stack, but has just
00398                                    //   a single copy per matcher.
00399 
00400     UVector32       *fGroupMap;    // Map from capture group number to position of
00401                                    //   the group's variables in the matcher stack frame.
00402 
00403     int32_t         fMaxCaptureDigits;
00404 
00405     UnicodeSet     **fStaticSets;  // Ptr to static (shared) sets for predefined
00406                                    //   regex character classes, e.g. Word.
00407 
00408     Regex8BitSet   *fStaticSets8;  // Ptr to the static (shared) latin-1 only
00409                                    //  sets for predefined regex classes.
00410 
00411     int32_t         fStartType;    // Info on how a match must start.
00412     int32_t         fInitialStringIdx;     //  
00413     int32_t         fInitialStringLen;
00414     UnicodeSet     *fInitialChars;  
00415     UChar32         fInitialChar;
00416     Regex8BitSet   *fInitialChars8;
00417 
00418     friend class RegexCompile;
00419     friend class RegexMatcher;
00420 
00421     //
00422     //  Implementation Methods
00423     //
00424     void        init();            // Common initialization, for use by constructors.
00425     void        zap();             // Common cleanup
00426 #ifdef REGEX_DEBUG
00427     void        dumpOp(int32_t index) const;
00428     friend     void RegexPatternDump(const RegexPattern *);
00429 #endif
00430 
00431 };
00432 
00433 
00434 
00444 class U_I18N_API RegexMatcher: public UObject {
00445 public:
00446 
00461     RegexMatcher(const UnicodeString &regexp, uint32_t flags, UErrorCode &status);
00462 
00478     RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
00479         uint32_t flags, UErrorCode &status);
00480 
00481 
00487     virtual ~RegexMatcher();
00488 
00489 
00496     virtual UBool matches(UErrorCode &status);
00497 
00506     virtual UBool matches(int32_t startIndex, UErrorCode &status);
00507 
00508 
00509 
00510 
00523     virtual UBool lookingAt(UErrorCode &status);
00524 
00525 
00539     virtual UBool lookingAt(int32_t startIndex, UErrorCode &status);
00540 
00553     virtual UBool find();
00554 
00555 
00565     virtual UBool find(int32_t start, UErrorCode &status);
00566 
00567 
00577     virtual UnicodeString group(UErrorCode &status) const;
00578 
00579 
00592     virtual UnicodeString group(int32_t groupNum, UErrorCode &status) const;
00593 
00594 
00600     virtual int32_t groupCount() const;
00601 
00602 
00610     virtual int32_t start(UErrorCode &status) const;
00611 
00612 
00626     virtual int32_t start(int group, UErrorCode &status) const;
00627 
00628 
00638     virtual int32_t end(UErrorCode &status) const;
00639 
00640 
00654     virtual int32_t end(int group, UErrorCode &status) const;
00655 
00656 
00665     virtual RegexMatcher &reset();
00666 
00667 
00677     virtual RegexMatcher &reset(int32_t index, UErrorCode &status);
00678 
00679 
00687     virtual RegexMatcher &reset(const UnicodeString &input);
00688 
00689 
00696     virtual const UnicodeString &input() const;
00697 
00698 
00704     virtual const RegexPattern &pattern() const;
00705 
00706 
00723     virtual UnicodeString replaceAll(const UnicodeString &replacement, UErrorCode &status);
00724 
00725 
00746     virtual UnicodeString replaceFirst(const UnicodeString &replacement, UErrorCode &status);
00747 
00775     virtual RegexMatcher &appendReplacement(UnicodeString &dest,
00776         const UnicodeString &replacement, UErrorCode &status);
00777 
00778 
00789     virtual UnicodeString &appendTail(UnicodeString &dest);
00790 
00791 
00792 
00817     virtual int32_t  split(const UnicodeString &input,
00818         UnicodeString    dest[],
00819         int32_t          destCapacity,
00820         UErrorCode       &status);
00821 
00822 
00823 
00829     void setTrace(UBool state);
00830 
00831 
00837     static UClassID getStaticClassID();
00838 
00844     virtual UClassID getDynamicClassID() const;
00845 
00846 private:
00847     // Constructors and other object boilerplate are private.
00848     // Instances of RegexMatcher can not be assigned, copied, cloned, etc.
00849     RegexMatcher(); // default constructor not implemented
00850     RegexMatcher(const RegexPattern *pat);
00851     RegexMatcher(const RegexMatcher &other);
00852     RegexMatcher &operator =(const RegexMatcher &rhs);
00853     friend class RegexPattern;
00854 
00855 
00856     //
00857     //  MatchAt   This is the internal interface to the match engine itself.
00858     //            Match status comes back in matcher member variables.
00859     //
00860     void                 MatchAt(int32_t startIdx, UErrorCode &status);
00861     inline void          backTrack(int32_t &inputIdx, int32_t &patIdx);
00862     UBool                isWordBoundary(int32_t pos);         // perform Perl-like  \b test
00863     UBool                isUWordBoundary(int32_t pos);        // perform RBBI based \b test
00864     REStackFrame        *resetStack();
00865     inline REStackFrame *StateSave(REStackFrame *fp, int32_t savePatIdx,
00866                                    int32_t frameSize, UErrorCode &status);
00867 
00868 
00869     const RegexPattern  *fPattern;
00870     RegexPattern        *fPatternOwned;    // Non-NULL if this matcher owns the pattern, and
00871                                            //   should delete it when through.
00872     const UnicodeString *fInput;
00873 
00874     UBool                fMatch;           // True if the last match was successful.
00875     int32_t              fMatchStart;      // Position of the start of the most recent match
00876     int32_t              fMatchEnd;        // First position after the end of the most recent match
00877     int32_t              fLastMatchEnd;    // First position after the end of the previous match.
00878 
00879     UVector32           *fStack;
00880     REStackFrame        *fFrame;           // After finding a match, the last active stack
00881                                            //   frame, which will contain the capture group results.
00882                                            //   NOT valid while match engine is running.
00883 
00884     int32_t             *fData;            // Data area for use by the compiled pattern.
00885     int32_t             fSmallData[8];     //   Use this for data if it's enough.
00886 
00887     UBool               fTraceDebug;       // Set true for debug tracing of match engine.
00888 
00889     UErrorCode          fDeferredStatus;   // Save error state if that cannot be immediately
00890                                            //   reported, or that permanently disables this matcher.
00891 
00892     RuleBasedBreakIterator  *fWordBreakItr;
00893 
00894 };
00895 
00896 U_NAMESPACE_END
00897 #endif  // UCONFIG_NO_REGULAR_EXPRESSIONS
00898 #endif

Generated on Wed May 18 17:29:14 2005 for ICU 2.8 by  doxygen 1.4.2