mbuiter.h

00001 /* Iterating through multibyte strings: macros for multi-byte encodings.
00002    Copyright (C) 2001, 2005 Free Software Foundation, Inc.
00003 
00004    This program is free software; you can redistribute it and/or modify
00005    it under the terms of the GNU Lesser General Public License as published by
00006    the Free Software Foundation; either version 2.1, or (at your option)
00007    any later version.
00008 
00009    This program is distributed in the hope that it will be useful,
00010    but WITHOUT ANY WARRANTY; without even the implied warranty of
00011    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012    GNU Lesser General Public License for more details.
00013 
00014    You should have received a copy of the GNU Lesser General Public License
00015    along with this program; if not, write to the Free Software Foundation,
00016    Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
00017 
00018 /* Written by Bruno Haible <bruno@clisp.org>.  */
00019 
00020 /* The macros in this file implement forward iteration through a
00021    multi-byte string, without knowing its length a-priori.
00022 
00023    With these macros, an iteration loop that looks like
00024 
00025       char *iter;
00026       for (iter = buf; *iter != '\0'; iter++)
00027         {
00028           do_something (*iter);
00029         }
00030 
00031    becomes
00032 
00033       mbui_iterator_t iter;
00034       for (mbui_init (iter, buf); mbui_avail (iter); mbui_advance (iter))
00035         {
00036           do_something (mbui_cur_ptr (iter), mb_len (mbui_cur (iter)));
00037         }
00038 
00039    The benefit of these macros over plain use of mbrtowc is:
00040    - Handling of invalid multibyte sequences is possible without
00041      making the code more complicated, while still preserving the
00042      invalid multibyte sequences.
00043 
00044    Compared to mbiter.h, the macros here don't need to know the string's
00045    length a-priori.  The downside is that at each step, the look-ahead
00046    that guards against overrunning the terminating '\0' is more expensive.
00047    The mbui_* macros are therefore suitable when there is a high probability
00048    that only the first few multibyte characters need to be inspected.
00049    Whereas the mbi_* macros are better if usually the iteration runs
00050    through the entire string.
00051 
00052    mbui_iterator_t
00053      is a type usable for variable declarations.
00054 
00055    mbui_init (iter, startptr)
00056      initializes the iterator, starting at startptr.
00057 
00058    mbui_avail (iter)
00059      returns true if there are more multibyte chracters available before
00060      the end of string is reached. In this case, mbui_cur (iter) is
00061      initialized to the next multibyte chracter.
00062 
00063    mbui_advance (iter)
00064      advances the iterator by one multibyte character.
00065 
00066    mbui_cur (iter)
00067      returns the current multibyte character, of type mbchar_t.  All the
00068      macros defined in mbchar.h can be used on it.
00069 
00070    mbui_cur_ptr (iter)
00071      return a pointer to the beginning of the current multibyte character.
00072 
00073    mbui_reloc (iter, ptrdiff)
00074      relocates iterator when the string is moved by ptrdiff bytes.
00075 
00076    Here are the function prototypes of the macros.
00077 
00078    extern void          mbui_init (mbui_iterator_t iter, const char *startptr);
00079    extern bool          mbui_avail (mbui_iterator_t iter);
00080    extern void          mbui_advance (mbui_iterator_t iter);
00081    extern mbchar_t      mbui_cur (mbui_iterator_t iter);
00082    extern const char *  mbui_cur_ptr (mbui_iterator_t iter);
00083    extern void          mbui_reloc (mbui_iterator_t iter, ptrdiff_t ptrdiff);
00084  */
00085 
00086 #ifndef _MBUITER_H
00087 #define _MBUITER_H 1
00088 
00089 #include <assert.h>
00090 #include <stdbool.h>
00091 #include <stdlib.h>
00092 
00093 /* Tru64 with Desktop Toolkit C has a bug: <stdio.h> must be included before
00094    <wchar.h>.
00095    BSD/OS 4.1 has a bug: <stdio.h> and <time.h> must be included before
00096    <wchar.h>.  */
00097 #include <stdio.h>
00098 #include <time.h>
00099 #include <wchar.h>
00100 
00101 #include "mbchar.h"
00102 #include "strnlen1.h"
00103 
00104 struct mbuiter_multi
00105 {
00106   bool in_shift;        /* true if next byte may not be interpreted as ASCII */
00107   mbstate_t state;      /* if in_shift: current shift state */
00108   bool next_done;       /* true if mbui_avail has already filled the following */
00109   struct mbchar cur;    /* the current character:
00110         const char *cur.ptr             pointer to current character
00111         The following are only valid after mbui_avail.
00112         size_t cur.bytes                number of bytes of current character
00113         bool cur.wc_valid               true if wc is a valid wide character
00114         wchar_t cur.wc                  if wc_valid: the current character
00115         */
00116 };
00117 
00118 static inline void
00119 mbuiter_multi_next (struct mbuiter_multi *iter)
00120 {
00121   if (iter->next_done)
00122     return;
00123   if (iter->in_shift)
00124     goto with_shift;
00125   /* Handle most ASCII characters quickly, without calling mbrtowc().  */
00126   if (is_basic (*iter->cur.ptr))
00127     {
00128       /* These characters are part of the basic character set.  ISO C 99
00129          guarantees that their wide character code is identical to their
00130          char code.  */
00131       iter->cur.bytes = 1;
00132       iter->cur.wc = *iter->cur.ptr;
00133       iter->cur.wc_valid = true;
00134     }
00135   else
00136     {
00137       assert (mbsinit (&iter->state));
00138       iter->in_shift = true;
00139     with_shift:
00140       iter->cur.bytes = mbrtowc (&iter->cur.wc, iter->cur.ptr,
00141                                  strnlen1 (iter->cur.ptr, MB_CUR_MAX),
00142                                  &iter->state);
00143       if (iter->cur.bytes == (size_t) -1)
00144         {
00145           /* An invalid multibyte sequence was encountered.  */
00146           iter->cur.bytes = 1;
00147           iter->cur.wc_valid = false;
00148           /* Whether to set iter->in_shift = false and reset iter->state
00149              or not is not very important; the string is bogus anyway.  */
00150         }
00151       else if (iter->cur.bytes == (size_t) -2)
00152         {
00153           /* An incomplete multibyte character at the end.  */
00154           iter->cur.bytes = strlen (iter->cur.ptr);
00155           iter->cur.wc_valid = false;
00156           /* Whether to set iter->in_shift = false and reset iter->state
00157              or not is not important; the string end is reached anyway.  */
00158         }
00159       else
00160         {
00161           if (iter->cur.bytes == 0)
00162             {
00163               /* A null wide character was encountered.  */
00164               iter->cur.bytes = 1;
00165               assert (*iter->cur.ptr == '\0');
00166               assert (iter->cur.wc == 0);
00167             }
00168           iter->cur.wc_valid = true;
00169 
00170           /* When in the initial state, we can go back treating ASCII
00171              characters more quickly.  */
00172           if (mbsinit (&iter->state))
00173             iter->in_shift = false;
00174         }
00175     }
00176   iter->next_done = true;
00177 }
00178 
00179 static inline void
00180 mbuiter_multi_reloc (struct mbuiter_multi *iter, ptrdiff_t ptrdiff)
00181 {
00182   iter->cur.ptr += ptrdiff;
00183 }
00184 
00185 /* Iteration macros.  */
00186 typedef struct mbuiter_multi mbui_iterator_t;
00187 #define mbui_init(iter, startptr) \
00188   ((iter).cur.ptr = (startptr), \
00189    (iter).in_shift = false, memset (&(iter).state, '\0', sizeof (mbstate_t)), \
00190    (iter).next_done = false)
00191 #define mbui_avail(iter) \
00192   (mbuiter_multi_next (&(iter)), !mb_isnul ((iter).cur))
00193 #define mbui_advance(iter) \
00194   ((iter).cur.ptr += (iter).cur.bytes, (iter).next_done = false)
00195 
00196 /* Access to the current character.  */
00197 #define mbui_cur(iter) (iter).cur
00198 #define mbui_cur_ptr(iter) (iter).cur.ptr
00199 
00200 /* Relocation.  */
00201 #define mbui_reloc(iter, ptrdiff) mbuiter_multi_reloc (&iter, ptrdiff)
00202 
00203 #endif /* _MBUITER_H */

Generated on Fri Oct 5 18:20:26 2007 for WvStreams by  doxygen 1.5.3