Drizzled Public API Documentation

core.h

00001 // Copyright 2006 Nemanja Trifunovic
00002 
00003 /*
00004 Permission is hereby granted, free of charge, to any person or organization
00005 obtaining a copy of the software and accompanying documentation covered by
00006 this license (the "Software") to use, reproduce, display, distribute,
00007 execute, and transmit the Software, and to prepare derivative works of the
00008 Software, and to permit third-parties to whom the Software is furnished to
00009 do so, all subject to the following:
00010 
00011 The copyright notices in the Software and this entire statement, including
00012 the above license grant, this restriction and the following disclaimer,
00013 must be included in all copies of the Software, in whole or in part, and
00014 all derivative works of the Software, unless such copies or derivative
00015 works are solely in the form of machine-executable object code generated by
00016 a source language processor.
00017 
00018 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
00019 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
00020 FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
00021 SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
00022 FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
00023 ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
00024 DEALINGS IN THE SOFTWARE.
00025 */
00026 
00027 #pragma once
00028  
00029  #include <iterator>
00030  
00031 namespace drizzled
00032 {
00033 namespace utf8
00034 {
00035 
00036 // Helper code - not intended to be directly called by the library users. May be changed at any time
00037 namespace internal
00038 {
00039     // Unicode constants
00040     // Leading (high) surrogates: 0xd800 - 0xdbff
00041     // Trailing (low) surrogates: 0xdc00 - 0xdfff
00042     const uint16_t LEAD_SURROGATE_MIN  = 0xd800u;
00043     const uint16_t LEAD_SURROGATE_MAX  = 0xdbffu;
00044     const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u;
00045     const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu;
00046     const uint16_t LEAD_OFFSET         = LEAD_SURROGATE_MIN - (0x10000 >> 10);
00047     const uint32_t SURROGATE_OFFSET    = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN;
00048 
00049     // Maximum valid value for a Unicode code point
00050     const uint32_t CODE_POINT_MAX      = 0x0010ffffu;
00051 
00052     template<typename octet_type>
00053     inline uint8_t mask8(octet_type oc)
00054     {
00055         return static_cast<uint8_t>(0xff & oc);
00056     }
00057     template<typename u16_type>
00058     inline uint16_t mask16(u16_type oc)
00059     {
00060         return static_cast<uint16_t>(0xffff & oc);
00061     }
00062     template<typename octet_type>
00063     inline bool is_trail(octet_type oc)
00064     {
00065         return ((mask8(oc) >> 6) == 0x2);
00066     }
00067 
00068     template <typename u16>
00069     inline bool is_lead_surrogate(u16 cp)
00070     {
00071         return (cp >= LEAD_SURROGATE_MIN && cp <= LEAD_SURROGATE_MAX);
00072     }
00073 
00074     template <typename u16>
00075     inline bool is_trail_surrogate(u16 cp)
00076     {
00077         return (cp >= TRAIL_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
00078     }
00079 
00080     template <typename u16>
00081     inline bool is_surrogate(u16 cp)
00082     {
00083         return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
00084     }
00085 
00086     template <typename u32>
00087     inline bool is_code_point_valid(u32 cp)
00088     {
00089         return (cp <= CODE_POINT_MAX && !is_surrogate(cp) && cp != 0xfffe && cp != 0xffff);
00090     }
00091 
00092     template <typename octet_iterator>
00093     inline typename std::iterator_traits<octet_iterator>::difference_type
00094     sequence_length(octet_iterator lead_it)
00095     {
00096         uint8_t lead = mask8(*lead_it);
00097         if (lead < 0x80)
00098             return 1;
00099         else if ((lead >> 5) == 0x6)
00100             return 2;
00101         else if ((lead >> 4) == 0xe)
00102             return 3;
00103         else if ((lead >> 3) == 0x1e)
00104             return 4;
00105         else
00106             return 0;
00107     }
00108 
00109     template <typename octet_difference_type>
00110     inline bool is_overlong_sequence(uint32_t cp, octet_difference_type length)
00111     {
00112         if (cp < 0x80) {
00113             if (length != 1) 
00114                 return true;
00115         }
00116         else if (cp < 0x800) {
00117             if (length != 2) 
00118                 return true;
00119         }
00120         else if (cp < 0x10000) {
00121             if (length != 3) 
00122                 return true;
00123         }
00124 
00125         return false;
00126     }
00127 
00128     enum utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT};
00129 
00131 
00132     template <typename octet_iterator>
00133     utf_error get_sequence_1(octet_iterator& it, octet_iterator end, uint32_t* code_point)
00134     {
00135         if (it != end) {
00136             if (code_point)
00137                 *code_point = mask8(*it);
00138             return UTF8_OK;
00139         }
00140         return NOT_ENOUGH_ROOM;
00141     }
00142 
00143     template <typename octet_iterator>
00144     utf_error get_sequence_2(octet_iterator& it, octet_iterator end, uint32_t* code_point)
00145     {
00146         utf_error ret_code = NOT_ENOUGH_ROOM;
00147 
00148         if (it != end) {
00149             uint32_t cp = mask8(*it);
00150             if (++it != end) {
00151                 if (is_trail(*it)) {
00152                     cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f);
00153 
00154                     if (code_point)
00155                         *code_point = cp;
00156                     ret_code = UTF8_OK;
00157                 }
00158                 else
00159                     ret_code = INCOMPLETE_SEQUENCE;
00160             }
00161             else
00162                 ret_code = NOT_ENOUGH_ROOM;
00163         }
00164 
00165         return ret_code;
00166     }
00167 
00168     template <typename octet_iterator>
00169     utf_error get_sequence_3(octet_iterator& it, octet_iterator end, uint32_t* code_point)
00170     {
00171         utf_error ret_code = NOT_ENOUGH_ROOM;
00172 
00173         if (it != end) {
00174             uint32_t cp = mask8(*it);
00175             if (++it != end) {
00176                 if (is_trail(*it)) {
00177                     cp = ((cp << 12) & 0xffff) + ((mask8(*it) << 6) & 0xfff);
00178                     if (++it != end) {
00179                         if (is_trail(*it)) {
00180                             cp += (*it) & 0x3f;
00181 
00182                             if (code_point)
00183                                 *code_point = cp;
00184                             ret_code = UTF8_OK;
00185                         }
00186                         else 
00187                             ret_code = INCOMPLETE_SEQUENCE;
00188                     }
00189                     else
00190                         ret_code = NOT_ENOUGH_ROOM;
00191                 }
00192                 else
00193                     ret_code = INCOMPLETE_SEQUENCE;
00194             }
00195             else
00196                 ret_code = NOT_ENOUGH_ROOM;
00197         }
00198 
00199         return ret_code;
00200     }
00201 
00202     template <typename octet_iterator>
00203     utf_error get_sequence_4(octet_iterator& it, octet_iterator end, uint32_t* code_point)
00204     {
00205         utf_error ret_code = NOT_ENOUGH_ROOM;
00206 
00207         if (it != end) {
00208             uint32_t cp = mask8(*it);
00209             if (++it != end) {
00210                 if (is_trail(*it)) {
00211                     cp = ((cp << 18) & 0x1fffff) + ((mask8(*it) << 12) & 0x3ffff);
00212                     if (++it != end) {
00213                         if (is_trail(*it)) {
00214                             cp += (mask8(*it) << 6) & 0xfff;
00215                             if (++it != end) {
00216                                 if (is_trail(*it)) {
00217                                     cp += (*it) & 0x3f;
00218 
00219                                     if (code_point)
00220                                         *code_point = cp;
00221                                     ret_code = UTF8_OK;
00222                                 }
00223                                 else
00224                                     ret_code = INCOMPLETE_SEQUENCE;
00225                             }
00226                             else
00227                                 ret_code = NOT_ENOUGH_ROOM;
00228                         }
00229                         else
00230                             ret_code = INCOMPLETE_SEQUENCE;
00231                     }
00232                     else
00233                         ret_code = NOT_ENOUGH_ROOM;
00234                 }
00235                 else 
00236                     ret_code = INCOMPLETE_SEQUENCE;
00237             }
00238             else
00239                 ret_code = NOT_ENOUGH_ROOM;
00240         }
00241 
00242         return ret_code;
00243     }
00244 
00245     template <typename octet_iterator>
00246     utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t* code_point)
00247     {
00248         // Save the original value of it so we can go back in case of failure
00249         // Of course, it does not make much sense with i.e. stream iterators
00250         octet_iterator original_it = it;
00251 
00252         uint32_t cp = 0;
00253         // Determine the sequence length based on the lead octet
00254         typedef typename std::iterator_traits<octet_iterator>::difference_type octet_difference_type;
00255         octet_difference_type length = sequence_length(it);
00256         if (length == 0)
00257             return INVALID_LEAD;
00258 
00259         // Now that we have a valid sequence length, get trail octets and calculate the code point
00260         utf_error err = UTF8_OK;
00261         switch (length) {
00262             case 1:
00263                 err = get_sequence_1(it, end, &cp);
00264                 break;
00265             case 2:
00266                 err = get_sequence_2(it, end, &cp);
00267             break;
00268             case 3:
00269                 err = get_sequence_3(it, end, &cp);
00270             break;
00271             case 4:
00272                 err = get_sequence_4(it, end, &cp);
00273             break;
00274         }
00275 
00276         if (err == UTF8_OK) {
00277             // Decoding succeeded. Now, security checks...
00278             if (is_code_point_valid(cp)) {
00279                 if (!is_overlong_sequence(cp, length)){
00280                     // Passed! Return here.
00281                     if (code_point)
00282                         *code_point = cp;
00283                     ++it;
00284                     return UTF8_OK;
00285                 }
00286                 else
00287                     err = OVERLONG_SEQUENCE;
00288             }
00289             else 
00290                 err = INVALID_CODE_POINT;
00291         }
00292 
00293         // Failure branch - restore the original value of the iterator
00294         it = original_it;
00295         return err;
00296     }
00297 
00298     template <typename octet_iterator>
00299     inline utf_error validate_next(octet_iterator& it, octet_iterator end) {
00300         return validate_next(it, end, 0);
00301     }
00302 
00303 } // namespace internal
00304 
00306 
00307     // Byte order mark
00308     const uint8_t bom[] = {0xef, 0xbb, 0xbf};
00309 
00310     template <typename octet_iterator>
00311     octet_iterator find_invalid(octet_iterator start, octet_iterator end)
00312     {
00313         octet_iterator result = start;
00314         while (result != end) {
00315             internal::utf_error err_code = internal::validate_next(result, end);
00316             if (err_code != internal::UTF8_OK)
00317                 return result;
00318         }
00319         return result;
00320     }
00321 
00322     template <typename octet_iterator>
00323     inline bool is_valid(octet_iterator start, octet_iterator end)
00324     {
00325         return (find_invalid(start, end) == end);
00326     }
00327 
00328     template <typename octet_iterator>
00329     inline bool starts_with_bom (octet_iterator it, octet_iterator end)
00330     {
00331         return (
00332             ((it != end) && (internal::mask8(*it++)) == bom[0]) &&
00333             ((it != end) && (internal::mask8(*it++)) == bom[1]) &&
00334             ((it != end) && (internal::mask8(*it))   == bom[2])
00335            );
00336     }
00337   
00338   //Deprecated in release 2.3 
00339     template <typename octet_iterator>
00340     inline bool is_bom (octet_iterator it)
00341     {
00342         return (
00343             (internal::mask8(*it++)) == bom[0] &&
00344             (internal::mask8(*it++)) == bom[1] &&
00345             (internal::mask8(*it))   == bom[2]
00346            );
00347     }
00348 } // namespace utf8
00349 } // namespace drizzled
00350 
00351 
00352