Drizzled Public API Documentation

CSUTF8.cc

00001 /* Copyright (C) 2008 PrimeBase Technologies GmbH, Germany
00002  *
00003  * PrimeBase Media Stream for MySQL
00004  *
00005  * This program is free software; you can redistribute it and/or modify
00006  * it under the terms of the GNU General Public License as published by
00007  * the Free Software Foundation; either version 2 of the License, or
00008  * (at your option) any later version.
00009  *
00010  * This program is distributed in the hope that it will be useful,
00011  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00012  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00013  * GNU General Public License for more details.
00014  *
00015  * You should have received a copy of the GNU General Public License
00016  * along with this program; if not, write to the Free Software
00017  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
00018  *
00019  * Original author: Paul McCullagh (H&G2JCtL)
00020  * Continued development: Barry Leslie
00021  *
00022  * 2007-06-14
00023  *
00024  * CORE SYSTEM:
00025  * Unicode / UTF-8 convertion
00026  *
00027  */
00028 
00029 #include "CSConfig.h"
00030 
00031 #include <assert.h>
00032 #include <string.h>
00033 
00034 #include "CSUTF8.h"
00035 #include "CSMemory.h"
00036 #include "CSGlobal.h"
00037 
00038 size_t cs_utf_to_uni_char(const u_char *in_string, uint32_t *ret_value)
00039 {
00040   const u_char *i_string =  in_string;
00041   size_t  s_len = strlen((char*)in_string);
00042   u_char  ch;
00043   uint32_t  val;
00044   size_t  clen;
00045 
00046   ch = *i_string;
00047   if ((ch & 0x80) == 0x00) {
00048     val = (uint32_t) ch & 0x0000007F;
00049     clen = 1;
00050   }
00051   else if ((ch & 0xE0) == 0xC0) {
00052     if (s_len > 1) {
00053       val = ((i_string[0] & 0x0000001F) << 6) |
00054             (i_string[1] & 0x0000003F);
00055       if (val < 0x00000080)
00056         val = '?';
00057       clen = 2;
00058     }
00059     else {
00060       val = '?';
00061       clen = s_len;
00062     }
00063   }
00064   else if ((ch & 0xF0) == 0xE0) {
00065     if (s_len > 2) {
00066       val = ((i_string[0] & 0x0000000F) << 12) |
00067             ((i_string[1] & 0x0000003F) << 6) |
00068             (i_string[2] & 0x0000003F);
00069       if (val < 0x000000800)
00070         val = '?';
00071       clen = 3;
00072     }
00073     else {
00074       val = '?';
00075       clen = s_len;
00076     }
00077   }
00078   else if ((ch & 0xF8) == 0xF0) {
00079     if (s_len > 3) {
00080       val = ((i_string[0] & 0x00000007) << 18) |
00081             ((i_string[1] & 0x0000003F) << 12) |
00082             ((i_string[2] & 0x0000003F) << 6) |
00083             (i_string[3] & 0x0000003F);
00084       if (val < 0x00010000)
00085         val = '?';
00086       clen = 4;
00087     }
00088     else {
00089       val = '?';
00090       clen = s_len;
00091     }
00092   }
00093   else if ((ch & 0xFC) == 0xF8) {
00094     if (s_len > 4) {
00095       val = ((i_string[0] & 0x00000003) << 24) |
00096             ((i_string[1] & 0x0000003F) << 18) |
00097             ((i_string[2] & 0x0000003F) << 12) |
00098             ((i_string[3] & 0x0000003F) << 6) |
00099             (i_string[4] & 0x0000003F);
00100       if (val < 0x00200000)
00101         val = '?';
00102       clen = 5;
00103     }
00104     else {
00105       val = '?';
00106       clen = s_len;
00107     }
00108   }
00109   else if ((ch & 0xFE) == 0xFC) {
00110     if (s_len > 5) {
00111       val = ((i_string[0] & 0x00000001) << 30) |
00112             ((i_string[1] & 0x0000003F) << 24) |
00113             ((i_string[2] & 0x0000003F) << 18) |
00114             ((i_string[3] & 0x0000003F) << 12) |
00115             ((i_string[4] & 0x0000003F) << 6) |
00116             (i_string[5] & 0x0000003F);
00117       if (val < 0x04000000)
00118         val = '?';
00119       clen = 6;
00120     }
00121     else {
00122       val = '?';
00123       clen = s_len;
00124     }
00125   }
00126   else {
00127     // Should not happen!
00128     val = '?';
00129     clen = 1;
00130   }
00131   *ret_value = val;
00132   return(clen);
00133 }
00134 
00135 void cs_utf8_to_uni(size_t out_len, unichar *out_string, const u_char *in_string)
00136 {
00137   uint32_t  utf_value;
00138 
00139   out_len--;  // Space for zero terminator
00140   while (*in_string) {
00141     in_string += cs_utf_to_uni_char(in_string, &utf_value);
00142     if (out_len == 0)
00143       break;
00144     if (utf_value > 0x0000FFFF)
00145       *out_string = (unichar) '?';
00146     else
00147       *out_string = (unichar) utf_value;
00148     out_string++;
00149     out_len--;
00150   }
00151   *out_string = 0;
00152 }
00153 
00154 void cs_utf8_to_uni_no_term(size_t out_len, unichar *out_string, const u_char *in_string)
00155 {
00156   uint32_t  utf_value;
00157 
00158   while (*in_string) {
00159     in_string += cs_utf_to_uni_char(in_string, &utf_value);
00160     if (out_len == 0)
00161       break;
00162     if (utf_value > 0x0000FFFF)
00163       *out_string = (unichar) '?';
00164     else
00165       *out_string = (unichar) utf_value;
00166     out_string++;
00167     out_len--;
00168   }
00169 }
00170 
00171 void cs_uni_to_utf8(size_t out_len, char *out_string, const unichar *in_string)
00172 {
00173   out_len--;  // Space for zero terminator
00174   while (*in_string) {
00175     if (*in_string <= 0x007F) {
00176       if (out_len < 1)
00177         break;
00178       *out_string++ = (char) (u_char) *in_string;
00179       out_len--;
00180     }
00181     else if (*in_string <= 0x07FF) {
00182       if (out_len < 3)
00183         break;
00184       *out_string++ = (char) (u_char) ((0x00C0) | ((*in_string >> 6) & 0x001F));
00185       *out_string++ = (char) (u_char) ((0x0080) | (*in_string & 0x003F));
00186       out_len -= 2;
00187     }
00188     else /* <= 0xFFFF */ {
00189       if (out_len < 3)
00190         break;
00191       *out_string++ = (char) (u_char) ((0x00E0) | ((*in_string >> 12) & 0x000F));
00192       *out_string++ = (char) (u_char) ((0x0080) | ((*in_string >> 6) & 0x003F));
00193       *out_string++ = (char) (u_char) ((0x0080) | (*in_string & 0x003F));
00194       out_len -= 3;
00195     }
00196     in_string++;
00197   }
00198   *out_string = 0;
00199 }
00200 
00201 void cs_uni_to_utf8(size_t out_len, char *out_string, const unichar *in_string, s_int in_len)
00202 {
00203   out_len--;  // Space for zero terminator
00204   while (in_len--) {
00205     if (*in_string <= 0x007F) {
00206       if (out_len < 1)
00207         break;
00208       *out_string++ = (char) (u_char) *in_string;
00209       out_len--;
00210     }
00211     else if (*in_string <= 0x07FF) {
00212       if (out_len < 3)
00213         break;
00214       *out_string++ = (char) (u_char) ((0x00C0) | ((*in_string >> 6) & 0x001F));
00215       *out_string++ = (char) (u_char) ((0x0080) | (*in_string & 0x003F));
00216       out_len -= 2;
00217     }
00218     else /* <= 0xFFFF */ {
00219       if (out_len < 3)
00220         break;
00221       *out_string++ = (char) (u_char) ((0x00E0) | ((*in_string >> 12) & 0x000F));
00222       *out_string++ = (char) (u_char) ((0x0080) | ((*in_string >> 6) & 0x003F));
00223       *out_string++ = (char) (u_char) ((0x0080) | (*in_string & 0x003F));
00224       out_len -= 3;
00225     }
00226     in_string++;
00227   }
00228   *out_string = 0;
00229 }
00230 
00231 size_t cs_utf8_to_uni_len(const char *in_string)
00232 {
00233   size_t slen = 0;
00234 
00235   while (*in_string) {
00236     if ((*((u_char *) in_string) & 0xC0) == 0x80)
00237       // These are char data bytes (10xxxxxx)
00238       ;
00239     else
00240       // These are single char (00xxxxx, 01xxxxx), or char start bytes (11xxxxxx)
00241       slen++;
00242     in_string++;
00243   }
00244   return slen;
00245 }
00246 
00247 size_t cs_uni_to_utf8_len(const unichar *in_string, s_int in_len)
00248 {
00249   size_t slen = 0;
00250 
00251   while (in_len--) {
00252     if (*in_string <= 0x000007F) {
00253       slen++;
00254     }
00255     else if (*in_string <= 0x00007FF)
00256       slen += 2;
00257     else /* <= 0xFFFF */
00258       slen += 3;
00259     in_string++;
00260   }
00261   return slen;
00262 }
00263 
00264 /*
00265 size_t cs_uni_len(const unichar *in_string)
00266 {
00267   size_t len = 0;
00268   
00269   while (*in_string++) len++;
00270   return len;
00271 }
00272 */
00273