1: # 4 "./lpsrc/flx_string.ipk" 2: val bin_char2int : char -> int 3: val oct_char2int : char -> int 4: val dec_char2int : char -> int 5: val hex_char2int : char -> int 6: 7: val binint_of_string : string -> int 8: val octint_of_string : string -> int 9: val decint_of_string : string -> int 10: val hexint_of_string : string -> int 11: 12: val binbig_int_of_string : string -> Big_int.big_int 13: val octbig_int_of_string : string -> Big_int.big_int 14: val decbig_int_of_string : string -> Big_int.big_int 15: val hexbig_int_of_string : string -> Big_int.big_int 16: 17: val floating_of_string : string -> float 18: 19: val unescape : string -> string 20: 21: val escape_of_string : char -> string -> string 22: val py_dquote_of_string : string -> string 23: val py_quote_of_string : string -> string 24: val c_quote_of_string : string -> string 25: val utf8_of_int : int -> string 26: val parse_utf8 : string -> int -> int * int 27: val hex2 : int -> string 28: val hex4 : int -> string 29: val hex8 : int -> string 30:
1: # 35 "./lpsrc/flx_string.ipk" 2: let hexchar_of_int i = 3: if i < 10 4: then char_of_int (i + (int_of_char '0')) 5: else char_of_int (i- 10 + (int_of_char 'A')) 6: 7: let hex8 i = 8: let j = ref i in 9: let s = String.create 8 in 10: for k = 0 to 7 do 11: s.[7-k] <- hexchar_of_int (!j mod 16); 12: j := !j / 16 13: done; 14: s 15: 16: let hex4 i = 17: let j = ref i in 18: let s = String.create 4 in 19: for k = 0 to 3 do 20: s.[3-k] <- hexchar_of_int (!j mod 16); 21: j := !j / 16 22: done; 23: s 24: 25: let hex2 i = 26: let j = ref i in 27: let s = String.create 2 in 28: for k = 0 to 1 do 29: s.[1-k] <- hexchar_of_int (!j mod 16); 30: j := !j / 16 31: done; 32: s 33: 34: let escape_of_char quote ch = 35: if ch = '\\' then "\\\\" 36: else if ch = quote then "\\" ^ (String.make 1 quote) 37: else if ch = '\n' then "\\n" 38: else if ch < ' ' 39: or ch > char_of_int 126 40: then "\\x" ^ (hex2 (Char.code ch)) 41: else String.make 1 ch 42: 43: let escape_of_string quote x = 44: let esc = escape_of_char quote in 45: let res = ref "" in 46: for i = 0 to (String.length x -1) do 47: res := !res ^ (esc x.[i]) 48: done; 49: (String.make 1) quote ^ !res ^ (String.make 1 quote) 50: 51: let py_dquote_of_string = escape_of_string '"';; 52: let c_quote_of_string = escape_of_string '"';; 53: let py_quote_of_string = escape_of_string '\'';; 54: 55: let string_of_char c = String.make 1 c;; 56: 57: let bin_char2int s = 58: let c = Char.code s in 59: match s with 60: | '0' -> 0 61: | '1' -> 1 62: | _ -> raise (Flx_exceptions.LexError ("'" ^ (string_of_char s) ^ "' not binary digit")) 63: 64: let oct_char2int s = 65: let c = Char.code s in 66: match s with 67: _ when (s >= '0' & s <= '7') -> 68: c - (Char.code '0') 69: | _ -> raise (Flx_exceptions.LexError ("'" ^ (string_of_char s) ^ "' not octal digit")) 70: 71: let dec_char2int s = 72: let c = Char.code s in 73: match s with 74: _ when (s >= '0' & s <= '9') -> 75: c - (Char.code '0') 76: | _ -> raise (Flx_exceptions.LexError ("'" ^ (string_of_char s) ^ "' not decimal digit")) 77: 78: let hex_char2int s = 79: let c = Char.code s in 80: match s with 81: _ when (s >= '0' & s <= '9') -> 82: c - (Char.code '0') 83: | _ when (s >= 'a' & s <= 'f') -> 84: (c - (Char.code 'a')) + 10 85: | _ when (s >= 'A' & s <= 'F') -> 86: (c - (Char.code 'A')) + 10 87: | _ -> raise (Flx_exceptions.LexError ("'" ^ (string_of_char s) ^ "' not hexadecimal digit")) 88: 89: 90: let len = String.length;; 91: 92: let binint_of_string s = 93: let len = len s in 94: let value = ref 0 in 95: for i = 0 to (len - 1) do 96: if s.[i] <> '_' 97: then value := !value * 2 + (bin_char2int s.[i]) 98: done; 99: !value 100: 101: let octint_of_string s = 102: let len = len s in 103: let value = ref 0 in 104: for i = 0 to (len - 1) do 105: if s.[i] <> '_' 106: then value := !value * 8 + (oct_char2int s.[i]) 107: done; 108: !value 109: 110: let decint_of_string s = 111: let len = len s in 112: let value = ref 0 in 113: for i = 0 to (len - 1) do 114: if s.[i] <> '_' 115: then value := !value * 10 + (dec_char2int s.[i]) 116: done; 117: !value 118: 119: let hexint_of_string s = 120: let len = len s in 121: let value = ref 0 in 122: for i = 0 to (len - 1) do 123: if s.[i] <> '_' 124: then value := !value * 16 + (hex_char2int s.[i]) 125: done; 126: !value 127: 128: let binbig_int_of_string s = 129: let len = len s in 130: let value = ref (Big_int.big_int_of_int 0) in 131: for i = 0 to (len - 1) do 132: if s.[i] <> '_' 133: then value := 134: Big_int.add_int_big_int 135: (bin_char2int s.[i]) 136: (Big_int.mult_int_big_int 2 !value) 137: done; 138: !value 139: 140: let octbig_int_of_string s = 141: let len = len s in 142: let value = ref (Big_int.big_int_of_int 0) in 143: for i = 0 to (len - 1) do 144: if s.[i] <> '_' 145: then value := 146: Big_int.add_int_big_int 147: (oct_char2int s.[i]) 148: (Big_int.mult_int_big_int 8 !value) 149: done; 150: !value 151: 152: let decbig_int_of_string s = 153: let len = len s in 154: let value = ref (Big_int.big_int_of_int 0) in 155: for i = 0 to (len - 1) do 156: if s.[i] <> '_' 157: then value := 158: Big_int.add_int_big_int 159: (dec_char2int s.[i]) 160: (Big_int.mult_int_big_int 10 !value) 161: done; 162: !value 163: 164: let hexbig_int_of_string s = 165: let len = len s in 166: let value = ref (Big_int.big_int_of_int 0) in 167: for i = 0 to (len - 1) do 168: if s.[i] <> '_' 169: then value := 170: Big_int.add_int_big_int 171: (hex_char2int s.[i]) 172: (Big_int.mult_int_big_int 16 !value) 173: done; 174: !value 175: 176: let floating_of_string s' = 177: let dst = ref 0 in 178: let s = String.copy s' in 179: for src = 0 to (String.length s) - 1 do 180: if s.[src] <> '_' 181: then begin 182: s.[!dst] <- s.[src]; 183: incr dst 184: end 185: done; 186: float_of_string (String.sub s 0 !dst) 187: 188: (* WARNING: THIS CODE WILL NOT WORK FOR THE HIGHER PLANES 189: BECAUSE OCAML ONLY SUPPORTS 31 bit signed integers; 190: THIS CODE REQUIRES 32 bits [This can be fixed by using 191: negative codes but hasn't been done] 192: 193: HAPPINESS: Since the above note was posted, 194: ISO10646/Unicode has agreed on a 20 bit address 195: space for code points. 196: *) 197: 198: (* parse the first utf8 encoded character of a string s 199: starting at index position i, return a pair 200: consisting of the decoded integers, and the position 201: of the first character not decoded. 202: 203: If the first character is bad, it is returned, 204: otherwise if the encoding is bad, the result is 205: an unspecified value. 206: 207: Fails if the index is past or at 208: the end of the string. 209: 210: COMPATIBILITY NOTE: if this function is called 211: with a SINGLE character string, it will return 212: the usual value for the character, in range 213: 0 .. 255 214: *) 215: 216: let parse_utf8 (s : string) (i : int) : int * int = 217: let ord = int_of_char 218: and n = (String.length s) - i 219: in 220: if n <= 0 then 221: failwith 222: ( 223: "parse_utf8: index "^ string_of_int i^ 224: " >= "^string_of_int (String.length s)^ 225: " = length of '" ^ s ^ "'" 226: ) 227: else let lead = ord (s.[i]) in 228: if (lead land 0x80) = 0 then 229: lead land 0x7F,i+1 (* ASCII *) 230: else if lead land 0xE0 = 0xC0 && n > 1 then 231: ((lead land 0x1F) lsl 6) lor 232: (ord(s.[i+1]) land 0x3F),i+2 233: else if lead land 0xF0 = 0xE0 && n > 2 then 234: ((lead land 0x1F) lsl 12) lor 235: ((ord(s.[i+1]) land 0x3F) lsl 6) lor 236: (ord(s.[i+2]) land 0x3F),i+3 237: else if lead land 0xF8 = 0xF0 && n > 3 then 238: ((lead land 0x1F) lsl 18) lor 239: ((ord(s.[i+1]) land 0x3F) lsl 12) lor 240: ((ord(s.[i+2]) land 0x3F) lsl 6) lor 241: (ord(s.[i+3]) land 0x3F),i+4 242: else if lead land 0xFC = 0xF8 && n > 4 then 243: ((lead land 0x1F) lsl 24) lor 244: ((ord(s.[i+1]) land 0x3F) lsl 18) lor 245: ((ord(s.[i+2]) land 0x3F) lsl 12) lor 246: ((ord(s.[i+3]) land 0x3F) lsl 6) lor 247: (ord(s.[i+4]) land 0x3F),i+5 248: else if lead land 0xFE = 0xFC && n > 5 then 249: ((lead land 0x1F) lsl 30) lor 250: ((ord(s.[i+1]) land 0x3F) lsl 24) lor 251: ((ord(s.[i+2]) land 0x3F) lsl 18) lor 252: ((ord(s.[i+3]) land 0x3F) lsl 12) lor 253: ((ord(s.[i+4]) land 0x3F) lsl 6) lor 254: (ord(s.[i+5]) land 0x3F),i+6 255: else lead, i+1 (* error, just use bad character *) 256: 257: (* convert an integer into a utf-8 encoded string of bytes *) 258: let utf8_of_int i = 259: let chr x = String.make 1 (Char.chr x) in 260: if i < 0x80 then 261: chr(i) 262: else if i < 0x800 then 263: chr(0xC0 lor ((i lsr 6) land 0x1F)) ^ 264: chr(0x80 lor (i land 0x3F)) 265: else if i < 0x10000 then 266: chr(0xE0 lor ((i lsr 12) land 0xF)) ^ 267: chr(0x80 lor ((i lsr 6) land 0x3F)) ^ 268: chr(0x80 lor (i land 0x3F)) 269: else if i < 0x200000 then 270: chr(0xF0 lor ((i lsr 18) land 0x7)) ^ 271: chr(0x80 lor ((i lsr 12) land 0x3F)) ^ 272: chr(0x80 lor ((i lsr 6) land 0x3F)) ^ 273: chr(0x80 lor (i land 0x3F)) 274: else if i < 0x4000000 then 275: chr(0xF8 lor ((i lsr 24) land 0x3)) ^ 276: chr(0x80 lor ((i lsr 18) land 0x3F)) ^ 277: chr(0x80 lor ((i lsr 12) land 0x3F)) ^ 278: chr(0x80 lor ((i lsr 6) land 0x3F)) ^ 279: chr(0x80 lor (i land 0x3F)) 280: else chr(0xFC lor ((i lsr 30) land 0x1)) ^ 281: chr(0x80 lor ((i lsr 24) land 0x3F)) ^ 282: chr(0x80 lor ((i lsr 18) land 0x3F)) ^ 283: chr(0x80 lor ((i lsr 12) land 0x3F)) ^ 284: chr(0x80 lor ((i lsr 6) land 0x3F)) ^ 285: chr(0x80 lor (i land 0x3F)) 286: 287: let unescape s = 288: let hex_limit = 2 in 289: let n = len s in 290: let s' = Buffer.create 1000 in 291: let deferred = ref 0 in 292: 293: (* tack char deferres tacking spaces until 294: the next non-space is received 295: *) 296: let tack_char ch = 297: if ch = ' ' then incr deferred 298: else begin 299: if !deferred<>0 then begin 300: Buffer.add_string s' (String.make !deferred ' '); 301: deferred := 0 302: end; 303: Buffer.add_char s' ch 304: end 305: in 306: 307: (* tack string always flushes deferred characters *) 308: let tack_string ss = 309: if !deferred<> 0 then begin 310: Buffer.add_string s' (String.make !deferred ' '); 311: deferred := 0 312: end; 313: Buffer.add_string s' ss 314: in 315: let tack_utf8 code = tack_string (utf8_of_int code) in 316: let i= ref 0 in 317: while !i< n do let ch = s.[!i] in 318: if ch = '\\' then begin 319: tack_string ""; (* flush spaces before any slosh *) 320: incr i; 321: if !i = n then tack_char '\\' 322: else match s.[!i] with 323: | 'a' -> tack_char '\007'; incr i (* 7 : bell *) 324: | 'b' -> tack_char '\008'; incr i (* 8 : backspace *) 325: | 't' -> tack_char '\t'; incr i (* 9 : horizontal tab *) 326: 327: (* Note that \n flushes deferred spaces! *) 328: | 'n' -> tack_char '\n'; incr i (* 10 : linefeed *) 329: | 'r' -> tack_char '\r'; incr i (* 13 : return *) 330: | 'v' -> tack_char '\011'; incr i (* vertical tab *) 331: | 'f' -> tack_char '\012'; incr i (* form feed *) 332: | 'e' -> tack_char '\033'; incr i (* 27: x1b: escape *) 333: 334: | '\\' -> tack_char '\\'; incr i 335: | '"' -> tack_char '"'; incr i (* NOTE OCAMLLEX BUG: TWO SPACES REQUIRED *) 336: | '\'' -> tack_char '\''; incr i 337: 338: (* this is the special case of \ spaces: 339: if the spaces are followed by a newline, 340: discard the spaces (and the newline!) 341: otherwise we keep the spaces 342: *) 343: | ' ' -> 344: while !i<n && s.[!i]=' ' do 345: incr deferred; 346: incr i 347: done; 348: if !i<n && s.[!i]='\n' then begin 349: deferred :=0; 350: incr i 351: end 352: 353: (* \newline is thrown out, but defered spaces are output *) 354: | '\n' -> incr i 355: | 'x' -> 356: begin 357: incr i; 358: let j = ref 0 and value = ref 0 in 359: while 360: (!i < n) & 361: (!j < hex_limit) & 362: (String.contains "0123456789ABCDEFabcdef" s.[!i]) do 363: value := !value * 16 + (hex_char2int s.[!i]); 364: incr i; 365: incr j 366: done; 367: tack_utf8 !value 368: end 369: | 'u' -> 370: begin 371: incr i; 372: let j = ref 0 and value = ref 0 in 373: while 374: (!i < n) & 375: (!j < 4) & 376: (String.contains "0123456789ABCDEFabcdef" s.[!i]) do 377: value := !value * 16 + (hex_char2int s.[!i]); 378: incr i; 379: incr j 380: done; 381: tack_utf8 !value 382: end 383: | 'U' -> 384: begin 385: incr i; 386: let j = ref 0 and value = ref 0 in 387: while 388: (!i < n) & 389: (!j < 8) & 390: (String.contains "0123456789ABCDEFabcdef" s.[!i]) do 391: value := !value * 16 + (hex_char2int s.[!i]); 392: incr i; 393: incr j 394: done; 395: tack_utf8 !value 396: end 397: | 'd' -> 398: begin 399: incr i; 400: let j = ref 0 and value = ref 0 in 401: while 402: (!i < n) & 403: (!j < 3) & 404: (String.contains "0123456789" s.[!i]) do 405: value := !value * 10 + (dec_char2int s.[!i]); 406: incr i; 407: incr j 408: done; 409: tack_utf8 !value 410: end 411: | 'o' -> 412: begin 413: incr i; 414: let j = ref 0 and value = ref 0 in 415: while 416: (!i < n) & 417: (!j < 3) & 418: (String.contains "01234567" s.[!i]) do 419: value := !value * 8 + (oct_char2int s.[!i]); 420: incr i; 421: incr j 422: done; 423: tack_utf8 !value 424: end 425: 426: | x -> tack_char '\\'; tack_char x; 427: incr i; 428: end else begin 429: 430: (* if we get a newline character, emit it 431: without preceding spaces 432: *) 433: if s.[!i]='\n' then deferred :=0; 434: tack_char s.[!i]; 435: incr i 436: end 437: done; 438: tack_string ""; (* flush any deferred spaces *) 439: Buffer.contents s' 440: 441: (* this routine converts strings containing 442: utf8 and/or \U \u escapes to a normalised 443: ASCII form using \U and \u escapes 444: for all codes in the range 0-1F, and >80 445: *) 446: 447: (* this routine converts strings containing 448: utf8 and/or \U \u escapes to a normalised 449: ASCII form using \U and \u escapes 450: for all codes in the range 0-1F, and >80 451: *) 452: