1: # 3 "./lpsrc/flx_regexp.ipk" 2: open Flx_ast 3: open Flx_mtypes2 4: 5: val process_regexp: 6: regexp_t -> 7: CharSet.t * (* alphabet*) 8: int * (* state count *) 9: (int, expr_t) Hashtbl.t * (* term_codes *) 10: (int * int, int) Hashtbl.t (* transition matrix *) 11:
1: # 14 "./lpsrc/flx_regexp.ipk" 2: (* build a DFA from a regular expression 3: *) 4: open Flx_ast 5: open Flx_print 6: open Flx_mtypes2 7: 8: let hashtbl_length h = 9: let n = ref 0 in 10: Hashtbl.iter (fun _ _ -> incr n) h; 11: !n 12: 13: let augment re = `REGEXP_seq (re,`REGEXP_sentinel) 14: 15: type annotation_t = 16: { 17: nullable: bool; 18: firstpos: PosSet.t; 19: lastpos: PosSet.t 20: } 21: 22: let fp_get followpos i = 23: try Hashtbl.find followpos i 24: with Not_found -> PosSet.empty 25: 26: let fp_add followpos i j = 27: Hashtbl.replace followpos i (PosSet.add j (fp_get followpos i)) 28: 29: let fp_union followpos i x = 30: Hashtbl.replace followpos i (PosSet.union (fp_get followpos i) x) 31: 32: let rec annotate counter followpos posmap codemap re = 33: let a r = annotate counter followpos posmap codemap r in 34: match re with 35: | `REGEXP_group _ -> failwith "Can't handle groups yet" 36: | `REGEXP_name _ -> failwith "Unbound regular expresion name" 37: | `REGEXP_seq (r1,r2) -> 38: let a1 = a r1 and a2 = a r2 in 39: let au = 40: { 41: nullable = a1.nullable && a2.nullable; 42: firstpos = 43: if a1.nullable 44: then PosSet.union a1.firstpos a2.firstpos 45: else a1.firstpos; 46: lastpos = 47: if a2.nullable 48: then PosSet.union a1.lastpos a2.lastpos 49: else a2.lastpos; 50: } 51: in 52: PosSet.iter 53: (fun i -> fp_union followpos i a2.firstpos) 54: a1.lastpos 55: ; 56: au 57: 58: | `REGEXP_alt (r1,r2) -> 59: let a1 = a r1 and a2 = a r2 in 60: { 61: nullable = a1.nullable || a2.nullable; 62: firstpos = 63: PosSet.union a1.firstpos a2.firstpos; 64: lastpos = 65: PosSet.union a1.lastpos a2.lastpos 66: } 67: 68: | `REGEXP_aster r1 -> 69: let a1 = a r1 in 70: let au = 71: { 72: nullable = true; 73: firstpos = a1.firstpos; 74: lastpos = a1.lastpos 75: } 76: in 77: PosSet.iter 78: (fun i -> fp_union followpos i a1.firstpos) 79: a1.lastpos 80: ; 81: au 82: 83: | `REGEXP_string s -> 84: let n = String.length s in 85: if n = 0 86: then (a `REGEXP_epsilon) 87: else 88: begin 89: let start = !counter in 90: counter := start + n; 91: let last = !counter - 1 in 92: let au = 93: { 94: nullable = false; 95: firstpos = PosSet.singleton start; 96: lastpos = PosSet.singleton last 97: } 98: in 99: for i = start to last-1 do 100: fp_add followpos i (i+1); 101: Hashtbl.add posmap i (Char.code s.[i-start]) 102: done 103: ; 104: Hashtbl.add posmap last (Char.code s.[last-start]) 105: ; 106: au 107: end 108: 109: | `REGEXP_epsilon -> 110: { 111: nullable = true; 112: firstpos = PosSet.empty; 113: lastpos = PosSet.empty 114: } 115: 116: | `REGEXP_code s -> 117: Hashtbl.add codemap !counter s; 118: let u = 119: { 120: nullable = false; 121: firstpos = PosSet.singleton !counter; 122: lastpos = PosSet.singleton !counter 123: } 124: in 125: incr counter; 126: u 127: 128: | `REGEXP_sentinel -> 129: let u = 130: { 131: nullable = false; 132: firstpos = PosSet.singleton !counter; 133: lastpos = PosSet.singleton !counter; 134: } 135: in 136: Hashtbl.add followpos !counter PosSet.empty; 137: u 138: 139: let list_of_set x = 140: let lst = ref [] in 141: PosSet.iter 142: (fun i -> lst := i :: !lst) 143: x 144: ; 145: !lst 146: 147: let string_of_set x = 148: "{" ^ 149: String.concat ", " (List.map string_of_int (list_of_set x)) ^ 150: "}" 151: 152: let print_followpos followpos = 153: Hashtbl.iter 154: (fun i fp -> 155: print_endline ( 156: (string_of_int i) ^ 157: " -> " ^ 158: string_of_set fp 159: ) 160: ) 161: followpos 162: 163: let print_int_set s = 164: print_string "{"; 165: PosSet.iter 166: (fun i -> print_string (string_of_int i ^ ", ")) 167: s 168: ; 169: print_string "}" 170: 171: exception Found of int 172: ;; 173: 174: let process_regexp re = 175: (* 176: print_endline (" | " ^ Lex_print.string_of_re re); 177: *) 178: let are = augment re in 179: let followpos = Hashtbl.create 97 in 180: let codemap = Hashtbl.create 97 in 181: let posmap = Hashtbl.create 97 in 182: let counter = ref 1 in 183: let root = annotate counter followpos posmap codemap are in 184: let posarray = Array.make !counter 0 in 185: let alphabet = ref CharSet.empty in 186: Hashtbl.iter 187: (fun i c -> 188: posarray.(i-1) <- c; 189: alphabet := CharSet.add c !alphabet 190: ) 191: posmap; 192: (* 193: print_endline "Followpos:"; 194: print_followpos followpos; 195: print_endline ("Charpos '" ^ posarray ^ "'"); 196: print_endline ("Codepos: "); 197: Hashtbl.iter 198: (fun i c -> 199: print_endline ((string_of_int i) ^ " -> " ^ c) 200: ) 201: codemap 202: ; 203: print_string "alphabet '"; 204: CharSet.iter 205: (fun c -> print_char c) 206: !alphabet; 207: print_endline "'"; 208: *) 209: let marked_dstates = ref PosSetSet.empty in 210: let unmarked_dstates = ref (PosSetSet.singleton root.firstpos) in 211: let find_char c t = 212: try 213: PosSet.iter 214: (fun i -> if posarray.(i-1) = c then raise (Found i)) 215: t 216: ; 217: print_endline ("Can't find char '" ^ String.make 1 (Char.chr c) ^ "'") 218: ; 219: raise Not_found 220: with Found p -> p 221: in 222: let state_counter = ref 1 in 223: let state_map = Hashtbl.create 97 in 224: let inv_state_map = Hashtbl.create 97 in 225: 226: let dtran = Hashtbl.create 97 in 227: Hashtbl.add state_map 0 root.firstpos; 228: Hashtbl.add inv_state_map root.firstpos 0; 229: (* 230: print_endline "Root is"; 231: print_int_set root.firstpos; 232: print_endline ""; 233: *) 234: while not (PosSetSet.is_empty !unmarked_dstates) do 235: let t = PosSetSet.choose !unmarked_dstates in 236: unmarked_dstates := PosSetSet.remove t !unmarked_dstates; 237: marked_dstates := PosSetSet.add t !marked_dstates; 238: let src_state_index = 239: try 240: let state_index = Hashtbl.find inv_state_map t in 241: (* 242: print_endline ("src_state = " ^ string_of_int state_index); 243: *) 244: state_index 245: with Not_found -> 246: print_endline "Can't find "; print_int_set t; 247: print_endline ""; 248: raise Not_found 249: in 250: 251: CharSet.iter 252: (fun c -> 253: let u = ref (PosSet.empty) in 254: PosSet.iter 255: (fun i -> 256: if posarray.(i-1) = c 257: then begin 258: u := PosSet.union !u (try Hashtbl.find followpos i with 259: Not_found -> failwith ("Can't find followpos of index " ^ string_of_int i)) 260: end 261: ) 262: t 263: ; 264: if not (PosSet.is_empty !u) 265: then 266: let dst_state_index = 267: if not (PosSetSet.mem !u !marked_dstates) 268: && not (PosSetSet.mem !u !unmarked_dstates) 269: then begin 270: let state_index = !state_counter in 271: incr state_counter 272: ; 273: (* 274: print_string ("Adding new state " ^ string_of_int state_index ^ " = "); 275: print_int_set !u; 276: print_endline ""; 277: *) 278: Hashtbl.add state_map state_index !u; 279: Hashtbl.add inv_state_map !u state_index; 280: let n1 = PosSetSet.cardinal !unmarked_dstates in 281: unmarked_dstates := PosSetSet.add !u !unmarked_dstates; 282: assert(n1 <> PosSetSet.cardinal !unmarked_dstates); 283: state_index 284: end 285: else 286: try Hashtbl.find inv_state_map !u with Not_found -> failwith "ERROR 2" 287: in 288: Hashtbl.add dtran (c,src_state_index) dst_state_index 289: ) 290: !alphabet 291: done; 292: (* 293: print_endline "states:"; 294: PosSetSet.iter 295: (fun s -> print_int_set s; print_endline "") 296: !marked_dstates 297: ; 298: print_endline ""; 299: 300: print_endline "states:"; 301: Hashtbl.iter 302: (fun idx state -> 303: print_string (string_of_int idx ^ " -> "); 304: print_int_set state; 305: print_endline "" 306: ) 307: state_map 308: ; 309: *) 310: 311: let term_codes = Hashtbl.create 97 in 312: Hashtbl.iter 313: (fun idx state -> 314: try 315: PosSet.iter 316: (fun i -> 317: if Hashtbl.mem codemap i 318: then raise (Found i) 319: ) 320: state 321: ; 322: raise Not_found 323: with 324: | Found i -> 325: let code = Hashtbl.find codemap i in 326: Hashtbl.add term_codes idx code 327: | Not_found -> () 328: ) 329: state_map 330: ; 331: 332: !alphabet,!state_counter, term_codes, dtran 333: 334: