现代编译原理第2章讲解词法分析,正则表达式是静态的和说明性的,自动机 是动态的和命令式的;也就是说,你不必用一个算法来模拟就能看到正则表 达式的成分和结构,但是理解自动机常常需要你在自己的头脑中来“执行” 它。 因此,正则表达式一般更适合用来指明程序设计语言单词的词法结构。
词法分析器生成器就是把正则表达式转换为DFA来进行词法解析,此类工具有 C下的lex, flex. OCaml下的ocamllex。
程序练习为实现tiger语言的词法解析,书中采用SML示例,我用OCaml重写了:
tokens.ml 用于词法分析器构造token类型,实际上OCaml中用token类型输 出和使用麻烦,我在lexer中直接用string代替了:
(* 用于调试lexers的"脚手架",由于OCaml中不支持函数名大写开头,我用v_代 替了书中例子里的函数名 *) type linenum = int type token = string let v_type i (j:int) = "TYPE " ^ (string_of_int i) let v_var i (j:int) = "VAR " ^ (string_of_int i) let v_function i (j:int) = "FUNCTION " ^ (string_of_int i) let v_break i (j:int) = "BREAK " ^ (string_of_int i) let v_of i (j:int) = "OF " ^ (string_of_int i) let v_end i (j:int) = "END " ^ (string_of_int i) let v_in i (j:int) = "IN " ^ (string_of_int i) let v_nil i (j:int) = "NIL " ^ (string_of_int i) let v_let i (j:int) = "LET " ^ (string_of_int i) let v_do i (j:int) = "DO " ^ (string_of_int i) let v_to i (j:int) = "TO " ^ (string_of_int i) let v_for i (j:int) = "FOR " ^ (string_of_int i) let v_while i (j:int) = "WHILE " ^ (string_of_int i) let v_else i (j:int) = "ELSE " ^ (string_of_int i) let v_then i (j:int) = "THEN " ^ (string_of_int i) let v_if i (j:int) = "IF " ^ (string_of_int i) let v_array i (j:int) = "ARRAY " ^ (string_of_int i) let v_assign i (j:int) = "ASSIGN " ^ (string_of_int i) let v_or i (j:int) = "OR " ^ (string_of_int i) let v_and i (j:int) = "AND " ^ (string_of_int i) let v_ge i (j:int) = "GE " ^ (string_of_int i) let v_gt i (j:int) = "GT " ^ (string_of_int i) let v_le i (j:int) = "LE " ^ (string_of_int i) let v_lt i (j:int) = "LT " ^ (string_of_int i) let v_neq i (j:int) = "NEQ " ^ (string_of_int i) let v_eq i (j:int) = "EQ " ^ (string_of_int i) let v_divide i (j:int) = "DIVIDE " ^ (string_of_int i) let v_times i (j:int) = "TIMES " ^ (string_of_int i) let v_minus i (j:int) = "MINUS " ^ (string_of_int i) let v_plus i (j:int) = "PLUS " ^ (string_of_int i) let v_dot i (j:int) = "DOT " ^ (string_of_int i) let v_rbrace i (j:int) = "RBRACE " ^ (string_of_int i) let v_lbrace i (j:int) = "LBRACE " ^ (string_of_int i) let v_rbrack i (j:int) = "RBRACK " ^ (string_of_int i) let v_lbrack i (j:int) = "LBRACK " ^ (string_of_int i) let v_rparen i (j:int) = "RPAREN " ^ (string_of_int i) let v_lparen i (j:int) = "LPAREN " ^ (string_of_int i) let v_semicolon i (j:int) = "SEMICOLON " ^ (string_of_int i) let v_colon i (j:int) = "COLON " ^ (string_of_int i) let v_comma i (j:int) = "COMMA " ^ (string_of_int i) let v_string s i (j:int) = "STRING("^s^") " ^ (string_of_int i) let v_int c i (j:int) = "INT("^(string_of_int c)^") " ^ (string_of_int i) let v_id s i (j:int) = "ID("^s^") " ^ (string_of_int i) let v_eof i (j:int) = "EOF " ^ (string_of_int i)
errorMsg.ml 输出带有文件名和行号的错误信息:
let any_errors = ref false let file_name = ref "" let line_num = ref 1 let line_pos = ref [1] (* 保存每行的结束位置,相对于文件开始的位置 *) let source_stream = ref stdin let reset () = any_errors := false; file_name := ""; line_num := 1; line_pos := [1]; source_stream := stdin exception Error let error pos (msg:string) = let rec look rest n = match rest with | [] -> print_string "0.0" | h :: r -> if h < pos then Printf.printf ":%d.%d" n (pos - h) else look r (n - 1) in any_errors := true; print_string !file_name; look !line_pos !line_num; print_string ":"; print_string msg; print_newline () let impossible msg = Printf.printf "Error: Compiler bug: %s\n" msg; flush stdout; raise Error
driver.ml 主程序,调用lexer:
let parse filename = let file = open_in filename in let lexbuf = Lexing.from_channel file in ErrorMsg.reset (); ErrorMsg.file_name := filename; (* 输出所有token *) let rec do_it () = let result = TigerLexer.tokens lexbuf in print_endline result; if (String.length result) > 3 && (String.sub result 0 3) = "EOF" then () else do_it () in do_it (); close_in file ;; let main () = if Array.length Sys.argv = 1 then ( print_endline "usage: tigerlex.exe <source file>"; exit 1 ); parse Sys.argv.(1) ;; (* 非交互模式下执行main,在toplevel中不执行main,否则#load模块时就会异常 *) if not !Sys.interactive then main ()
tigerLexer.mll tiger语言的词法解析:
(* * 标识符: 以字母开头,由字母、数字和下划线组成的序列。区分大小写字母。 * 注释:注释可以出现在任意两个单词之间。以/*开始,*/结束,可以嵌套。 * 内置类型:int, string * * * 怎样处理注释 * 怎样处理字符串 * 错误处理 * 文件结束处理 * 其它特性 *) { type pos = int type lexresult = Tokens.token let line_num = ErrorMsg.line_num (* 当前行号 *) let line_pos = ErrorMsg.line_pos (* 每行的起始位置,相对于整个buffer *) let data_buff = Buffer.create 100 let add_lexeme lexbuf = let len = lexbuf.Lexing.lex_curr_pos - lexbuf.Lexing.lex_start_pos in Buffer.add_substring data_buff lexbuf.Lexing.lex_buffer lexbuf.Lexing.lex_start_pos len let err lexbuf msg = let pos = lexbuf.Lexing.lex_curr_p.Lexing.pos_cnum in ErrorMsg.error pos msg let eof () = let pos = List.hd !line_pos in Tokens.v_eof pos pos let newline lexbuf = line_num := !line_num + 1; line_pos := lexbuf.Lexing.lex_curr_p.Lexing.pos_cnum :: !line_pos (* * 调用转换为token的函数 * f, 要调用的函数 * len,此token的长度 * v,附带的value *) let to_token lexbuf f len = let pos = lexbuf.Lexing.lex_curr_p.Lexing.pos_cnum in f pos (pos + len) let char_num c = if c < '0' || c > '9' then raise (Invalid_argument "char_num not digit char") else int_of_char c - int_of_char '0' } let digit = ['0'-'9'] let digits = digit+ let ident = ['a'-'z' 'A'-'Z']['a'-'z' 'A'-'Z' '0'-'9' '_']* rule tokens = parse | ['\t' '\r' ' '] { tokens lexbuf } | ['\n'] { newline lexbuf; tokens lexbuf } | "/*" { finish_comment lexbuf; "" } | '"' { Buffer.clear data_buff; finish_string lexbuf } | "while" { to_token lexbuf Tokens.v_while 5 } | "for" { to_token lexbuf Tokens.v_for 3 } | "to" { to_token lexbuf Tokens.v_to 2 } | "break" { to_token lexbuf Tokens.v_break 5 } | "let" { to_token lexbuf Tokens.v_let 3 } | "in" { to_token lexbuf Tokens.v_in 2 } | "end" { to_token lexbuf Tokens.v_end 3} | "function" { to_token lexbuf Tokens.v_function 8 } | "var" { to_token lexbuf Tokens.v_var 3 } | "type" { to_token lexbuf Tokens.v_type 4 } | "array" { to_token lexbuf Tokens.v_array 5 } | "if" { to_token lexbuf Tokens.v_if 2 } | "then" { to_token lexbuf Tokens.v_then 4 } | "else" { to_token lexbuf Tokens.v_else 4 } | "do" { to_token lexbuf Tokens.v_do 2 } | "of" { to_token lexbuf Tokens.v_of 2 } | "nil" { to_token lexbuf Tokens.v_nil 3 } | "," { to_token lexbuf Tokens.v_comma 1 } | ":" { to_token lexbuf Tokens.v_colon 1 } | ";" { to_token lexbuf Tokens.v_semicolon 1 } | "(" { to_token lexbuf Tokens.v_lparen 1 } | ")" { to_token lexbuf Tokens.v_rparen 1 } | "[" { to_token lexbuf Tokens.v_lbrack 1 } | "]" { to_token lexbuf Tokens.v_rbrack 1 } | "{" { to_token lexbuf Tokens.v_lbrace 1 } | "}" { to_token lexbuf Tokens.v_rbrace 1 } | "." { to_token lexbuf Tokens.v_dot 1 } | "+" { to_token lexbuf Tokens.v_plus 1 } | "-" { to_token lexbuf Tokens.v_minus 1 } | "*" { to_token lexbuf Tokens.v_times 1 } | "/" { to_token lexbuf Tokens.v_divide 1 } | "=" { to_token lexbuf Tokens.v_eq 1 } | "<>" { to_token lexbuf Tokens.v_neq 2 } | "<" { to_token lexbuf Tokens.v_lt 1 } | "<=" { to_token lexbuf Tokens.v_le 2 } | ">" { to_token lexbuf Tokens.v_gt 1 } | ">=" { to_token lexbuf Tokens.v_ge 2 } | "&" { to_token lexbuf Tokens.v_and 1 } | "|" { to_token lexbuf Tokens.v_or 1 } | ":=" { to_token lexbuf Tokens.v_assign 2 } | digits as num { to_token lexbuf (Tokens.v_int (int_of_string num)) (String.length num) } | ident as id { to_token lexbuf (Tokens.v_id id) (String.length id) } | _ as c { err lexbuf (Printf.sprintf "illegal character %c" c); tokens lexbuf } | eof { eof () } (* 匹配注释 *) and finish_comment = parse | "*/" { () } | "/*" { finish_comment lexbuf; finish_comment lexbuf } (* 嵌套注释 *) | '\n' { newline lexbuf; finish_comment lexbuf } | _ { finish_comment lexbuf } | eof { err lexbuf "unterminated comment"; ignore (eof ()) } (* 匹配字符串 *) and finish_string = parse | '"' { let str = Buffer.contents data_buff in to_token lexbuf (Tokens.v_string str) ((String.length str) + 2) } | '\\' { finish_escaped_char lexbuf; finish_string lexbuf } | '\n' { newline lexbuf; Buffer.add_char data_buff '\n'; finish_string lexbuf } | [^ '"' '\\' '\n']+ { add_lexeme lexbuf; finish_string lexbuf } | eof { err lexbuf "unterminated string"; eof () } (* 匹配字符串中的转义字符 *) and finish_escaped_char = parse (* \^c 控制字符不知道什么意思,没有添加 *) | 'n' { Buffer.add_char data_buff '\n' } | 't' { Buffer.add_char data_buff '\t' } | '"' | '\\' as c { Buffer.add_char data_buff c } | ' ' | '\t' | '\r' { finish_multiline_escape lexbuf } | '\n' { newline lexbuf; finish_multiline_escape lexbuf } | (digit as a) (digit as b) (digit as c) { Buffer.add_char data_buff (Char.chr ((char_num a)*100 + (char_num b)*10 + (char_num c))) } (* 出现错误,返回 *) | _ { err lexbuf "Invalid escape sequence"; () } | eof { err lexbuf "unterminated escape sequence"; ignore (eof ()); () } (* 匹配字符串中连接多行的方式 *) and finish_multiline_escape = parse | '\\' { () } | ' ' | '\t' | '\r' { finish_multiline_escape lexbuf } | '\n' { newline lexbuf; finish_multiline_escape lexbuf } (* 出现错误,返回 *) | _ { err lexbuf "Invalid multiline escape char"; () } | eof { err lexbuf "unterminated multiline escape"; ignore (eof ()); () }
编译:
生成词法分析器为.ml ocamllex tigerLexer.mll 编译为字节码: ocamlc -o tigerlex.exe tokens.ml errorMsg.ml tigerLexer.ml driver.ml 编译为模块: 可以在toplevel中#load "tigerlex.cma"加载,然后调用 Driver.parse -> file_name:string -> unit进行测试,还可以用#trace跟 踪函数的执行过程。 ocamlc -a -g -o tigerlex.cma tokens.ml errorMsg.ml tigerLexer.ml driver.ml
测试queens.tig,注意要用cp936编码保存,如果用utf-8保存,中文字符会读取错误:
1: /* A program to solve the 8-queens problem */ 2: 3: let 4: var N := 8 5: 6: type intArray = array of int 7: 8: var row := intArray [ N ] of 0 9: var col := intArray [ N ] of 0 10: var diag1 := intArray [N+N-1] of 0 11: var diag2 := intArray [N+N-1] of 0 12: var test_str = " 字符串测试\tstring\n test \ 13: \next line" 14: function printboard() = 15: (for i := 0 to N-1 16: do (for j := 0 to N-1 17: do print(if col[i]=j then " O" else " ."); 18: print("\n")); 19: print("\n")) 20: 21: function try(c:int) = 22: ( /* (* 测试注释 for i:= 0 to c do print("."); print("\n"); flush(); 23: multi line 24: *) */ 25: if c=N 26: then printboard() 27: else for r := 0 to N-1 28: do if row[r]=0 & diag1[r+c]=0 & diag2[r+7-c]=0 29: then (row[r]:=1; diag1[r+c]:=1; diag2[r+7-c]:=1; 30: col[c]:=r; 31: try(c+1); 32: row[r]:=0; diag1[r+c]:=0; diag2[r+7-c]:=0) 33: 34: ) 35: in try(0) 36: /* comment 37: " string 38: test 39: end 40:
tigerlex queens.tig LET 50 VAR 58 ID(N) 60 ASSIGN 63 INT(8) 65 TYPE 75 ID(intArray) 84 EQ 86 ARRAY 92 OF 95 ID(int) 99 VAR 108 ID(row) 112 ASSIGN 115 ID(intArray) 124 LBRACK 126 ID(N) 128 RBRACK 130 OF 133 INT(0) 135 VAR 143 ID(col) 147 ASSIGN 150 ID(intArray) 159 LBRACK 161 ID(N) 163 RBRACK 165 OF 168 INT(0) 170 VAR 178 ID(diag1) 184 ASSIGN 187 ID(intArray) 196 LBRACK 198 ID(N) 199 PLUS 200 ID(N) 201 MINUS 202 INT(1) 203 RBRACK 204 OF 207 INT(0) 209 VAR 217 ID(diag2) 223 ASSIGN 226 ID(intArray) 235 LBRACK 237 ID(N) 238 PLUS 239 ID(N) 240 MINUS 241 INT(1) 242 RBRACK 243 OF 246 INT(0) 248 VAR 253 ID(test_str) 262 EQ 264 STRING( 字符串测试 string test next line) 321 FUNCTION 334 ID(printboard) 345 LPAREN 346 RPAREN 347 EQ 349 LPAREN 358 FOR 361 ID(i) 363 ASSIGN 366 INT(0) 368 TO 371 ID(N) 373 MINUS 374 INT(1) 375 DO 380 LPAREN 382 FOR 385 ID(j) 387 ASSIGN 390 INT(0) 392 TO 395 ID(N) 397 MINUS 398 INT(1) 399 DO 410 ID(print) 416 LPAREN 417 IF 419 ID(col) 423 LBRACK 424 ID(i) 425 RBRACK 426 EQ 427 ID(j) 428 THEN 433 STRING( O) 438 ELSE 443 STRING( .) 448 RPAREN 449 SEMICOLON 450 ID(print) 462 LPAREN 463 STRING( ) 467 RPAREN 468 RPAREN 469 SEMICOLON 470 ID(print) 485 LPAREN 486 STRING( ) 490 RPAREN 491 RPAREN 492 FUNCTION 506 ID(try) 510 LPAREN 511 ID(c) 512 COLON 513 ID(int) 516 RPAREN 517 EQ 519 LPAREN 522 IF 619 ID(c) 621 EQ 622 ID(N) 623 THEN 633 ID(printboard) 644 LPAREN 645 RPAREN 646 ELSE 656 FOR 660 ID(r) 662 ASSIGN 665 INT(0) 667 TO 670 ID(N) 672 MINUS 673 INT(1) 674 DO 681 IF 684 ID(row) 688 LBRACK 689 ID(r) 690 RBRACK 691 EQ 692 INT(0) 693 AND 695 ID(diag1) 701 LBRACK 702 ID(r) 703 PLUS 704 ID(c) 705 RBRACK 706 EQ 707 INT(0) 708 AND 710 ID(diag2) 716 LBRACK 717 ID(r) 718 PLUS 719 INT(7) 720 MINUS 721 ID(c) 722 RBRACK 723 EQ 724 INT(0) 725 THEN 742 LPAREN 744 ID(row) 747 LBRACK 748 ID(r) 749 RBRACK 750 ASSIGN 752 INT(1) 753 SEMICOLON 754 ID(diag1) 760 LBRACK 761 ID(r) 762 PLUS 763 ID(c) 764 RBRACK 765 ASSIGN 767 INT(1) 768 SEMICOLON 769 ID(diag2) 775 LBRACK 776 ID(r) 777 PLUS 778 INT(7) 779 MINUS 780 ID(c) 781 RBRACK 782 ASSIGN 784 INT(1) 785 SEMICOLON 786 ID(col) 801 LBRACK 802 ID(c) 803 RBRACK 804 ASSIGN 806 ID(r) 807 SEMICOLON 808 ID(try) 830 LPAREN 831 ID(c) 832 PLUS 833 INT(1) 834 RPAREN 835 SEMICOLON 836 ID(row) 844 LBRACK 845 ID(r) 846 RBRACK 847 ASSIGN 849 INT(0) 850 SEMICOLON 851 ID(diag1) 857 LBRACK 858 ID(r) 859 PLUS 860 ID(c) 861 RBRACK 862 ASSIGN 864 INT(0) 865 SEMICOLON 866 ID(diag2) 872 LBRACK 873 ID(r) 874 PLUS 875 INT(7) 876 MINUS 877 ID(c) 878 RBRACK 879 ASSIGN 881 INT(0) 882 RPAREN 883 RPAREN 886 IN 890 ID(try) 894 LPAREN 895 INT(0) 896 RPAREN 897 queens.tig:40.2:unterminated comment EOF 932