{ (* Lexer for a small imperative language (uC) sestoft@dina.kvl.dk * 2001-03-02, 2001-04-10 *) open Lexing Cpar; exception LexicalError of string * int * int (* (message, loc1, loc2) *) fun lexerError lexbuf s = raise LexicalError (s, getLexemeStart lexbuf, getLexemeEnd lexbuf); (* Distinguish keywords from identifiers: *) fun keyword s = case s of "char" => CHAR | "else" => ELSE | "false" => CSTBOOL 0 | "for" => FOR | "if" => IF | "int" => INT | "null" => NULL | "print" => PRINT | "println" => PRINTLN | "return" => RETURN | "true" => CSTBOOL 1 | "void" => VOID | "while" => WHILE | _ => NAME s; (* For scanning strings and comments *) local val savedLexemeStart = ref 0 in fun resetLexerState lexbuf = (savedLexemeStart := getLexemeStart lexbuf) fun notTerminated kind lexbuf = (resetLexerState lexbuf; raise LexicalError (kind ^ " not terminated", !savedLexemeStart, getLexemeEnd lexbuf)) end (* A string constant is scanned as a list of characters. To handle very long string constants, use a CharArray instead, and extend it dynamically (by doubling it when too small). *) local val string_buff = ref [] : char list ref in fun reset_string_buffer lexbuf = string_buff := []; fun store_string_char c = (string_buff := c :: !string_buff) fun store_c_escape lexbuf = case Char.fromCString (getLexeme lexbuf) of NONE => lexerError lexbuf "illegal escape sequence" | SOME c => store_string_char c fun get_string() = String.implode (List.rev (!string_buff)) end } rule Token = parse [` ` `\t` `\n` `\r`] { Token lexbuf } | [`0`-`9`]+ { case Int.fromString (getLexeme lexbuf) of NONE => lexerError lexbuf "internal error" | SOME i => CSTINT i } | [`a`-`z``A`-`Z`][`a`-`z``A`-`Z``0`-`9`]* { keyword (getLexeme lexbuf) } | `+` { PLUS } | `-` { MINUS } | `*` { TIMES } | `/` { DIV } | `%` { MOD } | `=` { ASSIGN } | "==" { EQ } | "!=" { NE } | `>` { GT } | `<` { LT } | ">=" { GE } | "<=" { LE } | "||" { SEQOR } | "&&" { SEQAND } | "&" { AMP } | "!" { NOT } | `(` { LPAR } | `)` { RPAR } | `{` { LBRACE } | `}` { RBRACE } | `[` { LBRACK } | `]` { RBRACK } | `;` { SEMI } | `,` { COMMA } | "//" { SkipToEndLine lexbuf; Token lexbuf } | "/*" { SkipComment lexbuf; Token lexbuf } | `"` { reset_string_buffer(); String lexbuf; CSTSTRING (get_string()) } | eof { EOF } | _ { lexerError lexbuf "Illegal symbol in input" } and SkipComment = parse "*/" { () } | (eof | `\^Z`) { notTerminated "comment" lexbuf } | _ { SkipComment lexbuf } and SkipToEndLine = parse [`\n` `\r`] { () } | (eof | `\^Z`) { () } | _ { SkipToEndLine lexbuf } and String = parse `"` { () } | `\\` [`\\` `"` `a` `b` `t` `n` `v` `f` `r`] { store_c_escape lexbuf; String lexbuf } | `\\` [`0`-`9`] [`0`-`9`] [`0`-`9`] { store_c_escape lexbuf; String lexbuf } | "\\u" [`0`-`9``a`-`f``A`-`F`] [`0`-`9``a`-`f``A`-`F`] [`0`-`9``a`-`f``A`-`F`] [`0`-`9``a`-`f``A`-`F`] { store_c_escape lexbuf; String lexbuf } | `\\` { lexerError lexbuf "illegal escape sequence" } | (eof | `\^Z`) { notTerminated "string" lexbuf } | [`\n` `\r`] { lexerError lexbuf "newline not permitted in string" } | [`\^A`-`\^Z` `\127` `\255`] { lexerError lexbuf "invalid character in string" } | _ { (store_string_char(getLexemeChar lexbuf 0); String lexbuf) } ;