/* * Lexer definition for simplified Python syntax. */ /* * Since we're only parsing 1 file, we don't need to have yywrap() (plus, * having it included messes up compilation). */ %option noyywrap %option yylineno %{ #include #include #include #include "parser.hpp" /* * We'll use this stack to keep track of indentation level, as described in * the Python docs: * * https://docs.python.org/3/reference/lexical_analysis.html#indentation */ std::stack _indent_stack; yypstate* state = yypstate_new(); #define PUSH_TOKEN(i, str) do { \ YYSTYPE temp = new std::string(str); \ int s = yypush_parse(state, i, &temp); \ if (s != YYPUSH_MORE) { \ yypstate_delete(state); \ return s; \ } } while(0); %} %% %{ /* * These lines go at the top of the lexing function. We only want to * initialize the indentation level stack once by pushing a 0 onto it (the * indentation stack should never be empty, except immediately after it is * created). */ if (_indent_stack.empty()) { _indent_stack.push(0); } %} ^[ \t]*\r?\n { /* Skip blank lines */ } ^[ \t]*#.*\r?\n { /* Skip whole-line comments. */ } #.*$ { /* Skip comments on the same line as a statement. */ } ^[ \t]+ { /* * Handle indentation as described in Python docs linked above. * Note that this pattern treats leading spaces and leading tabs * equivalently, which could cause some unexpected behavior if * they're combined in a single line. For the purposes of this * project, that's OK. */ if (_indent_stack.top() < yyleng) { /* * If the current indentation level is greater than the * previous indentation level (stored at the top of the stack), * then emit an INDENT and push the new indentation level onto * the stack. */ PUSH_TOKEN(INDENT, ""); _indent_stack.push(yyleng); } else { /* * If the current indentation level is less than or equal to * the previous indentation level, pop indentation levels off * the stack until the top is equal to the current indentation * level. Emit a DEDENT for each element popped from the stack. */ while (!_indent_stack.empty() && _indent_stack.top() != yyleng) { _indent_stack.pop(); PUSH_TOKEN(DEDENT, ""); } /* * If we popped everythin g off the stack, that means the * current indentation level didn't match any on the stack, * which is an indentation error. */ if (_indent_stack.empty()) { std::cerr << "Error: Incorrect indentation on line " << yylineno << std::endl; return 1; } } } ^[^ \t\r\n]+ { /* * If we find a line that's not indented, pop all indentation * levels off the stack, and emit a DEDENT for each one. Then, * call REJECT, so the next rule matching this token is also * applied. */ while (_indent_stack.top() != 0) { _indent_stack.pop(); PUSH_TOKEN(DEDENT, ""); } REJECT; } \r?\n { PUSH_TOKEN(NEWLINE, ""); } <> { /* * If we reach the end of the file, pop all indentation levels * off the stack, and emit a DEDENT for each one. */ while(_indent_stack.top() != 0) { _indent_stack.pop(); PUSH_TOKEN(DEDENT, ""); } PUSH_TOKEN(TEOF, ""); yyterminate(); } [ \t] { /* Ignore spaces that haven't been handled above. */ } "and" { PUSH_TOKEN(AND, ""); } "break" { PUSH_TOKEN(BREAK, ""); } "def" { PUSH_TOKEN(DEF, ""); } "elif" { PUSH_TOKEN(ELIF, ""); } "else" { PUSH_TOKEN(ELSE, ""); } "for" { PUSH_TOKEN(FOR, ""); } "if" { PUSH_TOKEN(IF, ""); } "not" { PUSH_TOKEN(NOT, ""); } "or" { PUSH_TOKEN(OR, ""); } "return" { PUSH_TOKEN(RETURN, ""); } "while" { PUSH_TOKEN(WHILE, ""); } "True" { PUSH_TOKEN(BOOLEAN, "true"); } "False" { PUSH_TOKEN(BOOLEAN, "false"); } [a-zA-Z_][a-zA-Z0-9_]* { PUSH_TOKEN(IDENTIFIER, yytext); } -?[0-9]*"."[0-9]+ { PUSH_TOKEN(FLOAT, yytext); } -?[0-9]+ { PUSH_TOKEN(INTEGER, yytext); } "=" { PUSH_TOKEN(ASSIGN, yytext); } "+" { PUSH_TOKEN(PLUS, yytext); } "-" { PUSH_TOKEN(MINUS, yytext); } "*" { PUSH_TOKEN(TIMES, yytext); } "/" { PUSH_TOKEN(DIVIDEDBY, yytext); } "==" { PUSH_TOKEN(EQ, yytext); } "!=" { PUSH_TOKEN(NEQ, yytext); } ">" { PUSH_TOKEN(GT, yytext); } ">=" { PUSH_TOKEN(GTE, yytext); } "<" { PUSH_TOKEN(LT, yytext); } "<=" { PUSH_TOKEN(LTE, yytext); } "(" { PUSH_TOKEN(LPAREN, yytext); } ")" { PUSH_TOKEN(RPAREN, yytext); } "," { PUSH_TOKEN(COMMA, yytext); } ":" { PUSH_TOKEN(COLON, yytext); } . { std::cerr << "Unrecognized token on line " << yylineno << ": " << yytext << std::endl; return 1; } %%