Assignment-2/scanner.l

186 lines
5.9 KiB
Plaintext
Raw Permalink Normal View History

/*
* Lexer definition for simplified Python syntax.
*/
/*
* Since we're only parsing 1 file, we don't need to have yywrap() (plus,
* having it included messes up compilation).
*/
%option noyywrap
%option yylineno
%{
#include <iostream>
#include <stack>
#include <cstdlib>
2019-05-11 22:34:26 -07:00
#include "parser.hpp"
/*
* We'll use this stack to keep track of indentation level, as described in
* the Python docs:
*
* https://docs.python.org/3/reference/lexical_analysis.html#indentation
*/
std::stack<int> _indent_stack;
2019-05-11 22:34:26 -07:00
yypstate* state = yypstate_new();
#define PUSH_TOKEN(i, str) do { \
2019-05-12 23:02:48 -07:00
YYSTYPE temp = new std::string(str); \
2019-05-11 22:34:26 -07:00
int s = yypush_parse(state, i, &temp); \
if (s != YYPUSH_MORE) { \
yypstate_delete(state); \
return s; \
} } while(0);
%}
%%
%{
/*
* These lines go at the top of the lexing function. We only want to
* initialize the indentation level stack once by pushing a 0 onto it (the
* indentation stack should never be empty, except immediately after it is
* created).
*/
if (_indent_stack.empty()) {
_indent_stack.push(0);
}
%}
^[ \t]*\r?\n { /* Skip blank lines */ }
^[ \t]*#.*\r?\n { /* Skip whole-line comments. */ }
#.*$ { /* Skip comments on the same line as a statement. */ }
^[ \t]+ {
/*
* Handle indentation as described in Python docs linked above.
* Note that this pattern treats leading spaces and leading tabs
* equivalently, which could cause some unexpected behavior if
* they're combined in a single line. For the purposes of this
* project, that's OK.
*/
if (_indent_stack.top() < yyleng) {
/*
* If the current indentation level is greater than the
* previous indentation level (stored at the top of the stack),
* then emit an INDENT and push the new indentation level onto
* the stack.
*/
2019-05-11 22:34:26 -07:00
PUSH_TOKEN(INDENT, "");
_indent_stack.push(yyleng);
} else {
/*
* If the current indentation level is less than or equal to
* the previous indentation level, pop indentation levels off
* the stack until the top is equal to the current indentation
* level. Emit a DEDENT for each element popped from the stack.
*/
while (!_indent_stack.empty() && _indent_stack.top() != yyleng) {
_indent_stack.pop();
2019-05-11 22:34:26 -07:00
PUSH_TOKEN(DEDENT, "");
}
/*
* If we popped everythin g off the stack, that means the
* current indentation level didn't match any on the stack,
* which is an indentation error.
*/
if (_indent_stack.empty()) {
std::cerr << "Error: Incorrect indentation on line "
<< yylineno << std::endl;
return 1;
}
}
}
^[^ \t\r\n]+ {
/*
* If we find a line that's not indented, pop all indentation
* levels off the stack, and emit a DEDENT for each one. Then,
* call REJECT, so the next rule matching this token is also
* applied.
*/
while (_indent_stack.top() != 0) {
_indent_stack.pop();
2019-05-11 22:34:26 -07:00
PUSH_TOKEN(DEDENT, "");
}
REJECT;
}
\r?\n {
2019-05-11 22:34:26 -07:00
PUSH_TOKEN(NEWLINE, "");
}
<<EOF>> {
/*
* If we reach the end of the file, pop all indentation levels
* off the stack, and emit a DEDENT for each one.
*/
while(_indent_stack.top() != 0) {
_indent_stack.pop();
2019-05-11 22:34:26 -07:00
PUSH_TOKEN(DEDENT, "");
}
2019-05-12 23:02:48 -07:00
PUSH_TOKEN(TEOF, "");
yyterminate();
}
[ \t] { /* Ignore spaces that haven't been handled above. */ }
2019-05-11 22:34:26 -07:00
"and" { PUSH_TOKEN(AND, ""); }
"break" { PUSH_TOKEN(BREAK, ""); }
"def" { PUSH_TOKEN(DEF, ""); }
"elif" { PUSH_TOKEN(ELIF, ""); }
"else" { PUSH_TOKEN(ELSE, ""); }
"for" { PUSH_TOKEN(FOR, ""); }
"if" { PUSH_TOKEN(IF, ""); }
"not" { PUSH_TOKEN(NOT, ""); }
"or" { PUSH_TOKEN(OR, ""); }
"return" { PUSH_TOKEN(RETURN, ""); }
"while" { PUSH_TOKEN(WHILE, ""); }
2019-05-12 23:02:48 -07:00
"True" { PUSH_TOKEN(BOOLEAN, "true"); }
"False" { PUSH_TOKEN(BOOLEAN, "false"); }
[a-zA-Z_][a-zA-Z0-9_]* {
2019-05-12 23:02:48 -07:00
PUSH_TOKEN(IDENTIFIER, yytext);
}
-?[0-9]*"."[0-9]+ {
2019-05-12 23:02:48 -07:00
PUSH_TOKEN(FLOAT, yytext);
}
-?[0-9]+ {
2019-05-12 23:02:48 -07:00
PUSH_TOKEN(INTEGER, yytext);
}
2019-05-12 23:02:48 -07:00
"=" { PUSH_TOKEN(ASSIGN, yytext); }
"+" { PUSH_TOKEN(PLUS, yytext); }
"-" { PUSH_TOKEN(MINUS, yytext); }
"*" { PUSH_TOKEN(TIMES, yytext); }
"/" { PUSH_TOKEN(DIVIDEDBY, yytext); }
2019-05-12 23:02:48 -07:00
"==" { PUSH_TOKEN(EQ, yytext); }
"!=" { PUSH_TOKEN(NEQ, yytext); }
">" { PUSH_TOKEN(GT, yytext); }
">=" { PUSH_TOKEN(GTE, yytext); }
"<" { PUSH_TOKEN(LT, yytext); }
"<=" { PUSH_TOKEN(LTE, yytext); }
2019-05-12 23:02:48 -07:00
"(" { PUSH_TOKEN(LPAREN, yytext); }
")" { PUSH_TOKEN(RPAREN, yytext); }
2019-05-12 23:02:48 -07:00
"," { PUSH_TOKEN(COMMA, yytext); }
":" { PUSH_TOKEN(COLON, yytext); }
. {
std::cerr << "Unrecognized token on line " << yylineno << ": "
<< yytext << std::endl;
return 1;
}
%%