266 lines
8.8 KiB
Python
266 lines
8.8 KiB
Python
from . import ast
|
|
|
|
import io
|
|
import enum
|
|
import typing
|
|
import dataclasses
|
|
|
|
__all__ = ['Token', 'ExpectedError', 'Lexer', 'is_whitespace', 'is_operator', 'is_numeric', 'is_alpha', 'is_alphanum']
|
|
|
|
class Token(enum.Enum):
|
|
ILLEGAL = -1
|
|
EOF = 0
|
|
|
|
COMMENT = 1
|
|
IDENT = 2
|
|
STRING = 3
|
|
BOOL = 4
|
|
INTEGER = 5
|
|
FLOAT = 6
|
|
HEREDOC = 7
|
|
CURLY = 8
|
|
SQUARE = 9
|
|
PAREN = 10
|
|
COMMA = 11
|
|
COLON = 12
|
|
OPERATOR = 13
|
|
ELLIPSIS = 14
|
|
DOT = 15
|
|
|
|
class ExpectedError(Exception):
|
|
def __init__(self, pos: ast.Position, expected: str, got: str):
|
|
super().__init__(f'{pos}: expected {expected}, got {"EOF" if got == '' else repr(got)}')
|
|
self.pos = pos
|
|
self.got = got
|
|
self.expected = expected
|
|
|
|
class Lexer:
|
|
def __init__(self, stream: typing.TextIO, name: str):
|
|
self.pos = pos = ast.Position()
|
|
self.prev_pos = ast.Position()
|
|
self.unread = ''
|
|
self.stream = stream
|
|
self.pos.name = name
|
|
|
|
def _pos(self) -> ast.Position:
|
|
return dataclasses.replace(self.pos)
|
|
|
|
def _peek(self, n: int) -> str:
|
|
if self.unread != '':
|
|
return self.unread
|
|
pos = self.stream.tell()
|
|
text = self.stream.read(n)
|
|
self.stream.seek(pos)
|
|
return text
|
|
|
|
def _read(self) -> str:
|
|
char = self.unread
|
|
if self.unread != '':
|
|
self.unread = ''
|
|
|
|
if char == '':
|
|
char = self.stream.read(1)
|
|
|
|
self.prev_pos = dataclasses.replace(self.pos)
|
|
if char == '\n':
|
|
self.pos.line += 1
|
|
self.pos.col = 1
|
|
elif char != '':
|
|
self.pos.col += 1
|
|
return char
|
|
|
|
def _unread(self, char):
|
|
self.pos = self.prev_pos
|
|
self.unread = char
|
|
|
|
def _scan_str(self) -> tuple[Token, ast.Position, str]:
|
|
pos = dataclasses.replace(self.pos)
|
|
with io.StringIO() as out:
|
|
out.write('"')
|
|
escape = False
|
|
char = self._read()
|
|
while True:
|
|
if char == '"' and escape:
|
|
escape = False
|
|
out.write('\\"')
|
|
elif char == '\\' and escape:
|
|
escape = False
|
|
out.write('\\\\')
|
|
elif char == '\\':
|
|
escape = True
|
|
elif char == '"':
|
|
break
|
|
elif char == '' or char == '\r' or char == '\n':
|
|
raise ExpectedError(self.pos, repr('"'), char)
|
|
elif escape:
|
|
escape = False
|
|
out.write('\\' + char)
|
|
else:
|
|
out.write(char)
|
|
|
|
char = self._read()
|
|
out.write('"')
|
|
return Token.STRING, pos, out.getvalue()
|
|
|
|
def _scan_number(self, char: str) -> tuple[Token, ast.Position, str]:
|
|
pos = dataclasses.replace(self.pos)
|
|
tok = Token.INTEGER
|
|
with io.StringIO() as out:
|
|
while True:
|
|
if is_numeric(char):
|
|
out.write(char)
|
|
elif char == '.':
|
|
if tok == Token.FLOAT:
|
|
raise ExpectedError(self.pos, "number", char)
|
|
tok = Token.FLOAT
|
|
out.write(char)
|
|
else:
|
|
self._unread(char)
|
|
return tok, pos, out.getvalue()
|
|
char = self._read()
|
|
|
|
def _scan_ident(self, char: str) -> tuple[Token, ast.Position, str]:
|
|
pos = dataclasses.replace(self.pos)
|
|
with io.StringIO() as out:
|
|
while is_alphanum(char) or char in ('-', '_'):
|
|
out.write(char)
|
|
char = self._read()
|
|
self._unread(char)
|
|
val = out.getvalue()
|
|
if val in ('true', 'false'):
|
|
return Token.BOOL, pos, val
|
|
else:
|
|
return Token.IDENT, pos, out.getvalue()
|
|
|
|
def _scan_comment(self, char: str) -> tuple[Token, ast.Position, str]:
|
|
pos = dataclasses.replace(self.pos)
|
|
with io.StringIO() as out:
|
|
while char != '\n' and char != '':
|
|
if char != '\r':
|
|
out.write(char)
|
|
char = self._read()
|
|
return Token.COMMENT, pos, out.getvalue()
|
|
|
|
def _scan_inline_comment(self, char: str) -> tuple[Token, ast.Position, str]:
|
|
pos = dataclasses.replace(self.pos)
|
|
with io.StringIO() as out:
|
|
while True:
|
|
if char == '*' and self._peek(1) == '/':
|
|
self._read()
|
|
break
|
|
out.write(char)
|
|
char = self._read()
|
|
return Token.COMMENT, pos, out.getvalue()
|
|
|
|
def _scan_heredoc(self, char: str) -> tuple[Token, ast.Position, str]:
|
|
pos = dataclasses.replace(self.pos)
|
|
with io.StringIO() as out:
|
|
char = self._read()
|
|
if char != '<':
|
|
raise ExpectedError(self.pos, repr('<'), char)
|
|
|
|
char = self._read()
|
|
if not is_alpha(char):
|
|
raise ExpectedError(self.pos, 'heredoc name', char)
|
|
|
|
_, _, heredoc_name = self._scan_ident(char)
|
|
name_len = len(heredoc_name) - 1
|
|
|
|
char = self._read()
|
|
while True:
|
|
if char == heredoc_name[0] and self._peek(name_len) == heredoc_name[1:]:
|
|
self.pos.col += name_len
|
|
self.stream.seek(self.stream.tell()+name_len)
|
|
break
|
|
else:
|
|
out.write(char)
|
|
char = self._read()
|
|
|
|
return Token.HEREDOC, pos, out.getvalue()
|
|
|
|
def _scan_operator(self, char) -> tuple[Token, ast.Position, str]:
|
|
pos = dataclasses.replace(self.pos)
|
|
with io.StringIO() as out:
|
|
while is_operator(char):
|
|
out.write(char)
|
|
char = self._read()
|
|
self._unread(char)
|
|
return Token.OPERATOR, pos, out.getvalue()
|
|
|
|
def scan(self) -> tuple[Token, ast.Position, str]:
|
|
char = self._read()
|
|
while is_whitespace(char):
|
|
char = self._read()
|
|
|
|
match char:
|
|
case '{' | '}':
|
|
return Token.CURLY, self._pos(), char
|
|
case '[' | ']':
|
|
return Token.SQUARE, self._pos(), char
|
|
case '(' | ')':
|
|
return Token.PAREN, self._pos(), char
|
|
case ',':
|
|
return Token.COMMA, self._pos(), char
|
|
case ':':
|
|
return Token.COLON, self._pos(), char
|
|
case '"':
|
|
return self._scan_str()
|
|
case '<':
|
|
# If the next character is not another less than symbol,
|
|
# this is probably a less than operator.
|
|
if self._peek(1) != '<':
|
|
return self._scan_operator(char)
|
|
return self._scan_heredoc(char)
|
|
case '/':
|
|
next = self._peek(1)
|
|
if next == '/':
|
|
# Ignore comment and return next token
|
|
self._scan_comment(char)
|
|
return self.scan()
|
|
elif next == '*':
|
|
# Ignore inlinecomment and return next token
|
|
self._scan_inline_comment(char)
|
|
return self.scan()
|
|
else:
|
|
# If the next character is not another slash
|
|
# or an asterisk, this is probably a division
|
|
# operator.
|
|
return self._scan_operator(char)
|
|
case '#':
|
|
# Ignore comments and return next token
|
|
self._scan_comment(char)
|
|
return self.scan()
|
|
case '.':
|
|
if (next := self._read()) != '.':
|
|
self._unread(next)
|
|
return Token.DOT, self._pos(), next
|
|
elif (next := self._read()) != '.':
|
|
raise ExpectedError(self.pos, '.', next)
|
|
return Token.ELLIPSIS, self._pos(), "..."
|
|
case '':
|
|
return Token.EOF, self._pos(), char
|
|
|
|
if is_numeric(char):
|
|
return self._scan_number(char)
|
|
elif is_alpha(char):
|
|
return self._scan_ident(char)
|
|
elif is_operator(char):
|
|
return self._scan_operator(char)
|
|
|
|
return Token.ILLEGAL, self._pos(), char
|
|
|
|
def is_whitespace(char: str) -> bool:
|
|
return char in (' ', '\t', '\r', '\n')
|
|
|
|
def is_operator(char: str) -> bool:
|
|
return char in ('=', '+', '-', '*', '/', '%', '!', '>', '<', '|', '&')
|
|
|
|
def is_numeric(char: str) -> bool:
|
|
return char >= '0' and char <= '9'
|
|
|
|
def is_alpha(char: str) -> bool:
|
|
return (char >= 'a' and char <= 'z') or (char >= 'A' and char <= 'Z')
|
|
|
|
def is_alphanum(char: str) -> bool:
|
|
return is_numeric(char) or is_alpha(char)
|