import re specs = ( # Space: (re.compile(r"^\s"), None), # Comments: (re.compile(r"^//.*"), None), # Symbols: (re.compile(r"^\("), "("), (re.compile(r"^\)"), ")"), (re.compile(r"^\,"), ","), (re.compile(r"^\{"), "{"), (re.compile(r"^\}"), "}"), (re.compile(r"^;"), ";"), # Keywords: (re.compile(r"^\blet\b"), "let"), (re.compile(r"^\breturn\b"), "return"), (re.compile(r"^\bif\b"), "if"), (re.compile(r"^\belse\b"), "else"), (re.compile(r"^\bwhile\b"), "while"), (re.compile(r"^\bfor\b"), "for"), (re.compile(r"^\def\b"), "def"), (re.compile(r"^\btrue\b"), "true"), (re.compile(r"^\bfalse\b"), "false"), # Floats: (re.compile(r"^[-+]?[0-9]+\.[0-9]+"), "FLOAT"), # Numbers: (re.compile(r"^[-+]?[0-9]+"), "NUMBER"), # Identifiers: (re.compile(r"^\w+"), "IDENTIFIER"), # Assignment: (re.compile(r"^="), "SIMPLE_ASSIGN"), # Logical operators: (re.compile(r"^&&"), "AND"), (re.compile(r"^\|\|"), "OR"), (re.compile(r"^!"), "NOT"), # Math operators: +, -, *, /: (re.compile(r"^[+-]"), "ADDITIVE_OPERATOR"), (re.compile(r"^[*/]"), "MULTIPLICATIVE_OPERATOR"), # Double-quoted strings (re.compile(r"^\"[^\"]*\""), "STRING"), ) class Tokenizer: def __init__(self): self.script = "" self.cursor = 0 def init(self, script: str): self.script = script self.cursor = 0 def isEOF(self): return self.cursor == len(self.script) def has_more_tokens(self): return self.cursor < len(self.script) def get_next_token(self): if not self.has_more_tokens(): return None _string = self.script[self.cursor:] for spec in specs: tokenValue = self.match(spec[0], _string) if tokenValue == None: continue if (spec[1] == None): return self.get_next_token() return { "type": spec[1], "value": tokenValue, } raise Exception("Unknown token: " + _string[0]) def match(self, reg: re, _script): matched = reg.search(_script) if matched == None: return None self.cursor += matched.span(0)[1] return matched[0]