import re from typing import Optional, Callable, Any, Iterable, NamedTuple, Union, List, Dict, Tuple import typing try: regex_type = re.Pattern except AttributeError: try: regex_type = re.RegexObject except AttributeError: regex_type = type(re.compile('')) Matcher = Callable[[str, int], int] def make_regex_matcher(regex) -> Matcher: def matcher(text: str, begin: int) -> int: m = regex.match(text, begin) if m: return m.end() else: return -1 return matcher def make_literal_matcher(literal) -> Matcher: def matcher(text: str, begin: int) -> int: if text.startswith(literal, begin): return begin + len(literal) else: return -1 return matcher def never_matcher(text: str, begin: int) -> int: return -1 def always_one_matcher(text: str, begin: int) -> int: return begin + 1 class Location(NamedTuple): filename: str source: str begin: int end: int line: int column: int def __str__(self): if self.filename: return f"{self.filename}:{self.line}:{self.column}" else: return f"{self.line}:{self.column}" class ParserHint(NamedTuple): location: Location message: str class ParseError(Exception): def __init__(self, loc: Location, message: str, hints: Iterable[ParserHint]): msg = f"{loc}: {message}" if hints: lines = (f".. while parsing {h.message} at {h.location}" for h in reversed(hints)) msg = msg + "\n" + "\n".join(lines) super().__init__(msg) self.loc = loc class Rule: """Rule to match tokens """ def __init__(self, name: str, matcher: Matcher=never_matcher, literal: str="", value:Any=None, ignore:bool=False): self.name = name self.matcher = matcher self.literal = literal if value is None or callable(value): self.valuer = value else: self.valuer = lambda s: value self.ignore = bool(ignore) def __repr__(self): return "Rule({!r})".format(self.name) def __str__(self): return "{!r}".format(self.name) # Special rules for begin and end Begin = Rule("begin-of-file") End = Rule("end-of-file") Synthetic = Rule("synthetic") Error = Rule("error") def make_matcher_from_pattern(pattern: Any) -> Matcher: if isinstance(pattern, str): # Compile strings to regex regex = re.compile(pattern, re.ASCII) return make_regex_matcher(regex) elif isinstance(pattern, regex_type): # Already compiled regex return make_regex_matcher(pattern) elif callable(pattern): # Custom matcher function return pattern else: raise TypeError('Invalid type for rule pattern {!r}'.format(type(pattern))) class Lexer(object): def __init__(self): self.global_rules = [] self.prefix_rules = {} self.lexer_type = SourceLexer def add_rule(self, rule: Rule, prefix:Iterable[str]=None): if prefix: for pre in prefix: if not isinstance(pre, str): raise TypeError("Prefixes must be an iterable of str") if len(pre) > 1: raise ValueError('Prefixes must be single characters') rules = self.prefix_rules.setdefault(pre, []) rules.append(rule) else: self.global_rules.append(rule) def rule(self, name: str, pattern: Any, *, value:Any=None, prefix:Optional[Iterable[str]]=None): matcher = make_matcher_from_pattern(pattern) rule = Rule(name, matcher, "", value, ignore=False) self.add_rule(rule, prefix) return rule def ignore(self, name: str, pattern: Any, *, value:Any=None, prefix:Optional[Iterable[str]]=None): matcher = make_matcher_from_pattern(pattern) rule = Rule(name, matcher, "", value, ignore=True) self.add_rule(rule, prefix) return rule def ignore_whitespace(self, *, ignore_newline=True): spaces = " \t\v\r" if ignore_newline: spaces += "\n" regex = re.compile(f"[{re.escape(spaces)}]+") self.ignore("whitespace", regex, prefix=spaces) def literal(self, literal: str, value: Any=None): if not isinstance(literal, str): raise TypeError('Literals must be strings, got {!r}'.format(type(literal))) if not literal: raise ValueError('Empty literal') if len(literal) == 1: # Prefix match is full match matcher = always_one_matcher else: matcher = make_literal_matcher(literal) rule = Rule(repr(literal), matcher, literal, value) self.add_rule(rule, literal[0]) return rule def literals(self, *args: str): return [self.literal(arg) for arg in args] def make(self, source: str, filename: str=""): return self.lexer_type(self, source, filename) class Token: __slots__ = ["rule", "location", "value", "_text"] def __init__(self, rule: Rule, location: Location): self.rule = rule self.location = location self.value = None self._text = None if rule.valuer: self.value = rule.valuer(self.text()) def text(self) -> str: if self._text is None: loc = self.location self._text = loc.source[loc.begin:loc.end] return self._text def __str__(self) -> str: loc = self.location length = loc.end - loc.begin if self.rule.literal or length > 20: return self.rule.name else: return f"{self.rule.name} {self.text()!r}" def __repr__(self) -> str: return f"Token({self.rule.name!r}" def synthetic(text: str): length = len(text) loc = Location("", text, 0, length, 1, 1) return Token(Synthetic, loc) class SourceLexer: def __init__(self, lexer: Lexer, source: str, filename:str=""): self.pos = 0 self.lexer = lexer self.source = source self.source_length = len(source) self.filename = filename self.line = 1 self.line_end = 0 def scan(self) -> Token: pos = self.pos source_end = self.source_length lexer = self.lexer source = self.source global_rules = lexer.global_rules while pos < source_end: prefix = source[pos] prefix_rules = lexer.prefix_rules.get(prefix) best_rule = None best_end = -1 if prefix_rules: for rule in prefix_rules: end = rule.matcher(source, pos) if end >= best_end: best_rule = rule best_end = end for rule in global_rules: end = rule.matcher(source, pos) if end >= best_end: best_rule = rule best_end = end column = pos - self.line_end + 1 while self.line_end < best_end: line_end = source.find("\n", self.line_end, best_end) if line_end < 0: break self.line_end = line_end + 1 self.line += 1 if best_end < 0: loc = Location(self.filename, source, pos, pos + 1, self.line, column) return Token(Error, loc) if best_rule.ignore: pos = best_end else: self.pos = best_end loc = Location(self.filename, source, pos, best_end, self.line, column) return Token(best_rule, loc) loc = Location(self.filename, source, source_end, source_end + 1, self.line + 1, 1) return Token(End, loc) def format_rule(rule): if isinstance(rule, list): return 'any of ({})'.format(', '.join(format_rule(r) for r in rule)) elif isinstance(rule, Rule): return rule.name elif isinstance(rule, str): return repr(rule) else: raise TypeError(f'Unsupported rule type {repr(type(rule))}') def format_message(msg): return " " + msg if msg else "" class ParserHintContext: def __init__(self, parser: "Parser", token_or_loc: Union[Token, Location], message: str): self.parser = parser if hasattr(token_or_loc, "location"): self.location = token_or_loc.location else: self.location = token_or_loc self.message = message def __enter__(self): self.parser.hint_stack.append(ParserHint(self.location, self.message)) def __exit__(self, type, value, traceback): self.parser.hint_stack.pop() class Parser: def __init__(self, lexer: Lexer, source: str, filename:str=""): begin_loc = Location(filename, source, 0, 0, 1, 1) self.lexer = lexer self.source_lexer = lexer.make(source, filename) self.prev_token = Token(Begin, begin_loc) self.token = self.source_lexer.scan() self.hint_stack = [] def scan(self): if self.token.rule is not End: self.prev_token = self.token self.token = self.source_lexer.scan() if self.token.rule is Error: self.fail(f"Bad token starting with {self.token.text()!r}") return self.prev_token def peek(self, rule: Any) -> Optional[Token]: if isinstance(rule, list): for r in rule: tok = self.peek(r) if tok: return tok elif isinstance(rule, Rule): if self.token.rule == rule: return self.token elif isinstance(rule, str): if self.token.rule.literal == rule: return self.token else: raise TypeError(f'Unsupported rule type {type(rule)!r}') def accept(self, rule) -> Optional[Token]: tok = self.peek(rule) if tok: self.scan() return tok else: return None def fail_at(self, location: Location, message: str): raise ParseError(location, message, self.hint_stack) def fail(self, message: str): self.fail_at(self.token.location, message) def fail_prev(self, message: str): self.fail_at(self.prev_token.location, message) def fail_got(self, message: str): self.fail_at(self.token.location, message + f", got {self.token}") def fail_prev_got(self, message: str): self.fail_at(self.prev_token.location, message + f", got {self.prev_token}") def require(self, rule, message: str="") -> Token: tok = self.accept(rule) if tok: return tok else: fr, fm = format_rule, format_message self.fail_got(f"Expected {fr(rule)}{fm(message)}") def sep(self, sep, message="") -> Iterable[int]: n = 0 yield n while self.accept(sep): yield n n += 1 def until(self, end, message="") -> Iterable[int]: n = 0 while not self.accept(end): yield n n += 1 def sep_until(self, sep, end, message="") -> Iterable[int]: n = 0 while not self.accept(end): if n > 0 and not self.accept(sep): fr, fm = format_rule, format_message self.fail_got(f"Expected {fr(sep)} or {fr(end)}{fm(message)}") yield n n += 1 def ignore(self, rule) -> int: n = 0 while self.accept(rule): n += 1 return n def hint(self, token_or_loc: Union[Token, Location], message: str): return ParserHintContext(self, token_or_loc, message) get_origin = getattr(typing, "get_origin", lambda o: getattr(o, "__origin__", None)) get_args = getattr(typing, "get_args", lambda o: getattr(o, "__args__", None)) class AstField(NamedTuple): name: str base: type optional: bool sequence: bool def make_ast_field(name, base): origin, args = get_origin(base), get_args(base) optional = sequence = False if origin == Union and len(args) == 2 and type(None) in args: base = args[args.index(type(None)) ^ 1] optional = True origin, args = get_origin(base), get_args(base) if origin == List: base = args[0] sequence = True elif origin: base = object return AstField(name, base, optional, sequence) class Ast: def __init__(self, *args, **kwargs): cls = type(self) if len(args) > len(cls.fields): raise TypeError(f"Too many fields for {cls.__name__}: {len(args)}, expected {len(cls.fields)}") for field, arg in zip(cls.fields, args): setattr(self, field.name, arg) for name, arg in kwargs.items(): setattr(self, name, arg) for field in cls.fields: try: value = getattr(self, field.name) if field.optional and value is None: continue if field.sequence: for ix, v in enumerate(value): if not isinstance(v, field.base): raise TypeError(f"Trying to assign '{type(v).__name__}' to '{cls.__name__}' field '{field.name}: {field.base.__name__}' index [{ix}]") else: if not isinstance(value, field.base): raise TypeError(f"Trying to assign '{type(value).__name__}' to '{cls.__name__}' field '{field.name}: {field.base.__name__}'") except AttributeError: raise ValueError(f"'{cls.__name__}' requires field '{field.name}: {field.base.__name__}'") def __init_subclass__(cls, **kwargs): fields = getattr(cls, "__annotations__", {}) cls.fields = [make_ast_field(k, v) for k,v in fields.items()] super().__init_subclass__(**kwargs) def _imp_dump(self, result, indent): cls = type(self) indent_str = " " * indent result += (cls.__name__, "(") first = True num_asts = 0 for field in cls.fields: if issubclass(field.base, Ast): num_asts += 1 continue if not first: result.append(", ") first = False result += (field.name, "=", str(getattr(self, field.name, None))) for field in cls.fields: if not issubclass(field.base, Ast): continue if num_asts > 1: result += ("\n", indent_str, " ") else: if not first: result.append(", ") result.append(field.name) result.append("=") attr = getattr(self, field.name, None) if not attr: result.append("None") continue if field.sequence: result.append("[") seq_indent = 1 if num_asts == 1 else 2 print(seq_indent) for ast in getattr(self, field.name, None): result += ("\n", indent_str, " " * seq_indent) ast._imp_dump(result, indent + seq_indent) result += ("\n", indent_str, " ]") else: attr._imp_dump(result, indent + 1) result += ")" def dump(self, indent=0): result = [] self._imp_dump(result, indent) return "".join(result)