""" A regular expression based Lexer/tokenizer for TOML. """ from collections import namedtuple import re from prettytoml import tokens from prettytoml.errors import TOMLError TokenSpec = namedtuple('TokenSpec', ('type', 're')) # Specs of all the valid tokens _LEXICAL_SPECS = ( TokenSpec(tokens.TYPE_COMMENT, re.compile(r'^(#.*)\n')), TokenSpec(tokens.TYPE_STRING, re.compile(r'^("(([^"]|\\")+?[^\\]|([^"]|\\")|)")')), # Single line only TokenSpec(tokens.TYPE_MULTILINE_STRING, re.compile(r'^(""".*?""")', re.DOTALL)), TokenSpec(tokens.TYPE_LITERAL_STRING, re.compile(r"^('.*?')")), TokenSpec(tokens.TYPE_MULTILINE_LITERAL_STRING, re.compile(r"^('''.*?''')", re.DOTALL)), TokenSpec(tokens.TYPE_BARE_STRING, re.compile(r'^([A-Za-z0-9_-]+)')), TokenSpec(tokens.TYPE_DATE, re.compile( r'^([0-9]{4}-[0-9]{2}-[0-9]{2}(T[0-9]{2}:[0-9]{2}:[0-9]{2}(\.[0-9]*)?)?(([zZ])|((\+|-)[0-9]{2}:[0-9]{2}))?)')), TokenSpec(tokens.TYPE_WHITESPACE, re.compile(r'^( |\t)', re.DOTALL)), TokenSpec(tokens.TYPE_INTEGER, re.compile(r'^(((\+|-)[0-9_]+)|([0-9][0-9_]*))')), TokenSpec(tokens.TYPE_FLOAT, re.compile(r'^((((\+|-)[0-9_]+)|([1-9][0-9_]*))(\.[0-9_]+)?([eE](\+|-)?[0-9_]+)?)')), TokenSpec(tokens.TYPE_BOOLEAN, re.compile(r'^(true|false)')), TokenSpec(tokens.TYPE_OP_SQUARE_LEFT_BRACKET, re.compile(r'^(\[)')), TokenSpec(tokens.TYPE_OP_SQUARE_RIGHT_BRACKET, re.compile(r'^(\])')), TokenSpec(tokens.TYPE_OP_CURLY_LEFT_BRACKET, re.compile(r'^(\{)')), TokenSpec(tokens.TYPE_OP_CURLY_RIGHT_BRACKET, re.compile(r'^(\})')), TokenSpec(tokens.TYPE_OP_ASSIGNMENT, re.compile(r'^(=)')), TokenSpec(tokens.TYPE_OP_COMMA, re.compile(r'^(,)')), TokenSpec(tokens.TYPE_OP_DOUBLE_SQUARE_LEFT_BRACKET, re.compile(r'^(\[\[)')), TokenSpec(tokens.TYPE_OP_DOUBLE_SQUARE_RIGHT_BRACKET, re.compile(r'^(\]\])')), TokenSpec(tokens.TYPE_OPT_DOT, re.compile(r'^(\.)')), TokenSpec(tokens.TYPE_NEWLINE, re.compile('^(\n|\r\n)')), ) def _next_token_candidates(source): matches = [] for token_spec in _LEXICAL_SPECS: match = token_spec.re.search(source) if match: matches.append(tokens.Token(token_spec.type, match.group(1))) return matches def _choose_from_next_token_candidates(candidates): if len(candidates) == 1: return candidates[0] elif len(candidates) > 1: # Return the maximal-munch with ties broken by natural order of token type. maximal_munch_length = max(len(token.source_substring) for token in candidates) maximal_munches = [token for token in candidates if len(token.source_substring) == maximal_munch_length] return sorted(maximal_munches)[0] # Return the first in sorting by priority def _munch_a_token(source): """ Munches a single Token instance if it could recognize one at the beginning of the given source text, or None if no token type could be recognized. """ candidates = _next_token_candidates(source) return _choose_from_next_token_candidates(candidates) class LexerError(TOMLError): def __init__(self, message): self._message = message def __repr__(self): return self._message def __str__(self): return self._message def tokenize(source, is_top_level=False): """ Tokenizes the input TOML source into a stream of tokens. If is_top_level is set to True, will make sure that the input source has a trailing newline character before it is tokenized. Raises a LexerError when it fails recognize another token while not at the end of the source. """ # Newlines are going to be normalized to UNIX newlines. source = source.replace('\r\n', '\n') if is_top_level and source and source[-1] != '\n': source += '\n' next_row = 1 next_col = 1 next_index = 0 while next_index < len(source): new_token = _munch_a_token(source[next_index:]) if not new_token: raise LexerError("failed to read the next token at ({}, {}): {}".format( next_row, next_col, source[next_index:])) # Set the col and row on the new token new_token = tokens.Token(new_token.type, new_token.source_substring, next_col, next_row) # Advance the index, row and col count next_index += len(new_token.source_substring) for c in new_token.source_substring: if c == '\n': next_row += 1 next_col = 1 else: next_col += 1 yield new_token