GardenSnake.py - cscg24-lolpython - CSCG 2024 Challenge 'Can I Haz Lolpython?'

	cscg24-lolpython CSCG 2024 Challenge 'Can I Haz Lolpython?'
	git clone https://git.sinitax.com/sinitax/cscg24-lolpython
	Log \| Files \| Refs \| sfeed.txt
GardenSnake.py (19213B)
      1# GardenSnake - a parser generator demonstration program
      2#
      3# This implements a modified version of a subset of Python:
      4#  - only 'def', 'return' and 'if' statements
      5#  - 'if' only has 'then' clause (no elif nor else)
      6#  - single-quoted strings only, content in raw format
      7#  - numbers are decimal.Decimal instances (not integers or floats)
      8#  - no print statment; use the built-in 'print' function
      9#  - only < > == + - / * implemented (and unary + -)
     10#  - assignment and tuple assignment work
     11#  - no generators of any sort
     12#  - no ... well, no quite a lot
     13
     14# Why?  I'm thinking about a new indentation-based configuration
     15# language for a project and wanted to figure out how to do it.  Once
     16# I got that working I needed a way to test it out.  My original AST
     17# was dumb so I decided to target Python's AST and compile it into
     18# Python code.  Plus, it's pretty cool that it only took a day or so
     19# from sitting down with Ply to having working code.
     20
     21# This uses David Beazley's Ply from http://www.dabeaz.com/ply/
     22
     23# This work is hereby released into the Public Domain. To view a copy of
     24# the public domain dedication, visit
     25# http://creativecommons.org/licenses/publicdomain/ or send a letter to
     26# Creative Commons, 543 Howard Street, 5th Floor, San Francisco,
     27# California, 94105, USA.
     28#
     29# Portions of this work are derived from Python's Grammar definition
     30# and may be covered under the Python copyright and license
     31#
     32#          Andrew Dalke / Dalke Scientific Software, LLC
     33#             30 August 2006 / Cape Town, South Africa
     34
     35# Changelog:
     36#  30 August - added link to CC license; removed the "swapcase" encoding
     37
     38# Modifications for inclusion in PLY distribution
     39import sys
     40sys.path.insert(0,"../..")
     41from ply import *
     42
     43##### Lexer ######
     44#import lex
     45import decimal
     46
     47tokens = (
     48    'DEF',
     49    'IF',
     50    'NAME',
     51    'NUMBER',  # Python decimals
     52    'STRING',  # single quoted strings only; syntax of raw strings
     53    'LPAR',
     54    'RPAR',
     55    'COLON',
     56    'EQ',
     57    'ASSIGN',
     58    'LT',
     59    'GT',
     60    'PLUS',
     61    'MINUS',
     62    'MULT',
     63    'DIV',
     64    'RETURN',
     65    'WS',
     66    'NEWLINE',
     67    'COMMA',
     68    'SEMICOLON',
     69    'INDENT',
     70    'DEDENT',
     71    'ENDMARKER',
     72    )
     73
     74#t_NUMBER = r'\d+'
     75# taken from decmial.py but without the leading sign
     76def t_NUMBER(t):
     77    r"""(\d+(\.\d*)?|\.\d+)([eE][-+]? \d+)?"""
     78    t.value = decimal.Decimal(t.value)
     79    return t
     80
     81def t_STRING(t):
     82    r"'([^\\']+|\\'|\\\\)*'"  # I think this is right ...
     83    t.value=t.value[1:-1].decode("string-escape") # .swapcase() # for fun
     84    return t
     85
     86t_COLON = r':'
     87t_EQ = r'=='
     88t_ASSIGN = r'='
     89t_LT = r'<'
     90t_GT = r'>'
     91t_PLUS = r'\+'
     92t_MINUS = r'-'
     93t_MULT = r'\*'
     94t_DIV = r'/'
     95t_COMMA = r','
     96t_SEMICOLON = r';'
     97
     98# Ply nicely documented how to do this.
     99
    100RESERVED = {
    101  "def": "DEF",
    102  "if": "IF",
    103  "return": "RETURN",
    104  }
    105
    106def t_NAME(t):
    107    r'[a-zA-Z_][a-zA-Z0-9_]*'
    108    t.type = RESERVED.get(t.value, "NAME")
    109    return t
    110
    111# Putting this before t_WS let it consume lines with only comments in
    112# them so the latter code never sees the WS part.  Not consuming the
    113# newline.  Needed for "if 1: #comment"
    114def t_comment(t):
    115    r"[ ]*\043[^\n]*"  # \043 is '#'
    116    pass
    117
    118
    119# Whitespace
    120def t_WS(t):
    121    r' [ ]+ '
    122    if t.lexer.at_line_start and t.lexer.paren_count == 0:
    123        return t
    124
    125# Don't generate newline tokens when inside of parenthesis, eg
    126#   a = (1,
    127#        2, 3)
    128def t_newline(t):
    129    r'\n+'
    130    t.lexer.lineno += len(t.value)
    131    t.type = "NEWLINE"
    132    if t.lexer.paren_count == 0:
    133        return t
    134
    135def t_LPAR(t):
    136    r'\('
    137    t.lexer.paren_count += 1
    138    return t
    139
    140def t_RPAR(t):
    141    r'\)'
    142    # check for underflow?  should be the job of the parser
    143    t.lexer.paren_count -= 1
    144    return t
    145
    146
    147def t_error(t):
    148    raise SyntaxError("Unknown symbol %r" % (t.value[0],))
    149    print "Skipping", repr(t.value[0])
    150    t.lexer.skip(1)
    151
    152## I implemented INDENT / DEDENT generation as a post-processing filter
    153
    154# The original lex token stream contains WS and NEWLINE characters.
    155# WS will only occur before any other tokens on a line.
    156
    157# I have three filters.  One tags tokens by adding two attributes.
    158# "must_indent" is True if the token must be indented from the
    159# previous code.  The other is "at_line_start" which is True for WS
    160# and the first non-WS/non-NEWLINE on a line.  It flags the check so
    161# see if the new line has changed indication level.
    162
    163# Python's syntax has three INDENT states
    164#  0) no colon hence no need to indent
    165#  1) "if 1: go()" - simple statements have a COLON but no need for an indent
    166#  2) "if 1:\n  go()" - complex statements have a COLON NEWLINE and must indent
    167NO_INDENT = 0
    168MAY_INDENT = 1
    169MUST_INDENT = 2
    170
    171# only care about whitespace at the start of a line
    172def track_tokens_filter(lexer, tokens):
    173    lexer.at_line_start = at_line_start = True
    174    indent = NO_INDENT
    175    saw_colon = False
    176    for token in tokens:
    177        token.at_line_start = at_line_start
    178
    179        if token.type == "COLON":
    180            at_line_start = False
    181            indent = MAY_INDENT
    182            token.must_indent = False
    183            
    184        elif token.type == "NEWLINE":
    185            at_line_start = True
    186            if indent == MAY_INDENT:
    187                indent = MUST_INDENT
    188            token.must_indent = False
    189
    190        elif token.type == "WS":
    191            assert token.at_line_start == True
    192            at_line_start = True
    193            token.must_indent = False
    194
    195        else:
    196            # A real token; only indent after COLON NEWLINE
    197            if indent == MUST_INDENT:
    198                token.must_indent = True
    199            else:
    200                token.must_indent = False
    201            at_line_start = False
    202            indent = NO_INDENT
    203
    204        yield token
    205        lexer.at_line_start = at_line_start
    206
    207def _new_token(type, lineno):
    208    tok = lex.LexToken()
    209    tok.type = type
    210    tok.value = None
    211    tok.lineno = lineno
    212    return tok
    213
    214# Synthesize a DEDENT tag
    215def DEDENT(lineno):
    216    return _new_token("DEDENT", lineno)
    217
    218# Synthesize an INDENT tag
    219def INDENT(lineno):
    220    return _new_token("INDENT", lineno)
    221
    222
    223# Track the indentation level and emit the right INDENT / DEDENT events.
    224def indentation_filter(tokens):
    225    # A stack of indentation levels; will never pop item 0
    226    levels = [0]
    227    token = None
    228    depth = 0
    229    prev_was_ws = False
    230    for token in tokens:
    231##        if 1:
    232##            print "Process", token,
    233##            if token.at_line_start:
    234##                print "at_line_start",
    235##            if token.must_indent:
    236##                print "must_indent",
    237##            print
    238                
    239        # WS only occurs at the start of the line
    240        # There may be WS followed by NEWLINE so
    241        # only track the depth here.  Don't indent/dedent
    242        # until there's something real.
    243        if token.type == "WS":
    244            assert depth == 0
    245            depth = len(token.value)
    246            prev_was_ws = True
    247            # WS tokens are never passed to the parser
    248            continue
    249
    250        if token.type == "NEWLINE":
    251            depth = 0
    252            if prev_was_ws or token.at_line_start:
    253                # ignore blank lines
    254                continue
    255            # pass the other cases on through
    256            yield token
    257            continue
    258
    259        # then it must be a real token (not WS, not NEWLINE)
    260        # which can affect the indentation level
    261
    262        prev_was_ws = False
    263        if token.must_indent:
    264            # The current depth must be larger than the previous level
    265            if not (depth > levels[-1]):
    266                raise IndentationError("expected an indented block")
    267
    268            levels.append(depth)
    269            yield INDENT(token.lineno)
    270
    271        elif token.at_line_start:
    272            # Must be on the same level or one of the previous levels
    273            if depth == levels[-1]:
    274                # At the same level
    275                pass
    276            elif depth > levels[-1]:
    277                raise IndentationError("indentation increase but not in new block")
    278            else:
    279                # Back up; but only if it matches a previous level
    280                try:
    281                    i = levels.index(depth)
    282                except ValueError:
    283                    raise IndentationError("inconsistent indentation")
    284                for _ in range(i+1, len(levels)):
    285                    yield DEDENT(token.lineno)
    286                    levels.pop()
    287
    288        yield token
    289
    290    ### Finished processing ###
    291
    292    # Must dedent any remaining levels
    293    if len(levels) > 1:
    294        assert token is not None
    295        for _ in range(1, len(levels)):
    296            yield DEDENT(token.lineno)
    297    
    298
    299# The top-level filter adds an ENDMARKER, if requested.
    300# Python's grammar uses it.
    301def filter(lexer, add_endmarker = True):
    302    token = None
    303    tokens = iter(lexer.token, None)
    304    tokens = track_tokens_filter(lexer, tokens)
    305    for token in indentation_filter(tokens):
    306        yield token
    307
    308    if add_endmarker:
    309        lineno = 1
    310        if token is not None:
    311            lineno = token.lineno
    312        yield _new_token("ENDMARKER", lineno)
    313
    314# Combine Ply and my filters into a new lexer
    315
    316class IndentLexer(object):
    317    def __init__(self, debug=0, optimize=0, lextab='lextab', reflags=0):
    318        self.lexer = lex.lex(debug=debug, optimize=optimize, lextab=lextab, reflags=reflags)
    319        self.token_stream = None
    320    def input(self, s, add_endmarker=True):
    321        self.lexer.paren_count = 0
    322        self.lexer.input(s)
    323        self.token_stream = filter(self.lexer, add_endmarker)
    324    def token(self):
    325        try:
    326            return self.token_stream.next()
    327        except StopIteration:
    328            return None
    329
    330##########   Parser (tokens -> AST) ######
    331
    332# also part of Ply
    333#import yacc
    334
    335# I use the Python AST
    336from compiler import ast
    337
    338# Helper function
    339def Assign(left, right):
    340    names = []
    341    if isinstance(left, ast.Name):
    342        # Single assignment on left
    343        return ast.Assign([ast.AssName(left.name, 'OP_ASSIGN')], right)
    344    elif isinstance(left, ast.Tuple):
    345        # List of things - make sure they are Name nodes
    346        names = []
    347        for child in left.getChildren():
    348            if not isinstance(child, ast.Name):
    349                raise SyntaxError("that assignment not supported")
    350            names.append(child.name)
    351        ass_list = [ast.AssName(name, 'OP_ASSIGN') for name in names]
    352        return ast.Assign([ast.AssTuple(ass_list)], right)
    353    else:
    354        raise SyntaxError("Can't do that yet")
    355
    356
    357# The grammar comments come from Python's Grammar/Grammar file
    358
    359## NB: compound_stmt in single_input is followed by extra NEWLINE!
    360# file_input: (NEWLINE | stmt)* ENDMARKER
    361def p_file_input_end(p):
    362    """file_input_end : file_input ENDMARKER"""
    363    p[0] = ast.Stmt(p[1])
    364def p_file_input(p):
    365    """file_input : file_input NEWLINE
    366                  | file_input stmt
    367                  | NEWLINE
    368                  | stmt"""
    369    if isinstance(p[len(p)-1], basestring):
    370        if len(p) == 3:
    371            p[0] = p[1]
    372        else:
    373            p[0] = [] # p == 2 --> only a blank line
    374    else:
    375        if len(p) == 3:
    376            p[0] = p[1] + p[2]
    377        else:
    378            p[0] = p[1]
    379            
    380
    381# funcdef: [decorators] 'def' NAME parameters ':' suite
    382# ignoring decorators
    383def p_funcdef(p):
    384    "funcdef : DEF NAME parameters COLON suite"
    385    p[0] = ast.Function(None, p[2], tuple(p[3]), (), 0, None, p[5])
    386    
    387# parameters: '(' [varargslist] ')'
    388def p_parameters(p):
    389    """parameters : LPAR RPAR
    390                  | LPAR varargslist RPAR"""
    391    if len(p) == 3:
    392        p[0] = []
    393    else:
    394        p[0] = p[2]
    395    
    396
    397# varargslist: (fpdef ['=' test] ',')* ('*' NAME [',' '**' NAME] | '**' NAME) | 
    398# highly simplified
    399def p_varargslist(p):
    400    """varargslist : varargslist COMMA NAME
    401                   | NAME"""
    402    if len(p) == 4:
    403        p[0] = p[1] + p[3]
    404    else:
    405        p[0] = [p[1]]
    406
    407# stmt: simple_stmt | compound_stmt
    408def p_stmt_simple(p):
    409    """stmt : simple_stmt"""
    410    # simple_stmt is a list
    411    p[0] = p[1]
    412    
    413def p_stmt_compound(p):
    414    """stmt : compound_stmt"""
    415    p[0] = [p[1]]
    416
    417# simple_stmt: small_stmt (';' small_stmt)* [';'] NEWLINE
    418def p_simple_stmt(p):
    419    """simple_stmt : small_stmts NEWLINE
    420                   | small_stmts SEMICOLON NEWLINE"""
    421    p[0] = p[1]
    422
    423def p_small_stmts(p):
    424    """small_stmts : small_stmts SEMICOLON small_stmt
    425                   | small_stmt"""
    426    if len(p) == 4:
    427        p[0] = p[1] + [p[3]]
    428    else:
    429        p[0] = [p[1]]
    430
    431# small_stmt: expr_stmt | print_stmt  | del_stmt | pass_stmt | flow_stmt |
    432#    import_stmt | global_stmt | exec_stmt | assert_stmt
    433def p_small_stmt(p):
    434    """small_stmt : flow_stmt
    435                  | expr_stmt"""
    436    p[0] = p[1]
    437
    438# expr_stmt: testlist (augassign (yield_expr|testlist) |
    439#                      ('=' (yield_expr|testlist))*)
    440# augassign: ('+=' | '-=' | '*=' | '/=' | '%=' | '&=' | '|=' | '^=' |
    441#             '<<=' | '>>=' | '**=' | '//=')
    442def p_expr_stmt(p):
    443    """expr_stmt : testlist ASSIGN testlist
    444                 | testlist """
    445    if len(p) == 2:
    446        # a list of expressions
    447        p[0] = ast.Discard(p[1])
    448    else:
    449        p[0] = Assign(p[1], p[3])
    450
    451def p_flow_stmt(p):
    452    "flow_stmt : return_stmt"
    453    p[0] = p[1]
    454
    455# return_stmt: 'return' [testlist]
    456def p_return_stmt(p):
    457    "return_stmt : RETURN testlist"
    458    p[0] = ast.Return(p[2])
    459
    460
    461def p_compound_stmt(p):
    462    """compound_stmt : if_stmt
    463                     | funcdef"""
    464    p[0] = p[1]
    465
    466def p_if_stmt(p):
    467    'if_stmt : IF test COLON suite'
    468    p[0] = ast.If([(p[2], p[4])], None)
    469
    470def p_suite(p):
    471    """suite : simple_stmt
    472             | NEWLINE INDENT stmts DEDENT"""
    473    if len(p) == 2:
    474        p[0] = ast.Stmt(p[1])
    475    else:
    476        p[0] = ast.Stmt(p[3])
    477    
    478
    479def p_stmts(p):
    480    """stmts : stmts stmt
    481             | stmt"""
    482    if len(p) == 3:
    483        p[0] = p[1] + p[2]
    484    else:
    485        p[0] = p[1]
    486
    487## No using Python's approach because Ply supports precedence
    488
    489# comparison: expr (comp_op expr)*
    490# arith_expr: term (('+'|'-') term)*
    491# term: factor (('*'|'/'|'%'|'//') factor)*
    492# factor: ('+'|'-'|'~') factor | power
    493# comp_op: '<'|'>'|'=='|'>='|'<='|'<>'|'!='|'in'|'not' 'in'|'is'|'is' 'not'
    494
    495def make_lt_compare((left, right)):
    496    return ast.Compare(left, [('<', right),])
    497def make_gt_compare((left, right)):
    498    return ast.Compare(left, [('>', right),])
    499def make_eq_compare((left, right)):
    500    return ast.Compare(left, [('==', right),])
    501
    502
    503binary_ops = {
    504    "+": ast.Add,
    505    "-": ast.Sub,
    506    "*": ast.Mul,
    507    "/": ast.Div,
    508    "<": make_lt_compare,
    509    ">": make_gt_compare,
    510    "==": make_eq_compare,
    511}
    512unary_ops = {
    513    "+": ast.UnaryAdd,
    514    "-": ast.UnarySub,
    515    }
    516precedence = (
    517    ("left", "EQ", "GT", "LT"),
    518    ("left", "PLUS", "MINUS"),
    519    ("left", "MULT", "DIV"),
    520    )
    521
    522def p_comparison(p):
    523    """comparison : comparison PLUS comparison
    524                  | comparison MINUS comparison
    525                  | comparison MULT comparison
    526                  | comparison DIV comparison
    527                  | comparison LT comparison
    528                  | comparison EQ comparison
    529                  | comparison GT comparison
    530                  | PLUS comparison
    531                  | MINUS comparison
    532                  | power"""
    533    if len(p) == 4:
    534        p[0] = binary_ops[p[2]]((p[1], p[3]))
    535    elif len(p) == 3:
    536        p[0] = unary_ops[p[1]](p[2])
    537    else:
    538        p[0] = p[1]
    539                  
    540# power: atom trailer* ['**' factor]
    541# trailers enables function calls.  I only allow one level of calls
    542# so this is 'trailer'
    543def p_power(p):
    544    """power : atom
    545             | atom trailer"""
    546    if len(p) == 2:
    547        p[0] = p[1]
    548    else:
    549        if p[2][0] == "CALL":
    550            p[0] = ast.CallFunc(p[1], p[2][1], None, None)
    551        else:
    552            raise AssertionError("not implemented")
    553
    554def p_atom_name(p):
    555    """atom : NAME"""
    556    p[0] = ast.Name(p[1])
    557
    558def p_atom_number(p):
    559    """atom : NUMBER
    560            | STRING"""
    561    p[0] = ast.Const(p[1])
    562
    563def p_atom_tuple(p):
    564    """atom : LPAR testlist RPAR"""
    565    p[0] = p[2]
    566
    567# trailer: '(' [arglist] ')' | '[' subscriptlist ']' | '.' NAME
    568def p_trailer(p):
    569    "trailer : LPAR arglist RPAR"
    570    p[0] = ("CALL", p[2])
    571
    572# testlist: test (',' test)* [',']
    573# Contains shift/reduce error
    574def p_testlist(p):
    575    """testlist : testlist_multi COMMA
    576                | testlist_multi """
    577    if len(p) == 2:
    578        p[0] = p[1]
    579    else:
    580        # May need to promote singleton to tuple
    581        if isinstance(p[1], list):
    582            p[0] = p[1]
    583        else:
    584            p[0] = [p[1]]
    585    # Convert into a tuple?
    586    if isinstance(p[0], list):
    587        p[0] = ast.Tuple(p[0])
    588
    589def p_testlist_multi(p):
    590    """testlist_multi : testlist_multi COMMA test
    591                      | test"""
    592    if len(p) == 2:
    593        # singleton
    594        p[0] = p[1]
    595    else:
    596        if isinstance(p[1], list):
    597            p[0] = p[1] + [p[3]]
    598        else:
    599            # singleton -> tuple
    600            p[0] = [p[1], p[3]]
    601
    602
    603# test: or_test ['if' or_test 'else' test] | lambdef
    604#  as I don't support 'and', 'or', and 'not' this works down to 'comparison'
    605def p_test(p):
    606    "test : comparison"
    607    p[0] = p[1]
    608    
    609
    610
    611# arglist: (argument ',')* (argument [',']| '*' test [',' '**' test] | '**' test)
    612# XXX INCOMPLETE: this doesn't allow the trailing comma
    613def p_arglist(p):
    614    """arglist : arglist COMMA argument
    615               | argument"""
    616    if len(p) == 4:
    617        p[0] = p[1] + [p[3]]
    618    else:
    619        p[0] = [p[1]]
    620
    621# argument: test [gen_for] | test '=' test  # Really [keyword '='] test
    622def p_argument(p):
    623    "argument : test"
    624    p[0] = p[1]
    625
    626def p_error(p):
    627    #print "Error!", repr(p)
    628    raise SyntaxError(p)
    629
    630
    631class GardenSnakeParser(object):
    632    def __init__(self, lexer = None):
    633        if lexer is None:
    634            lexer = IndentLexer()
    635        self.lexer = lexer
    636        self.parser = yacc.yacc(start="file_input_end")
    637
    638    def parse(self, code):
    639        self.lexer.input(code)
    640        result = self.parser.parse(lexer = self.lexer)
    641        return ast.Module(None, result)
    642
    643
    644###### Code generation ######
    645    
    646from compiler import misc, syntax, pycodegen
    647
    648class GardenSnakeCompiler(object):
    649    def __init__(self):
    650        self.parser = GardenSnakeParser()
    651    def compile(self, code, filename="<string>"):
    652        tree = self.parser.parse(code)
    653        #print  tree
    654        misc.set_filename(filename, tree)
    655        syntax.check(tree)
    656        gen = pycodegen.ModuleCodeGenerator(tree)
    657        code = gen.getCode()
    658        return code
    659
    660####### Test code #######
    661    
    662compile = GardenSnakeCompiler().compile
    663
    664code = r"""
    665
    666print('LET\'S TRY THIS \\OUT')
    667  
    668#Comment here
    669def x(a):
    670    print('called with',a)
    671    if a == 1:
    672        return 2
    673    if a*2 > 10: return 999 / 4
    674        # Another comment here
    675
    676    return a+2*3
    677
    678ints = (1, 2,
    679   3, 4,
    6805)
    681print('mutiline-expression', ints)
    682
    683t = 4+1/3*2+6*(9-5+1)
    684print('predence test; should be 34+2/3:', t, t==(34+2/3))
    685
    686print('numbers', 1,2,3,4,5)
    687if 1:
    688 8
    689 a=9
    690 print(x(a))
    691
    692print(x(1))
    693print(x(2))
    694print(x(8),'3')
    695print('this is decimal', 1/5)
    696print('BIG DECIMAL', 1.234567891234567e12345)
    697
    698"""
    699
    700# Set up the GardenSnake run-time environment
    701def print_(*args):
    702    print "-->", " ".join(map(str,args))
    703
    704globals()["print"] = print_
    705
    706compiled_code = compile(code)
    707
    708exec compiled_code in globals()
    709print "Done"