lolpython.py (21148B)
1#!/usr/bin/env python 2# Implementation of the LOLPython language. 3# Converts from LOLPython to Python then optionally runs the Python. 4 5# This package depends on PLY -- http://www.dabeaz.com/ply/ 6 7# Written by Andrew Dalke <dalke@dalkescientific.com> 8# Dalke Scientific Software, LLC 9# 1 June 2007, Gothenburg, Sweden 10# 11# This software is in the public domain. For details see: 12# http://creativecommons.org/licenses/publicdomain/ 13 14 15import sys 16import keyword 17import os 18import types 19from cStringIO import StringIO 20from ply import * 21 22 23__NAME__ = "lolpython" 24__VERSION__ = "1.0" 25 26# Translating LOLPython tokens to Python tokens 27# This could be cleaned up. For example, some of 28# these tokens could be merged into one. 29tokens = ( 30 "NAME", # variable names 31 "RESERVED", # Used for Python reserved names 32 "NUMBER", # Integers and floats 33 "STRING", 34 "OP", # Like the Python OP 35 "CLOSE", # Don't really need this.. 36 37 "COMMENT", 38 "AUTOCALL", # write t.value then add '(' 39 "INLINE", # write t.value directly 40 "FUTURE", # for the "I FUTURE CAT WITH" statement 41 "PRINT", # VISIBLE -> stdout or COMPLAIN -> stderr 42 43 "ENDMARKER", 44 "COLON", 45 "WS", 46 "NEWLINE", 47) 48 49# Helper functions for making given token types 50def OP(t, value): 51 t.type = "OP" 52 t.value = value 53 return t 54 55def RESERVED(t, value): 56 t.type = "RESERVED" 57 t.value = value 58 return t 59 60def AUTOCALL(t, value): 61 t.type = "AUTOCALL" 62 t.value = "tuple" 63 t.lexer.paren_stack.append(")") 64 return t 65 66def INLINE(t, value): 67 t.type = "INLINE" 68 t.value = value 69 return t 70 71##### 72 73# ply uses a large regex for token detection, and sre is limited to 74# 100 groups. This grammar pushes the limit. I use (?:non-grouping) 75# parens to keep the count down. 76 77 78def t_ASSIGN(t): # cannot be a simple pattern because it must 79 r'CAN[ ]+HA[SZ]\b' # come before the t_NAME definition 80 return OP(t, "=") 81 82def t_SINGLE_QUOTE_STRING(t): 83 r"'([^\\']+|\\'|\\\\)*'" # I think this is right ... 84 t.type = "STRING" 85 t.value = t.value[1:-1].decode("string-escape") 86 return t 87 88def t_DOUBLE_QUOTE_STRING(t): 89 r'"([^\\"]+|\\"|\\\\)*"' 90 t.type = "STRING" 91 t.value = t.value[1:-1].decode("string-escape") 92 print(t.value) 93 return t 94 95# and LOL quoted strings! They end with /LOL 96# No way to have "/LOL" in the string. 97def t_LOL_STRING(t): 98 r"LOL[ ]*((?!/LOL).|\n)*[ ]*/LOL" 99 t.type = "STRING" 100 t.value = t.value[3:-4].strip(" ") 101 return t 102 103# Aliases for the same thing - for extra cuteness 104def t_LSQUARE(t): 105 r"(?:SOME|LOOK[ ]AT|LET[ ]+THE)\b" 106 t.lexer.paren_stack.append(']') 107 return OP(t, "[") 108 109def t_LPAREN(t): 110 r"(?:WIT|THEZ)\b" 111 t.lexer.paren_stack.append(')') 112 return OP(t, "(") 113 114def t_LBRACE(t): 115 r"BUCKET\b" 116 t.lexer.paren_stack.append("}") 117 return OP(t, "{") 118 119def t_CLOSE(t): 120 r"(?:OK(!+|\b)|!+)" 121 stack = t.lexer.paren_stack 122 if t.value.startswith("OK"): 123 num_closes = len(t.value)-1 # OK -> 1, OK! -> 2, OK!!->3 124 else: 125 num_closes = len(t.value) # ! -> 1, !! -> 2 126 # Which close is this? I use "OK" to match (, [ and { 127 if len(stack) < num_closes: 128 raise AssertionError("not enough opens on the stack: line %d" 129 % (t.lineno,)) 130 t.value = "".join(stack[-num_closes:][::-1]) 131 del stack[-num_closes:] 132 return t 133 134def t_EQ(t): 135 r"KINDA[ ]+LIKE\b" 136 return OP(t, "==") 137 138def t_NE(t): 139 r"(?:KINDA[ ]+)?NOT[ ]+LIKE\b" 140 return OP(t, "!=") 141 142def t_is(t): 143 r"KINDA[ ]+IS\b" 144 return RESERVED(t, "is") 145 146def t_GT(t): 147 r"ATE[ ]+MORE[ ]+CHEEZBURGERS?[ ]+THAN\b" 148 return OP(t, ">") 149 150def t_LT(t): 151 r"ATE[ ]+FEWER[ ]+CHEEZBURGERS?[ ]+THAN\b" 152 return OP(t, "<") 153 154def t_GTE(t): 155 r"BIG[ ]+LIKE\b" 156 return OP(t, ">=") 157 158def t_LTE(t): 159 r"SMALL[ ]+LIKE\b" 160 return OP(t, "<=") 161 162def t_RETURN(t): 163 r"U[ ]+TAKE\b" 164 return RESERVED(t, "return") 165 166def t_yield(t): 167 r"U[ ]+BORROW\b" 168 return RESERVED(t, "yield") 169 170def t_ELIF(t): 171 r"OR[ ]+IZ\b" 172 return RESERVED(t, "elif") 173 174def t_ELSE(t): 175 r"(?:(?:I[ ]+GIVE[ ]+UP|IZ[ ]+KEWL|ALL[ ]+DONE)|NOPE)\b" 176 return RESERVED(t, "else") 177 178def t_COLON(t): 179 r"\?" 180 t.value = ":" 181 return t 182 183def t_FROM(t): 184 r"IN[ ]+MAI\b" 185 return RESERVED(t, "from") 186 187def t_EXCEPT(t): 188 r"O[ ]+NOES\b" 189 return RESERVED(t, "except") 190 191def t_PLUS(t): 192 r"ALONG[ ]+WITH\b" 193 return OP(t, "+") 194def t_MINUS(t): 195 r"TAKE[ ]+AWAY\b" 196 return OP(t, "-") 197 198def t_PLUS_EQUAL(t): 199 r"GETZ[ ]+ANOTHR\b" 200 return OP(t, "+=") 201 202def t_MINUS_EQUAL(t): 203 r"THROW[SZ]?[ ]+AWAY\b" 204 return OP(t, "-=") 205 206def t_DIV(t): 207 r"SMASHES[ ]+INTO\b" 208 return OP(t, "/") 209def t_DIV_EQUAL(t): 210 r"SMASHES[ ]+INTO[ ]+HAS\b" 211 return OP(t, "/=") 212def t_TRUEDIV(t): 213 r"SMASHES[ ]+NICELY[ ]+INTO\b" 214 return OP(t, "//") 215def t_MUL(t): 216 r"OF[ ]THOSE\b" 217 return OP(t, "*") 218def t_MUL_EQUAL(t): 219 r"COPIES[ ]+(?:HIM|HER|IT)SELF[ ]+BY\b" 220 return OP(t, "*=") 221def t_POW(t): 222 r"BY[ ]+GRAYSKULL[ ]+POWER" 223 return OP(t, "**") 224def t_IN(t): 225 r"IN[ ]+(?:UR|THE|THIS)\b" 226 return OP(t, "in") 227def t_del(t): 228 r"DO[ ]+NOT[ ]+WANT\b" 229 return RESERVED(t, "del") 230def t_and(t): 231 r"\&" 232 return RESERVED(t, "and") 233def t_or(t): 234 r"OR[ ]+MABEE\b" 235 return RESERVED(t, "or") 236 237def t_pass(t): 238 r"I[ ]+IZ[ ]+CUTE\b" 239 return RESERVED(t, "pass") 240 241def t_forever(t): 242 r"WHILE[ ]+I[ ]+CUTE\b" 243 return INLINE(t, "while 1") 244 245def t_def(t): 246 r"SO[ ]+IM[ ]+LIKE\b" 247 return RESERVED(t, "def") 248 249def t_class(t): 250 r"ME[ ]+MAKE[ ]\b" 251 return RESERVED(t, "class") 252 253def t_future(t): 254 r"I[ ]+FUTURE[ ]+CAT[ ]+WITH\b" 255 t.type = "FUTURE" 256 return t 257 258def t_assert(t): 259 r"SO[ ]+GOOD\b" 260 return RESERVED(t, "assert") 261 262def t_assert_not(t): 263 r"AINT[ ]+GOOD\b" 264 return INLINE(t, "assert not ") 265 266def t_for(t): 267 r"GIMME[ ]+EACH\b" 268 return RESERVED(t, "for") 269 270def t_list(t): 271 r"ALL[ ]+OF\b" 272 return AUTOCALL(t, "tuple") 273 274RESERVED_VALUES = { 275 "EASTERBUNNY": ("NUMBER", "0"), 276 "CHEEZBURGER": ("NUMBER", "1"), 277 "CHOKOLET": ("NUMBER", "-1"), 278 "TWIN": ("NUMBER", "2"), 279 "TWINZ": ("NUMBER", "2"), 280 "TWINS": ("NUMBER", "2"), 281 "EVILTWIN": ("NUMBER", "-2"), 282 "EVILTWINZ": ("NUMBER", "-2"), 283 "EVILTWINS": ("NUMBER", "-2"), 284 "ALLFINGERZ": ("NUMBER", "10"), 285 "TOEZ": ("NUMBER", "-10"), 286 "ONE": ("NUMBER", "1"), 287 "ONCE": ("NUMBER", "1"), 288 "TWO": ("NUMBER", "2"), 289 "TWICE": ("NUMBER", "2"), 290 "THR33": ("NUMBER", "3"), 291 "FOUR": ("NUMBER", "4"), 292 "FIV": ("NUMBER", "5"), 293 "SIKS": ("NUMBER", "6"), 294 "SEVN": ("NUMBER", "7"), 295 "ATE": ("NUMBER", "8"), 296 "NINE": ("NUMBER", "9"), 297 "MEH": ("NAME", "False"), 298 "YEAH": ("NAME", "True"), 299 "VISIBLE": ("PRINT", "stdout"), 300 "COMPLAIN": ("PRINT", "stderr"), 301 "AND": ("OP", ","), 302 "BLACKHOLE": ("RESERVED", "ZeroDivisionError"), 303 "DONOTLIKE": ("AUTOCALL", "AssertionError"), 304 305 "ANTI": ("OP", "-"), 306 "IZ": ("RESERVED", "if"), 307 "GIMME": ("RESERVED", "import"), 308 "LIKE": ("RESERVED", "as"), 309 "OWN": ("OP", "."), 310 311 "PLZ": ("RESERVED", "try"), 312 "HALP": ("RESERVED", "raise"), 313 "WHATEVER": ("RESERVED", "finally"), 314 "KTHX": ("RESERVED", "continue"), 315 "KTHXBYE": ("RESERVED", "break"), 316 317 "OVER": ("OP", "/"), 318 319 "AINT": ("RESERVED", "not"), 320 "ME": ("RESERVED", "self"), 321 322 "STRING": ("AUTOCALL", "str"), 323 "NUMBR": ("AUTOCALL", "int"), 324 "BIGNESS": ("AUTOCALL", "len"), 325 "NUMBRZ": ("AUTOCALL", "range"), 326 "ADDED": ("AUTOCALL", ".append"), 327 328 "ARGZ": ("INLINE", "_lol_sys.argv"), 329 "THINGZ": ("INLINE", "()"), # invisible tuple didn't sound right 330 "THING": ("INLINE", "()"), # sometimes it's better in singular form 331 "MY": ("INLINE", "self."), 332 "MYSELF": ("INLINE", "(self)"), 333 334 "EVEN": ("INLINE", "% 2 == 0"), 335 "ODD": ("INLINE", "% 2 == 1"), 336 "WIF": ("RESERVED", "with"), 337 } 338 339def t_FLOAT(t): 340 r"""(?:\d+(?:\.\d*)?|\.\d+)(?:[eE][-+]? \d+)?""" 341 t.value = t.value 342 t.type = "NUMBER" 343 return t 344 345def t_INT(t): 346 r"\d+" 347 t.type = "NUMBER" 348 return t 349 350def t_INVISIBLE(t): 351 r"INVISIBLE([ ]+(LIST|STRING|BUCKET))?\b" 352 if "LIST" in t.value: 353 t.type = "INLINE" 354 t.value = "[]" 355 elif "STRING" in t.value: 356 t.type = "INLINE" 357 t.value = '""' 358 elif "BUCKET" in t.value: 359 t.type = "INLINE" 360 t.value = "{}" 361 else: 362 RESERVED(t, "None") 363 return t 364 365# Not consuming the newline. Needed for "IZ EASTERBUNNY? BTW comment" 366def t_COMMENT(t): 367 r"[ ]*(?:BTW|WTF)[^\n]*" 368 return t 369 370def t_NAME(t): 371 r'[a-zA-Z_][a-zA-Z0-9_]*' 372 if t.value in RESERVED_VALUES: 373 type, value = RESERVED_VALUES[t.value] 374 t.type = type 375 t.value = value 376 if t.type == "AUTOCALL": 377 t.lexer.paren_stack.append(")") 378 return t 379 380def t_WS(t): 381 r' [ ]+ ' 382 if t.lexer.at_line_start and not t.lexer.paren_stack: 383 return t 384 385 386# Don't generate newline tokens when inside of parens 387def t_newline(t): 388 r'\n+' 389 t.lexer.lineno += len(t.value) 390 t.type = "NEWLINE" 391 if not t.lexer.paren_stack: 392 return t 393 394 395def t_error(t): 396 raise SyntaxError("Unknown symbol %r" % (t.value[0],)) 397 print "Skipping", repr(t.value[0]) 398 t.lexer.skip(1) 399 400 401## I implemented INDENT / DEDENT generation as a post-processing filter 402 403# The original lex token stream contains WS and NEWLINE characters. 404# WS will only occur before any other tokens on a line. 405 406# I have three filters. One tags tokens by adding two attributes. 407# "must_indent" is True if the token must be indented from the 408# previous code. The other is "at_line_start" which is True for WS 409# and the first non-WS/non-NEWLINE on a line. It flags the check so 410# see if the new line has changed indication level. 411 412# Python's syntax has three INDENT states 413# 0) no colon hence no need to indent 414# 1) "if 1: go()" - simple statements have a COLON but no need for an indent 415# 2) "if 1:\n go()" - complex statements have a COLON NEWLINE and must indent 416NO_INDENT = 0 417MAY_INDENT = 1 418MUST_INDENT = 2 419 420# only care about whitespace at the start of a line 421def track_tokens_filter(lexer, tokens): 422 lexer.at_line_start = at_line_start = True 423 indent = NO_INDENT 424 for token in tokens: 425 token.at_line_start = at_line_start 426 427 if token.type == "COLON": 428 at_line_start = False 429 indent = MAY_INDENT 430 token.must_indent = False 431 432 elif token.type == "NEWLINE": 433 at_line_start = True 434 if indent == MAY_INDENT: 435 indent = MUST_INDENT 436 token.must_indent = False 437 438 elif token.type == "WS": 439 assert token.at_line_start == True 440 at_line_start = True 441 token.must_indent = False 442 443 elif token.type == "COMMENT": 444 pass 445 446 else: 447 # A real token; only indent after COLON NEWLINE 448 if indent == MUST_INDENT: 449 token.must_indent = True 450 else: 451 token.must_indent = False 452 at_line_start = False 453 454 indent = NO_INDENT 455 456 yield token 457 lexer.at_line_start = at_line_start 458 459def _new_token(type, lineno): 460 tok = lex.LexToken() 461 tok.type = type 462 tok.value = None 463 tok.lineno = lineno 464 tok.lexpos = -1 465 return tok 466 467# Synthesize a DEDENT tag 468def DEDENT(lineno): 469 return _new_token("DEDENT", lineno) 470 471# Synthesize an INDENT tag 472def INDENT(lineno): 473 return _new_token("INDENT", lineno) 474 475 476# Track the indentation level and emit the right INDENT / DEDENT events. 477def indentation_filter(tokens): 478 # A stack of indentation levels; will never pop item 0 479 levels = [0] 480 token = None 481 depth = 0 482 prev_was_ws = False 483 for token in tokens: 484## if 1: 485## print "Process", token, 486## if token.at_line_start: 487## print "at_line_start", 488## if token.must_indent: 489## print "must_indent", 490## print 491 492 # WS only occurs at the start of the line 493 # There may be WS followed by NEWLINE so 494 # only track the depth here. Don't indent/dedent 495 # until there's something real. 496 if token.type == "WS": 497 assert depth == 0 498 depth = len(token.value) 499 prev_was_ws = True 500 # Don't forward WS to the parser 501 continue 502 503 if token.type == "NEWLINE": 504 depth = 0 505 if prev_was_ws or token.at_line_start: 506 # ignore blank lines 507 continue 508 # pass the other cases on through 509 yield token 510 continue 511 512 if token.type == "COMMENT": 513 yield token 514 continue 515 516 # then it must be a real token (not WS, not NEWLINE) 517 # which can affect the indentation level 518 519 prev_was_ws = False 520 if token.must_indent: 521 # The current depth must be larger than the previous level 522 if not (depth > levels[-1]): 523 raise IndentationError("expected an indented block") 524 525 levels.append(depth) 526 yield INDENT(token.lineno) 527 528 elif token.at_line_start: 529 # Must be on the same level or one of the previous levels 530 if depth == levels[-1]: 531 # At the same level 532 pass 533 elif depth > levels[-1]: 534 raise IndentationError("indentation increase but not in new block") 535 else: 536 # Back up; but only if it matches a previous level 537 try: 538 i = levels.index(depth) 539 except ValueError: 540 raise IndentationError("inconsistent indentation") 541 for _ in range(i+1, len(levels)): 542 yield DEDENT(token.lineno) 543 levels.pop() 544 545 yield token 546 547 ### Finished processing ### 548 549 # Must dedent any remaining levels 550 if len(levels) > 1: 551 assert token is not None 552 for _ in range(1, len(levels)): 553 yield DEDENT(token.lineno) 554 555 556# The top-level filter adds an ENDMARKER, if requested. 557# Python's grammar uses it. 558def token_filter(lexer, add_endmarker = True): 559 token = None 560 tokens = iter(lexer.token, None) 561 tokens = track_tokens_filter(lexer, tokens) 562 for token in indentation_filter(tokens): 563 yield token 564 565 if add_endmarker: 566 lineno = 1 567 if token is not None: 568 lineno = token.lineno 569 yield _new_token("ENDMARKER", lineno) 570 571class LOLLexer(object): 572 def __init__(self, debug=0, optimize=0, lextab='lextab', reflags=0): 573 self.lexer = lex.lex(debug=debug, optimize=optimize, 574 lextab=lextab, reflags=reflags) 575 self.token_stream = None 576 def input(self, s, add_endmarker=True): 577 self.lexer.paren_stack = [] 578 self.lexer.input(s) 579 self.token_stream = token_filter(self.lexer, add_endmarker) 580 def token(self): 581 try: 582 return self.token_stream.next() 583 except StopIteration: 584 return None 585 586# Helper class to generate logically correct indented Python code 587class IndentWriter(object): 588 def __init__(self, outfile): 589 self.outfile = outfile 590 self.at_first_column = True 591 self.indent = 0 592 def write(self, text): 593 if self.at_first_column: 594 self.outfile.write(" "*self.indent) 595 self.at_first_column = False 596 self.outfile.write(text) 597 598# Split things up because the from __future__ statements must 599# go before any other code. 600HEADER = """# LOLPython to Python converter version 1.0 601# Written by Andrew Dalke, who should have been working on better things. 602 603""" 604 605BODY = """ 606# sys is used for COMPLAIN and ARGZ 607import sys as _lol_sys 608 609""" 610 611def to_python(s): 612 L = LOLLexer() 613 L.input(s) 614 615 header = StringIO() 616 header.write(HEADER) 617 header_output = IndentWriter(header) 618 619 body = StringIO() 620 body.write(BODY) 621 body_output = IndentWriter(body) 622 623 write = body_output.write 624 output = body_output 625 626 for t in iter(L.token_stream): 627 if t.type == "NAME": 628 # Need to escape names which are Python variables Do that 629 # by appending an "_". But then I also need to make sure 630 # that "yield_" does not collide with "yield". And you 631 # thought you were being clever trying to use a Python 632 # variable. :) 633 name = t.value.rstrip("_") 634 if name in keyword.kwlist: 635 write(t.value + "_ ") 636 else: 637 write(t.value + " ") 638 639 elif t.type in ("RESERVED", "OP", "NUMBER", "CLOSE"): 640 # While not pretty, I'll put a space after each 641 # term because it's the simplest solution. Otherwise 642 # I'll need to track the amount of whitespace between 643 # the tokens in the original text. 644 write(t.value+" ") 645 646 # XXX escape names which are special in Python! 647 elif t.type == "STRING": 648 write(repr(t.value) + " ") 649 650 elif t.type == "COMMENT": 651 # Not enough information to keep comments on the correct 652 # indentation level. This is good enough. Ugly though. 653 # Maybe I need to fix the tokenizer. 654 write("#"+ t.value[3:]+"\n") 655 output.at_first_column = True 656 657 elif t.type == "COLON": 658 write(":") 659 660 elif t.type == "INDENT": 661 output.indent += 1 662 pass 663 elif t.type == "DEDENT": 664 output.indent -= 1 665 pass 666 elif t.type == "NEWLINE": 667 write(t.value) 668 output.at_first_column = True 669 output = body_output 670 write = output.write 671 elif t.type == "PRINT": 672 if t.value == "stdout": 673 write("print ") 674 elif t.value == "stderr": 675 write("print >>_lol_sys.stderr, ") 676 else: 677 raise AssertionError(t.value) 678 elif t.type == "AUTOCALL": 679 write(t.value + "(") 680 elif t.type == "INLINE": 681 write(t.value) 682 elif t.type == "ENDMARKER": 683 write("\n# The end.\n") 684 elif t.type == "WS": 685 output.leading_ws = t.value 686 elif t.type == "FUTURE": 687 # Write to the header. This is a hack. Err, a hairball. 688 output = header_output 689 write = output.write 690 write("from __future__ import ") 691 692 else: 693 raise AssertionError(t.type) 694 695 return header.getvalue() + body.getvalue() 696 697 698# API code for doing the translation and exec'ing the result 699 700def execfile(infile, module_name="__lolmain__"): 701 "file, module_name -- exec the lolpython file in a newly created module" 702 if not hasattr(infile, "read"): 703 s = open(infile).read() 704 else: 705 s = infile.read() 706 return execstring(s, module_name) 707 708def execstring(s, module_name="__lolmain__"): 709 "s, module_name -- exec the lolpython string in a newly created module" 710 python_s = to_python(s) 711 # Doing this bit of trickiness so I can have LOLPython code act 712 # like __main__. This fix is enough to fool unittest. 713 m = types.ModuleType(module_name) 714 sys.modules[module_name] = m 715 exec python_s in m.__dict__ 716 return m 717 718def convert_file(infile, outfile): 719 "read LOLPython code from infile, write converted Python code to outfile" 720 if not hasattr(outfile, "write"): 721 outfile = open(outfile, "w") 722 outfile.write(to_python(infile.read())) 723 724def convert(filenames): 725 "convert LOLPython filenames into corresponding Python '.py' files" 726 if not filenames: 727 convert_file(sys.stdin, sys.stdout) 728 else: 729 for filename in filenames: 730 base, ext = os.path.splitext(filename) 731 convert_file(open(filename), open(base+".py", "w")) 732 733def help(): 734 print """convert and run a lolpython program 735Commands are: 736 lolpython Read a lolpython program from stdin and execute it 737 lolpython --convert Convert a lolpython program from stdin 738 and generate python to stdout 739 lolpython --convert filename1 [filename....] 740 Convert a list of lolpython files into Python files 741 lolpython filename [arg1 [arg2 ...]] 742 Run a lolpython program using optional arguments 743""" 744 745def main(argv): 746 if len(argv) >= 2: 747 if argv[1] == "--convert": 748 convert(argv[2:]) 749 return 750 if argv[1] == "--help": 751 help() 752 return 753 if argv[1] == "--version": 754 print __NAME__ + " " + __VERSION__ 755 return 756 757 # otherwise, run the lolpython program 758 sys.argv = sys.argv[1:] 759 filename = sys.argv[0] 760 execfile(filename, "__main__") 761 else: 762 # commands from stdin 763 execfile(sys.stdin) 764 765 766 767if __name__ == "__main__": 768 main(sys.argv)