lex.py - cscg24-lolpython - CSCG 2024 Challenge 'Can I Haz Lolpython?'

	cscg24-lolpython CSCG 2024 Challenge 'Can I Haz Lolpython?'
	git clone https://git.sinitax.com/sinitax/cscg24-lolpython
	Log \| Files \| Refs \| sfeed.txt
lex.py (33586B)
      1#-----------------------------------------------------------------------------
      2# ply: lex.py
      3#
      4# Author: David M. Beazley (dave@dabeaz.com)
      5#
      6# Copyright (C) 2001-2006, David M. Beazley
      7#
      8# This library is free software; you can redistribute it and/or
      9# modify it under the terms of the GNU Lesser General Public
     10# License as published by the Free Software Foundation; either
     11# version 2.1 of the License, or (at your option) any later version.
     12# 
     13# This library is distributed in the hope that it will be useful,
     14# but WITHOUT ANY WARRANTY; without even the implied warranty of
     15# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     16# Lesser General Public License for more details.
     17# 
     18# You should have received a copy of the GNU Lesser General Public
     19# License along with this library; if not, write to the Free Software
     20# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
     21# 
     22# See the file COPYING for a complete copy of the LGPL.
     23#-----------------------------------------------------------------------------
     24
     25__version__ = "2.2"
     26
     27import re, sys, types
     28
     29# Regular expression used to match valid token names
     30_is_identifier = re.compile(r'^[a-zA-Z0-9_]+$')
     31
     32# Available instance types.  This is used when lexers are defined by a class.
     33# It's a little funky because I want to preserve backwards compatibility
     34# with Python 2.0 where types.ObjectType is undefined.
     35
     36try:
     37   _INSTANCETYPE = (types.InstanceType, types.ObjectType)
     38except AttributeError:
     39   _INSTANCETYPE = types.InstanceType
     40   class object: pass       # Note: needed if no new-style classes present
     41
     42# Exception thrown when invalid token encountered and no default error
     43# handler is defined.
     44class LexError(Exception):
     45    def __init__(self,message,s):
     46         self.args = (message,)
     47         self.text = s
     48
     49# Token class
     50class LexToken(object):
     51    def __str__(self):
     52        return "LexToken(%s,%r,%d,%d)" % (self.type,self.value,self.lineno,self.lexpos)
     53    def __repr__(self):
     54        return str(self)
     55    def skip(self,n):
     56        self.lexer.skip(n)
     57
     58# -----------------------------------------------------------------------------
     59# Lexer class
     60#
     61# This class encapsulates all of the methods and data associated with a lexer.
     62#
     63#    input()          -  Store a new string in the lexer
     64#    token()          -  Get the next token
     65# -----------------------------------------------------------------------------
     66
     67class Lexer:
     68    def __init__(self):
     69        self.lexre = None             # Master regular expression. This is a list of 
     70                                      # tuples (re,findex) where re is a compiled
     71                                      # regular expression and findex is a list
     72                                      # mapping regex group numbers to rules
     73        self.lexretext = None         # Current regular expression strings
     74        self.lexstatere = {}          # Dictionary mapping lexer states to master regexs
     75        self.lexstateretext = {}      # Dictionary mapping lexer states to regex strings
     76        self.lexstate = "INITIAL"     # Current lexer state
     77        self.lexstatestack = []       # Stack of lexer states
     78        self.lexstateinfo = None      # State information
     79        self.lexstateignore = {}      # Dictionary of ignored characters for each state
     80        self.lexstateerrorf = {}      # Dictionary of error functions for each state
     81        self.lexreflags = 0           # Optional re compile flags
     82        self.lexdata = None           # Actual input data (as a string)
     83        self.lexpos = 0               # Current position in input text
     84        self.lexlen = 0               # Length of the input text
     85        self.lexerrorf = None         # Error rule (if any)
     86        self.lextokens = None         # List of valid tokens
     87        self.lexignore = ""           # Ignored characters
     88        self.lexliterals = ""         # Literal characters that can be passed through
     89        self.lexmodule = None         # Module
     90        self.lineno = 1               # Current line number
     91        self.lexdebug = 0             # Debugging mode
     92        self.lexoptimize = 0          # Optimized mode
     93
     94    def clone(self,object=None):
     95        c = Lexer()
     96        c.lexstatere = self.lexstatere
     97        c.lexstateinfo = self.lexstateinfo
     98        c.lexstateretext = self.lexstateretext
     99        c.lexstate = self.lexstate
    100        c.lexstatestack = self.lexstatestack
    101        c.lexstateignore = self.lexstateignore
    102        c.lexstateerrorf = self.lexstateerrorf
    103        c.lexreflags = self.lexreflags
    104        c.lexdata = self.lexdata
    105        c.lexpos = self.lexpos
    106        c.lexlen = self.lexlen
    107        c.lextokens = self.lextokens
    108        c.lexdebug = self.lexdebug
    109        c.lineno = self.lineno
    110        c.lexoptimize = self.lexoptimize
    111        c.lexliterals = self.lexliterals
    112        c.lexmodule   = self.lexmodule
    113
    114        # If the object parameter has been supplied, it means we are attaching the
    115        # lexer to a new object.  In this case, we have to rebind all methods in
    116        # the lexstatere and lexstateerrorf tables.
    117
    118        if object:
    119            newtab = { }
    120            for key, ritem in self.lexstatere.items():
    121                newre = []
    122                for cre, findex in ritem:
    123                     newfindex = []
    124                     for f in findex:
    125                         if not f or not f[0]:
    126                             newfindex.append(f)
    127                             continue
    128                         newfindex.append((getattr(object,f[0].__name__),f[1]))
    129                newre.append((cre,newfindex))
    130                newtab[key] = newre
    131            c.lexstatere = newtab
    132            c.lexstateerrorf = { }
    133            for key, ef in self.lexstateerrorf.items():
    134                c.lexstateerrorf[key] = getattr(object,ef.__name__)
    135            c.lexmodule = object
    136
    137        # Set up other attributes
    138        c.begin(c.lexstate)
    139        return c
    140
    141    # ------------------------------------------------------------
    142    # writetab() - Write lexer information to a table file
    143    # ------------------------------------------------------------
    144    def writetab(self,tabfile):
    145        tf = open(tabfile+".py","w")
    146        tf.write("# %s.py. This file automatically created by PLY (version %s). Don't edit!\n" % (tabfile,__version__))
    147        tf.write("_lextokens    = %s\n" % repr(self.lextokens))
    148        tf.write("_lexreflags   = %s\n" % repr(self.lexreflags))
    149        tf.write("_lexliterals  = %s\n" % repr(self.lexliterals))
    150        tf.write("_lexstateinfo = %s\n" % repr(self.lexstateinfo))
    151        
    152        tabre = { }
    153        for key, lre in self.lexstatere.items():
    154             titem = []
    155             for i in range(len(lre)):
    156                  titem.append((self.lexstateretext[key][i],_funcs_to_names(lre[i][1])))
    157             tabre[key] = titem
    158
    159        tf.write("_lexstatere   = %s\n" % repr(tabre))
    160        tf.write("_lexstateignore = %s\n" % repr(self.lexstateignore))
    161
    162        taberr = { }
    163        for key, ef in self.lexstateerrorf.items():
    164             if ef:
    165                  taberr[key] = ef.__name__
    166             else:
    167                  taberr[key] = None
    168        tf.write("_lexstateerrorf = %s\n" % repr(taberr))
    169        tf.close()
    170
    171    # ------------------------------------------------------------
    172    # readtab() - Read lexer information from a tab file
    173    # ------------------------------------------------------------
    174    def readtab(self,tabfile,fdict):
    175        exec "import %s as lextab" % tabfile
    176        self.lextokens      = lextab._lextokens
    177        self.lexreflags     = lextab._lexreflags
    178        self.lexliterals    = lextab._lexliterals
    179        self.lexstateinfo   = lextab._lexstateinfo
    180        self.lexstateignore = lextab._lexstateignore
    181        self.lexstatere     = { }
    182        self.lexstateretext = { }
    183        for key,lre in lextab._lexstatere.items():
    184             titem = []
    185             txtitem = []
    186             for i in range(len(lre)):
    187                  titem.append((re.compile(lre[i][0],lextab._lexreflags),_names_to_funcs(lre[i][1],fdict)))
    188                  txtitem.append(lre[i][0])
    189             self.lexstatere[key] = titem
    190             self.lexstateretext[key] = txtitem
    191        self.lexstateerrorf = { }
    192        for key,ef in lextab._lexstateerrorf.items():
    193             self.lexstateerrorf[key] = fdict[ef]
    194        self.begin('INITIAL')
    195         
    196    # ------------------------------------------------------------
    197    # input() - Push a new string into the lexer
    198    # ------------------------------------------------------------
    199    def input(self,s):
    200        if not (isinstance(s,types.StringType) or isinstance(s,types.UnicodeType)):
    201            raise ValueError, "Expected a string"
    202        self.lexdata = s
    203        self.lexpos = 0
    204        self.lexlen = len(s)
    205
    206    # ------------------------------------------------------------
    207    # begin() - Changes the lexing state
    208    # ------------------------------------------------------------
    209    def begin(self,state):
    210        if not self.lexstatere.has_key(state):
    211            raise ValueError, "Undefined state"
    212        self.lexre = self.lexstatere[state]
    213        self.lexretext = self.lexstateretext[state]
    214        self.lexignore = self.lexstateignore.get(state,"")
    215        self.lexerrorf = self.lexstateerrorf.get(state,None)
    216        self.lexstate = state
    217
    218    # ------------------------------------------------------------
    219    # push_state() - Changes the lexing state and saves old on stack
    220    # ------------------------------------------------------------
    221    def push_state(self,state):
    222        self.lexstatestack.append(self.lexstate)
    223        self.begin(state)
    224
    225    # ------------------------------------------------------------
    226    # pop_state() - Restores the previous state
    227    # ------------------------------------------------------------
    228    def pop_state(self):
    229        self.begin(self.lexstatestack.pop())
    230
    231    # ------------------------------------------------------------
    232    # current_state() - Returns the current lexing state
    233    # ------------------------------------------------------------
    234    def current_state(self):
    235        return self.lexstate
    236
    237    # ------------------------------------------------------------
    238    # skip() - Skip ahead n characters
    239    # ------------------------------------------------------------
    240    def skip(self,n):
    241        self.lexpos += n
    242
    243    # ------------------------------------------------------------
    244    # token() - Return the next token from the Lexer
    245    #
    246    # Note: This function has been carefully implemented to be as fast
    247    # as possible.  Don't make changes unless you really know what
    248    # you are doing
    249    # ------------------------------------------------------------
    250    def token(self):
    251        # Make local copies of frequently referenced attributes
    252        lexpos    = self.lexpos
    253        lexlen    = self.lexlen
    254        lexignore = self.lexignore
    255        lexdata   = self.lexdata
    256
    257        while lexpos < lexlen:
    258            # This code provides some short-circuit code for whitespace, tabs, and other ignored characters
    259            if lexdata[lexpos] in lexignore:
    260                lexpos += 1
    261                continue
    262
    263            # Look for a regular expression match
    264            for lexre,lexindexfunc in self.lexre:
    265                m = lexre.match(lexdata,lexpos)
    266                if not m: continue
    267
    268                # Set last match in lexer so that rules can access it if they want
    269                self.lexmatch = m
    270
    271                # Create a token for return
    272                tok = LexToken()
    273                tok.value = m.group()
    274                tok.lineno = self.lineno
    275                tok.lexpos = lexpos
    276                tok.lexer = self
    277
    278                lexpos = m.end()
    279                i = m.lastindex
    280                func,tok.type = lexindexfunc[i]
    281                self.lexpos = lexpos
    282
    283                if not func:
    284                   # If no token type was set, it's an ignored token
    285                   if tok.type: return tok      
    286                   break
    287
    288                # if func not callable, it means it's an ignored token                
    289                if not callable(func):
    290                   break 
    291
    292                # If token is processed by a function, call it
    293                newtok = func(tok)
    294                
    295                # Every function must return a token, if nothing, we just move to next token
    296                if not newtok: 
    297                    lexpos = self.lexpos        # This is here in case user has updated lexpos.
    298                    break
    299                
    300                # Verify type of the token.  If not in the token map, raise an error
    301                if not self.lexoptimize:
    302                    if not self.lextokens.has_key(newtok.type):
    303                        raise LexError, ("%s:%d: Rule '%s' returned an unknown token type '%s'" % (
    304                            func.func_code.co_filename, func.func_code.co_firstlineno,
    305                            func.__name__, newtok.type),lexdata[lexpos:])
    306
    307                return newtok
    308            else:
    309                # No match, see if in literals
    310                if lexdata[lexpos] in self.lexliterals:
    311                    tok = LexToken()
    312                    tok.value = lexdata[lexpos]
    313                    tok.lineno = self.lineno
    314                    tok.lexer = self
    315                    tok.type = tok.value
    316                    tok.lexpos = lexpos
    317                    self.lexpos = lexpos + 1
    318                    return tok
    319        
    320                # No match. Call t_error() if defined.
    321                if self.lexerrorf:
    322                    tok = LexToken()
    323                    tok.value = self.lexdata[lexpos:]
    324                    tok.lineno = self.lineno
    325                    tok.type = "error"
    326                    tok.lexer = self
    327                    tok.lexpos = lexpos
    328                    self.lexpos = lexpos
    329                    newtok = self.lexerrorf(tok)
    330                    if lexpos == self.lexpos:
    331                        # Error method didn't change text position at all. This is an error.
    332                        raise LexError, ("Scanning error. Illegal character '%s'" % (lexdata[lexpos]), lexdata[lexpos:])
    333                    lexpos = self.lexpos
    334                    if not newtok: continue
    335                    return newtok
    336
    337                self.lexpos = lexpos
    338                raise LexError, ("Illegal character '%s' at index %d" % (lexdata[lexpos],lexpos), lexdata[lexpos:])
    339
    340        self.lexpos = lexpos + 1
    341        if self.lexdata is None:
    342             raise RuntimeError, "No input string given with input()"
    343        return None
    344        
    345# -----------------------------------------------------------------------------
    346# _validate_file()
    347#
    348# This checks to see if there are duplicated t_rulename() functions or strings
    349# in the parser input file.  This is done using a simple regular expression
    350# match on each line in the filename.
    351# -----------------------------------------------------------------------------
    352
    353def _validate_file(filename):
    354    import os.path
    355    base,ext = os.path.splitext(filename)
    356    if ext != '.py': return 1        # No idea what the file is. Return OK
    357
    358    try:
    359        f = open(filename)
    360        lines = f.readlines()
    361        f.close()
    362    except IOError:
    363        return 1                       # Oh well
    364
    365    fre = re.compile(r'\s*def\s+(t_[a-zA-Z_0-9]*)\(')
    366    sre = re.compile(r'\s*(t_[a-zA-Z_0-9]*)\s*=')
    367    counthash = { }
    368    linen = 1
    369    noerror = 1
    370    for l in lines:
    371        m = fre.match(l)
    372        if not m:
    373            m = sre.match(l)
    374        if m:
    375            name = m.group(1)
    376            prev = counthash.get(name)
    377            if not prev:
    378                counthash[name] = linen
    379            else:
    380                print "%s:%d: Rule %s redefined. Previously defined on line %d" % (filename,linen,name,prev)
    381                noerror = 0
    382        linen += 1
    383    return noerror
    384
    385# -----------------------------------------------------------------------------
    386# _funcs_to_names()
    387#
    388# Given a list of regular expression functions, this converts it to a list
    389# suitable for output to a table file
    390# -----------------------------------------------------------------------------
    391
    392def _funcs_to_names(funclist):
    393    result = []
    394    for f in funclist:
    395         if f and f[0]:
    396             result.append((f[0].__name__,f[1]))
    397         else:
    398             result.append(f)
    399    return result
    400
    401# -----------------------------------------------------------------------------
    402# _names_to_funcs()
    403#
    404# Given a list of regular expression function names, this converts it back to
    405# functions.
    406# -----------------------------------------------------------------------------
    407
    408def _names_to_funcs(namelist,fdict):
    409     result = []
    410     for n in namelist:
    411          if n and n[0]:
    412              result.append((fdict[n[0]],n[1]))
    413          else:
    414              result.append(n)
    415     return result
    416
    417# -----------------------------------------------------------------------------
    418# _form_master_re()
    419#
    420# This function takes a list of all of the regex components and attempts to
    421# form the master regular expression.  Given limitations in the Python re
    422# module, it may be necessary to break the master regex into separate expressions.
    423# -----------------------------------------------------------------------------
    424
    425def _form_master_re(relist,reflags,ldict):
    426    if not relist: return []
    427    regex = "|".join(relist)
    428    try:
    429        lexre = re.compile(regex,re.VERBOSE | reflags)
    430
    431        # Build the index to function map for the matching engine
    432        lexindexfunc = [ None ] * (max(lexre.groupindex.values())+1)
    433        for f,i in lexre.groupindex.items():
    434            handle = ldict.get(f,None)
    435            if type(handle) in (types.FunctionType, types.MethodType):
    436                lexindexfunc[i] = (handle,handle.__name__[2:])
    437            elif handle is not None:
    438                # If rule was specified as a string, we build an anonymous
    439                # callback function to carry out the action
    440                if f.find("ignore_") > 0:
    441                    lexindexfunc[i] = (None,None)
    442                    print "IGNORE", f
    443                else:
    444                    lexindexfunc[i] = (None, f[2:])
    445         
    446        return [(lexre,lexindexfunc)],[regex]
    447    except Exception,e:
    448        m = int(len(relist)/2)
    449        if m == 0: m = 1
    450        llist, lre = _form_master_re(relist[:m],reflags,ldict)
    451        rlist, rre = _form_master_re(relist[m:],reflags,ldict)
    452        return llist+rlist, lre+rre
    453
    454# -----------------------------------------------------------------------------
    455# def _statetoken(s,names)
    456#
    457# Given a declaration name s of the form "t_" and a dictionary whose keys are
    458# state names, this function returns a tuple (states,tokenname) where states
    459# is a tuple of state names and tokenname is the name of the token.  For example,
    460# calling this with s = "t_foo_bar_SPAM" might return (('foo','bar'),'SPAM')
    461# -----------------------------------------------------------------------------
    462
    463def _statetoken(s,names):
    464    nonstate = 1
    465    parts = s.split("_")
    466    for i in range(1,len(parts)):
    467         if not names.has_key(parts[i]) and parts[i] != 'ANY': break
    468    if i > 1:
    469       states = tuple(parts[1:i])
    470    else:
    471       states = ('INITIAL',)
    472
    473    if 'ANY' in states:
    474       states = tuple(names.keys())
    475      
    476    tokenname = "_".join(parts[i:])
    477    return (states,tokenname)
    478
    479# -----------------------------------------------------------------------------
    480# lex(module)
    481#
    482# Build all of the regular expression rules from definitions in the supplied module
    483# -----------------------------------------------------------------------------
    484def lex(module=None,object=None,debug=0,optimize=0,lextab="lextab",reflags=0,nowarn=0):
    485    global lexer
    486    ldict = None
    487    stateinfo  = { 'INITIAL' : 'inclusive'}
    488    error = 0
    489    files = { }
    490    lexobj = Lexer()
    491    lexobj.lexdebug = debug
    492    lexobj.lexoptimize = optimize
    493    global token,input
    494
    495    if nowarn: warn = 0
    496    else: warn = 1
    497    
    498    if object: module = object
    499
    500    if module:
    501        # User supplied a module object.
    502        if isinstance(module, types.ModuleType):
    503            ldict = module.__dict__
    504        elif isinstance(module, _INSTANCETYPE):
    505            _items = [(k,getattr(module,k)) for k in dir(module)]
    506            ldict = { }
    507            for (i,v) in _items:
    508                ldict[i] = v
    509        else:
    510            raise ValueError,"Expected a module or instance"
    511        lexobj.lexmodule = module
    512        
    513    else:
    514        # No module given.  We might be able to get information from the caller.
    515        try:
    516            raise RuntimeError
    517        except RuntimeError:
    518            e,b,t = sys.exc_info()
    519            f = t.tb_frame
    520            f = f.f_back           # Walk out to our calling function
    521            ldict = f.f_globals    # Grab its globals dictionary
    522
    523    if optimize and lextab:
    524        try:
    525            lexobj.readtab(lextab,ldict)
    526            token = lexobj.token
    527            input = lexobj.input
    528            lexer = lexobj
    529            return lexobj
    530        
    531        except ImportError:
    532            pass
    533        
    534    # Get the tokens, states, and literals variables (if any)
    535    if (module and isinstance(module,_INSTANCETYPE)):
    536        tokens   = getattr(module,"tokens",None)
    537        states   = getattr(module,"states",None)
    538        literals = getattr(module,"literals","")
    539    else:
    540        tokens   = ldict.get("tokens",None)
    541        states   = ldict.get("states",None)
    542        literals = ldict.get("literals","")
    543        
    544    if not tokens:
    545        raise SyntaxError,"lex: module does not define 'tokens'"
    546    if not (isinstance(tokens,types.ListType) or isinstance(tokens,types.TupleType)):
    547        raise SyntaxError,"lex: tokens must be a list or tuple."
    548
    549    # Build a dictionary of valid token names
    550    lexobj.lextokens = { }
    551    if not optimize:
    552        for n in tokens:
    553            if not _is_identifier.match(n):
    554                print "lex: Bad token name '%s'" % n
    555                error = 1
    556            if warn and lexobj.lextokens.has_key(n):
    557                print "lex: Warning. Token '%s' multiply defined." % n
    558            lexobj.lextokens[n] = None
    559    else:
    560        for n in tokens: lexobj.lextokens[n] = None
    561
    562    if debug:
    563        print "lex: tokens = '%s'" % lexobj.lextokens.keys()
    564
    565    try:
    566         for c in literals:
    567               if not (isinstance(c,types.StringType) or isinstance(c,types.UnicodeType)) or len(c) > 1:
    568                    print "lex: Invalid literal %s. Must be a single character" % repr(c)
    569                    error = 1
    570                    continue
    571
    572    except TypeError:
    573         print "lex: Invalid literals specification. literals must be a sequence of characters."
    574         error = 1
    575
    576    lexobj.lexliterals = literals
    577
    578    # Build statemap
    579    if states:
    580         if not (isinstance(states,types.TupleType) or isinstance(states,types.ListType)):
    581              print "lex: states must be defined as a tuple or list."
    582              error = 1
    583         else:
    584              for s in states:
    585                    if not isinstance(s,types.TupleType) or len(s) != 2:
    586                           print "lex: invalid state specifier %s. Must be a tuple (statename,'exclusive|inclusive')" % repr(s)
    587                           error = 1
    588                           continue
    589                    name, statetype = s
    590                    if not isinstance(name,types.StringType):
    591                           print "lex: state name %s must be a string" % repr(name)
    592                           error = 1
    593                           continue
    594                    if not (statetype == 'inclusive' or statetype == 'exclusive'):
    595                           print "lex: state type for state %s must be 'inclusive' or 'exclusive'" % name
    596                           error = 1
    597                           continue
    598                    if stateinfo.has_key(name):
    599                           print "lex: state '%s' already defined." % name
    600                           error = 1
    601                           continue
    602                    stateinfo[name] = statetype
    603
    604    # Get a list of symbols with the t_ or s_ prefix
    605    tsymbols = [f for f in ldict.keys() if f[:2] == 't_' ]
    606
    607    # Now build up a list of functions and a list of strings
    608
    609    funcsym =  { }        # Symbols defined as functions
    610    strsym =   { }        # Symbols defined as strings
    611    toknames = { }        # Mapping of symbols to token names
    612
    613    for s in stateinfo.keys():
    614         funcsym[s] = []
    615         strsym[s] = []
    616
    617    ignore   = { }        # Ignore strings by state
    618    errorf   = { }        # Error functions by state
    619
    620    if len(tsymbols) == 0:
    621        raise SyntaxError,"lex: no rules of the form t_rulename are defined."
    622
    623    for f in tsymbols:
    624        t = ldict[f]
    625        states, tokname = _statetoken(f,stateinfo)
    626        toknames[f] = tokname
    627
    628        if callable(t):
    629            for s in states: funcsym[s].append((f,t))
    630        elif (isinstance(t, types.StringType) or isinstance(t,types.UnicodeType)):
    631            for s in states: strsym[s].append((f,t))
    632        else:
    633            print "lex: %s not defined as a function or string" % f
    634            error = 1
    635
    636    # Sort the functions by line number
    637    for f in funcsym.values():
    638        f.sort(lambda x,y: cmp(x[1].func_code.co_firstlineno,y[1].func_code.co_firstlineno))
    639
    640    # Sort the strings by regular expression length
    641    for s in strsym.values():
    642        s.sort(lambda x,y: (len(x[1]) < len(y[1])) - (len(x[1]) > len(y[1])))
    643
    644    regexs = { }
    645
    646    # Build the master regular expressions
    647    for state in stateinfo.keys():
    648        regex_list = []
    649
    650        # Add rules defined by functions first
    651        for fname, f in funcsym[state]:
    652            line = f.func_code.co_firstlineno
    653            file = f.func_code.co_filename
    654            files[file] = None
    655            tokname = toknames[fname]
    656
    657            ismethod = isinstance(f, types.MethodType)
    658
    659            if not optimize:
    660                nargs = f.func_code.co_argcount
    661                if ismethod:
    662                    reqargs = 2
    663                else:
    664                    reqargs = 1
    665                if nargs > reqargs:
    666                    print "%s:%d: Rule '%s' has too many arguments." % (file,line,f.__name__)
    667                    error = 1
    668                    continue
    669
    670                if nargs < reqargs:
    671                    print "%s:%d: Rule '%s' requires an argument." % (file,line,f.__name__)
    672                    error = 1
    673                    continue
    674
    675                if tokname == 'ignore':
    676                    print "%s:%d: Rule '%s' must be defined as a string." % (file,line,f.__name__)
    677                    error = 1
    678                    continue
    679        
    680            if tokname == 'error':
    681                errorf[state] = f
    682                continue
    683
    684            if f.__doc__:
    685                if not optimize:
    686                    try:
    687                        c = re.compile("(?P<%s>%s)" % (f.__name__,f.__doc__), re.VERBOSE | reflags)
    688                        if c.match(""):
    689                             print "%s:%d: Regular expression for rule '%s' matches empty string." % (file,line,f.__name__)
    690                             error = 1
    691                             continue
    692                    except re.error,e:
    693                        print "%s:%d: Invalid regular expression for rule '%s'. %s" % (file,line,f.__name__,e)
    694                        if '#' in f.__doc__:
    695                             print "%s:%d. Make sure '#' in rule '%s' is escaped with '\\#'." % (file,line, f.__name__)                 
    696                        error = 1
    697                        continue
    698
    699                    if debug:
    700                        print "lex: Adding rule %s -> '%s' (state '%s')" % (f.__name__,f.__doc__, state)
    701
    702                # Okay. The regular expression seemed okay.  Let's append it to the master regular
    703                # expression we're building
    704  
    705                regex_list.append("(?P<%s>%s)" % (f.__name__,f.__doc__))
    706            else:
    707                print "%s:%d: No regular expression defined for rule '%s'" % (file,line,f.__name__)
    708
    709        # Now add all of the simple rules
    710        for name,r in strsym[state]:
    711            tokname = toknames[name]       
    712
    713            if tokname == 'ignore':
    714                 ignore[state] = r
    715                 continue
    716
    717            if not optimize:
    718                if tokname == 'error':
    719                    raise SyntaxError,"lex: Rule '%s' must be defined as a function" % name
    720                    error = 1
    721                    continue
    722        
    723                if not lexobj.lextokens.has_key(tokname) and tokname.find("ignore_") < 0:
    724                    print "lex: Rule '%s' defined for an unspecified token %s." % (name,tokname)
    725                    error = 1
    726                    continue
    727                try:
    728                    c = re.compile("(?P<%s>%s)" % (name,r),re.VERBOSE | reflags)
    729                    if (c.match("")):
    730                         print "lex: Regular expression for rule '%s' matches empty string." % name
    731                         error = 1
    732                         continue
    733                except re.error,e:
    734                    print "lex: Invalid regular expression for rule '%s'. %s" % (name,e)
    735                    if '#' in r:
    736                         print "lex: Make sure '#' in rule '%s' is escaped with '\\#'." % name
    737
    738                    error = 1
    739                    continue
    740                if debug:
    741                    print "lex: Adding rule %s -> '%s' (state '%s')" % (name,r,state)
    742                
    743            regex_list.append("(?P<%s>%s)" % (name,r))
    744
    745        if not regex_list:
    746             print "lex: No rules defined for state '%s'" % state
    747             error = 1
    748
    749        regexs[state] = regex_list
    750
    751
    752    if not optimize:
    753        for f in files.keys(): 
    754           if not _validate_file(f):
    755                error = 1
    756
    757    if error:
    758        raise SyntaxError,"lex: Unable to build lexer."
    759
    760    # From this point forward, we're reasonably confident that we can build the lexer.
    761    # No more errors will be generated, but there might be some warning messages.
    762
    763    # Build the master regular expressions
    764
    765    for state in regexs.keys():
    766        lexre, re_text = _form_master_re(regexs[state],reflags,ldict)
    767        lexobj.lexstatere[state] = lexre
    768        lexobj.lexstateretext[state] = re_text
    769        if debug:
    770            for i in range(len(re_text)):
    771                 print "lex: state '%s'. regex[%d] = '%s'" % (state, i, re_text[i])
    772
    773    # For inclusive states, we need to add the INITIAL state
    774    for state,type in stateinfo.items():
    775        if state != "INITIAL" and type == 'inclusive':
    776             lexobj.lexstatere[state].extend(lexobj.lexstatere['INITIAL'])
    777             lexobj.lexstateretext[state].extend(lexobj.lexstateretext['INITIAL'])
    778
    779    lexobj.lexstateinfo = stateinfo
    780    lexobj.lexre = lexobj.lexstatere["INITIAL"]
    781    lexobj.lexretext = lexobj.lexstateretext["INITIAL"]
    782
    783    # Set up ignore variables
    784    lexobj.lexstateignore = ignore
    785    lexobj.lexignore = lexobj.lexstateignore.get("INITIAL","")
    786
    787    # Set up error functions
    788    lexobj.lexstateerrorf = errorf
    789    lexobj.lexerrorf = errorf.get("INITIAL",None)
    790    if warn and not lexobj.lexerrorf:
    791        print "lex: Warning. no t_error rule is defined."
    792
    793    # Check state information for ignore and error rules
    794    for s,stype in stateinfo.items():
    795        if stype == 'exclusive':
    796              if warn and not errorf.has_key(s):
    797                   print "lex: Warning. no error rule is defined for exclusive state '%s'" % s
    798              if warn and not ignore.has_key(s) and lexobj.lexignore:
    799                   print "lex: Warning. no ignore rule is defined for exclusive state '%s'" % s
    800        elif stype == 'inclusive':
    801              if not errorf.has_key(s):
    802                   errorf[s] = errorf.get("INITIAL",None)
    803              if not ignore.has_key(s):
    804                   ignore[s] = ignore.get("INITIAL","")
    805   
    806
    807    # Create global versions of the token() and input() functions
    808    token = lexobj.token
    809    input = lexobj.input
    810    lexer = lexobj
    811
    812    # If in optimize mode, we write the lextab   
    813    if lextab and optimize:
    814        lexobj.writetab(lextab)
    815
    816    return lexobj
    817
    818# -----------------------------------------------------------------------------
    819# runmain()
    820#
    821# This runs the lexer as a main program
    822# -----------------------------------------------------------------------------
    823
    824def runmain(lexer=None,data=None):
    825    if not data:
    826        try:
    827            filename = sys.argv[1]
    828            f = open(filename)
    829            data = f.read()
    830            f.close()
    831        except IndexError:
    832            print "Reading from standard input (type EOF to end):"
    833            data = sys.stdin.read()
    834
    835    if lexer:
    836        _input = lexer.input
    837    else:
    838        _input = input
    839    _input(data)
    840    if lexer:
    841        _token = lexer.token
    842    else:
    843        _token = token
    844        
    845    while 1:
    846        tok = _token()
    847        if not tok: break
    848        print "(%s,%r,%d,%d)" % (tok.type, tok.value, tok.lineno,tok.lexpos)
    849        
    850
    851# -----------------------------------------------------------------------------
    852# @TOKEN(regex)
    853#
    854# This decorator function can be used to set the regex expression on a function
    855# when its docstring might need to be set in an alternative way
    856# -----------------------------------------------------------------------------
    857
    858def TOKEN(r):
    859    def set_doc(f):
    860        f.__doc__ = r
    861        return f
    862    return set_doc
    863
    864# Alternative spelling of the TOKEN decorator
    865Token = TOKEN
    866