diff options
Diffstat (limited to 'bitbake/lib/bb/pysh/pyshlex.py')
| -rw-r--r-- | bitbake/lib/bb/pysh/pyshlex.py | 883 |
1 files changed, 0 insertions, 883 deletions
diff --git a/bitbake/lib/bb/pysh/pyshlex.py b/bitbake/lib/bb/pysh/pyshlex.py deleted file mode 100644 index a42c294464..0000000000 --- a/bitbake/lib/bb/pysh/pyshlex.py +++ /dev/null | |||
| @@ -1,883 +0,0 @@ | |||
| 1 | # pyshlex.py - PLY compatible lexer for pysh. | ||
| 2 | # | ||
| 3 | # Copyright 2007 Patrick Mezard | ||
| 4 | # | ||
| 5 | # This software may be used and distributed according to the terms | ||
| 6 | # of the GNU General Public License, incorporated herein by reference. | ||
| 7 | |||
| 8 | # TODO: | ||
| 9 | # - review all "char in 'abc'" snippets: the empty string can be matched | ||
| 10 | # - test line continuations within quoted/expansion strings | ||
| 11 | # - eof is buggy wrt sublexers | ||
| 12 | # - the lexer cannot really work in pull mode as it would be required to run | ||
| 13 | # PLY in pull mode. It was designed to work incrementally and it would not be | ||
| 14 | # that hard to enable pull mode. | ||
| 15 | import re | ||
| 16 | |||
| 17 | from ply import lex | ||
| 18 | from bb.pysh.sherrors import * | ||
| 19 | |||
| 20 | class NeedMore(Exception): | ||
| 21 | pass | ||
| 22 | |||
| 23 | def is_blank(c): | ||
| 24 | return c in (' ', '\t') | ||
| 25 | |||
| 26 | _RE_DIGITS = re.compile(r'^\d+$') | ||
| 27 | |||
| 28 | def are_digits(s): | ||
| 29 | return _RE_DIGITS.search(s) is not None | ||
| 30 | |||
| 31 | _OPERATORS = dict([ | ||
| 32 | ('&&', 'AND_IF'), | ||
| 33 | ('||', 'OR_IF'), | ||
| 34 | (';;', 'DSEMI'), | ||
| 35 | ('<<', 'DLESS'), | ||
| 36 | ('>>', 'DGREAT'), | ||
| 37 | ('<&', 'LESSAND'), | ||
| 38 | ('>&', 'GREATAND'), | ||
| 39 | ('<>', 'LESSGREAT'), | ||
| 40 | ('<<-', 'DLESSDASH'), | ||
| 41 | ('>|', 'CLOBBER'), | ||
| 42 | ('&', 'AMP'), | ||
| 43 | (';', 'COMMA'), | ||
| 44 | ('<', 'LESS'), | ||
| 45 | ('>', 'GREATER'), | ||
| 46 | ('(', 'LPARENS'), | ||
| 47 | (')', 'RPARENS'), | ||
| 48 | ]) | ||
| 49 | |||
| 50 | #Make a function to silence pychecker "Local variable shadows global" | ||
| 51 | def make_partial_ops(): | ||
| 52 | partials = {} | ||
| 53 | for k in _OPERATORS: | ||
| 54 | for i in range(1, len(k)+1): | ||
| 55 | partials[k[:i]] = None | ||
| 56 | return partials | ||
| 57 | |||
| 58 | _PARTIAL_OPERATORS = make_partial_ops() | ||
| 59 | |||
| 60 | def is_partial_op(s): | ||
| 61 | """Return True if s matches a non-empty subpart of an operator starting | ||
| 62 | at its first character. | ||
| 63 | """ | ||
| 64 | return s in _PARTIAL_OPERATORS | ||
| 65 | |||
| 66 | def is_op(s): | ||
| 67 | """If s matches an operator, returns the operator identifier. Return None | ||
| 68 | otherwise. | ||
| 69 | """ | ||
| 70 | return _OPERATORS.get(s) | ||
| 71 | |||
| 72 | _RESERVEDS = dict([ | ||
| 73 | ('if', 'If'), | ||
| 74 | ('then', 'Then'), | ||
| 75 | ('else', 'Else'), | ||
| 76 | ('elif', 'Elif'), | ||
| 77 | ('fi', 'Fi'), | ||
| 78 | ('do', 'Do'), | ||
| 79 | ('done', 'Done'), | ||
| 80 | ('case', 'Case'), | ||
| 81 | ('esac', 'Esac'), | ||
| 82 | ('while', 'While'), | ||
| 83 | ('until', 'Until'), | ||
| 84 | ('for', 'For'), | ||
| 85 | ('{', 'Lbrace'), | ||
| 86 | ('}', 'Rbrace'), | ||
| 87 | ('!', 'Bang'), | ||
| 88 | ('in', 'In'), | ||
| 89 | ('|', 'PIPE'), | ||
| 90 | ]) | ||
| 91 | |||
| 92 | def get_reserved(s): | ||
| 93 | return _RESERVEDS.get(s) | ||
| 94 | |||
| 95 | _RE_NAME = re.compile(r'^[0-9a-zA-Z_]+$') | ||
| 96 | |||
| 97 | def is_name(s): | ||
| 98 | return _RE_NAME.search(s) is not None | ||
| 99 | |||
| 100 | def find_chars(seq, chars): | ||
| 101 | for i,v in enumerate(seq): | ||
| 102 | if v in chars: | ||
| 103 | return i,v | ||
| 104 | return -1, None | ||
| 105 | |||
| 106 | class WordLexer: | ||
| 107 | """WordLexer parse quoted or expansion expressions and return an expression | ||
| 108 | tree. The input string can be any well formed sequence beginning with quoting | ||
| 109 | or expansion character. Embedded expressions are handled recursively. The | ||
| 110 | resulting tree is made of lists and strings. Lists represent quoted or | ||
| 111 | expansion expressions. Each list first element is the opening separator, | ||
| 112 | the last one the closing separator. In-between can be any number of strings | ||
| 113 | or lists for sub-expressions. Non quoted/expansion expression can written as | ||
| 114 | strings or as lists with empty strings as starting and ending delimiters. | ||
| 115 | """ | ||
| 116 | |||
| 117 | NAME_CHARSET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_' | ||
| 118 | NAME_CHARSET = dict(zip(NAME_CHARSET, NAME_CHARSET)) | ||
| 119 | |||
| 120 | SPECIAL_CHARSET = '@*#?-$!0' | ||
| 121 | |||
| 122 | #Characters which can be escaped depends on the current delimiters | ||
| 123 | ESCAPABLE = { | ||
| 124 | '`': set(['$', '\\', '`']), | ||
| 125 | '"': set(['$', '\\', '`', '"']), | ||
| 126 | "'": set(), | ||
| 127 | } | ||
| 128 | |||
| 129 | def __init__(self, heredoc = False): | ||
| 130 | # _buffer is the unprocessed input characters buffer | ||
| 131 | self._buffer = [] | ||
| 132 | # _stack is empty or contains a quoted list being processed | ||
| 133 | # (this is the DFS path to the quoted expression being evaluated). | ||
| 134 | self._stack = [] | ||
| 135 | self._escapable = None | ||
| 136 | # True when parsing unquoted here documents | ||
| 137 | self._heredoc = heredoc | ||
| 138 | |||
| 139 | def add(self, data, eof=False): | ||
| 140 | """Feed the lexer with more data. If the quoted expression can be | ||
| 141 | delimited, return a tuple (expr, remaining) containing the expression | ||
| 142 | tree and the unconsumed data. | ||
| 143 | Otherwise, raise NeedMore. | ||
| 144 | """ | ||
| 145 | self._buffer += list(data) | ||
| 146 | self._parse(eof) | ||
| 147 | |||
| 148 | result = self._stack[0] | ||
| 149 | remaining = ''.join(self._buffer) | ||
| 150 | self._stack = [] | ||
| 151 | self._buffer = [] | ||
| 152 | return result, remaining | ||
| 153 | |||
| 154 | def _is_escapable(self, c, delim=None): | ||
| 155 | if delim is None: | ||
| 156 | if self._heredoc: | ||
| 157 | # Backslashes works as if they were double quoted in unquoted | ||
| 158 | # here-documents | ||
| 159 | delim = '"' | ||
| 160 | else: | ||
| 161 | if len(self._stack)<=1: | ||
| 162 | return True | ||
| 163 | delim = self._stack[-2][0] | ||
| 164 | |||
| 165 | escapables = self.ESCAPABLE.get(delim, None) | ||
| 166 | return escapables is None or c in escapables | ||
| 167 | |||
| 168 | def _parse_squote(self, buf, result, eof): | ||
| 169 | if not buf: | ||
| 170 | raise NeedMore() | ||
| 171 | try: | ||
| 172 | pos = buf.index("'") | ||
| 173 | except ValueError: | ||
| 174 | raise NeedMore() | ||
| 175 | result[-1] += ''.join(buf[:pos]) | ||
| 176 | result += ["'"] | ||
| 177 | return pos+1, True | ||
| 178 | |||
| 179 | def _parse_bquote(self, buf, result, eof): | ||
| 180 | if not buf: | ||
| 181 | raise NeedMore() | ||
| 182 | |||
| 183 | if buf[0]=='\n': | ||
| 184 | #Remove line continuations | ||
| 185 | result[:] = ['', '', ''] | ||
| 186 | elif self._is_escapable(buf[0]): | ||
| 187 | result[-1] += buf[0] | ||
| 188 | result += [''] | ||
| 189 | else: | ||
| 190 | #Keep as such | ||
| 191 | result[:] = ['', '\\'+buf[0], ''] | ||
| 192 | |||
| 193 | return 1, True | ||
| 194 | |||
| 195 | def _parse_dquote(self, buf, result, eof): | ||
| 196 | if not buf: | ||
| 197 | raise NeedMore() | ||
| 198 | pos, sep = find_chars(buf, '$\\`"') | ||
| 199 | if pos==-1: | ||
| 200 | raise NeedMore() | ||
| 201 | |||
| 202 | result[-1] += ''.join(buf[:pos]) | ||
| 203 | if sep=='"': | ||
| 204 | result += ['"'] | ||
| 205 | return pos+1, True | ||
| 206 | else: | ||
| 207 | #Keep everything until the separator and defer processing | ||
| 208 | return pos, False | ||
| 209 | |||
| 210 | def _parse_command(self, buf, result, eof): | ||
| 211 | if not buf: | ||
| 212 | raise NeedMore() | ||
| 213 | |||
| 214 | chars = '$\\`"\'' | ||
| 215 | if result[0] == '$(': | ||
| 216 | chars += ')' | ||
| 217 | pos, sep = find_chars(buf, chars) | ||
| 218 | if pos == -1: | ||
| 219 | raise NeedMore() | ||
| 220 | |||
| 221 | result[-1] += ''.join(buf[:pos]) | ||
| 222 | if (result[0]=='$(' and sep==')') or (result[0]=='`' and sep=='`'): | ||
| 223 | result += [sep] | ||
| 224 | return pos+1, True | ||
| 225 | else: | ||
| 226 | return pos, False | ||
| 227 | |||
| 228 | def _parse_parameter(self, buf, result, eof): | ||
| 229 | if not buf: | ||
| 230 | raise NeedMore() | ||
| 231 | |||
| 232 | pos, sep = find_chars(buf, '$\\`"\'}') | ||
| 233 | if pos==-1: | ||
| 234 | raise NeedMore() | ||
| 235 | |||
| 236 | result[-1] += ''.join(buf[:pos]) | ||
| 237 | if sep=='}': | ||
| 238 | result += [sep] | ||
| 239 | return pos+1, True | ||
| 240 | else: | ||
| 241 | return pos, False | ||
| 242 | |||
| 243 | def _parse_dollar(self, buf, result, eof): | ||
| 244 | sep = result[0] | ||
| 245 | if sep=='$': | ||
| 246 | if not buf: | ||
| 247 | #TODO: handle empty $ | ||
| 248 | raise NeedMore() | ||
| 249 | if buf[0]=='(': | ||
| 250 | if len(buf)==1: | ||
| 251 | raise NeedMore() | ||
| 252 | |||
| 253 | if buf[1]=='(': | ||
| 254 | result[0] = '$((' | ||
| 255 | buf[:2] = [] | ||
| 256 | else: | ||
| 257 | result[0] = '$(' | ||
| 258 | buf[:1] = [] | ||
| 259 | |||
| 260 | elif buf[0]=='{': | ||
| 261 | result[0] = '${' | ||
| 262 | buf[:1] = [] | ||
| 263 | else: | ||
| 264 | if buf[0] in self.SPECIAL_CHARSET: | ||
| 265 | result[-1] = buf[0] | ||
| 266 | read = 1 | ||
| 267 | else: | ||
| 268 | for read,c in enumerate(buf): | ||
| 269 | if c not in self.NAME_CHARSET: | ||
| 270 | break | ||
| 271 | else: | ||
| 272 | if not eof: | ||
| 273 | raise NeedMore() | ||
| 274 | read += 1 | ||
| 275 | |||
| 276 | result[-1] += ''.join(buf[0:read]) | ||
| 277 | |||
| 278 | if not result[-1]: | ||
| 279 | result[:] = ['', result[0], ''] | ||
| 280 | else: | ||
| 281 | result += [''] | ||
| 282 | return read,True | ||
| 283 | |||
| 284 | sep = result[0] | ||
| 285 | if sep=='$(': | ||
| 286 | parsefunc = self._parse_command | ||
| 287 | elif sep=='${': | ||
| 288 | parsefunc = self._parse_parameter | ||
| 289 | else: | ||
| 290 | raise NotImplementedError(sep) | ||
| 291 | |||
| 292 | pos, closed = parsefunc(buf, result, eof) | ||
| 293 | return pos, closed | ||
| 294 | |||
| 295 | def _parse(self, eof): | ||
| 296 | buf = self._buffer | ||
| 297 | stack = self._stack | ||
| 298 | recurse = False | ||
| 299 | |||
| 300 | while 1: | ||
| 301 | if not stack or recurse: | ||
| 302 | if not buf: | ||
| 303 | raise NeedMore() | ||
| 304 | if buf[0] not in ('"\\`$\''): | ||
| 305 | raise ShellSyntaxError('Invalid quoted string sequence') | ||
| 306 | stack.append([buf[0], '']) | ||
| 307 | buf[:1] = [] | ||
| 308 | recurse = False | ||
| 309 | |||
| 310 | result = stack[-1] | ||
| 311 | if result[0]=="'": | ||
| 312 | parsefunc = self._parse_squote | ||
| 313 | elif result[0]=='\\': | ||
| 314 | parsefunc = self._parse_bquote | ||
| 315 | elif result[0]=='"': | ||
| 316 | parsefunc = self._parse_dquote | ||
| 317 | elif result[0]=='`': | ||
| 318 | parsefunc = self._parse_command | ||
| 319 | elif result[0][0]=='$': | ||
| 320 | parsefunc = self._parse_dollar | ||
| 321 | else: | ||
| 322 | raise NotImplementedError() | ||
| 323 | |||
| 324 | read, closed = parsefunc(buf, result, eof) | ||
| 325 | |||
| 326 | buf[:read] = [] | ||
| 327 | if closed: | ||
| 328 | if len(stack)>1: | ||
| 329 | #Merge in parent expression | ||
| 330 | parsed = stack.pop() | ||
| 331 | stack[-1] += [parsed] | ||
| 332 | stack[-1] += [''] | ||
| 333 | else: | ||
| 334 | break | ||
| 335 | else: | ||
| 336 | recurse = True | ||
| 337 | |||
| 338 | def normalize_wordtree(wtree): | ||
| 339 | """Fold back every literal sequence (delimited with empty strings) into | ||
| 340 | parent sequence. | ||
| 341 | """ | ||
| 342 | def normalize(wtree): | ||
| 343 | result = [] | ||
| 344 | for part in wtree[1:-1]: | ||
| 345 | if isinstance(part, list): | ||
| 346 | part = normalize(part) | ||
| 347 | if part[0]=='': | ||
| 348 | #Move the part content back at current level | ||
| 349 | result += part[1:-1] | ||
| 350 | continue | ||
| 351 | elif not part: | ||
| 352 | #Remove empty strings | ||
| 353 | continue | ||
| 354 | result.append(part) | ||
| 355 | if not result: | ||
| 356 | result = [''] | ||
| 357 | return [wtree[0]] + result + [wtree[-1]] | ||
| 358 | |||
| 359 | return normalize(wtree) | ||
| 360 | |||
| 361 | |||
| 362 | def make_wordtree(token, here_document=False): | ||
| 363 | """Parse a delimited token and return a tree similar to the ones returned by | ||
| 364 | WordLexer. token may contain any combinations of expansion/quoted fields and | ||
| 365 | non-ones. | ||
| 366 | """ | ||
| 367 | tree = [''] | ||
| 368 | remaining = token | ||
| 369 | delimiters = '\\$`' | ||
| 370 | if not here_document: | ||
| 371 | delimiters += '\'"' | ||
| 372 | |||
| 373 | while 1: | ||
| 374 | pos, sep = find_chars(remaining, delimiters) | ||
| 375 | if pos==-1: | ||
| 376 | tree += [remaining, ''] | ||
| 377 | return normalize_wordtree(tree) | ||
| 378 | tree.append(remaining[:pos]) | ||
| 379 | remaining = remaining[pos:] | ||
| 380 | |||
| 381 | try: | ||
| 382 | result, remaining = WordLexer(heredoc = here_document).add(remaining, True) | ||
| 383 | except NeedMore: | ||
| 384 | raise ShellSyntaxError('Invalid token "%s"') | ||
| 385 | tree.append(result) | ||
| 386 | |||
| 387 | |||
| 388 | def wordtree_as_string(wtree): | ||
| 389 | """Rewrite an expression tree generated by make_wordtree as string.""" | ||
| 390 | def visit(node, output): | ||
| 391 | for child in node: | ||
| 392 | if isinstance(child, list): | ||
| 393 | visit(child, output) | ||
| 394 | else: | ||
| 395 | output.append(child) | ||
| 396 | |||
| 397 | output = [] | ||
| 398 | visit(wtree, output) | ||
| 399 | return ''.join(output) | ||
| 400 | |||
| 401 | |||
| 402 | def unquote_wordtree(wtree): | ||
| 403 | """Fold the word tree while removing quotes everywhere. Other expansion | ||
| 404 | sequences are joined as such. | ||
| 405 | """ | ||
| 406 | def unquote(wtree): | ||
| 407 | unquoted = [] | ||
| 408 | if wtree[0] in ('', "'", '"', '\\'): | ||
| 409 | wtree = wtree[1:-1] | ||
| 410 | |||
| 411 | for part in wtree: | ||
| 412 | if isinstance(part, list): | ||
| 413 | part = unquote(part) | ||
| 414 | unquoted.append(part) | ||
| 415 | return ''.join(unquoted) | ||
| 416 | |||
| 417 | return unquote(wtree) | ||
| 418 | |||
| 419 | |||
| 420 | class HereDocLexer: | ||
| 421 | """HereDocLexer delimits whatever comes from the here-document starting newline | ||
| 422 | not included to the closing delimiter line included. | ||
| 423 | """ | ||
| 424 | def __init__(self, op, delim): | ||
| 425 | assert op in ('<<', '<<-') | ||
| 426 | if not delim: | ||
| 427 | raise ShellSyntaxError('invalid here document delimiter %s' % str(delim)) | ||
| 428 | |||
| 429 | self._op = op | ||
| 430 | self._delim = delim | ||
| 431 | self._buffer = [] | ||
| 432 | self._token = [] | ||
| 433 | |||
| 434 | def add(self, data, eof): | ||
| 435 | """If the here-document was delimited, return a tuple (content, remaining). | ||
| 436 | Raise NeedMore() otherwise. | ||
| 437 | """ | ||
| 438 | self._buffer += list(data) | ||
| 439 | self._parse(eof) | ||
| 440 | token = ''.join(self._token) | ||
| 441 | remaining = ''.join(self._buffer) | ||
| 442 | self._token, self._remaining = [], [] | ||
| 443 | return token, remaining | ||
| 444 | |||
| 445 | def _parse(self, eof): | ||
| 446 | while 1: | ||
| 447 | #Look for first unescaped newline. Quotes may be ignored | ||
| 448 | escaped = False | ||
| 449 | for i,c in enumerate(self._buffer): | ||
| 450 | if escaped: | ||
| 451 | escaped = False | ||
| 452 | elif c=='\\': | ||
| 453 | escaped = True | ||
| 454 | elif c=='\n': | ||
| 455 | break | ||
| 456 | else: | ||
| 457 | i = -1 | ||
| 458 | |||
| 459 | if i==-1 or self._buffer[i]!='\n': | ||
| 460 | if not eof: | ||
| 461 | raise NeedMore() | ||
| 462 | #No more data, maybe the last line is closing delimiter | ||
| 463 | line = ''.join(self._buffer) | ||
| 464 | eol = '' | ||
| 465 | self._buffer[:] = [] | ||
| 466 | else: | ||
| 467 | line = ''.join(self._buffer[:i]) | ||
| 468 | eol = self._buffer[i] | ||
| 469 | self._buffer[:i+1] = [] | ||
| 470 | |||
| 471 | if self._op=='<<-': | ||
| 472 | line = line.lstrip('\t') | ||
| 473 | |||
| 474 | if line==self._delim: | ||
| 475 | break | ||
| 476 | |||
| 477 | self._token += [line, eol] | ||
| 478 | if i==-1: | ||
| 479 | break | ||
| 480 | |||
| 481 | class Token: | ||
| 482 | #TODO: check this is still in use | ||
| 483 | OPERATOR = 'OPERATOR' | ||
| 484 | WORD = 'WORD' | ||
| 485 | |||
| 486 | def __init__(self): | ||
| 487 | self.value = '' | ||
| 488 | self.type = None | ||
| 489 | |||
| 490 | def __getitem__(self, key): | ||
| 491 | #Behave like a two elements tuple | ||
| 492 | if key==0: | ||
| 493 | return self.type | ||
| 494 | if key==1: | ||
| 495 | return self.value | ||
| 496 | raise IndexError(key) | ||
| 497 | |||
| 498 | |||
| 499 | class HereDoc: | ||
| 500 | def __init__(self, op, name=None): | ||
| 501 | self.op = op | ||
| 502 | self.name = name | ||
| 503 | self.pendings = [] | ||
| 504 | |||
| 505 | TK_COMMA = 'COMMA' | ||
| 506 | TK_AMPERSAND = 'AMP' | ||
| 507 | TK_OP = 'OP' | ||
| 508 | TK_TOKEN = 'TOKEN' | ||
| 509 | TK_COMMENT = 'COMMENT' | ||
| 510 | TK_NEWLINE = 'NEWLINE' | ||
| 511 | TK_IONUMBER = 'IO_NUMBER' | ||
| 512 | TK_ASSIGNMENT = 'ASSIGNMENT_WORD' | ||
| 513 | TK_HERENAME = 'HERENAME' | ||
| 514 | |||
| 515 | class Lexer: | ||
| 516 | """Main lexer. | ||
| 517 | |||
| 518 | Call add() until the script AST is returned. | ||
| 519 | """ | ||
| 520 | # Here-document handling makes the whole thing more complex because they basically | ||
| 521 | # force tokens to be reordered: here-content must come right after the operator | ||
| 522 | # and the here-document name, while some other tokens might be following the | ||
| 523 | # here-document expression on the same line. | ||
| 524 | # | ||
| 525 | # So, here-doc states are basically: | ||
| 526 | # *self._state==ST_NORMAL | ||
| 527 | # - self._heredoc.op is None: no here-document | ||
| 528 | # - self._heredoc.op is not None but name is: here-document operator matched, | ||
| 529 | # waiting for the document name/delimiter | ||
| 530 | # - self._heredoc.op and name are not None: here-document is ready, following | ||
| 531 | # tokens are being stored and will be pushed again when the document is | ||
| 532 | # completely parsed. | ||
| 533 | # *self._state==ST_HEREDOC | ||
| 534 | # - The here-document is being delimited by self._herelexer. Once it is done | ||
| 535 | # the content is pushed in front of the pending token list then all these | ||
| 536 | # tokens are pushed once again. | ||
| 537 | ST_NORMAL = 'ST_NORMAL' | ||
| 538 | ST_OP = 'ST_OP' | ||
| 539 | ST_BACKSLASH = 'ST_BACKSLASH' | ||
| 540 | ST_QUOTED = 'ST_QUOTED' | ||
| 541 | ST_COMMENT = 'ST_COMMENT' | ||
| 542 | ST_HEREDOC = 'ST_HEREDOC' | ||
| 543 | |||
| 544 | #Match end of backquote strings | ||
| 545 | RE_BACKQUOTE_END = re.compile(r'(?<!\\)(`)') | ||
| 546 | |||
| 547 | def __init__(self, parent_state = None): | ||
| 548 | self._input = [] | ||
| 549 | self._pos = 0 | ||
| 550 | |||
| 551 | self._token = '' | ||
| 552 | self._type = TK_TOKEN | ||
| 553 | |||
| 554 | self._state = self.ST_NORMAL | ||
| 555 | self._parent_state = parent_state | ||
| 556 | self._wordlexer = None | ||
| 557 | |||
| 558 | self._heredoc = HereDoc(None) | ||
| 559 | self._herelexer = None | ||
| 560 | |||
| 561 | ### Following attributes are not used for delimiting token and can safely | ||
| 562 | ### be changed after here-document detection (see _push_toke) | ||
| 563 | |||
| 564 | # Count the number of tokens following a 'For' reserved word. Needed to | ||
| 565 | # return an 'In' reserved word if it comes in third place. | ||
| 566 | self._for_count = None | ||
| 567 | |||
| 568 | def add(self, data, eof=False): | ||
| 569 | """Feed the lexer with data. | ||
| 570 | |||
| 571 | When eof is set to True, returns unconsumed data or raise if the lexer | ||
| 572 | is in the middle of a delimiting operation. | ||
| 573 | Raise NeedMore otherwise. | ||
| 574 | """ | ||
| 575 | self._input += list(data) | ||
| 576 | self._parse(eof) | ||
| 577 | self._input[:self._pos] = [] | ||
| 578 | return ''.join(self._input) | ||
| 579 | |||
| 580 | def _parse(self, eof): | ||
| 581 | while self._state: | ||
| 582 | if self._pos>=len(self._input): | ||
| 583 | if not eof: | ||
| 584 | raise NeedMore() | ||
| 585 | elif self._state not in (self.ST_OP, self.ST_QUOTED, self.ST_HEREDOC): | ||
| 586 | #Delimit the current token and leave cleanly | ||
| 587 | self._push_token('') | ||
| 588 | break | ||
| 589 | else: | ||
| 590 | #Let the sublexer handle the eof themselves | ||
| 591 | pass | ||
| 592 | |||
| 593 | if self._state==self.ST_NORMAL: | ||
| 594 | self._parse_normal() | ||
| 595 | elif self._state==self.ST_COMMENT: | ||
| 596 | self._parse_comment() | ||
| 597 | elif self._state==self.ST_OP: | ||
| 598 | self._parse_op(eof) | ||
| 599 | elif self._state==self.ST_QUOTED: | ||
| 600 | self._parse_quoted(eof) | ||
| 601 | elif self._state==self.ST_HEREDOC: | ||
| 602 | self._parse_heredoc(eof) | ||
| 603 | else: | ||
| 604 | assert False, "Unknown state " + str(self._state) | ||
| 605 | |||
| 606 | if self._heredoc.op is not None: | ||
| 607 | raise ShellSyntaxError('missing here-document delimiter') | ||
| 608 | |||
| 609 | def _parse_normal(self): | ||
| 610 | c = self._input[self._pos] | ||
| 611 | if c=='\n': | ||
| 612 | self._push_token(c) | ||
| 613 | self._token = c | ||
| 614 | self._type = TK_NEWLINE | ||
| 615 | self._push_token('') | ||
| 616 | self._pos += 1 | ||
| 617 | elif c in ('\\', '\'', '"', '`', '$'): | ||
| 618 | self._state = self.ST_QUOTED | ||
| 619 | elif is_partial_op(c): | ||
| 620 | self._push_token(c) | ||
| 621 | |||
| 622 | self._type = TK_OP | ||
| 623 | self._token += c | ||
| 624 | self._pos += 1 | ||
| 625 | self._state = self.ST_OP | ||
| 626 | elif is_blank(c): | ||
| 627 | self._push_token(c) | ||
| 628 | |||
| 629 | #Discard blanks | ||
| 630 | self._pos += 1 | ||
| 631 | elif self._token: | ||
| 632 | self._token += c | ||
| 633 | self._pos += 1 | ||
| 634 | elif c=='#': | ||
| 635 | self._state = self.ST_COMMENT | ||
| 636 | self._type = TK_COMMENT | ||
| 637 | self._pos += 1 | ||
| 638 | else: | ||
| 639 | self._pos += 1 | ||
| 640 | self._token += c | ||
| 641 | |||
| 642 | def _parse_op(self, eof): | ||
| 643 | assert self._token | ||
| 644 | |||
| 645 | while 1: | ||
| 646 | if self._pos>=len(self._input): | ||
| 647 | if not eof: | ||
| 648 | raise NeedMore() | ||
| 649 | c = '' | ||
| 650 | else: | ||
| 651 | c = self._input[self._pos] | ||
| 652 | |||
| 653 | op = self._token + c | ||
| 654 | if c and is_partial_op(op): | ||
| 655 | #Still parsing an operator | ||
| 656 | self._token = op | ||
| 657 | self._pos += 1 | ||
| 658 | else: | ||
| 659 | #End of operator | ||
| 660 | self._push_token(c) | ||
| 661 | self._state = self.ST_NORMAL | ||
| 662 | break | ||
| 663 | |||
| 664 | def _parse_comment(self): | ||
| 665 | while 1: | ||
| 666 | if self._pos>=len(self._input): | ||
| 667 | raise NeedMore() | ||
| 668 | |||
| 669 | c = self._input[self._pos] | ||
| 670 | if c=='\n': | ||
| 671 | #End of comment, do not consume the end of line | ||
| 672 | self._state = self.ST_NORMAL | ||
| 673 | break | ||
| 674 | else: | ||
| 675 | self._token += c | ||
| 676 | self._pos += 1 | ||
| 677 | |||
| 678 | def _parse_quoted(self, eof): | ||
| 679 | """Precondition: the starting backquote/dollar is still in the input queue.""" | ||
| 680 | if not self._wordlexer: | ||
| 681 | self._wordlexer = WordLexer() | ||
| 682 | |||
| 683 | if self._pos<len(self._input): | ||
| 684 | #Transfer input queue character into the subparser | ||
| 685 | input = self._input[self._pos:] | ||
| 686 | self._pos += len(input) | ||
| 687 | |||
| 688 | wtree, remaining = self._wordlexer.add(input, eof) | ||
| 689 | self._wordlexer = None | ||
| 690 | self._token += wordtree_as_string(wtree) | ||
| 691 | |||
| 692 | #Put unparsed character back in the input queue | ||
| 693 | if remaining: | ||
| 694 | self._input[self._pos:self._pos] = list(remaining) | ||
| 695 | self._state = self.ST_NORMAL | ||
| 696 | |||
| 697 | def _parse_heredoc(self, eof): | ||
| 698 | assert not self._token | ||
| 699 | |||
| 700 | if self._herelexer is None: | ||
| 701 | self._herelexer = HereDocLexer(self._heredoc.op, self._heredoc.name) | ||
| 702 | |||
| 703 | if self._pos<len(self._input): | ||
| 704 | #Transfer input queue character into the subparser | ||
| 705 | input = self._input[self._pos:] | ||
| 706 | self._pos += len(input) | ||
| 707 | |||
| 708 | self._token, remaining = self._herelexer.add(input, eof) | ||
| 709 | |||
| 710 | #Reset here-document state | ||
| 711 | self._herelexer = None | ||
| 712 | heredoc, self._heredoc = self._heredoc, HereDoc(None) | ||
| 713 | if remaining: | ||
| 714 | self._input[self._pos:self._pos] = list(remaining) | ||
| 715 | self._state = self.ST_NORMAL | ||
| 716 | |||
| 717 | #Push pending tokens | ||
| 718 | heredoc.pendings[:0] = [(self._token, self._type, heredoc.name)] | ||
| 719 | for token, type, delim in heredoc.pendings: | ||
| 720 | self._token = token | ||
| 721 | self._type = type | ||
| 722 | self._push_token(delim) | ||
| 723 | |||
| 724 | def _push_token(self, delim): | ||
| 725 | if not self._token: | ||
| 726 | return 0 | ||
| 727 | |||
| 728 | if self._heredoc.op is not None: | ||
| 729 | if self._heredoc.name is None: | ||
| 730 | #Here-document name | ||
| 731 | if self._type!=TK_TOKEN: | ||
| 732 | raise ShellSyntaxError("expecting here-document name, got '%s'" % self._token) | ||
| 733 | self._heredoc.name = unquote_wordtree(make_wordtree(self._token)) | ||
| 734 | self._type = TK_HERENAME | ||
| 735 | else: | ||
| 736 | #Capture all tokens until the newline starting the here-document | ||
| 737 | if self._type==TK_NEWLINE: | ||
| 738 | assert self._state==self.ST_NORMAL | ||
| 739 | self._state = self.ST_HEREDOC | ||
| 740 | |||
| 741 | self._heredoc.pendings.append((self._token, self._type, delim)) | ||
| 742 | self._token = '' | ||
| 743 | self._type = TK_TOKEN | ||
| 744 | return 1 | ||
| 745 | |||
| 746 | # BEWARE: do not change parser state from here to the end of the function: | ||
| 747 | # when parsing between an here-document operator to the end of the line | ||
| 748 | # tokens are stored in self._heredoc.pendings. Therefore, they will not | ||
| 749 | # reach the section below. | ||
| 750 | |||
| 751 | #Check operators | ||
| 752 | if self._type==TK_OP: | ||
| 753 | #False positive because of partial op matching | ||
| 754 | op = is_op(self._token) | ||
| 755 | if not op: | ||
| 756 | self._type = TK_TOKEN | ||
| 757 | else: | ||
| 758 | #Map to the specific operator | ||
| 759 | self._type = op | ||
| 760 | if self._token in ('<<', '<<-'): | ||
| 761 | #Done here rather than in _parse_op because there is no need | ||
| 762 | #to change the parser state since we are still waiting for | ||
| 763 | #the here-document name | ||
| 764 | if self._heredoc.op is not None: | ||
| 765 | raise ShellSyntaxError("syntax error near token '%s'" % self._token) | ||
| 766 | assert self._heredoc.op is None | ||
| 767 | self._heredoc.op = self._token | ||
| 768 | |||
| 769 | if self._type==TK_TOKEN: | ||
| 770 | if '=' in self._token and not delim: | ||
| 771 | if self._token.startswith('='): | ||
| 772 | #Token is a WORD... a TOKEN that is. | ||
| 773 | pass | ||
| 774 | else: | ||
| 775 | prev = self._token[:self._token.find('=')] | ||
| 776 | if is_name(prev): | ||
| 777 | self._type = TK_ASSIGNMENT | ||
| 778 | else: | ||
| 779 | #Just a token (unspecified) | ||
| 780 | pass | ||
| 781 | else: | ||
| 782 | reserved = get_reserved(self._token) | ||
| 783 | if reserved is not None: | ||
| 784 | if reserved=='In' and self._for_count!=2: | ||
| 785 | #Sorry, not a reserved word after all | ||
| 786 | pass | ||
| 787 | else: | ||
| 788 | self._type = reserved | ||
| 789 | if reserved in ('For', 'Case'): | ||
| 790 | self._for_count = 0 | ||
| 791 | elif are_digits(self._token) and delim in ('<', '>'): | ||
| 792 | #Detect IO_NUMBER | ||
| 793 | self._type = TK_IONUMBER | ||
| 794 | elif self._token==';': | ||
| 795 | self._type = TK_COMMA | ||
| 796 | elif self._token=='&': | ||
| 797 | self._type = TK_AMPERSAND | ||
| 798 | elif self._type==TK_COMMENT: | ||
| 799 | #Comments are not part of sh grammar, ignore them | ||
| 800 | self._token = '' | ||
| 801 | self._type = TK_TOKEN | ||
| 802 | return 0 | ||
| 803 | |||
| 804 | if self._for_count is not None: | ||
| 805 | #Track token count in 'For' expression to detect 'In' reserved words. | ||
| 806 | #Can only be in third position, no need to go beyond | ||
| 807 | self._for_count += 1 | ||
| 808 | if self._for_count==3: | ||
| 809 | self._for_count = None | ||
| 810 | |||
| 811 | self.on_token((self._token, self._type)) | ||
| 812 | self._token = '' | ||
| 813 | self._type = TK_TOKEN | ||
| 814 | return 1 | ||
| 815 | |||
| 816 | def on_token(self, token): | ||
| 817 | raise NotImplementedError | ||
| 818 | |||
| 819 | |||
| 820 | tokens = [ | ||
| 821 | TK_TOKEN, | ||
| 822 | # To silence yacc unused token warnings | ||
| 823 | # TK_COMMENT, | ||
| 824 | TK_NEWLINE, | ||
| 825 | TK_IONUMBER, | ||
| 826 | TK_ASSIGNMENT, | ||
| 827 | TK_HERENAME, | ||
| 828 | ] | ||
| 829 | |||
| 830 | #Add specific operators | ||
| 831 | tokens += _OPERATORS.values() | ||
| 832 | #Add reserved words | ||
| 833 | tokens += _RESERVEDS.values() | ||
| 834 | |||
| 835 | class PLYLexer(Lexer): | ||
| 836 | """Bridge Lexer and PLY lexer interface.""" | ||
| 837 | def __init__(self): | ||
| 838 | Lexer.__init__(self) | ||
| 839 | self._tokens = [] | ||
| 840 | self._current = 0 | ||
| 841 | self.lineno = 0 | ||
| 842 | |||
| 843 | def on_token(self, token): | ||
| 844 | value, type = token | ||
| 845 | |||
| 846 | self.lineno = 0 | ||
| 847 | t = lex.LexToken() | ||
| 848 | t.value = value | ||
| 849 | t.type = type | ||
| 850 | t.lexer = self | ||
| 851 | t.lexpos = 0 | ||
| 852 | t.lineno = 0 | ||
| 853 | |||
| 854 | self._tokens.append(t) | ||
| 855 | |||
| 856 | def is_empty(self): | ||
| 857 | return not bool(self._tokens) | ||
| 858 | |||
| 859 | #PLY compliant interface | ||
| 860 | def token(self): | ||
| 861 | if self._current>=len(self._tokens): | ||
| 862 | return None | ||
| 863 | t = self._tokens[self._current] | ||
| 864 | self._current += 1 | ||
| 865 | return t | ||
| 866 | |||
| 867 | |||
| 868 | def get_tokens(s): | ||
| 869 | """Parse the input string and return a tuple (tokens, unprocessed) where | ||
| 870 | tokens is a list of parsed tokens and unprocessed is the part of the input | ||
| 871 | string left untouched by the lexer. | ||
| 872 | """ | ||
| 873 | lexer = PLYLexer() | ||
| 874 | untouched = lexer.add(s, True) | ||
| 875 | tokens = [] | ||
| 876 | while 1: | ||
| 877 | token = lexer.token() | ||
| 878 | if token is None: | ||
| 879 | break | ||
| 880 | tokens.append(token) | ||
| 881 | |||
| 882 | tokens = [(t.value, t.type) for t in tokens] | ||
| 883 | return tokens, untouched | ||
