diff options
Diffstat (limited to 'bitbake/lib/pysh/pyshlex.py')
-rw-r--r-- | bitbake/lib/pysh/pyshlex.py | 888 |
1 files changed, 888 insertions, 0 deletions
diff --git a/bitbake/lib/pysh/pyshlex.py b/bitbake/lib/pysh/pyshlex.py new file mode 100644 index 0000000000..b977b5e869 --- /dev/null +++ b/bitbake/lib/pysh/pyshlex.py | |||
@@ -0,0 +1,888 @@ | |||
1 | # pyshlex.py - PLY compatible lexer for pysh. | ||
2 | # | ||
3 | # Copyright 2007 Patrick Mezard | ||
4 | # | ||
5 | # This software may be used and distributed according to the terms | ||
6 | # of the GNU General Public License, incorporated herein by reference. | ||
7 | |||
8 | # TODO: | ||
9 | # - review all "char in 'abc'" snippets: the empty string can be matched | ||
10 | # - test line continuations within quoted/expansion strings | ||
11 | # - eof is buggy wrt sublexers | ||
12 | # - the lexer cannot really work in pull mode as it would be required to run | ||
13 | # PLY in pull mode. It was designed to work incrementally and it would not be | ||
14 | # that hard to enable pull mode. | ||
15 | import re | ||
16 | try: | ||
17 | s = set() | ||
18 | del s | ||
19 | except NameError: | ||
20 | from Set import Set as set | ||
21 | |||
22 | from ply import lex | ||
23 | from sherrors import * | ||
24 | |||
25 | class NeedMore(Exception): | ||
26 | pass | ||
27 | |||
28 | def is_blank(c): | ||
29 | return c in (' ', '\t') | ||
30 | |||
31 | _RE_DIGITS = re.compile(r'^\d+$') | ||
32 | |||
33 | def are_digits(s): | ||
34 | return _RE_DIGITS.search(s) is not None | ||
35 | |||
36 | _OPERATORS = dict([ | ||
37 | ('&&', 'AND_IF'), | ||
38 | ('||', 'OR_IF'), | ||
39 | (';;', 'DSEMI'), | ||
40 | ('<<', 'DLESS'), | ||
41 | ('>>', 'DGREAT'), | ||
42 | ('<&', 'LESSAND'), | ||
43 | ('>&', 'GREATAND'), | ||
44 | ('<>', 'LESSGREAT'), | ||
45 | ('<<-', 'DLESSDASH'), | ||
46 | ('>|', 'CLOBBER'), | ||
47 | ('&', 'AMP'), | ||
48 | (';', 'COMMA'), | ||
49 | ('<', 'LESS'), | ||
50 | ('>', 'GREATER'), | ||
51 | ('(', 'LPARENS'), | ||
52 | (')', 'RPARENS'), | ||
53 | ]) | ||
54 | |||
55 | #Make a function to silence pychecker "Local variable shadows global" | ||
56 | def make_partial_ops(): | ||
57 | partials = {} | ||
58 | for k in _OPERATORS: | ||
59 | for i in range(1, len(k)+1): | ||
60 | partials[k[:i]] = None | ||
61 | return partials | ||
62 | |||
63 | _PARTIAL_OPERATORS = make_partial_ops() | ||
64 | |||
65 | def is_partial_op(s): | ||
66 | """Return True if s matches a non-empty subpart of an operator starting | ||
67 | at its first character. | ||
68 | """ | ||
69 | return s in _PARTIAL_OPERATORS | ||
70 | |||
71 | def is_op(s): | ||
72 | """If s matches an operator, returns the operator identifier. Return None | ||
73 | otherwise. | ||
74 | """ | ||
75 | return _OPERATORS.get(s) | ||
76 | |||
77 | _RESERVEDS = dict([ | ||
78 | ('if', 'If'), | ||
79 | ('then', 'Then'), | ||
80 | ('else', 'Else'), | ||
81 | ('elif', 'Elif'), | ||
82 | ('fi', 'Fi'), | ||
83 | ('do', 'Do'), | ||
84 | ('done', 'Done'), | ||
85 | ('case', 'Case'), | ||
86 | ('esac', 'Esac'), | ||
87 | ('while', 'While'), | ||
88 | ('until', 'Until'), | ||
89 | ('for', 'For'), | ||
90 | ('{', 'Lbrace'), | ||
91 | ('}', 'Rbrace'), | ||
92 | ('!', 'Bang'), | ||
93 | ('in', 'In'), | ||
94 | ('|', 'PIPE'), | ||
95 | ]) | ||
96 | |||
97 | def get_reserved(s): | ||
98 | return _RESERVEDS.get(s) | ||
99 | |||
100 | _RE_NAME = re.compile(r'^[0-9a-zA-Z_]+$') | ||
101 | |||
102 | def is_name(s): | ||
103 | return _RE_NAME.search(s) is not None | ||
104 | |||
105 | def find_chars(seq, chars): | ||
106 | for i,v in enumerate(seq): | ||
107 | if v in chars: | ||
108 | return i,v | ||
109 | return -1, None | ||
110 | |||
111 | class WordLexer: | ||
112 | """WordLexer parse quoted or expansion expressions and return an expression | ||
113 | tree. The input string can be any well formed sequence beginning with quoting | ||
114 | or expansion character. Embedded expressions are handled recursively. The | ||
115 | resulting tree is made of lists and strings. Lists represent quoted or | ||
116 | expansion expressions. Each list first element is the opening separator, | ||
117 | the last one the closing separator. In-between can be any number of strings | ||
118 | or lists for sub-expressions. Non quoted/expansion expression can written as | ||
119 | strings or as lists with empty strings as starting and ending delimiters. | ||
120 | """ | ||
121 | |||
122 | NAME_CHARSET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_' | ||
123 | NAME_CHARSET = dict(zip(NAME_CHARSET, NAME_CHARSET)) | ||
124 | |||
125 | SPECIAL_CHARSET = '@*#?-$!0' | ||
126 | |||
127 | #Characters which can be escaped depends on the current delimiters | ||
128 | ESCAPABLE = { | ||
129 | '`': set(['$', '\\', '`']), | ||
130 | '"': set(['$', '\\', '`', '"']), | ||
131 | "'": set(), | ||
132 | } | ||
133 | |||
134 | def __init__(self, heredoc = False): | ||
135 | # _buffer is the unprocessed input characters buffer | ||
136 | self._buffer = [] | ||
137 | # _stack is empty or contains a quoted list being processed | ||
138 | # (this is the DFS path to the quoted expression being evaluated). | ||
139 | self._stack = [] | ||
140 | self._escapable = None | ||
141 | # True when parsing unquoted here documents | ||
142 | self._heredoc = heredoc | ||
143 | |||
144 | def add(self, data, eof=False): | ||
145 | """Feed the lexer with more data. If the quoted expression can be | ||
146 | delimited, return a tuple (expr, remaining) containing the expression | ||
147 | tree and the unconsumed data. | ||
148 | Otherwise, raise NeedMore. | ||
149 | """ | ||
150 | self._buffer += list(data) | ||
151 | self._parse(eof) | ||
152 | |||
153 | result = self._stack[0] | ||
154 | remaining = ''.join(self._buffer) | ||
155 | self._stack = [] | ||
156 | self._buffer = [] | ||
157 | return result, remaining | ||
158 | |||
159 | def _is_escapable(self, c, delim=None): | ||
160 | if delim is None: | ||
161 | if self._heredoc: | ||
162 | # Backslashes works as if they were double quoted in unquoted | ||
163 | # here-documents | ||
164 | delim = '"' | ||
165 | else: | ||
166 | if len(self._stack)<=1: | ||
167 | return True | ||
168 | delim = self._stack[-2][0] | ||
169 | |||
170 | escapables = self.ESCAPABLE.get(delim, None) | ||
171 | return escapables is None or c in escapables | ||
172 | |||
173 | def _parse_squote(self, buf, result, eof): | ||
174 | if not buf: | ||
175 | raise NeedMore() | ||
176 | try: | ||
177 | pos = buf.index("'") | ||
178 | except ValueError: | ||
179 | raise NeedMore() | ||
180 | result[-1] += ''.join(buf[:pos]) | ||
181 | result += ["'"] | ||
182 | return pos+1, True | ||
183 | |||
184 | def _parse_bquote(self, buf, result, eof): | ||
185 | if not buf: | ||
186 | raise NeedMore() | ||
187 | |||
188 | if buf[0]=='\n': | ||
189 | #Remove line continuations | ||
190 | result[:] = ['', '', ''] | ||
191 | elif self._is_escapable(buf[0]): | ||
192 | result[-1] += buf[0] | ||
193 | result += [''] | ||
194 | else: | ||
195 | #Keep as such | ||
196 | result[:] = ['', '\\'+buf[0], ''] | ||
197 | |||
198 | return 1, True | ||
199 | |||
200 | def _parse_dquote(self, buf, result, eof): | ||
201 | if not buf: | ||
202 | raise NeedMore() | ||
203 | pos, sep = find_chars(buf, '$\\`"') | ||
204 | if pos==-1: | ||
205 | raise NeedMore() | ||
206 | |||
207 | result[-1] += ''.join(buf[:pos]) | ||
208 | if sep=='"': | ||
209 | result += ['"'] | ||
210 | return pos+1, True | ||
211 | else: | ||
212 | #Keep everything until the separator and defer processing | ||
213 | return pos, False | ||
214 | |||
215 | def _parse_command(self, buf, result, eof): | ||
216 | if not buf: | ||
217 | raise NeedMore() | ||
218 | |||
219 | chars = '$\\`"\'' | ||
220 | if result[0] == '$(': | ||
221 | chars += ')' | ||
222 | pos, sep = find_chars(buf, chars) | ||
223 | if pos == -1: | ||
224 | raise NeedMore() | ||
225 | |||
226 | result[-1] += ''.join(buf[:pos]) | ||
227 | if (result[0]=='$(' and sep==')') or (result[0]=='`' and sep=='`'): | ||
228 | result += [sep] | ||
229 | return pos+1, True | ||
230 | else: | ||
231 | return pos, False | ||
232 | |||
233 | def _parse_parameter(self, buf, result, eof): | ||
234 | if not buf: | ||
235 | raise NeedMore() | ||
236 | |||
237 | pos, sep = find_chars(buf, '$\\`"\'}') | ||
238 | if pos==-1: | ||
239 | raise NeedMore() | ||
240 | |||
241 | result[-1] += ''.join(buf[:pos]) | ||
242 | if sep=='}': | ||
243 | result += [sep] | ||
244 | return pos+1, True | ||
245 | else: | ||
246 | return pos, False | ||
247 | |||
248 | def _parse_dollar(self, buf, result, eof): | ||
249 | sep = result[0] | ||
250 | if sep=='$': | ||
251 | if not buf: | ||
252 | #TODO: handle empty $ | ||
253 | raise NeedMore() | ||
254 | if buf[0]=='(': | ||
255 | if len(buf)==1: | ||
256 | raise NeedMore() | ||
257 | |||
258 | if buf[1]=='(': | ||
259 | result[0] = '$((' | ||
260 | buf[:2] = [] | ||
261 | else: | ||
262 | result[0] = '$(' | ||
263 | buf[:1] = [] | ||
264 | |||
265 | elif buf[0]=='{': | ||
266 | result[0] = '${' | ||
267 | buf[:1] = [] | ||
268 | else: | ||
269 | if buf[0] in self.SPECIAL_CHARSET: | ||
270 | result[-1] = buf[0] | ||
271 | read = 1 | ||
272 | else: | ||
273 | for read,c in enumerate(buf): | ||
274 | if c not in self.NAME_CHARSET: | ||
275 | break | ||
276 | else: | ||
277 | if not eof: | ||
278 | raise NeedMore() | ||
279 | read += 1 | ||
280 | |||
281 | result[-1] += ''.join(buf[0:read]) | ||
282 | |||
283 | if not result[-1]: | ||
284 | result[:] = ['', result[0], ''] | ||
285 | else: | ||
286 | result += [''] | ||
287 | return read,True | ||
288 | |||
289 | sep = result[0] | ||
290 | if sep=='$(': | ||
291 | parsefunc = self._parse_command | ||
292 | elif sep=='${': | ||
293 | parsefunc = self._parse_parameter | ||
294 | else: | ||
295 | raise NotImplementedError() | ||
296 | |||
297 | pos, closed = parsefunc(buf, result, eof) | ||
298 | return pos, closed | ||
299 | |||
300 | def _parse(self, eof): | ||
301 | buf = self._buffer | ||
302 | stack = self._stack | ||
303 | recurse = False | ||
304 | |||
305 | while 1: | ||
306 | if not stack or recurse: | ||
307 | if not buf: | ||
308 | raise NeedMore() | ||
309 | if buf[0] not in ('"\\`$\''): | ||
310 | raise ShellSyntaxError('Invalid quoted string sequence') | ||
311 | stack.append([buf[0], '']) | ||
312 | buf[:1] = [] | ||
313 | recurse = False | ||
314 | |||
315 | result = stack[-1] | ||
316 | if result[0]=="'": | ||
317 | parsefunc = self._parse_squote | ||
318 | elif result[0]=='\\': | ||
319 | parsefunc = self._parse_bquote | ||
320 | elif result[0]=='"': | ||
321 | parsefunc = self._parse_dquote | ||
322 | elif result[0]=='`': | ||
323 | parsefunc = self._parse_command | ||
324 | elif result[0][0]=='$': | ||
325 | parsefunc = self._parse_dollar | ||
326 | else: | ||
327 | raise NotImplementedError() | ||
328 | |||
329 | read, closed = parsefunc(buf, result, eof) | ||
330 | |||
331 | buf[:read] = [] | ||
332 | if closed: | ||
333 | if len(stack)>1: | ||
334 | #Merge in parent expression | ||
335 | parsed = stack.pop() | ||
336 | stack[-1] += [parsed] | ||
337 | stack[-1] += [''] | ||
338 | else: | ||
339 | break | ||
340 | else: | ||
341 | recurse = True | ||
342 | |||
343 | def normalize_wordtree(wtree): | ||
344 | """Fold back every literal sequence (delimited with empty strings) into | ||
345 | parent sequence. | ||
346 | """ | ||
347 | def normalize(wtree): | ||
348 | result = [] | ||
349 | for part in wtree[1:-1]: | ||
350 | if isinstance(part, list): | ||
351 | part = normalize(part) | ||
352 | if part[0]=='': | ||
353 | #Move the part content back at current level | ||
354 | result += part[1:-1] | ||
355 | continue | ||
356 | elif not part: | ||
357 | #Remove empty strings | ||
358 | continue | ||
359 | result.append(part) | ||
360 | if not result: | ||
361 | result = [''] | ||
362 | return [wtree[0]] + result + [wtree[-1]] | ||
363 | |||
364 | return normalize(wtree) | ||
365 | |||
366 | |||
367 | def make_wordtree(token, here_document=False): | ||
368 | """Parse a delimited token and return a tree similar to the ones returned by | ||
369 | WordLexer. token may contain any combinations of expansion/quoted fields and | ||
370 | non-ones. | ||
371 | """ | ||
372 | tree = [''] | ||
373 | remaining = token | ||
374 | delimiters = '\\$`' | ||
375 | if not here_document: | ||
376 | delimiters += '\'"' | ||
377 | |||
378 | while 1: | ||
379 | pos, sep = find_chars(remaining, delimiters) | ||
380 | if pos==-1: | ||
381 | tree += [remaining, ''] | ||
382 | return normalize_wordtree(tree) | ||
383 | tree.append(remaining[:pos]) | ||
384 | remaining = remaining[pos:] | ||
385 | |||
386 | try: | ||
387 | result, remaining = WordLexer(heredoc = here_document).add(remaining, True) | ||
388 | except NeedMore: | ||
389 | raise ShellSyntaxError('Invalid token "%s"') | ||
390 | tree.append(result) | ||
391 | |||
392 | |||
393 | def wordtree_as_string(wtree): | ||
394 | """Rewrite an expression tree generated by make_wordtree as string.""" | ||
395 | def visit(node, output): | ||
396 | for child in node: | ||
397 | if isinstance(child, list): | ||
398 | visit(child, output) | ||
399 | else: | ||
400 | output.append(child) | ||
401 | |||
402 | output = [] | ||
403 | visit(wtree, output) | ||
404 | return ''.join(output) | ||
405 | |||
406 | |||
407 | def unquote_wordtree(wtree): | ||
408 | """Fold the word tree while removing quotes everywhere. Other expansion | ||
409 | sequences are joined as such. | ||
410 | """ | ||
411 | def unquote(wtree): | ||
412 | unquoted = [] | ||
413 | if wtree[0] in ('', "'", '"', '\\'): | ||
414 | wtree = wtree[1:-1] | ||
415 | |||
416 | for part in wtree: | ||
417 | if isinstance(part, list): | ||
418 | part = unquote(part) | ||
419 | unquoted.append(part) | ||
420 | return ''.join(unquoted) | ||
421 | |||
422 | return unquote(wtree) | ||
423 | |||
424 | |||
425 | class HereDocLexer: | ||
426 | """HereDocLexer delimits whatever comes from the here-document starting newline | ||
427 | not included to the closing delimiter line included. | ||
428 | """ | ||
429 | def __init__(self, op, delim): | ||
430 | assert op in ('<<', '<<-') | ||
431 | if not delim: | ||
432 | raise ShellSyntaxError('invalid here document delimiter %s' % str(delim)) | ||
433 | |||
434 | self._op = op | ||
435 | self._delim = delim | ||
436 | self._buffer = [] | ||
437 | self._token = [] | ||
438 | |||
439 | def add(self, data, eof): | ||
440 | """If the here-document was delimited, return a tuple (content, remaining). | ||
441 | Raise NeedMore() otherwise. | ||
442 | """ | ||
443 | self._buffer += list(data) | ||
444 | self._parse(eof) | ||
445 | token = ''.join(self._token) | ||
446 | remaining = ''.join(self._buffer) | ||
447 | self._token, self._remaining = [], [] | ||
448 | return token, remaining | ||
449 | |||
450 | def _parse(self, eof): | ||
451 | while 1: | ||
452 | #Look for first unescaped newline. Quotes may be ignored | ||
453 | escaped = False | ||
454 | for i,c in enumerate(self._buffer): | ||
455 | if escaped: | ||
456 | escaped = False | ||
457 | elif c=='\\': | ||
458 | escaped = True | ||
459 | elif c=='\n': | ||
460 | break | ||
461 | else: | ||
462 | i = -1 | ||
463 | |||
464 | if i==-1 or self._buffer[i]!='\n': | ||
465 | if not eof: | ||
466 | raise NeedMore() | ||
467 | #No more data, maybe the last line is closing delimiter | ||
468 | line = ''.join(self._buffer) | ||
469 | eol = '' | ||
470 | self._buffer[:] = [] | ||
471 | else: | ||
472 | line = ''.join(self._buffer[:i]) | ||
473 | eol = self._buffer[i] | ||
474 | self._buffer[:i+1] = [] | ||
475 | |||
476 | if self._op=='<<-': | ||
477 | line = line.lstrip('\t') | ||
478 | |||
479 | if line==self._delim: | ||
480 | break | ||
481 | |||
482 | self._token += [line, eol] | ||
483 | if i==-1: | ||
484 | break | ||
485 | |||
486 | class Token: | ||
487 | #TODO: check this is still in use | ||
488 | OPERATOR = 'OPERATOR' | ||
489 | WORD = 'WORD' | ||
490 | |||
491 | def __init__(self): | ||
492 | self.value = '' | ||
493 | self.type = None | ||
494 | |||
495 | def __getitem__(self, key): | ||
496 | #Behave like a two elements tuple | ||
497 | if key==0: | ||
498 | return self.type | ||
499 | if key==1: | ||
500 | return self.value | ||
501 | raise IndexError(key) | ||
502 | |||
503 | |||
504 | class HereDoc: | ||
505 | def __init__(self, op, name=None): | ||
506 | self.op = op | ||
507 | self.name = name | ||
508 | self.pendings = [] | ||
509 | |||
510 | TK_COMMA = 'COMMA' | ||
511 | TK_AMPERSAND = 'AMP' | ||
512 | TK_OP = 'OP' | ||
513 | TK_TOKEN = 'TOKEN' | ||
514 | TK_COMMENT = 'COMMENT' | ||
515 | TK_NEWLINE = 'NEWLINE' | ||
516 | TK_IONUMBER = 'IO_NUMBER' | ||
517 | TK_ASSIGNMENT = 'ASSIGNMENT_WORD' | ||
518 | TK_HERENAME = 'HERENAME' | ||
519 | |||
520 | class Lexer: | ||
521 | """Main lexer. | ||
522 | |||
523 | Call add() until the script AST is returned. | ||
524 | """ | ||
525 | # Here-document handling makes the whole thing more complex because they basically | ||
526 | # force tokens to be reordered: here-content must come right after the operator | ||
527 | # and the here-document name, while some other tokens might be following the | ||
528 | # here-document expression on the same line. | ||
529 | # | ||
530 | # So, here-doc states are basically: | ||
531 | # *self._state==ST_NORMAL | ||
532 | # - self._heredoc.op is None: no here-document | ||
533 | # - self._heredoc.op is not None but name is: here-document operator matched, | ||
534 | # waiting for the document name/delimiter | ||
535 | # - self._heredoc.op and name are not None: here-document is ready, following | ||
536 | # tokens are being stored and will be pushed again when the document is | ||
537 | # completely parsed. | ||
538 | # *self._state==ST_HEREDOC | ||
539 | # - The here-document is being delimited by self._herelexer. Once it is done | ||
540 | # the content is pushed in front of the pending token list then all these | ||
541 | # tokens are pushed once again. | ||
542 | ST_NORMAL = 'ST_NORMAL' | ||
543 | ST_OP = 'ST_OP' | ||
544 | ST_BACKSLASH = 'ST_BACKSLASH' | ||
545 | ST_QUOTED = 'ST_QUOTED' | ||
546 | ST_COMMENT = 'ST_COMMENT' | ||
547 | ST_HEREDOC = 'ST_HEREDOC' | ||
548 | |||
549 | #Match end of backquote strings | ||
550 | RE_BACKQUOTE_END = re.compile(r'(?<!\\)(`)') | ||
551 | |||
552 | def __init__(self, parent_state = None): | ||
553 | self._input = [] | ||
554 | self._pos = 0 | ||
555 | |||
556 | self._token = '' | ||
557 | self._type = TK_TOKEN | ||
558 | |||
559 | self._state = self.ST_NORMAL | ||
560 | self._parent_state = parent_state | ||
561 | self._wordlexer = None | ||
562 | |||
563 | self._heredoc = HereDoc(None) | ||
564 | self._herelexer = None | ||
565 | |||
566 | ### Following attributes are not used for delimiting token and can safely | ||
567 | ### be changed after here-document detection (see _push_toke) | ||
568 | |||
569 | # Count the number of tokens following a 'For' reserved word. Needed to | ||
570 | # return an 'In' reserved word if it comes in third place. | ||
571 | self._for_count = None | ||
572 | |||
573 | def add(self, data, eof=False): | ||
574 | """Feed the lexer with data. | ||
575 | |||
576 | When eof is set to True, returns unconsumed data or raise if the lexer | ||
577 | is in the middle of a delimiting operation. | ||
578 | Raise NeedMore otherwise. | ||
579 | """ | ||
580 | self._input += list(data) | ||
581 | self._parse(eof) | ||
582 | self._input[:self._pos] = [] | ||
583 | return ''.join(self._input) | ||
584 | |||
585 | def _parse(self, eof): | ||
586 | while self._state: | ||
587 | if self._pos>=len(self._input): | ||
588 | if not eof: | ||
589 | raise NeedMore() | ||
590 | elif self._state not in (self.ST_OP, self.ST_QUOTED, self.ST_HEREDOC): | ||
591 | #Delimit the current token and leave cleanly | ||
592 | self._push_token('') | ||
593 | break | ||
594 | else: | ||
595 | #Let the sublexer handle the eof themselves | ||
596 | pass | ||
597 | |||
598 | if self._state==self.ST_NORMAL: | ||
599 | self._parse_normal() | ||
600 | elif self._state==self.ST_COMMENT: | ||
601 | self._parse_comment() | ||
602 | elif self._state==self.ST_OP: | ||
603 | self._parse_op(eof) | ||
604 | elif self._state==self.ST_QUOTED: | ||
605 | self._parse_quoted(eof) | ||
606 | elif self._state==self.ST_HEREDOC: | ||
607 | self._parse_heredoc(eof) | ||
608 | else: | ||
609 | assert False, "Unknown state " + str(self._state) | ||
610 | |||
611 | if self._heredoc.op is not None: | ||
612 | raise ShellSyntaxError('missing here-document delimiter') | ||
613 | |||
614 | def _parse_normal(self): | ||
615 | c = self._input[self._pos] | ||
616 | if c=='\n': | ||
617 | self._push_token(c) | ||
618 | self._token = c | ||
619 | self._type = TK_NEWLINE | ||
620 | self._push_token('') | ||
621 | self._pos += 1 | ||
622 | elif c in ('\\', '\'', '"', '`', '$'): | ||
623 | self._state = self.ST_QUOTED | ||
624 | elif is_partial_op(c): | ||
625 | self._push_token(c) | ||
626 | |||
627 | self._type = TK_OP | ||
628 | self._token += c | ||
629 | self._pos += 1 | ||
630 | self._state = self.ST_OP | ||
631 | elif is_blank(c): | ||
632 | self._push_token(c) | ||
633 | |||
634 | #Discard blanks | ||
635 | self._pos += 1 | ||
636 | elif self._token: | ||
637 | self._token += c | ||
638 | self._pos += 1 | ||
639 | elif c=='#': | ||
640 | self._state = self.ST_COMMENT | ||
641 | self._type = TK_COMMENT | ||
642 | self._pos += 1 | ||
643 | else: | ||
644 | self._pos += 1 | ||
645 | self._token += c | ||
646 | |||
647 | def _parse_op(self, eof): | ||
648 | assert self._token | ||
649 | |||
650 | while 1: | ||
651 | if self._pos>=len(self._input): | ||
652 | if not eof: | ||
653 | raise NeedMore() | ||
654 | c = '' | ||
655 | else: | ||
656 | c = self._input[self._pos] | ||
657 | |||
658 | op = self._token + c | ||
659 | if c and is_partial_op(op): | ||
660 | #Still parsing an operator | ||
661 | self._token = op | ||
662 | self._pos += 1 | ||
663 | else: | ||
664 | #End of operator | ||
665 | self._push_token(c) | ||
666 | self._state = self.ST_NORMAL | ||
667 | break | ||
668 | |||
669 | def _parse_comment(self): | ||
670 | while 1: | ||
671 | if self._pos>=len(self._input): | ||
672 | raise NeedMore() | ||
673 | |||
674 | c = self._input[self._pos] | ||
675 | if c=='\n': | ||
676 | #End of comment, do not consume the end of line | ||
677 | self._state = self.ST_NORMAL | ||
678 | break | ||
679 | else: | ||
680 | self._token += c | ||
681 | self._pos += 1 | ||
682 | |||
683 | def _parse_quoted(self, eof): | ||
684 | """Precondition: the starting backquote/dollar is still in the input queue.""" | ||
685 | if not self._wordlexer: | ||
686 | self._wordlexer = WordLexer() | ||
687 | |||
688 | if self._pos<len(self._input): | ||
689 | #Transfer input queue character into the subparser | ||
690 | input = self._input[self._pos:] | ||
691 | self._pos += len(input) | ||
692 | |||
693 | wtree, remaining = self._wordlexer.add(input, eof) | ||
694 | self._wordlexer = None | ||
695 | self._token += wordtree_as_string(wtree) | ||
696 | |||
697 | #Put unparsed character back in the input queue | ||
698 | if remaining: | ||
699 | self._input[self._pos:self._pos] = list(remaining) | ||
700 | self._state = self.ST_NORMAL | ||
701 | |||
702 | def _parse_heredoc(self, eof): | ||
703 | assert not self._token | ||
704 | |||
705 | if self._herelexer is None: | ||
706 | self._herelexer = HereDocLexer(self._heredoc.op, self._heredoc.name) | ||
707 | |||
708 | if self._pos<len(self._input): | ||
709 | #Transfer input queue character into the subparser | ||
710 | input = self._input[self._pos:] | ||
711 | self._pos += len(input) | ||
712 | |||
713 | self._token, remaining = self._herelexer.add(input, eof) | ||
714 | |||
715 | #Reset here-document state | ||
716 | self._herelexer = None | ||
717 | heredoc, self._heredoc = self._heredoc, HereDoc(None) | ||
718 | if remaining: | ||
719 | self._input[self._pos:self._pos] = list(remaining) | ||
720 | self._state = self.ST_NORMAL | ||
721 | |||
722 | #Push pending tokens | ||
723 | heredoc.pendings[:0] = [(self._token, self._type, heredoc.name)] | ||
724 | for token, type, delim in heredoc.pendings: | ||
725 | self._token = token | ||
726 | self._type = type | ||
727 | self._push_token(delim) | ||
728 | |||
729 | def _push_token(self, delim): | ||
730 | if not self._token: | ||
731 | return 0 | ||
732 | |||
733 | if self._heredoc.op is not None: | ||
734 | if self._heredoc.name is None: | ||
735 | #Here-document name | ||
736 | if self._type!=TK_TOKEN: | ||
737 | raise ShellSyntaxError("expecting here-document name, got '%s'" % self._token) | ||
738 | self._heredoc.name = unquote_wordtree(make_wordtree(self._token)) | ||
739 | self._type = TK_HERENAME | ||
740 | else: | ||
741 | #Capture all tokens until the newline starting the here-document | ||
742 | if self._type==TK_NEWLINE: | ||
743 | assert self._state==self.ST_NORMAL | ||
744 | self._state = self.ST_HEREDOC | ||
745 | |||
746 | self._heredoc.pendings.append((self._token, self._type, delim)) | ||
747 | self._token = '' | ||
748 | self._type = TK_TOKEN | ||
749 | return 1 | ||
750 | |||
751 | # BEWARE: do not change parser state from here to the end of the function: | ||
752 | # when parsing between an here-document operator to the end of the line | ||
753 | # tokens are stored in self._heredoc.pendings. Therefore, they will not | ||
754 | # reach the section below. | ||
755 | |||
756 | #Check operators | ||
757 | if self._type==TK_OP: | ||
758 | #False positive because of partial op matching | ||
759 | op = is_op(self._token) | ||
760 | if not op: | ||
761 | self._type = TK_TOKEN | ||
762 | else: | ||
763 | #Map to the specific operator | ||
764 | self._type = op | ||
765 | if self._token in ('<<', '<<-'): | ||
766 | #Done here rather than in _parse_op because there is no need | ||
767 | #to change the parser state since we are still waiting for | ||
768 | #the here-document name | ||
769 | if self._heredoc.op is not None: | ||
770 | raise ShellSyntaxError("syntax error near token '%s'" % self._token) | ||
771 | assert self._heredoc.op is None | ||
772 | self._heredoc.op = self._token | ||
773 | |||
774 | if self._type==TK_TOKEN: | ||
775 | if '=' in self._token and not delim: | ||
776 | if self._token.startswith('='): | ||
777 | #Token is a WORD... a TOKEN that is. | ||
778 | pass | ||
779 | else: | ||
780 | prev = self._token[:self._token.find('=')] | ||
781 | if is_name(prev): | ||
782 | self._type = TK_ASSIGNMENT | ||
783 | else: | ||
784 | #Just a token (unspecified) | ||
785 | pass | ||
786 | else: | ||
787 | reserved = get_reserved(self._token) | ||
788 | if reserved is not None: | ||
789 | if reserved=='In' and self._for_count!=2: | ||
790 | #Sorry, not a reserved word after all | ||
791 | pass | ||
792 | else: | ||
793 | self._type = reserved | ||
794 | if reserved in ('For', 'Case'): | ||
795 | self._for_count = 0 | ||
796 | elif are_digits(self._token) and delim in ('<', '>'): | ||
797 | #Detect IO_NUMBER | ||
798 | self._type = TK_IONUMBER | ||
799 | elif self._token==';': | ||
800 | self._type = TK_COMMA | ||
801 | elif self._token=='&': | ||
802 | self._type = TK_AMPERSAND | ||
803 | elif self._type==TK_COMMENT: | ||
804 | #Comments are not part of sh grammar, ignore them | ||
805 | self._token = '' | ||
806 | self._type = TK_TOKEN | ||
807 | return 0 | ||
808 | |||
809 | if self._for_count is not None: | ||
810 | #Track token count in 'For' expression to detect 'In' reserved words. | ||
811 | #Can only be in third position, no need to go beyond | ||
812 | self._for_count += 1 | ||
813 | if self._for_count==3: | ||
814 | self._for_count = None | ||
815 | |||
816 | self.on_token((self._token, self._type)) | ||
817 | self._token = '' | ||
818 | self._type = TK_TOKEN | ||
819 | return 1 | ||
820 | |||
821 | def on_token(self, token): | ||
822 | raise NotImplementedError | ||
823 | |||
824 | |||
825 | tokens = [ | ||
826 | TK_TOKEN, | ||
827 | # To silence yacc unused token warnings | ||
828 | # TK_COMMENT, | ||
829 | TK_NEWLINE, | ||
830 | TK_IONUMBER, | ||
831 | TK_ASSIGNMENT, | ||
832 | TK_HERENAME, | ||
833 | ] | ||
834 | |||
835 | #Add specific operators | ||
836 | tokens += _OPERATORS.values() | ||
837 | #Add reserved words | ||
838 | tokens += _RESERVEDS.values() | ||
839 | |||
840 | class PLYLexer(Lexer): | ||
841 | """Bridge Lexer and PLY lexer interface.""" | ||
842 | def __init__(self): | ||
843 | Lexer.__init__(self) | ||
844 | self._tokens = [] | ||
845 | self._current = 0 | ||
846 | self.lineno = 0 | ||
847 | |||
848 | def on_token(self, token): | ||
849 | value, type = token | ||
850 | |||
851 | self.lineno = 0 | ||
852 | t = lex.LexToken() | ||
853 | t.value = value | ||
854 | t.type = type | ||
855 | t.lexer = self | ||
856 | t.lexpos = 0 | ||
857 | t.lineno = 0 | ||
858 | |||
859 | self._tokens.append(t) | ||
860 | |||
861 | def is_empty(self): | ||
862 | return not bool(self._tokens) | ||
863 | |||
864 | #PLY compliant interface | ||
865 | def token(self): | ||
866 | if self._current>=len(self._tokens): | ||
867 | return None | ||
868 | t = self._tokens[self._current] | ||
869 | self._current += 1 | ||
870 | return t | ||
871 | |||
872 | |||
873 | def get_tokens(s): | ||
874 | """Parse the input string and return a tuple (tokens, unprocessed) where | ||
875 | tokens is a list of parsed tokens and unprocessed is the part of the input | ||
876 | string left untouched by the lexer. | ||
877 | """ | ||
878 | lexer = PLYLexer() | ||
879 | untouched = lexer.add(s, True) | ||
880 | tokens = [] | ||
881 | while 1: | ||
882 | token = lexer.token() | ||
883 | if token is None: | ||
884 | break | ||
885 | tokens.append(token) | ||
886 | |||
887 | tokens = [(t.value, t.type) for t in tokens] | ||
888 | return tokens, untouched | ||