import ast import codegen import logging import os.path import bb.utils, bb.data from itertools import chain from pysh import pyshyacc, pyshlex, sherrors logger = logging.getLogger('BitBake.CodeParser') PARSERCACHE_VERSION = 2 try: import cPickle as pickle except ImportError: import pickle logger.info('Importing cPickle failed. Falling back to a very slow implementation.') def check_indent(codestr): """If the code is indented, add a top level piece of code to 'remove' the indentation""" i = 0 while codestr[i] in ["\n", "\t", " "]: i = i + 1 if i == 0: return codestr if codestr[i-1] == "\t" or codestr[i-1] == " ": return "if 1:\n" + codestr return codestr pythonparsecache = {} shellparsecache = {} pythonparsecacheextras = {} shellparsecacheextras = {} def parser_cachefile(d): cachedir = (d.getVar("PERSISTENT_DIR", True) or d.getVar("CACHE", True)) if cachedir in [None, '']: return None bb.utils.mkdirhier(cachedir) cachefile = os.path.join(cachedir, "bb_codeparser.dat") logger.debug(1, "Using cache in '%s' for codeparser cache", cachefile) return cachefile def parser_cache_init(d): global pythonparsecache global shellparsecache cachefile = parser_cachefile(d) if not cachefile: return try: p = pickle.Unpickler(file(cachefile, "rb")) data, version = p.load() except: return if version != PARSERCACHE_VERSION: return pythonparsecache = data[0] shellparsecache = data[1] def parser_cache_save(d): cachefile = parser_cachefile(d) if not cachefile: return glf = bb.utils.lockfile(cachefile + ".lock", shared=True) i = os.getpid() lf = None while not lf: shellcache = {} pythoncache = {} lf = bb.utils.lockfile(cachefile + ".lock." + str(i), retry=False) if not lf or os.path.exists(cachefile + "-" + str(i)): if lf: bb.utils.unlockfile(lf) lf = None i = i + 1 continue shellcache = shellparsecacheextras pythoncache = pythonparsecacheextras p = pickle.Pickler(file(cachefile + "-" + str(i), "wb"), -1) p.dump([[pythoncache, shellcache], PARSERCACHE_VERSION]) bb.utils.unlockfile(lf) bb.utils.unlockfile(glf) def internSet(items): new = set() for i in items: new.add(intern(i)) return new def parser_cache_savemerge(d): cachefile = parser_cachefile(d) if not cachefile: return glf = bb.utils.lockfile(cachefile + ".lock") try: p = pickle.Unpickler(file(cachefile, "rb")) data, version = p.load() except (IOError, EOFError): data, version = None, None if version != PARSERCACHE_VERSION: data = [{}, {}] for f in [y for y in os.listdir(os.path.dirname(cachefile)) if y.startswith(os.path.basename(cachefile) + '-')]: f = os.path.join(os.path.dirname(cachefile), f) try: p = pickle.Unpickler(file(f, "rb")) extradata, version = p.load() except (IOError, EOFError): extradata, version = [{}, {}], None if version != PARSERCACHE_VERSION: continue for h in extradata[0]: if h not in data[0]: data[0][h] = extradata[0][h] for h in extradata[1]: if h not in data[1]: data[1][h] = extradata[1][h] os.unlink(f) # When the dicts are originally created, python calls intern() on the set keys # which significantly improves memory usage. Sadly the pickle/unpickle process # doesn't call intern() on the keys and results in the same strings being duplicated # in memory. This also means pickle will save the same string multiple times in # the cache file. By interning the data here, the cache file shrinks dramatically # meaning faster load times and the reloaded cache files also consume much less # memory. This is worth any performance hit from this loops and the use of the # intern() data storage. # Python 3.x may behave better in this area for h in data[0]: data[0][h]["refs"] = internSet(data[0][h]["refs"]) data[0][h]["execs"] = internSet(data[0][h]["execs"]) for h in data[1]: data[1][h]["execs"] = internSet(data[1][h]["execs"]) p = pickle.Pickler(file(cachefile, "wb"), -1) p.dump([data, PARSERCACHE_VERSION]) bb.utils.unlockfile(glf) Logger = logging.getLoggerClass() class BufferedLogger(Logger): def __init__(self, name, level=0, target=None): Logger.__init__(self, name) self.setLevel(level) self.buffer = [] self.target = target def handle(self, record): self.buffer.append(record) def flush(self): for record in self.buffer: self.target.handle(record) self.buffer = [] class PythonParser(): getvars = ("d.getVar", "bb.data.getVar", "data.getVar") execfuncs = ("bb.build.exec_func", "bb.build.exec_task") def warn(self, func, arg): """Warn about calls of bitbake APIs which pass a non-literal argument for the variable name, as we're not able to track such a reference. """ try: funcstr = codegen.to_source(func) argstr = codegen.to_source(arg) except TypeError: self.log.debug(2, 'Failed to convert function and argument to source form') else: self.log.debug(1, self.unhandled_message % (funcstr, argstr)) def visit_Call(self, node): name = self.called_node_name(node.func) if name in self.getvars: if isinstance(node.args[0], ast.Str): self.var_references.add(node.args[0].s) else: self.warn(node.func, node.args[0]) elif name in self.execfuncs: if isinstance(node.args[0], ast.Str): self.var_execs.add(node.args[0].s) else: self.warn(node.func, node.args[0]) elif name and isinstance(node.func, (ast.Name, ast.Attribute)): self.execs.add(name) def called_node_name(self, node): """Given a called node, return its original string form""" components = [] while node: if isinstance(node, ast.Attribute): components.append(node.attr) node = node.value elif isinstance(node, ast.Name): components.append(node.id) return '.'.join(reversed(components)) else: break def __init__(self, name, log): self.var_references = set() self.var_execs = set() self.execs = set() self.references = set() self.log = BufferedLogger('BitBake.Data.%s' % name, logging.DEBUG, log) self.unhandled_message = "in call of %s, argument '%s' is not a string literal" self.unhandled_message = "while parsing %s, %s" % (name, self.unhandled_message) def parse_python(self, node): h = hash(str(node)) if h in pythonparsecache: self.references = pythonparsecache[h]["refs"] self.execs = pythonparsecache[h]["execs"] return if h in pythonparsecacheextras: self.references = pythonparsecacheextras[h]["refs"] self.execs = pythonparsecacheextras[h]["execs"] return code = compile(check_indent(str(node)), "", "exec", ast.PyCF_ONLY_AST) for n in ast.walk(code): if n.__class__.__name__ == "Call": self.visit_Call(n) self.references.update(self.var_references) self.references.update(self.var_execs) pythonparsecacheextras[h] = {} pythonparsecacheextras[h]["refs"] = self.references pythonparsecacheextras[h]["execs"] = self.execs class ShellParser(): def __init__(self, name, log): self.funcdefs = set() self.allexecs = set() self.execs = set() self.log = BufferedLogger('BitBake.Data.%s' % name, logging.DEBUG, log) self.unhandled_template = "unable to handle non-literal command '%s'" self.unhandled_template = "while parsing %s, %s" % (name, self.unhandled_template) def parse_shell(self, value): """Parse the supplied shell code in a string, returning the external commands it executes. """ h = hash(str(value)) if h in shellparsecache: self.execs = shellparsecache[h]["execs"] return self.execs if h in shellparsecacheextras: self.execs = shellparsecacheextras[h]["execs"] return self.execs try: tokens, _ = pyshyacc.parse(value, eof=True, debug=False) except pyshlex.NeedMore: raise sherrors.ShellSyntaxError("Unexpected EOF") for token in tokens: self.process_tokens(token) self.execs = set(cmd for cmd in self.allexecs if cmd not in self.funcdefs) shellparsecacheextras[h] = {} shellparsecacheextras[h]["execs"] = self.execs return self.execs def process_tokens(self, tokens): """Process a supplied portion of the syntax tree as returned by pyshyacc.parse. """ def function_definition(value): self.funcdefs.add(value.name) return [value.body], None def case_clause(value): # Element 0 of each item in the case is the list of patterns, and # Element 1 of each item in the case is the list of commands to be # executed when that pattern matches. words = chain(*[item[0] for item in value.items]) cmds = chain(*[item[1] for item in value.items]) return cmds, words def if_clause(value): main = chain(value.cond, value.if_cmds) rest = value.else_cmds if isinstance(rest, tuple) and rest[0] == "elif": return chain(main, if_clause(rest[1])) else: return chain(main, rest) def simple_command(value): return None, chain(value.words, (assign[1] for assign in value.assigns)) token_handlers = { "and_or": lambda x: ((x.left, x.right), None), "async": lambda x: ([x], None), "brace_group": lambda x: (x.cmds, None), "for_clause": lambda x: (x.cmds, x.items), "function_definition": function_definition, "if_clause": lambda x: (if_clause(x), None), "pipeline": lambda x: (x.commands, None), "redirect_list": lambda x: ([x.cmd], None), "subshell": lambda x: (x.cmds, None), "while_clause": lambda x: (chain(x.condition, x.cmds), None), "until_clause": lambda x: (chain(x.condition, x.cmds), None), "simple_command": simple_command, "case_clause": case_clause, } for token in tokens: name, value = token try: more_tokens, words = token_handlers[name](value) except KeyError: raise NotImplementedError("Unsupported token type " + name) if more_tokens: self.process_tokens(more_tokens) if words: self.process_words(words) def process_words(self, words): """Process a set of 'words' in pyshyacc parlance, which includes extraction of executed commands from $() blocks, as well as grabbing the command name argument. """ words = list(words) for word in list(words): wtree = pyshlex.make_wordtree(word[1]) for part in wtree: if not isinstance(part, list): continue if part[0] in ('`', '$('): command = pyshlex.wordtree_as_string(part[1:-1]) self.parse_shell(command) if word[0] in ("cmd_name", "cmd_word"): if word in words: words.remove(word) usetoken = False for word in words: if word[0] in ("cmd_name", "cmd_word") or \ (usetoken and word[0] == "TOKEN"): if "=" in word[1]: usetoken = True continue cmd = word[1] if cmd.startswith("$"): self.log.debug(1, self.unhandled_template % cmd) elif cmd == "eval": command = " ".join(word for _, word in words[1:]) self.parse_shell(command) else: self.allexecs.add(cmd) break