import ast import codegen import logging import os.path import bb.utils, bb.data from itertools import chain from pysh import pyshyacc, pyshlex, sherrors from bb.cache import MultiProcessCache logger = logging.getLogger('BitBake.CodeParser') try: import cPickle as pickle except ImportError: import pickle logger.info('Importing cPickle failed. Falling back to a very slow implementation.') def check_indent(codestr): """If the code is indented, add a top level piece of code to 'remove' the indentation""" i = 0 while codestr[i] in ["\n", "\t", " "]: i = i + 1 if i == 0: return codestr if codestr[i-1] == "\t" or codestr[i-1] == " ": return "if 1:\n" + codestr return codestr # Basically pickle, in python 2.7.3 at least, does badly with data duplication # upon pickling and unpickling. Combine this with duplicate objects and things # are a mess. # # When the sets are originally created, python calls intern() on the set keys # which significantly improves memory usage. Sadly the pickle/unpickle process # doesn't call intern() on the keys and results in the same strings being duplicated # in memory. This also means pickle will save the same string multiple times in # the cache file. # # By having shell and python cacheline objects with setstate/getstate, we force # the object creation through our own routine where we can call intern (via internSet). # # We also use hashable frozensets and ensure we use references to these so that # duplicates can be removed, both in memory and in the resulting pickled data. # # By playing these games, the size of the cache file shrinks dramatically # meaning faster load times and the reloaded cache files also consume much less # memory. Smaller cache files, faster load times and lower memory usage is good. # # A custom getstate/setstate using tuples is actually worth 15% cachesize by # avoiding duplication of the attribute names! class SetCache(object): def __init__(self): self.setcache = {} def internSet(self, items): new = [] for i in items: new.append(intern(i)) s = frozenset(new) if hash(s) in self.setcache: return self.setcache[hash(s)] self.setcache[hash(s)] = s return s codecache = SetCache() class pythonCacheLine(object): def __init__(self, refs, execs, contains): self.refs = codecache.internSet(refs) self.execs = codecache.internSet(execs) self.contains = {} for c in contains: self.contains[c] = codecache.internSet(contains[c]) def __getstate__(self): return (self.refs, self.execs, self.contains) def __setstate__(self, state): (refs, execs, contains) = state self.__init__(refs, execs, contains) def __hash__(self): l = (hash(self.refs), hash(self.execs)) for c in sorted(self.contains.keys()): l = l + (c, hash(self.contains[c])) return hash(l) def __repr__(self): return " ".join([str(self.refs), str(self.execs), str(self.contains)]) class shellCacheLine(object): def __init__(self, execs): self.execs = codecache.internSet(execs) def __getstate__(self): return (self.execs) def __setstate__(self, state): (execs) = state self.__init__(execs) def __hash__(self): return hash(self.execs) def __repr__(self): return str(self.execs) class CodeParserCache(MultiProcessCache): cache_file_name = "bb_codeparser.dat" CACHE_VERSION = 7 def __init__(self): MultiProcessCache.__init__(self) self.pythoncache = self.cachedata[0] self.shellcache = self.cachedata[1] self.pythoncacheextras = self.cachedata_extras[0] self.shellcacheextras = self.cachedata_extras[1] # To avoid duplication in the codeparser cache, keep # a lookup of hashes of objects we already have self.pythoncachelines = {} self.shellcachelines = {} def newPythonCacheLine(self, refs, execs, contains): cacheline = pythonCacheLine(refs, execs, contains) h = hash(cacheline) if h in self.pythoncachelines: return self.pythoncachelines[h] self.pythoncachelines[h] = cacheline return cacheline def newShellCacheLine(self, execs): cacheline = shellCacheLine(execs) h = hash(cacheline) if h in self.shellcachelines: return self.shellcachelines[h] self.shellcachelines[h] = cacheline return cacheline def init_cache(self, d): MultiProcessCache.init_cache(self, d) # cachedata gets re-assigned in the parent self.pythoncache = self.cachedata[0] self.shellcache = self.cachedata[1] def create_cachedata(self): data = [{}, {}] return data codeparsercache = CodeParserCache() def parser_cache_init(d): codeparsercache.init_cache(d) def parser_cache_save(d): codeparsercache.save_extras(d) def parser_cache_savemerge(d): codeparsercache.save_merge(d) Logger = logging.getLoggerClass() class BufferedLogger(Logger): def __init__(self, name, level=0, target=None): Logger.__init__(self, name) self.setLevel(level) self.buffer = [] self.target = target def handle(self, record): self.buffer.append(record) def flush(self): for record in self.buffer: self.target.handle(record) self.buffer = [] class PythonParser(): getvars = (".getVar", ".appendVar", ".prependVar") containsfuncs = ("bb.utils.contains", "base_contains", "bb.utils.contains_any") execfuncs = ("bb.build.exec_func", "bb.build.exec_task") def warn(self, func, arg): """Warn about calls of bitbake APIs which pass a non-literal argument for the variable name, as we're not able to track such a reference. """ try: funcstr = codegen.to_source(func) argstr = codegen.to_source(arg) except TypeError: self.log.debug(2, 'Failed to convert function and argument to source form') else: self.log.debug(1, self.unhandled_message % (funcstr, argstr)) def visit_Call(self, node): name = self.called_node_name(node.func) if name and name.endswith(self.getvars) or name in self.containsfuncs: if isinstance(node.args[0], ast.Str): varname = node.args[0].s if name in self.containsfuncs and isinstance(node.args[1], ast.Str): if varname not in self.contains: self.contains[varname] = set() self.contains[varname].add(node.args[1].s) else: self.references.add(node.args[0].s) else: self.warn(node.func, node.args[0]) elif name in self.execfuncs: if isinstance(node.args[0], ast.Str): self.var_execs.add(node.args[0].s) else: self.warn(node.func, node.args[0]) elif name and isinstance(node.func, (ast.Name, ast.Attribute)): self.execs.add(name) def called_node_name(self, node): """Given a called node, return its original string form""" components = [] while node: if isinstance(node, ast.Attribute): components.append(node.attr) node = node.value elif isinstance(node, ast.Name): components.append(node.id) return '.'.join(reversed(components)) else: break def __init__(self, name, log): self.var_execs = set() self.contains = {} self.execs = set() self.references = set() self.log = BufferedLogger('BitBake.Data.PythonParser', logging.DEBUG, log) self.unhandled_message = "in call of %s, argument '%s' is not a string literal" self.unhandled_message = "while parsing %s, %s" % (name, self.unhandled_message) def parse_python(self, node): if not node or not node.strip(): return h = hash(str(node)) if h in codeparsercache.pythoncache: self.references = set(codeparsercache.pythoncache[h].refs) self.execs = set(codeparsercache.pythoncache[h].execs) self.contains = {} for i in codeparsercache.pythoncache[h].contains: self.contains[i] = set(codeparsercache.pythoncache[h].contains[i]) return if h in codeparsercache.pythoncacheextras: self.references = set(codeparsercache.pythoncacheextras[h].refs) self.execs = set(codeparsercache.pythoncacheextras[h].execs) self.contains = {} for i in codeparsercache.pythoncacheextras[h].contains: self.contains[i] = set(codeparsercache.pythoncacheextras[h].contains[i]) return code = compile(check_indent(str(node)), "", "exec", ast.PyCF_ONLY_AST) for n in ast.walk(code): if n.__class__.__name__ == "Call": self.visit_Call(n) self.execs.update(self.var_execs) codeparsercache.pythoncacheextras[h] = codeparsercache.newPythonCacheLine(self.references, self.execs, self.contains) class ShellParser(): def __init__(self, name, log): self.funcdefs = set() self.allexecs = set() self.execs = set() self.log = BufferedLogger('BitBake.Data.%s' % name, logging.DEBUG, log) self.unhandled_template = "unable to handle non-literal command '%s'" self.unhandled_template = "while parsing %s, %s" % (name, self.unhandled_template) def parse_shell(self, value): """Parse the supplied shell code in a string, returning the external commands it executes. """ h = hash(str(value)) if h in codeparsercache.shellcache: self.execs = set(codeparsercache.shellcache[h].execs) return self.execs if h in codeparsercache.shellcacheextras: self.execs = set(codeparsercache.shellcacheextras[h].execs) return self.execs self._parse_shell(value) self.execs = set(cmd for cmd in self.allexecs if cmd not in self.funcdefs) codeparsercache.shellcacheextras[h] = codeparsercache.newShellCacheLine(self.execs) return self.execs def _parse_shell(self, value): try: tokens, _ = pyshyacc.parse(value, eof=True, debug=False) except pyshlex.NeedMore: raise sherrors.ShellSyntaxError("Unexpected EOF") for token in tokens: self.process_tokens(token) def process_tokens(self, tokens): """Process a supplied portion of the syntax tree as returned by pyshyacc.parse. """ def function_definition(value): self.funcdefs.add(value.name) return [value.body], None def case_clause(value): # Element 0 of each item in the case is the list of patterns, and # Element 1 of each item in the case is the list of commands to be # executed when that pattern matches. words = chain(*[item[0] for item in value.items]) cmds = chain(*[item[1] for item in value.items]) return cmds, words def if_clause(value): main = chain(value.cond, value.if_cmds) rest = value.else_cmds if isinstance(rest, tuple) and rest[0] == "elif": return chain(main, if_clause(rest[1])) else: return chain(main, rest) def simple_command(value): return None, chain(value.words, (assign[1] for assign in value.assigns)) token_handlers = { "and_or": lambda x: ((x.left, x.right), None), "async": lambda x: ([x], None), "brace_group": lambda x: (x.cmds, None), "for_clause": lambda x: (x.cmds, x.items), "function_definition": function_definition, "if_clause": lambda x: (if_clause(x), None), "pipeline": lambda x: (x.commands, None), "redirect_list": lambda x: ([x.cmd], None), "subshell": lambda x: (x.cmds, None), "while_clause": lambda x: (chain(x.condition, x.cmds), None), "until_clause": lambda x: (chain(x.condition, x.cmds), None), "simple_command": simple_command, "case_clause": case_clause, } for token in tokens: name, value = token try: more_tokens, words = token_handlers[name](value) except KeyError: raise NotImplementedError("Unsupported token type " + name) if more_tokens: self.process_tokens(more_tokens) if words: self.process_words(words) def process_words(self, words): """Process a set of 'words' in pyshyacc parlance, which includes extraction of executed commands from $() blocks, as well as grabbing the command name argument. """ words = list(words) for word in list(words): wtree = pyshlex.make_wordtree(word[1]) for part in wtree: if not isinstance(part, list): continue if part[0] in ('`', '$('): command = pyshlex.wordtree_as_string(part[1:-1]) self._parse_shell(command) if word[0] in ("cmd_name", "cmd_word"): if word in words: words.remove(word) usetoken = False for word in words: if word[0] in ("cmd_name", "cmd_word") or \ (usetoken and word[0] == "TOKEN"): if "=" in word[1]: usetoken = True continue cmd = word[1] if cmd.startswith("$"): self.log.debug(1, self.unhandled_template % cmd) elif cmd == "eval": command = " ".join(word for _, word in words[1:]) self._parse_shell(command) else: self.allexecs.add(cmd) break