summaryrefslogtreecommitdiffstats
path: root/bitbake
diff options
context:
space:
mode:
authorRichard Purdie <richard.purdie@linuxfoundation.org>2014-07-25 14:54:23 +0100
committerRichard Purdie <richard.purdie@linuxfoundation.org>2014-07-26 08:50:14 +0100
commit89d178841208557b030103cf0ae813a42550487c (patch)
tree8fee8247d56c36cf28a5b36671725e01325eb58f /bitbake
parenta05435fc59d32f2fcf4ea4185cb0655eeb343211 (diff)
downloadpoky-89d178841208557b030103cf0ae813a42550487c.tar.gz
bitbake: codeparser cache improvements
It turns out the codeparser cache is the bottleneck I've been observing when running bitbake commands, particularly as it grows. There are some things we can do about this: * We were processing the cache with "intern()" at save time. Its actually much more memory efficient to do this at creation time. * Use hashable objects such as frozenset rather than set so that we can compare objects * De-duplicate the cache objects, link duplicates to the same object saving memory and disk usage and improving speed * Using custom setstate/getstate to avoid the overhead of object attribute names in the cache file To make this work, a global cache was needed for the list of set objects as this was the only way I could find to get the data in at setstate object creation time :(. Parsing shows a modest improvement with these changes, cache load time is significantly better, cache save time is reduced since there is now no need to reprocess the data and cache is much smaller. We can drop the compress_keys() code and internSet code from the shared cache core since its no longer used and replaced by codeparser specific pieces. (Bitbake rev: 4aaf56bfbad4aa626be8a2f7a5f70834c3311dd3) Signed-off-by: Richard Purdie <richard.purdie@linuxfoundation.org>
Diffstat (limited to 'bitbake')
-rw-r--r--bitbake/lib/bb/cache.py12
-rw-r--r--bitbake/lib/bb/codeparser.py143
2 files changed, 109 insertions, 46 deletions
diff --git a/bitbake/lib/bb/cache.py b/bitbake/lib/bb/cache.py
index c7f3b7ab71..f892d7dc32 100644
--- a/bitbake/lib/bb/cache.py
+++ b/bitbake/lib/bb/cache.py
@@ -764,16 +764,6 @@ class MultiProcessCache(object):
764 764
765 self.cachedata = data 765 self.cachedata = data
766 766
767 def internSet(self, items):
768 new = set()
769 for i in items:
770 new.add(intern(i))
771 return new
772
773 def compress_keys(self, data):
774 # Override in subclasses if desired
775 return
776
777 def create_cachedata(self): 767 def create_cachedata(self):
778 data = [{}] 768 data = [{}]
779 return data 769 return data
@@ -833,8 +823,6 @@ class MultiProcessCache(object):
833 self.merge_data(extradata, data) 823 self.merge_data(extradata, data)
834 os.unlink(f) 824 os.unlink(f)
835 825
836 self.compress_keys(data)
837
838 with open(self.cachefile, "wb") as f: 826 with open(self.cachefile, "wb") as f:
839 p = pickle.Pickler(f, -1) 827 p = pickle.Pickler(f, -1)
840 p.dump([data, self.__class__.CACHE_VERSION]) 828 p.dump([data, self.__class__.CACHE_VERSION])
diff --git a/bitbake/lib/bb/codeparser.py b/bitbake/lib/bb/codeparser.py
index 2e8de12f33..8b8f91a762 100644
--- a/bitbake/lib/bb/codeparser.py
+++ b/bitbake/lib/bb/codeparser.py
@@ -33,9 +33,82 @@ def check_indent(codestr):
33 return codestr 33 return codestr
34 34
35 35
36# Basically pickle, in python 2.7.3 at least, does badly with data duplication
37# upon pickling and unpickling. Combine this with duplicate objects and things
38# are a mess.
39#
40# When the sets are originally created, python calls intern() on the set keys
41# which significantly improves memory usage. Sadly the pickle/unpickle process
42# doesn't call intern() on the keys and results in the same strings being duplicated
43# in memory. This also means pickle will save the same string multiple times in
44# the cache file.
45#
46# By having shell and python cacheline objects with setstate/getstate, we force
47# the object creation through our own routine where we can call intern (via internSet).
48#
49# We also use hashable frozensets and ensure we use references to these so that
50# duplicates can be removed, both in memory and in the resulting pickled data.
51#
52# By playing these games, the size of the cache file shrinks dramatically
53# meaning faster load times and the reloaded cache files also consume much less
54# memory. Smaller cache files, faster load times and lower memory usage is good.
55#
56# A custom getstate/setstate using tuples is actually worth 15% cachesize by
57# avoiding duplication of the attribute names!
58
59class SetCache(object):
60 def __init__(self):
61 self.setcache = {}
62
63 def internSet(self, items):
64
65 new = []
66 for i in items:
67 new.append(intern(i))
68 s = frozenset(new)
69 if hash(s) in self.setcache:
70 return self.setcache[hash(s)]
71 self.setcache[hash(s)] = s
72 return s
73
74codecache = SetCache()
75
76class pythonCacheLine(object):
77 def __init__(self, refs, execs, contains):
78 self.refs = codecache.internSet(refs)
79 self.execs = codecache.internSet(execs)
80 self.contains = {}
81 for c in contains:
82 self.contains[c] = codecache.internSet(contains[c])
83
84 def __getstate__(self):
85 return (self.refs, self.execs, self.contains)
86
87 def __setstate__(self, state):
88 (refs, execs, contains) = state
89 self.__init__(refs, execs, contains)
90 def __hash__(self):
91 l = (hash(self.refs), hash(self.execs))
92 for c in sorted(self.contains.keys()):
93 l = l + (c, hash(self.contains[c]))
94 return hash(l)
95
96class shellCacheLine(object):
97 def __init__(self, execs):
98 self.execs = codecache.internSet(execs)
99
100 def __getstate__(self):
101 return (self.execs)
102
103 def __setstate__(self, state):
104 (execs) = state
105 self.__init__(execs)
106 def __hash__(self):
107 return hash(self.execs)
108
36class CodeParserCache(MultiProcessCache): 109class CodeParserCache(MultiProcessCache):
37 cache_file_name = "bb_codeparser.dat" 110 cache_file_name = "bb_codeparser.dat"
38 CACHE_VERSION = 6 111 CACHE_VERSION = 7
39 112
40 def __init__(self): 113 def __init__(self):
41 MultiProcessCache.__init__(self) 114 MultiProcessCache.__init__(self)
@@ -44,6 +117,27 @@ class CodeParserCache(MultiProcessCache):
44 self.pythoncacheextras = self.cachedata_extras[0] 117 self.pythoncacheextras = self.cachedata_extras[0]
45 self.shellcacheextras = self.cachedata_extras[1] 118 self.shellcacheextras = self.cachedata_extras[1]
46 119
120 # To avoid duplication in the codeparser cache, keep
121 # a lookup of hashes of objects we already have
122 self.pythoncachelines = {}
123 self.shellcachelines = {}
124
125 def newPythonCacheLine(self, refs, execs, contains):
126 cacheline = pythonCacheLine(refs, execs, contains)
127 h = hash(cacheline)
128 if h in self.pythoncachelines:
129 return self.pythoncachelines[h]
130 self.pythoncachelines[h] = cacheline
131 return cacheline
132
133 def newShellCacheLine(self, execs):
134 cacheline = shellCacheLine(execs)
135 h = hash(cacheline)
136 if h in self.shellcachelines:
137 return self.shellcachelines[h]
138 self.shellcachelines[h] = cacheline
139 return cacheline
140
47 def init_cache(self, d): 141 def init_cache(self, d):
48 MultiProcessCache.init_cache(self, d) 142 MultiProcessCache.init_cache(self, d)
49 143
@@ -51,25 +145,6 @@ class CodeParserCache(MultiProcessCache):
51 self.pythoncache = self.cachedata[0] 145 self.pythoncache = self.cachedata[0]
52 self.shellcache = self.cachedata[1] 146 self.shellcache = self.cachedata[1]
53 147
54 def compress_keys(self, data):
55 # When the dicts are originally created, python calls intern() on the set keys
56 # which significantly improves memory usage. Sadly the pickle/unpickle process
57 # doesn't call intern() on the keys and results in the same strings being duplicated
58 # in memory. This also means pickle will save the same string multiple times in
59 # the cache file. By interning the data here, the cache file shrinks dramatically
60 # meaning faster load times and the reloaded cache files also consume much less
61 # memory. This is worth any performance hit from this loops and the use of the
62 # intern() data storage.
63 # Python 3.x may behave better in this area
64 for h in data[0]:
65 data[0][h]["refs"] = self.internSet(data[0][h]["refs"])
66 data[0][h]["execs"] = self.internSet(data[0][h]["execs"])
67 for k in data[0][h]["contains"]:
68 data[0][h]["contains"][k] = self.internSet(data[0][h]["contains"][k])
69 for h in data[1]:
70 data[1][h]["execs"] = self.internSet(data[1][h]["execs"])
71 return
72
73 def create_cachedata(self): 148 def create_cachedata(self):
74 data = [{}, {}] 149 data = [{}, {}]
75 return data 150 return data
@@ -168,15 +243,19 @@ class PythonParser():
168 h = hash(str(node)) 243 h = hash(str(node))
169 244
170 if h in codeparsercache.pythoncache: 245 if h in codeparsercache.pythoncache:
171 self.references = codeparsercache.pythoncache[h]["refs"] 246 self.references = set(codeparsercache.pythoncache[h].refs)
172 self.execs = codeparsercache.pythoncache[h]["execs"] 247 self.execs = set(codeparsercache.pythoncache[h].execs)
173 self.contains = codeparsercache.pythoncache[h]["contains"] 248 self.contains = {}
249 for i in codeparsercache.pythoncache[h].contains:
250 self.contains[i] = set(codeparsercache.pythoncache[h].contains[i])
174 return 251 return
175 252
176 if h in codeparsercache.pythoncacheextras: 253 if h in codeparsercache.pythoncacheextras:
177 self.references = codeparsercache.pythoncacheextras[h]["refs"] 254 self.references = set(codeparsercache.pythoncacheextras[h].refs)
178 self.execs = codeparsercache.pythoncacheextras[h]["execs"] 255 self.execs = set(codeparsercache.pythoncacheextras[h].execs)
179 self.contains = codeparsercache.pythoncacheextras[h]["contains"] 256 self.contains = {}
257 for i in codeparsercache.pythoncacheextras[h].contains:
258 self.contains[i] = set(codeparsercache.pythoncacheextras[h].contains[i])
180 return 259 return
181 260
182 code = compile(check_indent(str(node)), "<string>", "exec", 261 code = compile(check_indent(str(node)), "<string>", "exec",
@@ -188,10 +267,7 @@ class PythonParser():
188 267
189 self.execs.update(self.var_execs) 268 self.execs.update(self.var_execs)
190 269
191 codeparsercache.pythoncacheextras[h] = {} 270 codeparsercache.pythoncacheextras[h] = codeparsercache.newPythonCacheLine(self.references, self.execs, self.contains)
192 codeparsercache.pythoncacheextras[h]["refs"] = self.references
193 codeparsercache.pythoncacheextras[h]["execs"] = self.execs
194 codeparsercache.pythoncacheextras[h]["contains"] = self.contains
195 271
196class ShellParser(): 272class ShellParser():
197 def __init__(self, name, log): 273 def __init__(self, name, log):
@@ -210,18 +286,17 @@ class ShellParser():
210 h = hash(str(value)) 286 h = hash(str(value))
211 287
212 if h in codeparsercache.shellcache: 288 if h in codeparsercache.shellcache:
213 self.execs = codeparsercache.shellcache[h]["execs"] 289 self.execs = set(codeparsercache.shellcache[h].execs)
214 return self.execs 290 return self.execs
215 291
216 if h in codeparsercache.shellcacheextras: 292 if h in codeparsercache.shellcacheextras:
217 self.execs = codeparsercache.shellcacheextras[h]["execs"] 293 self.execs = set(codeparsercache.shellcacheextras[h].execs)
218 return self.execs 294 return self.execs
219 295
220 self._parse_shell(value) 296 self._parse_shell(value)
221 self.execs = set(cmd for cmd in self.allexecs if cmd not in self.funcdefs) 297 self.execs = set(cmd for cmd in self.allexecs if cmd not in self.funcdefs)
222 298
223 codeparsercache.shellcacheextras[h] = {} 299 codeparsercache.shellcacheextras[h] = codeparsercache.newShellCacheLine(self.execs)
224 codeparsercache.shellcacheextras[h]["execs"] = self.execs
225 300
226 return self.execs 301 return self.execs
227 302