Mercurial > hg
comparison mercurial/revlog.py @ 34880:9e18ab7f7240
sparse-read: move from a recursive-based approach to a heap-based one
The previous recursive approach was trying to optimise each read slice to have
a good density. It had the tendency to over-optimize smaller slices while
leaving larger hole in others.
The new approach focuses on improving the combined density of all the reads,
instead of the individual slices. It slices at the largest gaps first, as they
reduce the total amount of read data the most efficiently.
Another benefit of this approach is that we iterate over the delta chain only
once, reducing the overhead of slicing long delta chains.
On the repository we use for tests, the new approach shows similar or faster
performance than the current default linear full read.
The repository contains about 450,000 revisions with many concurrent
topological branches. Tests have been run on two versions of the repository:
one built with the current delta constraint, and the other with an unlimited
delta span (using 'experimental.maxdeltachainspan=0')
Below are timings for building 1% of all the revision in the manifest log using
'hg perfrevlogrevisions -m'. Times are given in seconds. They include the new
couple of follow-up changeset in this series.
delta-span standard unlimited
linear-read 922s 632s
sparse-read 814s 566s
author | Boris Feld <boris.feld@octobus.net> |
---|---|
date | Wed, 18 Oct 2017 12:53:00 +0200 |
parents | 4d5d5009bd75 |
children | 8c9b08a0c48c |
comparison
equal
deleted
inserted
replaced
34879:7d51a7792f52 | 34880:9e18ab7f7240 |
---|---|
15 | 15 |
16 import binascii | 16 import binascii |
17 import collections | 17 import collections |
18 import errno | 18 import errno |
19 import hashlib | 19 import hashlib |
20 import heapq | |
20 import os | 21 import os |
21 import struct | 22 import struct |
22 import zlib | 23 import zlib |
23 | 24 |
24 # import stuff from node for others to import from revlog | 25 # import stuff from node for others to import from revlog |
168 Assume that revs are sorted. | 169 Assume that revs are sorted. |
169 """ | 170 """ |
170 start = revlog.start | 171 start = revlog.start |
171 length = revlog.length | 172 length = revlog.length |
172 | 173 |
173 chunkqueue = collections.deque() | 174 if len(revs) <= 1: |
174 chunkqueue.append((revs, 0)) | 175 yield revs |
175 | 176 return |
176 while chunkqueue: | 177 |
177 revs, depth = chunkqueue.popleft() | 178 startbyte = start(revs[0]) |
178 | 179 endbyte = start(revs[-1]) + length(revs[-1]) |
179 startbyte = start(revs[0]) | 180 readdata = deltachainspan = endbyte - startbyte |
180 endbyte = start(revs[-1]) + length(revs[-1]) | 181 |
181 deltachainspan = endbyte - startbyte | 182 chainpayload = sum(length(r) for r in revs) |
182 | 183 |
183 if deltachainspan <= revlog._srminblocksize or len(revs) <= 1: | 184 if deltachainspan: |
184 yield revs | 185 density = chainpayload / float(deltachainspan) |
185 continue | 186 else: |
186 | 187 density = 1.0 |
187 # Find where is the largest hole (this is where we would split) and | 188 |
188 # sum up the lengths of useful data to compute the density of the span | 189 # Store the gaps in a heap to have them sorted by decreasing size |
189 textlen = 0 | 190 gapsheap = [] |
190 prevend = None | 191 heapq.heapify(gapsheap) |
191 largesthole = 0 | 192 prevend = None |
192 idxlargesthole = -1 | 193 for i, rev in enumerate(revs): |
193 for i, rev in enumerate(revs): | 194 revstart = start(rev) |
194 revstart = start(rev) | 195 revlen = length(rev) |
195 revlen = length(rev) | 196 |
196 | 197 if prevend is not None: |
197 if prevend is not None: | 198 gapsize = revstart - prevend |
198 hole = revstart - prevend | 199 if gapsize: |
199 if hole > largesthole: | 200 heapq.heappush(gapsheap, (-gapsize, i)) |
200 largesthole = hole | 201 |
201 idxlargesthole = i | 202 prevend = revstart + revlen |
202 | 203 |
203 textlen += revlen | 204 # Collect the indices of the largest holes until the density is acceptable |
204 prevend = revstart + revlen | 205 indicesheap = [] |
205 | 206 heapq.heapify(indicesheap) |
206 density = textlen / float(deltachainspan) if deltachainspan > 0 else 1.0 | 207 while gapsheap and density < revlog._srdensitythreshold: |
207 | 208 oppgapsize, gapidx = heapq.heappop(gapsheap) |
208 if density > revlog._srdensitythreshold: | 209 |
209 yield revs | 210 heapq.heappush(indicesheap, gapidx) |
210 continue | 211 |
211 | 212 # the gap sizes are stored as negatives to be sorted decreasingly |
212 # Add the left and right parts so that they will be sliced | 213 # by the heap |
213 # recursively too | 214 readdata -= (-oppgapsize) |
214 chunkqueue.append((revs[:idxlargesthole], depth + 1)) | 215 if readdata > 0: |
215 chunkqueue.append((revs[idxlargesthole:], depth + 1)) | 216 density = chainpayload / float(readdata) |
217 else: | |
218 density = 1.0 | |
219 | |
220 # Cut the revs at collected indices | |
221 previdx = 0 | |
222 while indicesheap: | |
223 idx = heapq.heappop(indicesheap) | |
224 yield revs[previdx:idx] | |
225 previdx = idx | |
226 yield revs[previdx:] | |
216 | 227 |
217 # index v0: | 228 # index v0: |
218 # 4 bytes: offset | 229 # 4 bytes: offset |
219 # 4 bytes: compressed length | 230 # 4 bytes: compressed length |
220 # 4 bytes: base rev | 231 # 4 bytes: base rev |