comparison mercurial/revlog.py @ 34880:9e18ab7f7240

sparse-read: move from a recursive-based approach to a heap-based one The previous recursive approach was trying to optimise each read slice to have a good density. It had the tendency to over-optimize smaller slices while leaving larger hole in others. The new approach focuses on improving the combined density of all the reads, instead of the individual slices. It slices at the largest gaps first, as they reduce the total amount of read data the most efficiently. Another benefit of this approach is that we iterate over the delta chain only once, reducing the overhead of slicing long delta chains. On the repository we use for tests, the new approach shows similar or faster performance than the current default linear full read. The repository contains about 450,000 revisions with many concurrent topological branches. Tests have been run on two versions of the repository: one built with the current delta constraint, and the other with an unlimited delta span (using 'experimental.maxdeltachainspan=0') Below are timings for building 1% of all the revision in the manifest log using 'hg perfrevlogrevisions -m'. Times are given in seconds. They include the new couple of follow-up changeset in this series. delta-span standard unlimited linear-read 922s 632s sparse-read 814s 566s
author Boris Feld <boris.feld@octobus.net>
date Wed, 18 Oct 2017 12:53:00 +0200
parents 4d5d5009bd75
children 8c9b08a0c48c
comparison
equal deleted inserted replaced
34879:7d51a7792f52 34880:9e18ab7f7240
15 15
16 import binascii 16 import binascii
17 import collections 17 import collections
18 import errno 18 import errno
19 import hashlib 19 import hashlib
20 import heapq
20 import os 21 import os
21 import struct 22 import struct
22 import zlib 23 import zlib
23 24
24 # import stuff from node for others to import from revlog 25 # import stuff from node for others to import from revlog
168 Assume that revs are sorted. 169 Assume that revs are sorted.
169 """ 170 """
170 start = revlog.start 171 start = revlog.start
171 length = revlog.length 172 length = revlog.length
172 173
173 chunkqueue = collections.deque() 174 if len(revs) <= 1:
174 chunkqueue.append((revs, 0)) 175 yield revs
175 176 return
176 while chunkqueue: 177
177 revs, depth = chunkqueue.popleft() 178 startbyte = start(revs[0])
178 179 endbyte = start(revs[-1]) + length(revs[-1])
179 startbyte = start(revs[0]) 180 readdata = deltachainspan = endbyte - startbyte
180 endbyte = start(revs[-1]) + length(revs[-1]) 181
181 deltachainspan = endbyte - startbyte 182 chainpayload = sum(length(r) for r in revs)
182 183
183 if deltachainspan <= revlog._srminblocksize or len(revs) <= 1: 184 if deltachainspan:
184 yield revs 185 density = chainpayload / float(deltachainspan)
185 continue 186 else:
186 187 density = 1.0
187 # Find where is the largest hole (this is where we would split) and 188
188 # sum up the lengths of useful data to compute the density of the span 189 # Store the gaps in a heap to have them sorted by decreasing size
189 textlen = 0 190 gapsheap = []
190 prevend = None 191 heapq.heapify(gapsheap)
191 largesthole = 0 192 prevend = None
192 idxlargesthole = -1 193 for i, rev in enumerate(revs):
193 for i, rev in enumerate(revs): 194 revstart = start(rev)
194 revstart = start(rev) 195 revlen = length(rev)
195 revlen = length(rev) 196
196 197 if prevend is not None:
197 if prevend is not None: 198 gapsize = revstart - prevend
198 hole = revstart - prevend 199 if gapsize:
199 if hole > largesthole: 200 heapq.heappush(gapsheap, (-gapsize, i))
200 largesthole = hole 201
201 idxlargesthole = i 202 prevend = revstart + revlen
202 203
203 textlen += revlen 204 # Collect the indices of the largest holes until the density is acceptable
204 prevend = revstart + revlen 205 indicesheap = []
205 206 heapq.heapify(indicesheap)
206 density = textlen / float(deltachainspan) if deltachainspan > 0 else 1.0 207 while gapsheap and density < revlog._srdensitythreshold:
207 208 oppgapsize, gapidx = heapq.heappop(gapsheap)
208 if density > revlog._srdensitythreshold: 209
209 yield revs 210 heapq.heappush(indicesheap, gapidx)
210 continue 211
211 212 # the gap sizes are stored as negatives to be sorted decreasingly
212 # Add the left and right parts so that they will be sliced 213 # by the heap
213 # recursively too 214 readdata -= (-oppgapsize)
214 chunkqueue.append((revs[:idxlargesthole], depth + 1)) 215 if readdata > 0:
215 chunkqueue.append((revs[idxlargesthole:], depth + 1)) 216 density = chainpayload / float(readdata)
217 else:
218 density = 1.0
219
220 # Cut the revs at collected indices
221 previdx = 0
222 while indicesheap:
223 idx = heapq.heappop(indicesheap)
224 yield revs[previdx:idx]
225 previdx = idx
226 yield revs[previdx:]
216 227
217 # index v0: 228 # index v0:
218 # 4 bytes: offset 229 # 4 bytes: offset
219 # 4 bytes: compressed length 230 # 4 bytes: compressed length
220 # 4 bytes: base rev 231 # 4 bytes: base rev