Mercurial > hg
comparison mercurial/linelog.py @ 38795:422d661056be
linelog: add a Python implementation of the linelog datastructure
This datastructure was originally developed by Jun Wu at Facebook,
inspired by SCCS weaves. It's useful as a cache for blame information,
but also is the magic that makes `hg absorb` easy to implement. In
service of importing the code to Mercurial, I wanted to actually
/understand/ it, and once I did I decided to take a run at
implementing it.
The help/internals/linelog.txt document is the README from Jun Wu's
implementaiton. It all applies to our linelog implementation.
Differential Revision: https://phab.mercurial-scm.org/D3990
author | Augie Fackler <augie@google.com> |
---|---|
date | Mon, 30 Jul 2018 10:42:37 -0400 |
parents | |
children | f956dc7217fc |
comparison
equal
deleted
inserted
replaced
38794:1d01cf0416a5 | 38795:422d661056be |
---|---|
1 # linelog - efficient cache for annotate data | |
2 # | |
3 # Copyright 2018 Google LLC. | |
4 # | |
5 # This software may be used and distributed according to the terms of the | |
6 # GNU General Public License version 2 or any later version. | |
7 """linelog is an efficient cache for annotate data inspired by SCCS Weaves. | |
8 | |
9 SCCS Weaves are an implementation of | |
10 https://en.wikipedia.org/wiki/Interleaved_deltas. See | |
11 mercurial/help/internals/linelog.txt for an exploration of SCCS weaves | |
12 and how linelog works in detail. | |
13 | |
14 Here's a hacker's summary: a linelog is a program which is executed in | |
15 the context of a revision. Executing the program emits information | |
16 about lines, including the revision that introduced them and the line | |
17 number in the file at the introducing revision. When an insertion or | |
18 deletion is performed on the file, a jump instruction is used to patch | |
19 in a new body of annotate information. | |
20 """ | |
21 from __future__ import absolute_import, print_function | |
22 | |
23 import abc | |
24 import struct | |
25 | |
26 from mercurial import ( | |
27 pycompat, | |
28 ) | |
29 from .thirdparty import ( | |
30 attr, | |
31 ) | |
32 | |
33 _llentry = struct.Struct('>II') | |
34 | |
35 class LineLogError(Exception): | |
36 """Error raised when something bad happens internally in linelog.""" | |
37 | |
38 @attr.s | |
39 class lineinfo(object): | |
40 # Introducing revision of this line. | |
41 rev = attr.ib() | |
42 # Line number for this line in its introducing revision. | |
43 linenum = attr.ib() | |
44 # Private. Offset in the linelog program of this line. Used internally. | |
45 _offset = attr.ib() | |
46 | |
47 @attr.s | |
48 class annotateresult(object): | |
49 rev = attr.ib() | |
50 lines = attr.ib() | |
51 _eof = attr.ib() | |
52 | |
53 def __iter__(self): | |
54 return iter(self.lines) | |
55 | |
56 class _llinstruction(object): | |
57 | |
58 __metaclass__ = abc.ABCMeta | |
59 | |
60 @abc.abstractmethod | |
61 def __init__(self, op1, op2): | |
62 pass | |
63 | |
64 @abc.abstractmethod | |
65 def __str__(self): | |
66 pass | |
67 | |
68 def __repr__(self): | |
69 return str(self) | |
70 | |
71 @abc.abstractmethod | |
72 def __eq__(self, other): | |
73 pass | |
74 | |
75 @abc.abstractmethod | |
76 def encode(self): | |
77 """Encode this instruction to the binary linelog format.""" | |
78 | |
79 @abc.abstractmethod | |
80 def execute(self, rev, pc, emit): | |
81 """Execute this instruction. | |
82 | |
83 Args: | |
84 rev: The revision we're annotating. | |
85 pc: The current offset in the linelog program. | |
86 emit: A function that accepts a single lineinfo object. | |
87 | |
88 Returns: | |
89 The new value of pc. Returns None if exeuction should stop | |
90 (that is, we've found the end of the file.) | |
91 """ | |
92 | |
93 class _jge(_llinstruction): | |
94 """If the current rev is greater than or equal to op1, jump to op2.""" | |
95 | |
96 def __init__(self, op1, op2): | |
97 self._cmprev = op1 | |
98 self._target = op2 | |
99 | |
100 def __str__(self): | |
101 return 'JGE %d %d' % (self._cmprev, self._target) | |
102 | |
103 def __eq__(self, other): | |
104 return (type(self) == type(other) | |
105 and self._cmprev == other._cmprev | |
106 and self._target == other._target) | |
107 | |
108 def encode(self): | |
109 return _llentry.pack(self._cmprev << 2, self._target) | |
110 | |
111 def execute(self, rev, pc, emit): | |
112 if rev >= self._cmprev: | |
113 return self._target | |
114 return pc + 1 | |
115 | |
116 class _jump(_llinstruction): | |
117 """Unconditional jumps are expressed as a JGE with op1 set to 0.""" | |
118 | |
119 def __init__(self, op1, op2): | |
120 if op1 != 0: | |
121 raise LineLogError("malformed JUMP, op1 must be 0, got %d" % op1) | |
122 self._target = op2 | |
123 | |
124 def __str__(self): | |
125 return 'JUMP %d' % (self._target) | |
126 | |
127 def __eq__(self, other): | |
128 return (type(self) == type(other) | |
129 and self._target == other._target) | |
130 | |
131 def encode(self): | |
132 return _llentry.pack(0, self._target) | |
133 | |
134 def execute(self, rev, pc, emit): | |
135 return self._target | |
136 | |
137 class _eof(_llinstruction): | |
138 """EOF is expressed as a JGE that always jumps to 0.""" | |
139 | |
140 def __init__(self, op1, op2): | |
141 if op1 != 0: | |
142 raise LineLogError("malformed EOF, op1 must be 0, got %d" % op1) | |
143 if op2 != 0: | |
144 raise LineLogError("malformed EOF, op2 must be 0, got %d" % op2) | |
145 | |
146 def __str__(self): | |
147 return 'EOF' | |
148 | |
149 def __eq__(self, other): | |
150 return type(self) == type(other) | |
151 | |
152 def encode(self): | |
153 return _llentry.pack(0, 0) | |
154 | |
155 def execute(self, rev, pc, emit): | |
156 return None | |
157 | |
158 class _jl(_llinstruction): | |
159 """If the current rev is less than op1, jump to op2.""" | |
160 | |
161 def __init__(self, op1, op2): | |
162 self._cmprev = op1 | |
163 self._target = op2 | |
164 | |
165 def __str__(self): | |
166 return 'JL %d %d' % (self._cmprev, self._target) | |
167 | |
168 def __eq__(self, other): | |
169 return (type(self) == type(other) | |
170 and self._cmprev == other._cmprev | |
171 and self._target == other._target) | |
172 | |
173 def encode(self): | |
174 return _llentry.pack(1 | (self._cmprev << 2), self._target) | |
175 | |
176 def execute(self, rev, pc, emit): | |
177 if rev < self._cmprev: | |
178 return self._target | |
179 return pc + 1 | |
180 | |
181 class _line(_llinstruction): | |
182 """Emit a line.""" | |
183 | |
184 def __init__(self, op1, op2): | |
185 # This line was introduced by this revision number. | |
186 self._rev = op1 | |
187 # This line had the specified line number in the introducing revision. | |
188 self._origlineno = op2 | |
189 | |
190 def __str__(self): | |
191 return 'LINE %d %d' % (self._rev, self._origlineno) | |
192 | |
193 def __eq__(self, other): | |
194 return (type(self) == type(other) | |
195 and self._rev == other._rev | |
196 and self._origlineno == other._origlineno) | |
197 | |
198 def encode(self): | |
199 return _llentry.pack(2 | (self._rev << 2), self._origlineno) | |
200 | |
201 def execute(self, rev, pc, emit): | |
202 emit(lineinfo(self._rev, self._origlineno, pc)) | |
203 return pc + 1 | |
204 | |
205 def _decodeone(data, offset): | |
206 """Decode a single linelog instruction from an offset in a buffer.""" | |
207 try: | |
208 op1, op2 = _llentry.unpack_from(data, offset) | |
209 except struct.error as e: | |
210 raise LineLogError('reading an instruction failed: %r' % e) | |
211 opcode = op1 & 0b11 | |
212 op1 = op1 >> 2 | |
213 if opcode == 0: | |
214 if op1 == 0: | |
215 if op2 == 0: | |
216 return _eof(op1, op2) | |
217 return _jump(op1, op2) | |
218 return _jge(op1, op2) | |
219 elif opcode == 1: | |
220 return _jl(op1, op2) | |
221 elif opcode == 2: | |
222 return _line(op1, op2) | |
223 raise NotImplementedError('Unimplemented opcode %r' % opcode) | |
224 | |
225 class linelog(object): | |
226 """Efficient cache for per-line history information.""" | |
227 | |
228 def __init__(self, program=None, maxrev=0): | |
229 if program is None: | |
230 # We pad the program with an extra leading EOF so that our | |
231 # offsets will match the C code exactly. This means we can | |
232 # interoperate with the C code. | |
233 program = [_eof(0, 0), _eof(0, 0)] | |
234 self._program = program | |
235 self._lastannotate = None | |
236 self._maxrev = maxrev | |
237 | |
238 def __eq__(self, other): | |
239 return (type(self) == type(other) | |
240 and self._program == other._program | |
241 and self._maxrev == other._maxrev) | |
242 | |
243 def __repr__(self): | |
244 return '<linelog at %s: maxrev=%d size=%d>' % ( | |
245 hex(id(self)), self._maxrev, len(self._program)) | |
246 | |
247 def debugstr(self): | |
248 fmt = '%%%dd %%s' % len(str(len(self._program))) | |
249 return '\n'.join( | |
250 fmt % (idx, i) for idx, i in enumerate(self._program[1:], 1)) | |
251 | |
252 @classmethod | |
253 def fromdata(cls, buf): | |
254 if len(buf) % _llentry.size != 0: | |
255 raise LineLogError( | |
256 "invalid linelog buffer size %d (must be a multiple of %d)" % ( | |
257 len(buf), _llentry.size)) | |
258 expected = len(buf) / _llentry.size | |
259 fakejge = _decodeone(buf, 0) | |
260 if isinstance(fakejge, _jump): | |
261 maxrev = 0 | |
262 else: | |
263 maxrev = fakejge._cmprev | |
264 numentries = fakejge._target | |
265 if expected != numentries: | |
266 raise LineLogError("corrupt linelog data: claimed" | |
267 " %d entries but given data for %d entries" % ( | |
268 expected, numentries)) | |
269 instructions = [_eof(0, 0)] | |
270 for offset in pycompat.xrange(1, numentries): | |
271 instructions.append(_decodeone(buf, offset * _llentry.size)) | |
272 return cls(instructions, maxrev=maxrev) | |
273 | |
274 def encode(self): | |
275 hdr = _jge(self._maxrev, len(self._program)).encode() | |
276 return hdr + ''.join(i.encode() for i in self._program[1:]) | |
277 | |
278 def clear(self): | |
279 self._program = [] | |
280 self._maxrev = 0 | |
281 self._lastannotate = None | |
282 | |
283 def replacelines(self, rev, a1, a2, b1, b2): | |
284 """Replace lines [a1, a2) with lines [b1, b2).""" | |
285 if self._lastannotate: | |
286 # TODO(augie): make replacelines() accept a revision at | |
287 # which we're editing as well as a revision to mark | |
288 # responsible for the edits. In hg-experimental it's | |
289 # stateful like this, so we're doing the same thing to | |
290 # retain compatibility with absorb until that's imported. | |
291 ar = self._lastannotate | |
292 else: | |
293 ar = self.annotate(rev) | |
294 # ar = self.annotate(self._maxrev) | |
295 if a1 > len(ar.lines): | |
296 raise LineLogError( | |
297 '%d contains %d lines, tried to access line %d' % ( | |
298 rev, len(ar.lines), a1)) | |
299 elif a1 == len(ar.lines): | |
300 # Simulated EOF instruction since we're at EOF, which | |
301 # doesn't have a "real" line. | |
302 a1inst = _eof(0, 0) | |
303 a1info = lineinfo(0, 0, ar._eof) | |
304 else: | |
305 a1info = ar.lines[a1] | |
306 a1inst = self._program[a1info._offset] | |
307 oldproglen = len(self._program) | |
308 appendinst = self._program.append | |
309 | |
310 # insert | |
311 if b1 < b2: | |
312 # Determine the jump target for the JGE at the start of | |
313 # the new block. | |
314 tgt = oldproglen + (b2 - b1 + 1) | |
315 # Jump to skip the insert if we're at an older revision. | |
316 appendinst(_jl(rev, tgt)) | |
317 for linenum in pycompat.xrange(b1, b2): | |
318 appendinst(_line(rev, linenum)) | |
319 # delete | |
320 if a1 < a2: | |
321 if a2 > len(ar.lines): | |
322 raise LineLogError( | |
323 '%d contains %d lines, tried to access line %d' % ( | |
324 rev, len(ar.lines), a2)) | |
325 elif a2 == len(ar.lines): | |
326 endaddr = ar._eof | |
327 else: | |
328 endaddr = ar.lines[a2]._offset | |
329 if a2 > 0 and rev < self._maxrev: | |
330 # If we're here, we're deleting a chunk of an old | |
331 # commit, so we need to be careful and not touch | |
332 # invisible lines between a2-1 and a2 (IOW, lines that | |
333 # are added later). | |
334 endaddr = ar.lines[a2 - 1]._offset + 1 | |
335 appendinst(_jge(rev, endaddr)) | |
336 # copy instruction from a1 | |
337 appendinst(a1inst) | |
338 # if a1inst isn't a jump or EOF, then we need to add an unconditional | |
339 # jump back into the program here. | |
340 if not isinstance(a1inst, (_jump, _eof)): | |
341 appendinst(_jump(0, a1info._offset + 1)) | |
342 # Patch instruction at a1, which makes our patch live. | |
343 self._program[a1info._offset] = _jump(0, oldproglen) | |
344 # For compat with the C version, re-annotate rev so that | |
345 # self.annotateresult is cromulent.. We could fix up the | |
346 # annotateresult in place (which is how the C version works), | |
347 # but for now we'll pass on that and see if it matters in | |
348 # practice. | |
349 self.annotate(max(self._lastannotate.rev, rev)) | |
350 if rev > self._maxrev: | |
351 self._maxrev = rev | |
352 | |
353 def annotate(self, rev): | |
354 pc = 1 | |
355 lines = [] | |
356 # Sanity check: if len(lines) is longer than len(program), we | |
357 # hit an infinite loop in the linelog program somehow and we | |
358 # should stop. | |
359 while pc is not None and len(lines) < len(self._program): | |
360 inst = self._program[pc] | |
361 lastpc = pc | |
362 pc = inst.execute(rev, pc, lines.append) | |
363 if pc is not None: | |
364 raise LineLogError( | |
365 'Probably hit an infinite loop in linelog. Program:\n' + | |
366 self.debugstr()) | |
367 ar = annotateresult(rev, lines, lastpc) | |
368 self._lastannotate = ar | |
369 return ar | |
370 | |
371 @property | |
372 def maxrev(self): | |
373 return self._maxrev | |
374 | |
375 # Stateful methods which depend on the value of the last | |
376 # annotation run. This API is for compatiblity with the original | |
377 # linelog, and we should probably consider refactoring it. | |
378 @property | |
379 def annotateresult(self): | |
380 """Return the last annotation result. C linelog code exposed this.""" | |
381 return [(l.rev, l.linenum) for l in self._lastannotate.lines] | |
382 | |
383 def getoffset(self, line): | |
384 return self._lastannotate.lines[line]._offset | |
385 | |
386 def getalllines(self, start=0, end=0): | |
387 """Get all lines that ever occurred in [start, end). | |
388 | |
389 Passing start == end == 0 means "all lines ever". | |
390 | |
391 This works in terms of *internal* program offsets, not line numbers. | |
392 """ | |
393 pc = start or 1 | |
394 lines = [] | |
395 # only take as many steps as there are instructions in the | |
396 # program - if we don't find an EOF or our stop-line before | |
397 # then, something is badly broken. | |
398 for step in pycompat.xrange(len(self._program)): | |
399 inst = self._program[pc] | |
400 nextpc = pc + 1 | |
401 if isinstance(inst, _jump): | |
402 nextpc = inst._target | |
403 elif isinstance(inst, _eof): | |
404 return lines | |
405 elif isinstance(inst, (_jl, _jge)): | |
406 pass | |
407 elif isinstance(inst, _line): | |
408 lines.append((inst._rev, inst._origlineno)) | |
409 else: | |
410 raise LineLogError("Illegal instruction %r" % inst) | |
411 if nextpc == end: | |
412 return lines | |
413 pc = nextpc | |
414 raise LineLogError("Failed to perform getalllines") |