comparison mercurial/linelog.py @ 38795:422d661056be

linelog: add a Python implementation of the linelog datastructure This datastructure was originally developed by Jun Wu at Facebook, inspired by SCCS weaves. It's useful as a cache for blame information, but also is the magic that makes `hg absorb` easy to implement. In service of importing the code to Mercurial, I wanted to actually /understand/ it, and once I did I decided to take a run at implementing it. The help/internals/linelog.txt document is the README from Jun Wu's implementaiton. It all applies to our linelog implementation. Differential Revision: https://phab.mercurial-scm.org/D3990
author Augie Fackler <augie@google.com>
date Mon, 30 Jul 2018 10:42:37 -0400
parents
children f956dc7217fc
comparison
equal deleted inserted replaced
38794:1d01cf0416a5 38795:422d661056be
1 # linelog - efficient cache for annotate data
2 #
3 # Copyright 2018 Google LLC.
4 #
5 # This software may be used and distributed according to the terms of the
6 # GNU General Public License version 2 or any later version.
7 """linelog is an efficient cache for annotate data inspired by SCCS Weaves.
8
9 SCCS Weaves are an implementation of
10 https://en.wikipedia.org/wiki/Interleaved_deltas. See
11 mercurial/help/internals/linelog.txt for an exploration of SCCS weaves
12 and how linelog works in detail.
13
14 Here's a hacker's summary: a linelog is a program which is executed in
15 the context of a revision. Executing the program emits information
16 about lines, including the revision that introduced them and the line
17 number in the file at the introducing revision. When an insertion or
18 deletion is performed on the file, a jump instruction is used to patch
19 in a new body of annotate information.
20 """
21 from __future__ import absolute_import, print_function
22
23 import abc
24 import struct
25
26 from mercurial import (
27 pycompat,
28 )
29 from .thirdparty import (
30 attr,
31 )
32
33 _llentry = struct.Struct('>II')
34
35 class LineLogError(Exception):
36 """Error raised when something bad happens internally in linelog."""
37
38 @attr.s
39 class lineinfo(object):
40 # Introducing revision of this line.
41 rev = attr.ib()
42 # Line number for this line in its introducing revision.
43 linenum = attr.ib()
44 # Private. Offset in the linelog program of this line. Used internally.
45 _offset = attr.ib()
46
47 @attr.s
48 class annotateresult(object):
49 rev = attr.ib()
50 lines = attr.ib()
51 _eof = attr.ib()
52
53 def __iter__(self):
54 return iter(self.lines)
55
56 class _llinstruction(object):
57
58 __metaclass__ = abc.ABCMeta
59
60 @abc.abstractmethod
61 def __init__(self, op1, op2):
62 pass
63
64 @abc.abstractmethod
65 def __str__(self):
66 pass
67
68 def __repr__(self):
69 return str(self)
70
71 @abc.abstractmethod
72 def __eq__(self, other):
73 pass
74
75 @abc.abstractmethod
76 def encode(self):
77 """Encode this instruction to the binary linelog format."""
78
79 @abc.abstractmethod
80 def execute(self, rev, pc, emit):
81 """Execute this instruction.
82
83 Args:
84 rev: The revision we're annotating.
85 pc: The current offset in the linelog program.
86 emit: A function that accepts a single lineinfo object.
87
88 Returns:
89 The new value of pc. Returns None if exeuction should stop
90 (that is, we've found the end of the file.)
91 """
92
93 class _jge(_llinstruction):
94 """If the current rev is greater than or equal to op1, jump to op2."""
95
96 def __init__(self, op1, op2):
97 self._cmprev = op1
98 self._target = op2
99
100 def __str__(self):
101 return 'JGE %d %d' % (self._cmprev, self._target)
102
103 def __eq__(self, other):
104 return (type(self) == type(other)
105 and self._cmprev == other._cmprev
106 and self._target == other._target)
107
108 def encode(self):
109 return _llentry.pack(self._cmprev << 2, self._target)
110
111 def execute(self, rev, pc, emit):
112 if rev >= self._cmprev:
113 return self._target
114 return pc + 1
115
116 class _jump(_llinstruction):
117 """Unconditional jumps are expressed as a JGE with op1 set to 0."""
118
119 def __init__(self, op1, op2):
120 if op1 != 0:
121 raise LineLogError("malformed JUMP, op1 must be 0, got %d" % op1)
122 self._target = op2
123
124 def __str__(self):
125 return 'JUMP %d' % (self._target)
126
127 def __eq__(self, other):
128 return (type(self) == type(other)
129 and self._target == other._target)
130
131 def encode(self):
132 return _llentry.pack(0, self._target)
133
134 def execute(self, rev, pc, emit):
135 return self._target
136
137 class _eof(_llinstruction):
138 """EOF is expressed as a JGE that always jumps to 0."""
139
140 def __init__(self, op1, op2):
141 if op1 != 0:
142 raise LineLogError("malformed EOF, op1 must be 0, got %d" % op1)
143 if op2 != 0:
144 raise LineLogError("malformed EOF, op2 must be 0, got %d" % op2)
145
146 def __str__(self):
147 return 'EOF'
148
149 def __eq__(self, other):
150 return type(self) == type(other)
151
152 def encode(self):
153 return _llentry.pack(0, 0)
154
155 def execute(self, rev, pc, emit):
156 return None
157
158 class _jl(_llinstruction):
159 """If the current rev is less than op1, jump to op2."""
160
161 def __init__(self, op1, op2):
162 self._cmprev = op1
163 self._target = op2
164
165 def __str__(self):
166 return 'JL %d %d' % (self._cmprev, self._target)
167
168 def __eq__(self, other):
169 return (type(self) == type(other)
170 and self._cmprev == other._cmprev
171 and self._target == other._target)
172
173 def encode(self):
174 return _llentry.pack(1 | (self._cmprev << 2), self._target)
175
176 def execute(self, rev, pc, emit):
177 if rev < self._cmprev:
178 return self._target
179 return pc + 1
180
181 class _line(_llinstruction):
182 """Emit a line."""
183
184 def __init__(self, op1, op2):
185 # This line was introduced by this revision number.
186 self._rev = op1
187 # This line had the specified line number in the introducing revision.
188 self._origlineno = op2
189
190 def __str__(self):
191 return 'LINE %d %d' % (self._rev, self._origlineno)
192
193 def __eq__(self, other):
194 return (type(self) == type(other)
195 and self._rev == other._rev
196 and self._origlineno == other._origlineno)
197
198 def encode(self):
199 return _llentry.pack(2 | (self._rev << 2), self._origlineno)
200
201 def execute(self, rev, pc, emit):
202 emit(lineinfo(self._rev, self._origlineno, pc))
203 return pc + 1
204
205 def _decodeone(data, offset):
206 """Decode a single linelog instruction from an offset in a buffer."""
207 try:
208 op1, op2 = _llentry.unpack_from(data, offset)
209 except struct.error as e:
210 raise LineLogError('reading an instruction failed: %r' % e)
211 opcode = op1 & 0b11
212 op1 = op1 >> 2
213 if opcode == 0:
214 if op1 == 0:
215 if op2 == 0:
216 return _eof(op1, op2)
217 return _jump(op1, op2)
218 return _jge(op1, op2)
219 elif opcode == 1:
220 return _jl(op1, op2)
221 elif opcode == 2:
222 return _line(op1, op2)
223 raise NotImplementedError('Unimplemented opcode %r' % opcode)
224
225 class linelog(object):
226 """Efficient cache for per-line history information."""
227
228 def __init__(self, program=None, maxrev=0):
229 if program is None:
230 # We pad the program with an extra leading EOF so that our
231 # offsets will match the C code exactly. This means we can
232 # interoperate with the C code.
233 program = [_eof(0, 0), _eof(0, 0)]
234 self._program = program
235 self._lastannotate = None
236 self._maxrev = maxrev
237
238 def __eq__(self, other):
239 return (type(self) == type(other)
240 and self._program == other._program
241 and self._maxrev == other._maxrev)
242
243 def __repr__(self):
244 return '<linelog at %s: maxrev=%d size=%d>' % (
245 hex(id(self)), self._maxrev, len(self._program))
246
247 def debugstr(self):
248 fmt = '%%%dd %%s' % len(str(len(self._program)))
249 return '\n'.join(
250 fmt % (idx, i) for idx, i in enumerate(self._program[1:], 1))
251
252 @classmethod
253 def fromdata(cls, buf):
254 if len(buf) % _llentry.size != 0:
255 raise LineLogError(
256 "invalid linelog buffer size %d (must be a multiple of %d)" % (
257 len(buf), _llentry.size))
258 expected = len(buf) / _llentry.size
259 fakejge = _decodeone(buf, 0)
260 if isinstance(fakejge, _jump):
261 maxrev = 0
262 else:
263 maxrev = fakejge._cmprev
264 numentries = fakejge._target
265 if expected != numentries:
266 raise LineLogError("corrupt linelog data: claimed"
267 " %d entries but given data for %d entries" % (
268 expected, numentries))
269 instructions = [_eof(0, 0)]
270 for offset in pycompat.xrange(1, numentries):
271 instructions.append(_decodeone(buf, offset * _llentry.size))
272 return cls(instructions, maxrev=maxrev)
273
274 def encode(self):
275 hdr = _jge(self._maxrev, len(self._program)).encode()
276 return hdr + ''.join(i.encode() for i in self._program[1:])
277
278 def clear(self):
279 self._program = []
280 self._maxrev = 0
281 self._lastannotate = None
282
283 def replacelines(self, rev, a1, a2, b1, b2):
284 """Replace lines [a1, a2) with lines [b1, b2)."""
285 if self._lastannotate:
286 # TODO(augie): make replacelines() accept a revision at
287 # which we're editing as well as a revision to mark
288 # responsible for the edits. In hg-experimental it's
289 # stateful like this, so we're doing the same thing to
290 # retain compatibility with absorb until that's imported.
291 ar = self._lastannotate
292 else:
293 ar = self.annotate(rev)
294 # ar = self.annotate(self._maxrev)
295 if a1 > len(ar.lines):
296 raise LineLogError(
297 '%d contains %d lines, tried to access line %d' % (
298 rev, len(ar.lines), a1))
299 elif a1 == len(ar.lines):
300 # Simulated EOF instruction since we're at EOF, which
301 # doesn't have a "real" line.
302 a1inst = _eof(0, 0)
303 a1info = lineinfo(0, 0, ar._eof)
304 else:
305 a1info = ar.lines[a1]
306 a1inst = self._program[a1info._offset]
307 oldproglen = len(self._program)
308 appendinst = self._program.append
309
310 # insert
311 if b1 < b2:
312 # Determine the jump target for the JGE at the start of
313 # the new block.
314 tgt = oldproglen + (b2 - b1 + 1)
315 # Jump to skip the insert if we're at an older revision.
316 appendinst(_jl(rev, tgt))
317 for linenum in pycompat.xrange(b1, b2):
318 appendinst(_line(rev, linenum))
319 # delete
320 if a1 < a2:
321 if a2 > len(ar.lines):
322 raise LineLogError(
323 '%d contains %d lines, tried to access line %d' % (
324 rev, len(ar.lines), a2))
325 elif a2 == len(ar.lines):
326 endaddr = ar._eof
327 else:
328 endaddr = ar.lines[a2]._offset
329 if a2 > 0 and rev < self._maxrev:
330 # If we're here, we're deleting a chunk of an old
331 # commit, so we need to be careful and not touch
332 # invisible lines between a2-1 and a2 (IOW, lines that
333 # are added later).
334 endaddr = ar.lines[a2 - 1]._offset + 1
335 appendinst(_jge(rev, endaddr))
336 # copy instruction from a1
337 appendinst(a1inst)
338 # if a1inst isn't a jump or EOF, then we need to add an unconditional
339 # jump back into the program here.
340 if not isinstance(a1inst, (_jump, _eof)):
341 appendinst(_jump(0, a1info._offset + 1))
342 # Patch instruction at a1, which makes our patch live.
343 self._program[a1info._offset] = _jump(0, oldproglen)
344 # For compat with the C version, re-annotate rev so that
345 # self.annotateresult is cromulent.. We could fix up the
346 # annotateresult in place (which is how the C version works),
347 # but for now we'll pass on that and see if it matters in
348 # practice.
349 self.annotate(max(self._lastannotate.rev, rev))
350 if rev > self._maxrev:
351 self._maxrev = rev
352
353 def annotate(self, rev):
354 pc = 1
355 lines = []
356 # Sanity check: if len(lines) is longer than len(program), we
357 # hit an infinite loop in the linelog program somehow and we
358 # should stop.
359 while pc is not None and len(lines) < len(self._program):
360 inst = self._program[pc]
361 lastpc = pc
362 pc = inst.execute(rev, pc, lines.append)
363 if pc is not None:
364 raise LineLogError(
365 'Probably hit an infinite loop in linelog. Program:\n' +
366 self.debugstr())
367 ar = annotateresult(rev, lines, lastpc)
368 self._lastannotate = ar
369 return ar
370
371 @property
372 def maxrev(self):
373 return self._maxrev
374
375 # Stateful methods which depend on the value of the last
376 # annotation run. This API is for compatiblity with the original
377 # linelog, and we should probably consider refactoring it.
378 @property
379 def annotateresult(self):
380 """Return the last annotation result. C linelog code exposed this."""
381 return [(l.rev, l.linenum) for l in self._lastannotate.lines]
382
383 def getoffset(self, line):
384 return self._lastannotate.lines[line]._offset
385
386 def getalllines(self, start=0, end=0):
387 """Get all lines that ever occurred in [start, end).
388
389 Passing start == end == 0 means "all lines ever".
390
391 This works in terms of *internal* program offsets, not line numbers.
392 """
393 pc = start or 1
394 lines = []
395 # only take as many steps as there are instructions in the
396 # program - if we don't find an EOF or our stop-line before
397 # then, something is badly broken.
398 for step in pycompat.xrange(len(self._program)):
399 inst = self._program[pc]
400 nextpc = pc + 1
401 if isinstance(inst, _jump):
402 nextpc = inst._target
403 elif isinstance(inst, _eof):
404 return lines
405 elif isinstance(inst, (_jl, _jge)):
406 pass
407 elif isinstance(inst, _line):
408 lines.append((inst._rev, inst._origlineno))
409 else:
410 raise LineLogError("Illegal instruction %r" % inst)
411 if nextpc == end:
412 return lines
413 pc = nextpc
414 raise LineLogError("Failed to perform getalllines")