hgext/remotefilelog/historypack.py
changeset 40495 3a333a582d7b
child 40506 10c10da14c5d
equal deleted inserted replaced
40494:9aeb9e2d28a7 40495:3a333a582d7b
       
     1 from __future__ import absolute_import
       
     2 
       
     3 import hashlib
       
     4 import struct
       
     5 
       
     6 from mercurial.node import hex, nullid
       
     7 from mercurial import (
       
     8     pycompat,
       
     9     util,
       
    10 )
       
    11 from . import (
       
    12     basepack,
       
    13     constants,
       
    14     shallowutil,
       
    15 )
       
    16 
       
    17 # (filename hash, offset, size)
       
    18 INDEXFORMAT0 = '!20sQQ'
       
    19 INDEXENTRYLENGTH0 = struct.calcsize(INDEXFORMAT0)
       
    20 INDEXFORMAT1 = '!20sQQII'
       
    21 INDEXENTRYLENGTH1 = struct.calcsize(INDEXFORMAT1)
       
    22 NODELENGTH = 20
       
    23 
       
    24 NODEINDEXFORMAT = '!20sQ'
       
    25 NODEINDEXENTRYLENGTH = struct.calcsize(NODEINDEXFORMAT)
       
    26 
       
    27 # (node, p1, p2, linknode)
       
    28 PACKFORMAT = "!20s20s20s20sH"
       
    29 PACKENTRYLENGTH = 82
       
    30 
       
    31 ENTRYCOUNTSIZE = 4
       
    32 
       
    33 INDEXSUFFIX = '.histidx'
       
    34 PACKSUFFIX = '.histpack'
       
    35 
       
    36 ANC_NODE = 0
       
    37 ANC_P1NODE = 1
       
    38 ANC_P2NODE = 2
       
    39 ANC_LINKNODE = 3
       
    40 ANC_COPYFROM = 4
       
    41 
       
    42 class historypackstore(basepack.basepackstore):
       
    43     INDEXSUFFIX = INDEXSUFFIX
       
    44     PACKSUFFIX = PACKSUFFIX
       
    45 
       
    46     def getpack(self, path):
       
    47         return historypack(path)
       
    48 
       
    49     def getancestors(self, name, node, known=None):
       
    50         for pack in self.packs:
       
    51             try:
       
    52                 return pack.getancestors(name, node, known=known)
       
    53             except KeyError:
       
    54                 pass
       
    55 
       
    56         for pack in self.refresh():
       
    57             try:
       
    58                 return pack.getancestors(name, node, known=known)
       
    59             except KeyError:
       
    60                 pass
       
    61 
       
    62         raise KeyError((name, node))
       
    63 
       
    64     def getnodeinfo(self, name, node):
       
    65         for pack in self.packs:
       
    66             try:
       
    67                 return pack.getnodeinfo(name, node)
       
    68             except KeyError:
       
    69                 pass
       
    70 
       
    71         for pack in self.refresh():
       
    72             try:
       
    73                 return pack.getnodeinfo(name, node)
       
    74             except KeyError:
       
    75                 pass
       
    76 
       
    77         raise KeyError((name, node))
       
    78 
       
    79     def add(self, filename, node, p1, p2, linknode, copyfrom):
       
    80         raise RuntimeError("cannot add to historypackstore (%s:%s)"
       
    81                            % (filename, hex(node)))
       
    82 
       
    83 class historypack(basepack.basepack):
       
    84     INDEXSUFFIX = INDEXSUFFIX
       
    85     PACKSUFFIX = PACKSUFFIX
       
    86 
       
    87     SUPPORTED_VERSIONS = [0, 1]
       
    88 
       
    89     def __init__(self, path):
       
    90         super(historypack, self).__init__(path)
       
    91 
       
    92         if self.VERSION == 0:
       
    93             self.INDEXFORMAT = INDEXFORMAT0
       
    94             self.INDEXENTRYLENGTH = INDEXENTRYLENGTH0
       
    95         else:
       
    96             self.INDEXFORMAT = INDEXFORMAT1
       
    97             self.INDEXENTRYLENGTH = INDEXENTRYLENGTH1
       
    98 
       
    99     def getmissing(self, keys):
       
   100         missing = []
       
   101         for name, node in keys:
       
   102             try:
       
   103                 self._findnode(name, node)
       
   104             except KeyError:
       
   105                 missing.append((name, node))
       
   106 
       
   107         return missing
       
   108 
       
   109     def getancestors(self, name, node, known=None):
       
   110         """Returns as many ancestors as we're aware of.
       
   111 
       
   112         return value: {
       
   113            node: (p1, p2, linknode, copyfrom),
       
   114            ...
       
   115         }
       
   116         """
       
   117         if known and node in known:
       
   118             return []
       
   119 
       
   120         ancestors = self._getancestors(name, node, known=known)
       
   121         results = {}
       
   122         for ancnode, p1, p2, linknode, copyfrom in ancestors:
       
   123             results[ancnode] = (p1, p2, linknode, copyfrom)
       
   124 
       
   125         if not results:
       
   126             raise KeyError((name, node))
       
   127         return results
       
   128 
       
   129     def getnodeinfo(self, name, node):
       
   130         # Drop the node from the tuple before returning, since the result should
       
   131         # just be (p1, p2, linknode, copyfrom)
       
   132         return self._findnode(name, node)[1:]
       
   133 
       
   134     def _getancestors(self, name, node, known=None):
       
   135         if known is None:
       
   136             known = set()
       
   137         section = self._findsection(name)
       
   138         filename, offset, size, nodeindexoffset, nodeindexsize = section
       
   139         pending = set((node,))
       
   140         o = 0
       
   141         while o < size:
       
   142             if not pending:
       
   143                 break
       
   144             entry, copyfrom = self._readentry(offset + o)
       
   145             o += PACKENTRYLENGTH
       
   146             if copyfrom:
       
   147                 o += len(copyfrom)
       
   148 
       
   149             ancnode = entry[ANC_NODE]
       
   150             if ancnode in pending:
       
   151                 pending.remove(ancnode)
       
   152                 p1node = entry[ANC_P1NODE]
       
   153                 p2node = entry[ANC_P2NODE]
       
   154                 if p1node != nullid and p1node not in known:
       
   155                     pending.add(p1node)
       
   156                 if p2node != nullid and p2node not in known:
       
   157                     pending.add(p2node)
       
   158 
       
   159                 yield (ancnode, p1node, p2node, entry[ANC_LINKNODE], copyfrom)
       
   160 
       
   161     def _readentry(self, offset):
       
   162         data = self._data
       
   163         entry = struct.unpack(PACKFORMAT, data[offset:offset + PACKENTRYLENGTH])
       
   164         copyfrom = None
       
   165         copyfromlen = entry[ANC_COPYFROM]
       
   166         if copyfromlen != 0:
       
   167             offset += PACKENTRYLENGTH
       
   168             copyfrom = data[offset:offset + copyfromlen]
       
   169         return entry, copyfrom
       
   170 
       
   171     def add(self, filename, node, p1, p2, linknode, copyfrom):
       
   172         raise RuntimeError("cannot add to historypack (%s:%s)" %
       
   173                            (filename, hex(node)))
       
   174 
       
   175     def _findnode(self, name, node):
       
   176         if self.VERSION == 0:
       
   177             ancestors = self._getancestors(name, node)
       
   178             for ancnode, p1node, p2node, linknode, copyfrom in ancestors:
       
   179                 if ancnode == node:
       
   180                     return (ancnode, p1node, p2node, linknode, copyfrom)
       
   181         else:
       
   182             section = self._findsection(name)
       
   183             nodeindexoffset, nodeindexsize = section[3:]
       
   184             entry = self._bisect(node, nodeindexoffset,
       
   185                                  nodeindexoffset + nodeindexsize,
       
   186                                  NODEINDEXENTRYLENGTH)
       
   187             if entry is not None:
       
   188                 node, offset = struct.unpack(NODEINDEXFORMAT, entry)
       
   189                 entry, copyfrom = self._readentry(offset)
       
   190                 # Drop the copyfromlen from the end of entry, and replace it
       
   191                 # with the copyfrom string.
       
   192                 return entry[:4] + (copyfrom,)
       
   193 
       
   194         raise KeyError("unable to find history for %s:%s" % (name, hex(node)))
       
   195 
       
   196     def _findsection(self, name):
       
   197         params = self.params
       
   198         namehash = hashlib.sha1(name).digest()
       
   199         fanoutkey = struct.unpack(params.fanoutstruct,
       
   200                                   namehash[:params.fanoutprefix])[0]
       
   201         fanout = self._fanouttable
       
   202 
       
   203         start = fanout[fanoutkey] + params.indexstart
       
   204         indexend = self._indexend
       
   205 
       
   206         for i in pycompat.xrange(fanoutkey + 1, params.fanoutcount):
       
   207             end = fanout[i] + params.indexstart
       
   208             if end != start:
       
   209                 break
       
   210         else:
       
   211             end = indexend
       
   212 
       
   213         entry = self._bisect(namehash, start, end, self.INDEXENTRYLENGTH)
       
   214         if not entry:
       
   215             raise KeyError(name)
       
   216 
       
   217         rawentry = struct.unpack(self.INDEXFORMAT, entry)
       
   218         if self.VERSION == 0:
       
   219             x, offset, size = rawentry
       
   220             nodeindexoffset = None
       
   221             nodeindexsize = None
       
   222         else:
       
   223             x, offset, size, nodeindexoffset, nodeindexsize = rawentry
       
   224             rawnamelen = self._index[nodeindexoffset:nodeindexoffset +
       
   225                                                      constants.FILENAMESIZE]
       
   226             actualnamelen = struct.unpack('!H', rawnamelen)[0]
       
   227             nodeindexoffset += constants.FILENAMESIZE
       
   228             actualname = self._index[nodeindexoffset:nodeindexoffset +
       
   229                                                      actualnamelen]
       
   230             if actualname != name:
       
   231                 raise KeyError("found file name %s when looking for %s" %
       
   232                                (actualname, name))
       
   233             nodeindexoffset += actualnamelen
       
   234 
       
   235         filenamelength = struct.unpack('!H', self._data[offset:offset +
       
   236                                                     constants.FILENAMESIZE])[0]
       
   237         offset += constants.FILENAMESIZE
       
   238 
       
   239         actualname = self._data[offset:offset + filenamelength]
       
   240         offset += filenamelength
       
   241 
       
   242         if name != actualname:
       
   243             raise KeyError("found file name %s when looking for %s" %
       
   244                            (actualname, name))
       
   245 
       
   246         # Skip entry list size
       
   247         offset += ENTRYCOUNTSIZE
       
   248 
       
   249         nodelistoffset = offset
       
   250         nodelistsize = (size - constants.FILENAMESIZE - filenamelength -
       
   251                         ENTRYCOUNTSIZE)
       
   252         return (name, nodelistoffset, nodelistsize,
       
   253                 nodeindexoffset, nodeindexsize)
       
   254 
       
   255     def _bisect(self, node, start, end, entrylen):
       
   256         # Bisect between start and end to find node
       
   257         origstart = start
       
   258         startnode = self._index[start:start + NODELENGTH]
       
   259         endnode = self._index[end:end + NODELENGTH]
       
   260 
       
   261         if startnode == node:
       
   262             return self._index[start:start + entrylen]
       
   263         elif endnode == node:
       
   264             return self._index[end:end + entrylen]
       
   265         else:
       
   266             while start < end - entrylen:
       
   267                 mid = start + (end - start) / 2
       
   268                 mid = mid - ((mid - origstart) % entrylen)
       
   269                 midnode = self._index[mid:mid + NODELENGTH]
       
   270                 if midnode == node:
       
   271                     return self._index[mid:mid + entrylen]
       
   272                 if node > midnode:
       
   273                     start = mid
       
   274                     startnode = midnode
       
   275                 elif node < midnode:
       
   276                     end = mid
       
   277                     endnode = midnode
       
   278         return None
       
   279 
       
   280     def markledger(self, ledger, options=None):
       
   281         for filename, node in self:
       
   282             ledger.markhistoryentry(self, filename, node)
       
   283 
       
   284     def cleanup(self, ledger):
       
   285         entries = ledger.sources.get(self, [])
       
   286         allkeys = set(self)
       
   287         repackedkeys = set((e.filename, e.node) for e in entries if
       
   288                            e.historyrepacked)
       
   289 
       
   290         if len(allkeys - repackedkeys) == 0:
       
   291             if self.path not in ledger.created:
       
   292                 util.unlinkpath(self.indexpath, ignoremissing=True)
       
   293                 util.unlinkpath(self.packpath, ignoremissing=True)
       
   294 
       
   295     def __iter__(self):
       
   296         for f, n, x, x, x, x in self.iterentries():
       
   297             yield f, n
       
   298 
       
   299     def iterentries(self):
       
   300         # Start at 1 to skip the header
       
   301         offset = 1
       
   302         while offset < self.datasize:
       
   303             data = self._data
       
   304             # <2 byte len> + <filename>
       
   305             filenamelen = struct.unpack('!H', data[offset:offset +
       
   306                                                    constants.FILENAMESIZE])[0]
       
   307             offset += constants.FILENAMESIZE
       
   308             filename = data[offset:offset + filenamelen]
       
   309             offset += filenamelen
       
   310 
       
   311             revcount = struct.unpack('!I', data[offset:offset +
       
   312                                                 ENTRYCOUNTSIZE])[0]
       
   313             offset += ENTRYCOUNTSIZE
       
   314 
       
   315             for i in pycompat.xrange(revcount):
       
   316                 entry = struct.unpack(PACKFORMAT, data[offset:offset +
       
   317                                                               PACKENTRYLENGTH])
       
   318                 offset += PACKENTRYLENGTH
       
   319 
       
   320                 copyfrom = data[offset:offset + entry[ANC_COPYFROM]]
       
   321                 offset += entry[ANC_COPYFROM]
       
   322 
       
   323                 yield (filename, entry[ANC_NODE], entry[ANC_P1NODE],
       
   324                         entry[ANC_P2NODE], entry[ANC_LINKNODE], copyfrom)
       
   325 
       
   326                 self._pagedin += PACKENTRYLENGTH
       
   327 
       
   328             # If we've read a lot of data from the mmap, free some memory.
       
   329             self.freememory()
       
   330 
       
   331 class mutablehistorypack(basepack.mutablebasepack):
       
   332     """A class for constructing and serializing a histpack file and index.
       
   333 
       
   334     A history pack is a pair of files that contain the revision history for
       
   335     various file revisions in Mercurial. It contains only revision history (like
       
   336     parent pointers and linknodes), not any revision content information.
       
   337 
       
   338     It consists of two files, with the following format:
       
   339 
       
   340     .histpack
       
   341         The pack itself is a series of file revisions with some basic header
       
   342         information on each.
       
   343 
       
   344         datapack = <version: 1 byte>
       
   345                    [<filesection>,...]
       
   346         filesection = <filename len: 2 byte unsigned int>
       
   347                       <filename>
       
   348                       <revision count: 4 byte unsigned int>
       
   349                       [<revision>,...]
       
   350         revision = <node: 20 byte>
       
   351                    <p1node: 20 byte>
       
   352                    <p2node: 20 byte>
       
   353                    <linknode: 20 byte>
       
   354                    <copyfromlen: 2 byte>
       
   355                    <copyfrom>
       
   356 
       
   357         The revisions within each filesection are stored in topological order
       
   358         (newest first). If a given entry has a parent from another file (a copy)
       
   359         then p1node is the node from the other file, and copyfrom is the
       
   360         filepath of the other file.
       
   361 
       
   362     .histidx
       
   363         The index file provides a mapping from filename to the file section in
       
   364         the histpack. In V1 it also contains sub-indexes for specific nodes
       
   365         within each file. It consists of three parts, the fanout, the file index
       
   366         and the node indexes.
       
   367 
       
   368         The file index is a list of index entries, sorted by filename hash (one
       
   369         per file section in the pack). Each entry has:
       
   370 
       
   371         - node (The 20 byte hash of the filename)
       
   372         - pack entry offset (The location of this file section in the histpack)
       
   373         - pack content size (The on-disk length of this file section's pack
       
   374                              data)
       
   375         - node index offset (The location of the file's node index in the index
       
   376                              file) [1]
       
   377         - node index size (the on-disk length of this file's node index) [1]
       
   378 
       
   379         The fanout is a quick lookup table to reduce the number of steps for
       
   380         bisecting the index. It is a series of 4 byte pointers to positions
       
   381         within the index. It has 2^16 entries, which corresponds to hash
       
   382         prefixes [00, 01, 02,..., FD, FE, FF]. Example: the pointer in slot 4F
       
   383         points to the index position of the first revision whose node starts
       
   384         with 4F. This saves log(2^16) bisect steps.
       
   385 
       
   386         dataidx = <fanouttable>
       
   387                   <file count: 8 byte unsigned> [1]
       
   388                   <fileindex>
       
   389                   <node count: 8 byte unsigned> [1]
       
   390                   [<nodeindex>,...] [1]
       
   391         fanouttable = [<index offset: 4 byte unsigned int>,...] (2^16 entries)
       
   392 
       
   393         fileindex = [<file index entry>,...]
       
   394         fileindexentry = <node: 20 byte>
       
   395                          <pack file section offset: 8 byte unsigned int>
       
   396                          <pack file section size: 8 byte unsigned int>
       
   397                          <node index offset: 4 byte unsigned int> [1]
       
   398                          <node index size: 4 byte unsigned int>   [1]
       
   399         nodeindex = <filename>[<node index entry>,...] [1]
       
   400         filename = <filename len : 2 byte unsigned int><filename value> [1]
       
   401         nodeindexentry = <node: 20 byte> [1]
       
   402                          <pack file node offset: 8 byte unsigned int> [1]
       
   403 
       
   404     [1]: new in version 1.
       
   405     """
       
   406     INDEXSUFFIX = INDEXSUFFIX
       
   407     PACKSUFFIX = PACKSUFFIX
       
   408 
       
   409     SUPPORTED_VERSIONS = [0, 1]
       
   410 
       
   411     def __init__(self, ui, packpath, version=0):
       
   412         # internal config: remotefilelog.historypackv1
       
   413         if version == 0 and ui.configbool('remotefilelog', 'historypackv1'):
       
   414             version = 1
       
   415 
       
   416         super(mutablehistorypack, self).__init__(ui, packpath, version=version)
       
   417         self.files = {}
       
   418         self.entrylocations = {}
       
   419         self.fileentries = {}
       
   420 
       
   421         if version == 0:
       
   422             self.INDEXFORMAT = INDEXFORMAT0
       
   423             self.INDEXENTRYLENGTH = INDEXENTRYLENGTH0
       
   424         else:
       
   425             self.INDEXFORMAT = INDEXFORMAT1
       
   426             self.INDEXENTRYLENGTH = INDEXENTRYLENGTH1
       
   427 
       
   428         self.NODEINDEXFORMAT = NODEINDEXFORMAT
       
   429         self.NODEINDEXENTRYLENGTH = NODEINDEXENTRYLENGTH
       
   430 
       
   431     def add(self, filename, node, p1, p2, linknode, copyfrom):
       
   432         copyfrom = copyfrom or ''
       
   433         copyfromlen = struct.pack('!H', len(copyfrom))
       
   434         self.fileentries.setdefault(filename, []).append((node, p1, p2,
       
   435                                                           linknode,
       
   436                                                           copyfromlen,
       
   437                                                           copyfrom))
       
   438 
       
   439     def _write(self):
       
   440         for filename in sorted(self.fileentries):
       
   441             entries = self.fileentries[filename]
       
   442             sectionstart = self.packfp.tell()
       
   443 
       
   444             # Write the file section content
       
   445             entrymap = dict((e[0], e) for e in entries)
       
   446             def parentfunc(node):
       
   447                 x, p1, p2, x, x, x = entrymap[node]
       
   448                 parents = []
       
   449                 if p1 != nullid:
       
   450                     parents.append(p1)
       
   451                 if p2 != nullid:
       
   452                     parents.append(p2)
       
   453                 return parents
       
   454 
       
   455             sortednodes = list(reversed(shallowutil.sortnodes(
       
   456                 (e[0] for e in entries),
       
   457                 parentfunc)))
       
   458 
       
   459             # Write the file section header
       
   460             self.writeraw("%s%s%s" % (
       
   461                 struct.pack('!H', len(filename)),
       
   462                 filename,
       
   463                 struct.pack('!I', len(sortednodes)),
       
   464             ))
       
   465 
       
   466             sectionlen = constants.FILENAMESIZE + len(filename) + 4
       
   467 
       
   468             rawstrings = []
       
   469 
       
   470             # Record the node locations for the index
       
   471             locations = self.entrylocations.setdefault(filename, {})
       
   472             offset = sectionstart + sectionlen
       
   473             for node in sortednodes:
       
   474                 locations[node] = offset
       
   475                 raw = '%s%s%s%s%s%s' % entrymap[node]
       
   476                 rawstrings.append(raw)
       
   477                 offset += len(raw)
       
   478 
       
   479             rawdata = ''.join(rawstrings)
       
   480             sectionlen += len(rawdata)
       
   481 
       
   482             self.writeraw(rawdata)
       
   483 
       
   484             # Record metadata for the index
       
   485             self.files[filename] = (sectionstart, sectionlen)
       
   486             node = hashlib.sha1(filename).digest()
       
   487             self.entries[node] = node
       
   488 
       
   489     def close(self, ledger=None):
       
   490         if self._closed:
       
   491             return
       
   492 
       
   493         self._write()
       
   494 
       
   495         return super(mutablehistorypack, self).close(ledger=ledger)
       
   496 
       
   497     def createindex(self, nodelocations, indexoffset):
       
   498         fileindexformat = self.INDEXFORMAT
       
   499         fileindexlength = self.INDEXENTRYLENGTH
       
   500         nodeindexformat = self.NODEINDEXFORMAT
       
   501         nodeindexlength = self.NODEINDEXENTRYLENGTH
       
   502         version = self.VERSION
       
   503 
       
   504         files = ((hashlib.sha1(filename).digest(), filename, offset, size)
       
   505                 for filename, (offset, size) in self.files.iteritems())
       
   506         files = sorted(files)
       
   507 
       
   508         # node index is after file index size, file index, and node index size
       
   509         indexlensize = struct.calcsize('!Q')
       
   510         nodeindexoffset = (indexoffset + indexlensize +
       
   511                            (len(files) * fileindexlength) + indexlensize)
       
   512 
       
   513         fileindexentries = []
       
   514         nodeindexentries = []
       
   515         nodecount = 0
       
   516         for namehash, filename, offset, size in files:
       
   517             # File section index
       
   518             if version == 0:
       
   519                 rawentry = struct.pack(fileindexformat, namehash, offset, size)
       
   520             else:
       
   521                 nodelocations = self.entrylocations[filename]
       
   522 
       
   523                 nodeindexsize = len(nodelocations) * nodeindexlength
       
   524 
       
   525                 rawentry = struct.pack(fileindexformat, namehash, offset, size,
       
   526                                        nodeindexoffset, nodeindexsize)
       
   527                 # Node index
       
   528                 nodeindexentries.append(struct.pack(constants.FILENAMESTRUCT,
       
   529                                                     len(filename)) + filename)
       
   530                 nodeindexoffset += constants.FILENAMESIZE + len(filename)
       
   531 
       
   532                 for node, location in sorted(nodelocations.iteritems()):
       
   533                     nodeindexentries.append(struct.pack(nodeindexformat, node,
       
   534                                                         location))
       
   535                     nodecount += 1
       
   536 
       
   537                 nodeindexoffset += len(nodelocations) * nodeindexlength
       
   538 
       
   539             fileindexentries.append(rawentry)
       
   540 
       
   541         nodecountraw = ''
       
   542         if version == 1:
       
   543             nodecountraw = struct.pack('!Q', nodecount)
       
   544         return (''.join(fileindexentries) + nodecountraw +
       
   545                 ''.join(nodeindexentries))