Mercurial > hg
changeset 47389:e6292eb33384
revlog: store sidedata in their own file
This makes sidedata manipulation simpler and results in more compact data when
traversing either data or sidedata.
Differential Revision: https://phab.mercurial-scm.org/D10787
author | Pierre-Yves David <pierre-yves.david@octobus.net> |
---|---|
date | Fri, 28 May 2021 23:41:17 +0200 |
parents | bcf92bdc2bca |
children | 65b86f516ba2 |
files | mercurial/configitems.py mercurial/revlog.py mercurial/revlogutils/docket.py mercurial/store.py tests/test-revlog-v2.t |
diffstat | 5 files changed, 140 insertions(+), 31 deletions(-) [+] |
line wrap: on
line diff
--- a/mercurial/configitems.py Fri May 28 23:41:12 2021 +0200 +++ b/mercurial/configitems.py Fri May 28 23:41:17 2021 +0200 @@ -1162,13 +1162,13 @@ # rewriting sidedata. # * introduce a proper solution to reduce the number of filelog related files. # * use caching for reading sidedata (similar to what we do for data). +# * no longer set offset=0 if sidedata_size=0 (simplify cutoff computation). # * Improvement to consider # - avoid compression header in chunk using the default compression? # - forbid "inline" compression mode entirely? # - split the data offset and flag field (the 2 bytes save are mostly trouble) # - keep track of uncompressed -chunk- size (to preallocate memory better) # - keep track of chain base or size (probably not that useful anymore) -# - store data and sidedata in different files coreconfigitem( b'experimental', b'revlogv2',
--- a/mercurial/revlog.py Fri May 28 23:41:12 2021 +0200 +++ b/mercurial/revlog.py Fri May 28 23:41:17 2021 +0200 @@ -1,4 +1,5 @@ # revlog.py - storage back-end for mercurial +# coding: utf8 # # Copyright 2005-2007 Olivia Mackall <olivia@selenic.com> # @@ -260,6 +261,11 @@ b'partial read of revlog %s; expected %d bytes from offset %d, got %d' ) +FILE_TOO_SHORT_MSG = _( + b'cannot read from revlog %s;' + b' expected %d bytes from offset %d, data size is %d' +) + class revlog(object): """ @@ -401,6 +407,7 @@ self._docket_file = None self._indexfile = None self._datafile = None + self._sidedatafile = None self._nodemap_file = None self.postfix = postfix self._trypending = trypending @@ -445,7 +452,7 @@ # custom flags. self._flagprocessors = dict(flagutil.flagprocessors) - # 2-tuple of file handles being used for active writing. + # 3-tuple of file handles being used for active writing. self._writinghandles = None # prevent nesting of addgroup self._adding_group = None @@ -634,6 +641,7 @@ if self._docket is not None: self._datafile = self._docket.data_filepath() + self._sidedatafile = self._docket.sidedata_filepath() elif self.postfix is None: self._datafile = b'%s.d' % self.radix else: @@ -803,9 +811,14 @@ with func() as fp: yield fp + @contextlib.contextmanager def _sidedatareadfp(self): """file object suitable to read sidedata""" - return self._datareadfp() + if self._writinghandles: + yield self._writinghandles[2] + else: + with self.opener(self._sidedatafile) as fp: + yield fp def tiprev(self): return len(self.index) - 1 @@ -909,6 +922,23 @@ def start(self, rev): return int(self.index[rev][0] >> 16) + def sidedata_cut_off(self, rev): + sd_cut_off = self.index[rev][8] + if sd_cut_off != 0: + return sd_cut_off + # This is some annoying dance, because entries without sidedata + # currently use 0 as their ofsset. (instead of previous-offset + + # previous-size) + # + # We should reconsider this sidedata → 0 sidata_offset policy. + # In the meantime, we need this. + while 0 <= rev: + e = self.index[rev] + if e[9] != 0: + return e[8] + e[9] + rev -= 1 + return 0 + def flags(self, rev): return self.index[rev][0] & 0xFFFF @@ -2074,11 +2104,19 @@ # XXX this need caching, as we do for data with self._sidedatareadfp() as sdf: - sdf.seek(sidedata_offset) + if self._docket.sidedata_end < sidedata_offset + sidedata_size: + filename = self._sidedatafile + end = self._docket.sidedata_end + offset = sidedata_offset + length = sidedata_size + m = FILE_TOO_SHORT_MSG % (filename, length, offset, end) + raise error.RevlogError(m) + + sdf.seek(sidedata_offset, os.SEEK_SET) comp_segment = sdf.read(sidedata_size) if len(comp_segment) < sidedata_size: - filename = self._datafile + filename = self._sidedatafile length = sidedata_size offset = sidedata_offset got = len(comp_segment) @@ -2215,7 +2253,7 @@ if existing_handles: # switched from inline to conventional reopen the index ifh = self.__index_write_fp() - self._writinghandles = (ifh, new_dfh) + self._writinghandles = (ifh, new_dfh, None) new_dfh = None finally: if new_dfh is not None: @@ -2233,7 +2271,7 @@ if self._writinghandles is not None: yield else: - ifh = dfh = None + ifh = dfh = sdfh = None try: r = len(self) # opening the data file. @@ -2253,6 +2291,17 @@ raise dfh = self._datafp(b"w+") transaction.add(self._datafile, dsize) + if self._sidedatafile is not None: + try: + sdfh = self.opener(self._sidedatafile, mode=b"r+") + dfh.seek(self._docket.sidedata_end, os.SEEK_SET) + except IOError as inst: + if inst.errno != errno.ENOENT: + raise + sdfh = self.opener(self._sidedatafile, mode=b"w+") + transaction.add( + self._sidedatafile, self._docket.sidedata_end + ) # opening the index file. isize = r * self.index.entry_size @@ -2262,7 +2311,7 @@ else: transaction.add(self._indexfile, isize) # exposing all file handle for writing. - self._writinghandles = (ifh, dfh) + self._writinghandles = (ifh, dfh, sdfh) yield if self._docket is not None: self._write_docket(transaction) @@ -2270,6 +2319,8 @@ self._writinghandles = None if dfh is not None: dfh.close() + if sdfh is not None: + dfh.close() # closing the index file last to avoid exposing referent to # potential unflushed data content. if ifh is not None: @@ -2513,7 +2564,8 @@ offset = self._get_data_offset(prev) if self._concurrencychecker: - ifh, dfh = self._writinghandles + ifh, dfh, sdfh = self._writinghandles + # XXX no checking for the sidedata file if self._inline: # offset is "as if" it were in the .d file, so we need to add on # the size of the entry metadata. @@ -2570,7 +2622,7 @@ if sidedata and self.hassidedata: sidedata_compression_mode = COMP_MODE_PLAIN serialized_sidedata = sidedatautil.serialize_sidedata(sidedata) - sidedata_offset = offset + deltainfo.deltalen + sidedata_offset = self._docket.sidedata_end h, comp_sidedata = self.compress(serialized_sidedata) if ( h != b'u' @@ -2622,6 +2674,7 @@ link, offset, serialized_sidedata, + sidedata_offset, ) rawtext = btext[0] @@ -2648,7 +2701,9 @@ else: return self._docket.data_end - def _writeentry(self, transaction, entry, data, link, offset, sidedata): + def _writeentry( + self, transaction, entry, data, link, offset, sidedata, sidedata_offset + ): # Files opened in a+ mode have inconsistent behavior on various # platforms. Windows requires that a file positioning call be made # when the file handle transitions between reads and writes. See @@ -2664,7 +2719,7 @@ if self._writinghandles is None: msg = b'adding revision outside `revlog._writing` context' raise error.ProgrammingError(msg) - ifh, dfh = self._writinghandles + ifh, dfh, sdfh = self._writinghandles if self._docket is None: ifh.seek(0, os.SEEK_END) else: @@ -2674,16 +2729,20 @@ dfh.seek(0, os.SEEK_END) else: dfh.seek(self._docket.data_end, os.SEEK_SET) + if sdfh: + sdfh.seek(self._docket.sidedata_end, os.SEEK_SET) curr = len(self) - 1 if not self._inline: transaction.add(self._datafile, offset) + if self._sidedatafile: + transaction.add(self._sidedatafile, sidedata_offset) transaction.add(self._indexfile, curr * len(entry)) if data[0]: dfh.write(data[0]) dfh.write(data[1]) if sidedata: - dfh.write(sidedata) + sdfh.write(sidedata) ifh.write(entry) else: offset += curr * self.index.entry_size @@ -2691,12 +2750,12 @@ ifh.write(entry) ifh.write(data[0]) ifh.write(data[1]) - if sidedata: - ifh.write(sidedata) + assert not sidedata self._enforceinlinesize(transaction) if self._docket is not None: self._docket.index_end = self._writinghandles[0].tell() self._docket.data_end = self._writinghandles[1].tell() + self._docket.sidedata_end = self._writinghandles[2].tell() nodemaputil.setup_persistent_nodemap(transaction, self) @@ -2866,12 +2925,17 @@ else: end = data_end + (rev * self.index.entry_size) + if self._sidedatafile: + sidedata_end = self.sidedata_cut_off(rev) + transaction.add(self._sidedatafile, sidedata_end) + transaction.add(self._indexfile, end) if self._docket is not None: # XXX we could, leverage the docket while stripping. However it is # not powerfull enough at the time of this comment self._docket.index_end = end self._docket.data_end = data_end + self._docket.sidedata_end = sidedata_end self._docket.write(transaction, stripping=True) # then reset internal state in memory to forget those revisions @@ -3398,13 +3462,10 @@ new_entries = [] # append the new sidedata with self._writing(transaction): - ifh, dfh = self._writinghandles - if self._docket is not None: - dfh.seek(self._docket.data_end, os.SEEK_SET) - else: - dfh.seek(0, os.SEEK_END) - - current_offset = dfh.tell() + ifh, dfh, sdfh = self._writinghandles + dfh.seek(self._docket.sidedata_end, os.SEEK_SET) + + current_offset = sdfh.tell() for rev in range(startrev, endrev + 1): entry = self.index[rev] new_sidedata, flags = sidedatautil.run_sidedata_helpers( @@ -3455,12 +3516,11 @@ ) # the sidedata computation might have move the file cursors around - dfh.seek(current_offset, os.SEEK_SET) - dfh.write(serialized_sidedata) + sdfh.seek(current_offset, os.SEEK_SET) + sdfh.write(serialized_sidedata) new_entries.append(entry_update) current_offset += len(serialized_sidedata) - if self._docket is not None: - self._docket.data_end = dfh.tell() + self._docket.sidedata_end = sdfh.tell() # rewrite the new index entries ifh.seek(startrev * self.index.entry_size)
--- a/mercurial/revlogutils/docket.py Fri May 28 23:41:12 2021 +0200 +++ b/mercurial/revlogutils/docket.py Fri May 28 23:41:17 2021 +0200 @@ -90,12 +90,15 @@ # | revlog index header. # * 1 bytes: size of index uuid # * 1 bytes: size of data uuid +# * 1 bytes: size of sizedata uuid # * 8 bytes: size of index-data # * 8 bytes: pending size of index-data # * 8 bytes: size of data +# * 8 bytes: size of sidedata # * 8 bytes: pending size of data +# * 8 bytes: pending size of sidedata # * 1 bytes: default compression header -S_HEADER = struct.Struct(constants.INDEX_HEADER_FMT + b'BBLLLLc') +S_HEADER = struct.Struct(constants.INDEX_HEADER_FMT + b'BBBLLLLLLc') class RevlogDocket(object): @@ -108,10 +111,13 @@ version_header=None, index_uuid=None, data_uuid=None, + sidedata_uuid=None, index_end=0, pending_index_end=0, data_end=0, pending_data_end=0, + sidedata_end=0, + pending_sidedata_end=0, default_compression_header=None, ): self._version_header = version_header @@ -122,19 +128,25 @@ self._opener = revlog.opener self._index_uuid = index_uuid self._data_uuid = data_uuid + self._sidedata_uuid = sidedata_uuid # thes asserts should be True as long as we have a single index filename assert index_end <= pending_index_end assert data_end <= pending_data_end + assert sidedata_end <= pending_sidedata_end self._initial_index_end = index_end self._pending_index_end = pending_index_end self._initial_data_end = data_end self._pending_data_end = pending_data_end + self._initial_sidedata_end = sidedata_end + self._pending_sidedata_end = pending_sidedata_end if use_pending: self._index_end = self._pending_index_end self._data_end = self._pending_data_end + self._sidedata_end = self._pending_sidedata_end else: self._index_end = self._initial_index_end self._data_end = self._initial_data_end + self._sidedata_end = self._initial_sidedata_end self.default_compression_header = default_compression_header def index_filepath(self): @@ -151,6 +163,13 @@ self._data_uuid = make_uid() return b"%s-%s.dat" % (self._radix, self._data_uuid) + def sidedata_filepath(self): + """file path to the current sidedata file associated to this docket""" + # very simplistic version at first + if self._sidedata_uuid is None: + self._sidedata_uuid = make_uid() + return b"%s-%s.sda" % (self._radix, self._sidedata_uuid) + @property def index_end(self): return self._index_end @@ -171,6 +190,16 @@ self._data_end = new_size self._dirty = True + @property + def sidedata_end(self): + return self._sidedata_end + + @sidedata_end.setter + def sidedata_end(self, new_size): + if new_size != self._sidedata_end: + self._sidedata_end = new_size + self._dirty = True + def write(self, transaction, pending=False, stripping=False): """write the modification of disk if any @@ -196,26 +225,33 @@ if pending: official_index_end = self._initial_index_end official_data_end = self._initial_data_end + official_sidedata_end = self._initial_sidedata_end else: official_index_end = self._index_end official_data_end = self._data_end + official_sidedata_end = self._sidedata_end # this assert should be True as long as we have a single index filename assert official_data_end <= self._data_end + assert official_sidedata_end <= self._sidedata_end data = ( self._version_header, len(self._index_uuid), len(self._data_uuid), + len(self._sidedata_uuid), official_index_end, self._index_end, official_data_end, self._data_end, + official_sidedata_end, + self._sidedata_end, self.default_compression_header, ) s = [] s.append(S_HEADER.pack(*data)) s.append(self._index_uuid) s.append(self._data_uuid) + s.append(self._sidedata_uuid) return b''.join(s) @@ -262,6 +298,9 @@ data_uuid_size = next(iheader) data_uuid = get_data(data_uuid_size) + sidedata_uuid_size = next(iheader) + sidedata_uuid = get_data(sidedata_uuid_size) + index_size = next(iheader) pending_index_size = next(iheader) @@ -270,6 +309,10 @@ pending_data_size = next(iheader) + sidedata_size = next(iheader) + + pending_sidedata_size = next(iheader) + default_compression_header = next(iheader) docket = RevlogDocket( @@ -278,10 +321,13 @@ version_header=version_header, index_uuid=index_uuid, data_uuid=data_uuid, + sidedata_uuid=sidedata_uuid, index_end=index_size, pending_index_end=pending_index_size, data_end=data_size, pending_data_end=pending_data_size, + sidedata_end=sidedata_size, + pending_sidedata_end=pending_sidedata_size, default_compression_header=default_compression_header, ) return docket
--- a/mercurial/store.py Fri May 28 23:41:12 2021 +0200 +++ b/mercurial/store.py Fri May 28 23:41:17 2021 +0200 @@ -395,6 +395,7 @@ b'.dat', b'.n', b'.nd', + b'.sda', b'd.tmpcensored', ) # files that are "volatile" and might change between listing and streaming
--- a/tests/test-revlog-v2.t Fri May 28 23:41:12 2021 +0200 +++ b/tests/test-revlog-v2.t Fri May 28 23:41:17 2021 +0200 @@ -86,9 +86,11 @@ - a data file $ ls .hg/store/00changelog* .hg/store/00manifest* - .hg/store/00changelog-6b8ab34b.dat - .hg/store/00changelog-88698448.idx + .hg/store/00changelog-1335303a.sda + .hg/store/00changelog-6b8ab34b.idx + .hg/store/00changelog-b875dfc5.dat .hg/store/00changelog.i - .hg/store/00manifest-1335303a.dat - .hg/store/00manifest-b875dfc5.idx + .hg/store/00manifest-05a21d65.idx + .hg/store/00manifest-43c37dde.dat + .hg/store/00manifest-e2c9362a.sda .hg/store/00manifest.i