store: don't read the whole fncache in memory
authorPulkit Goyal <pulkit@yandex-team.ru>
Thu, 22 Nov 2018 15:14:24 +0300
changeset 41978 a56487081109
parent 41976 51685c6dcca3
child 41979 0d467e4de4ae
store: don't read the whole fncache in memory In large repositories with lot of files, the fncache grows more than 100 MB and reading that whole thing into memory slows things down. Let's not read the whole thing into memory. This patch changes fncache loading code to read 1 MB at once. Loading 1 MB at once saves ~1 sec on perffncacheload for our internal repository. I tried various values such as 0.5 MB, 5 MB, 10 MB but best results were produced using 1 MB as the chunksize. On a narrow clone with fncache around 40 MB, this patch saves ~0.04 seconds on average on perffncacheload. To test the code, I have coded an extension in test-fncache.t which set chunksize to 1 byte, and the test passes with that. Differential Revision: https://phab.mercurial-scm.org/D5296
mercurial/store.py
tests/test-fncache.t
--- a/mercurial/store.py	Fri Mar 08 10:20:33 2019 -0800
+++ b/mercurial/store.py	Thu Nov 22 15:14:24 2018 +0300
@@ -8,6 +8,7 @@
 from __future__ import absolute_import
 
 import errno
+import functools
 import hashlib
 import os
 import stat
@@ -23,6 +24,9 @@
 )
 
 parsers = policy.importmod(r'parsers')
+# how much bytes should be read from fncache in one read
+# It is done to prevent loading large fncache files into memory
+fncache_chunksize = 10 ** 6
 
 def _matchtrackedpath(path, matcher):
     """parses a fncache entry and returns whether the entry is tracking a path
@@ -463,7 +467,20 @@
             # skip nonexistent file
             self.entries = set()
             return
-        self.entries = set(decodedir(fp.read()).splitlines())
+
+        self.entries = set()
+        chunk = b''
+        for c in iter(functools.partial(fp.read, fncache_chunksize), b''):
+            chunk += c
+            try:
+                p = chunk.rindex(b'\n')
+                self.entries.update(decodedir(chunk[:p + 1]).splitlines())
+                chunk = chunk[p + 1:]
+            except ValueError:
+                # substring '\n' not found, maybe the entry is bigger than the
+                # chunksize, so let's keep iterating
+                pass
+
         self._checkentries(fp)
         fp.close()
 
--- a/tests/test-fncache.t	Fri Mar 08 10:20:33 2019 -0800
+++ b/tests/test-fncache.t	Thu Nov 22 15:14:24 2018 +0300
@@ -1,5 +1,19 @@
 #require repofncache
 
+An extension which will set fncache chunksize to 1 byte to make sure that logic
+does not break
+
+  $ cat > chunksize.py <<EOF
+  > from __future__ import absolute_import
+  > from mercurial import store
+  > store.fncache_chunksize = 1
+  > EOF
+
+  $ cat >> $HGRCPATH <<EOF
+  > [extensions]
+  > chunksize = $TESTTMP/chunksize.py
+  > EOF
+
 Init repo1:
 
   $ hg init repo1