changeset 7118:619ebf82cef2

Take advantage of fstat calls clustering per directory if OS support it. util module implements two versions of statfiles function _statfiles calls lstat per file _statfiles_clustered takes advantage of optimizations in osutil.c, stats all files in directory at once when new directory is hit and caches the results util.statfiles dispatches to appropriate version during module loading The speedup on directory tree with 2k directories and 63k files is about factor of 1.8 (1.3s -> 0.8s for hg diff - hg startup overhead about .2s) At this point only Win32 now benefit from this patch. Rest of OSes use the non clustered implementation.
author Petr Kodl <petrkodl@gmail.com>
date Thu, 09 Oct 2008 10:29:47 -0400
parents ceb8aef03aa7
children 50f4e866d693
files mercurial/dirstate.py mercurial/util.py
diffstat 2 files changed, 51 insertions(+), 11 deletions(-) [+]
line wrap: on
line diff
--- a/mercurial/dirstate.py	Thu Oct 16 17:08:46 2008 +0200
+++ b/mercurial/dirstate.py	Thu Oct 09 10:29:47 2008 -0400
@@ -522,17 +522,11 @@
                         results[nf] = None
 
         # step 3: report unseen items in the dmap hash
-        visit = [f for f in dmap if f not in results and match(f)]
-        for nf in util.sort(visit):
-            results[nf] = None
-            try:
-                st = lstat(join(nf))
-                kind = getkind(st.st_mode)
-                if kind == regkind or kind == lnkkind:
-                    results[nf] = st
-            except OSError, inst:
-                if inst.errno not in (errno.ENOENT, errno.ENOTDIR):
-                    raise
+        visit = util.sort([f for f in dmap if f not in results and match(f)])
+        for nf, st in zip(visit, util.statfiles([join(i) for i in visit])):
+            if not st is None and not getkind(st.st_mode) in (regkind, lnkkind):
+                st = None
+            results[nf] = st
 
         del results['.hg']
         return results
--- a/mercurial/util.py	Thu Oct 16 17:08:46 2008 +0200
+++ b/mercurial/util.py	Thu Oct 09 10:29:47 2008 -0400
@@ -826,6 +826,52 @@
     '''return true if it is safe to hold open file handles to hardlinks'''
     return True
 
+def _statfiles(files):
+    'Stat each file in files and yield stat or None if file does not exist.'
+    lstat = os.lstat
+    for nf in files:
+        try:
+            st = lstat(nf)
+        except OSError, err:
+            if err.errno not in (errno.ENOENT, errno.ENOTDIR):
+                raise
+            st = None
+        yield st
+
+def _statfiles_clustered(files):
+    '''Stat each file in files and yield stat or None if file does not exist.
+    Cluster and cache stat per directory to minimize number of OS stat calls.'''
+    lstat = os.lstat
+    ncase = os.path.normcase
+    sep   = os.sep
+    dircache = {} # dirname -> filename -> status | None if file does not exist
+    for nf in files:
+        nf  = ncase(nf)
+        pos = nf.rfind(sep)
+        if pos == -1:
+            dir, base = '.', nf
+        else:
+            dir, base = nf[:pos], nf[pos+1:]
+        cache = dircache.get(dir, None)
+        if cache is None:
+            try:
+                dmap = dict([(ncase(n), s)
+                    for n, k, s in osutil.listdir(dir, True)])
+            except OSError, err:
+                # handle directory not found in Python version prior to 2.5
+                # Python <= 2.4 returns native Windows code 3 in errno
+                # Python >= 2.5 returns ENOENT and adds winerror field
+                if err.errno not in (3, errno.ENOENT, errno.ENOTDIR):
+                    raise
+                dmap = {}
+            cache = dircache.setdefault(dir, dmap)
+        yield cache.get(base, None)
+
+if sys.platform == 'win32':
+    statfiles = _statfiles_clustered
+else:
+    statfiles = _statfiles
+
 getuser_fallback = None
 
 def getuser():