# HG changeset patch # User Matt Mackall # Date 1461350282 18000 # Node ID 87d4a6c5567e81386b8c2209d95060d5bf72e064 # Parent f1ca249696ed61fbc1f5e4076cfadd2a039293f8 bdiff: further restrain potential quadratic performance This causes the longest_match search to limit itself to a window of 30000 lines during search (roughly 1MB), thus avoiding a full O(N*M) search that might occur in repetitive structured inputs. For a particular class of many MB pathological test cases, this generated the following timings: size before after 10x 1.25s 1.24s 100x 57s 33s 1000x >8400s 400s The times on the right quickly become much faster and appear more linear. While windowing means deltas are no longer "optimal", the resulting deltas were within a couple percent of expected size. While we've yet to have a report of a file with the level of repetition necessary to hit this case, some JSON/XML database dump scenario is fairly likely to hit it. This may also slightly improve the average-case performance for deltas of large binaries. diff -r f1ca249696ed -r 87d4a6c5567e mercurial/bdiff.c --- a/mercurial/bdiff.c Thu Apr 21 22:04:11 2016 -0500 +++ b/mercurial/bdiff.c Fri Apr 22 13:38:02 2016 -0500 @@ -148,7 +148,15 @@ static int longest_match(struct line *a, struct line *b, struct pos *pos, int a1, int a2, int b1, int b2, int *omi, int *omj) { - int mi = a1, mj = b1, mk = 0, mb = 0, i, j, k, half = (a1 + a2) / 2; + int mi = a1, mj = b1, mk = 0, mb = 0, i, j, k, half; + + /* window our search on large regions to better bound + worst-case performance. by choosing a window at the end, we + reduce skipping overhead on the b chains. */ + if (a2 - a1 > 30000) + a1 = a2 - 30000; + + half = (a1 + a2) / 2; for (i = a1; i < a2; i++) { /* skip all lines in b after the current block */