Mercurial > hg-stable
view mercurial/bdiff.c @ 37054:40206e227412
wireproto: define and implement protocol for issuing requests
The existing HTTP and SSH wire protocols suffer from a host of flaws
and shortcomings. I've been wanting to rewrite the protocol for a while
now. Supporting partial clone - which will require new wire protocol
commands and capabilities - and other advanced server functionality
will be much easier if we start from a clean slate and don't have
to be constrained by limitations of the existing wire protocol.
This commit starts to introduce a new data exchange format for
use over the wire protocol.
The new protocol is built on top of "frames," which are atomic
units of metadata + data. Frames will make it easier to implement
proxies and other mechanisms that want to inspect data without
having to maintain state. The existing frame metadata is very
minimal and it will evolve heavily. (We will eventually support
things like concurrent requests, out-of-order responses,
compression, side-channels for status updates, etc. Some of
these will require additions to the frame header.)
Another benefit of frames is that all reads are of a fixed size.
A reader works by consuming a frame header, extracting the payload
length, then reading that many bytes. No lookahead, buffering, or
memory reallocations are needed.
The new protocol attempts to be transport agnostic. I want all that's
required to use the new protocol to be a pair of unidirectional,
half-duplex pipes. (Yes, we will eventually make use of full-duplex
pipes, but that's for another commit.) Notably, when the SSH
transport switches to this new protocol, stderr will be unused.
This is by design: the lack of stderr on HTTP harms protocol
behavior there. By shoehorning everything into a pair of pipes,
we can have more consistent behavior across transports.
We currently only define the client side parts of the new protocol,
specifically the bits for requesting that a command run. This keeps
the new code and feature small and somewhat easy to review.
We add support to `hg debugwireproto` for writing frames into
HTTP request bodies. Our tests that issue commands to the new
HTTP endpoint have been updated to transmit frames. The server
bits haven't been touched to consume the frames yet. This will
occur in the next commit...
Astute readers may notice that the command name is transmitted in
both the HTTP request URL and the command request frame. This is
partially a kludge from me initially implementing the frame-based
protocol for SSH first. But it is also a feature: I intend to
eventually support issuing multiple commands per HTTP request. This
will allow us to replace the abomination that is the "batch" wire
protocol command with a protocol-level mechanism for performing
multi-dispatch. Because I want the frame-based protocol to be
as similar as possible across transports, I'd rather we (redundantly)
include the command name in the frame than differ behavior between
transports that have out-of-band routing information (like HTTP)
readily available.
Differential Revision: https://phab.mercurial-scm.org/D2851
author | Gregory Szorc <gregory.szorc@gmail.com> |
---|---|
date | Mon, 19 Mar 2018 16:49:53 -0700 |
parents | 50868145a8de |
children | 068e774ae29e |
line wrap: on
line source
/* bdiff.c - efficient binary diff extension for Mercurial Copyright 2005, 2006 Matt Mackall <mpm@selenic.com> This software may be used and distributed according to the terms of the GNU General Public License, incorporated herein by reference. Based roughly on Python difflib */ #include <limits.h> #include <stdlib.h> #include <string.h> #include "bdiff.h" #include "bitmanipulation.h" #include "compat.h" /* Hash implementation from diffutils */ #define ROL(v, n) ((v) << (n) | (v) >> (sizeof(v) * CHAR_BIT - (n))) #define HASH(h, c) ((c) + ROL(h, 7)) struct pos { int pos, len; }; int bdiff_splitlines(const char *a, ssize_t len, struct bdiff_line **lr) { unsigned hash; int i; const char *p, *b = a; const char *const plast = a + len - 1; struct bdiff_line *l; /* count the lines */ i = 1; /* extra line for sentinel */ for (p = a; p < plast; p++) if (*p == '\n') i++; if (p == plast) i++; *lr = l = (struct bdiff_line *)calloc(i, sizeof(struct bdiff_line)); if (!l) return -1; /* build the line array and calculate hashes */ hash = 0; for (p = a; p < plast; p++) { hash = HASH(hash, *p); if (*p == '\n') { l->hash = hash; hash = 0; l->len = p - b + 1; l->l = b; l->n = INT_MAX; l++; b = p + 1; } } if (p == plast) { hash = HASH(hash, *p); l->hash = hash; l->len = p - b + 1; l->l = b; l->n = INT_MAX; l++; } /* set up a sentinel */ l->hash = 0; l->len = 0; l->l = a + len; return i - 1; } static inline int cmp(struct bdiff_line *a, struct bdiff_line *b) { return a->hash != b->hash || a->len != b->len || memcmp(a->l, b->l, a->len); } static int equatelines(struct bdiff_line *a, int an, struct bdiff_line *b, int bn) { int i, j, buckets = 1, t, scale; struct pos *h = NULL; /* build a hash table of the next highest power of 2 */ while (buckets < bn + 1) buckets *= 2; /* try to allocate a large hash table to avoid collisions */ for (scale = 4; scale; scale /= 2) { h = (struct pos *)calloc(buckets, scale * sizeof(struct pos)); if (h) break; } if (!h) return 0; buckets = buckets * scale - 1; /* clear the hash table */ for (i = 0; i <= buckets; i++) { h[i].pos = -1; h[i].len = 0; } /* add lines to the hash table chains */ for (i = 0; i < bn; i++) { /* find the equivalence class */ for (j = b[i].hash & buckets; h[j].pos != -1; j = (j + 1) & buckets) if (!cmp(b + i, b + h[j].pos)) break; /* add to the head of the equivalence class */ b[i].n = h[j].pos; b[i].e = j; h[j].pos = i; h[j].len++; /* keep track of popularity */ } /* compute popularity threshold */ t = (bn >= 31000) ? bn / 1000 : 1000000 / (bn + 1); /* match items in a to their equivalence class in b */ for (i = 0; i < an; i++) { /* find the equivalence class */ for (j = a[i].hash & buckets; h[j].pos != -1; j = (j + 1) & buckets) if (!cmp(a + i, b + h[j].pos)) break; a[i].e = j; /* use equivalence class for quick compare */ if (h[j].len <= t) a[i].n = h[j].pos; /* point to head of match list */ else a[i].n = -1; /* too popular */ } /* discard hash tables */ free(h); return 1; } static int longest_match(struct bdiff_line *a, struct bdiff_line *b, struct pos *pos, int a1, int a2, int b1, int b2, int *omi, int *omj) { int mi = a1, mj = b1, mk = 0, i, j, k, half, bhalf; /* window our search on large regions to better bound worst-case performance. by choosing a window at the end, we reduce skipping overhead on the b chains. */ if (a2 - a1 > 30000) a1 = a2 - 30000; half = (a1 + a2 - 1) / 2; bhalf = (b1 + b2 - 1) / 2; for (i = a1; i < a2; i++) { /* skip all lines in b after the current block */ for (j = a[i].n; j >= b2; j = b[j].n) ; /* loop through all lines match a[i] in b */ for (; j >= b1; j = b[j].n) { /* does this extend an earlier match? */ for (k = 1; j - k >= b1 && i - k >= a1; k++) { /* reached an earlier match? */ if (pos[j - k].pos == i - k) { k += pos[j - k].len; break; } /* previous line mismatch? */ if (a[i - k].e != b[j - k].e) break; } pos[j].pos = i; pos[j].len = k; /* best match so far? we prefer matches closer to the middle to balance recursion */ if (k > mk) { /* a longer match */ mi = i; mj = j; mk = k; } else if (k == mk) { if (i > mi && i <= half && j > b1) { /* same match but closer to half */ mi = i; mj = j; } else if (i == mi && (mj > bhalf || i == a1)) { /* same i but best earlier j */ mj = j; } } } } if (mk) { mi = mi - mk + 1; mj = mj - mk + 1; } /* expand match to include subsequent popular lines */ while (mi + mk < a2 && mj + mk < b2 && a[mi + mk].e == b[mj + mk].e) mk++; *omi = mi; *omj = mj; return mk; } static struct bdiff_hunk *recurse(struct bdiff_line *a, struct bdiff_line *b, struct pos *pos, int a1, int a2, int b1, int b2, struct bdiff_hunk *l) { int i, j, k; while (1) { /* find the longest match in this chunk */ k = longest_match(a, b, pos, a1, a2, b1, b2, &i, &j); if (!k) return l; /* and recurse on the remaining chunks on either side */ l = recurse(a, b, pos, a1, i, b1, j, l); if (!l) return NULL; l->next = (struct bdiff_hunk *)malloc(sizeof(struct bdiff_hunk)); if (!l->next) return NULL; l = l->next; l->a1 = i; l->a2 = i + k; l->b1 = j; l->b2 = j + k; l->next = NULL; /* tail-recursion didn't happen, so do equivalent iteration */ a1 = i + k; b1 = j + k; } } int bdiff_diff(struct bdiff_line *a, int an, struct bdiff_line *b, int bn, struct bdiff_hunk *base) { struct bdiff_hunk *curr; struct pos *pos; int t, count = 0; /* allocate and fill arrays */ t = equatelines(a, an, b, bn); pos = (struct pos *)calloc(bn ? bn : 1, sizeof(struct pos)); if (pos && t) { /* generate the matching block list */ curr = recurse(a, b, pos, 0, an, 0, bn, base); if (!curr) return -1; /* sentinel end hunk */ curr->next = (struct bdiff_hunk *)malloc(sizeof(struct bdiff_hunk)); if (!curr->next) return -1; curr = curr->next; curr->a1 = curr->a2 = an; curr->b1 = curr->b2 = bn; curr->next = NULL; } free(pos); /* normalize the hunk list, try to push each hunk towards the end */ for (curr = base->next; curr; curr = curr->next) { struct bdiff_hunk *next = curr->next; if (!next) break; if (curr->a2 == next->a1 || curr->b2 == next->b1) while (curr->a2 < an && curr->b2 < bn && next->a1 < next->a2 && next->b1 < next->b2 && !cmp(a + curr->a2, b + curr->b2)) { curr->a2++; next->a1++; curr->b2++; next->b1++; } } for (curr = base->next; curr; curr = curr->next) count++; return count; } void bdiff_freehunks(struct bdiff_hunk *l) { struct bdiff_hunk *n; for (; l; l = n) { n = l->next; free(l); } }