mercurial/cext/pathencode.c
author Matt Harbison <matt_harbison@yahoo.com>
Fri, 27 Nov 2020 19:35:37 -0500
changeset 45946 464539c305aa
parent 41336 763b45bc4483
child 46374 e92ca942ddca
permissions -rw-r--r--
formatting: drop a few extra double quotes in docstrings These were made obvious by the reformatting in D9430. Differential Revision: https://phab.mercurial-scm.org/D9432

/*
 pathencode.c - efficient path name encoding

 Copyright 2012 Facebook

 This software may be used and distributed according to the terms of
 the GNU General Public License, incorporated herein by reference.
*/

/*
 * An implementation of the name encoding scheme used by the fncache
 * store.  The common case is of a path < 120 bytes long, which is
 * handled either in a single pass with no allocations or two passes
 * with a single allocation.  For longer paths, multiple passes are
 * required.
 */

#define PY_SSIZE_T_CLEAN
#include <Python.h>
#include <assert.h>
#include <ctype.h>
#include <stdlib.h>
#include <string.h>

#include "util.h"

/* state machine for the fast path */
enum path_state {
	START, /* first byte of a path component */
	A,     /* "AUX" */
	AU,
	THIRD, /* third of a 3-byte sequence, e.g. "AUX", "NUL" */
	C,     /* "CON" or "COMn" */
	CO,
	COMLPT, /* "COM" or "LPT" */
	COMLPTn,
	L,
	LP,
	N,
	NU,
	P, /* "PRN" */
	PR,
	LDOT, /* leading '.' */
	DOT,  /* '.' in a non-leading position */
	H,    /* ".h" */
	HGDI, /* ".hg", ".d", or ".i" */
	SPACE,
	DEFAULT, /* byte of a path component after the first */
};

/* state machine for dir-encoding */
enum dir_state {
	DDOT,
	DH,
	DHGDI,
	DDEFAULT,
};

static inline int inset(const uint32_t bitset[], char c)
{
	return bitset[((uint8_t)c) >> 5] & (1 << (((uint8_t)c) & 31));
}

static inline void charcopy(char *dest, Py_ssize_t *destlen, size_t destsize,
                            char c)
{
	if (dest) {
		assert(*destlen < destsize);
		dest[*destlen] = c;
	}
	(*destlen)++;
}

static inline void memcopy(char *dest, Py_ssize_t *destlen, size_t destsize,
                           const void *src, Py_ssize_t len)
{
	if (dest) {
		assert(*destlen + len < destsize);
		memcpy((void *)&dest[*destlen], src, len);
	}
	*destlen += len;
}

static inline void hexencode(char *dest, Py_ssize_t *destlen, size_t destsize,
                             uint8_t c)
{
	static const char hexdigit[] = "0123456789abcdef";

	charcopy(dest, destlen, destsize, hexdigit[c >> 4]);
	charcopy(dest, destlen, destsize, hexdigit[c & 15]);
}

/* 3-byte escape: tilde followed by two hex digits */
static inline void escape3(char *dest, Py_ssize_t *destlen, size_t destsize,
                           char c)
{
	charcopy(dest, destlen, destsize, '~');
	hexencode(dest, destlen, destsize, c);
}

static Py_ssize_t _encodedir(char *dest, size_t destsize, const char *src,
                             Py_ssize_t len)
{
	enum dir_state state = DDEFAULT;
	Py_ssize_t i = 0, destlen = 0;

	while (i < len) {
		switch (state) {
		case DDOT:
			switch (src[i]) {
			case 'd':
			case 'i':
				state = DHGDI;
				charcopy(dest, &destlen, destsize, src[i++]);
				break;
			case 'h':
				state = DH;
				charcopy(dest, &destlen, destsize, src[i++]);
				break;
			default:
				state = DDEFAULT;
				break;
			}
			break;
		case DH:
			if (src[i] == 'g') {
				state = DHGDI;
				charcopy(dest, &destlen, destsize, src[i++]);
			} else {
				state = DDEFAULT;
			}
			break;
		case DHGDI:
			if (src[i] == '/') {
				memcopy(dest, &destlen, destsize, ".hg", 3);
				charcopy(dest, &destlen, destsize, src[i++]);
			}
			state = DDEFAULT;
			break;
		case DDEFAULT:
			if (src[i] == '.') {
				state = DDOT;
			}
			charcopy(dest, &destlen, destsize, src[i++]);
			break;
		}
	}

	return destlen;
}

PyObject *encodedir(PyObject *self, PyObject *args)
{
	Py_ssize_t len, newlen;
	PyObject *pathobj, *newobj;
	char *path;

	if (!PyArg_ParseTuple(args, "O:encodedir", &pathobj)) {
		return NULL;
	}

	if (PyBytes_AsStringAndSize(pathobj, &path, &len) == -1) {
		PyErr_SetString(PyExc_TypeError, "expected a string");
		return NULL;
	}

	newlen = len ? _encodedir(NULL, 0, path, len + 1) : 1;

	if (newlen == len + 1) {
		Py_INCREF(pathobj);
		return pathobj;
	}

	newobj = PyBytes_FromStringAndSize(NULL, newlen);

	if (newobj) {
		assert(PyBytes_Check(newobj));
		Py_SIZE(newobj)--;
		_encodedir(PyBytes_AS_STRING(newobj), newlen, path, len + 1);
	}

	return newobj;
}

static Py_ssize_t _encode(const uint32_t twobytes[8], const uint32_t onebyte[8],
                          char *dest, Py_ssize_t destlen, size_t destsize,
                          const char *src, Py_ssize_t len, int encodedir)
{
	enum path_state state = START;
	Py_ssize_t i = 0;

	/*
	 * Python strings end with a zero byte, which we use as a
	 * terminal token as they are not valid inside path names.
	 */

	while (i < len) {
		switch (state) {
		case START:
			switch (src[i]) {
			case '/':
				charcopy(dest, &destlen, destsize, src[i++]);
				break;
			case '.':
				state = LDOT;
				escape3(dest, &destlen, destsize, src[i++]);
				break;
			case ' ':
				state = DEFAULT;
				escape3(dest, &destlen, destsize, src[i++]);
				break;
			case 'a':
				state = A;
				charcopy(dest, &destlen, destsize, src[i++]);
				break;
			case 'c':
				state = C;
				charcopy(dest, &destlen, destsize, src[i++]);
				break;
			case 'l':
				state = L;
				charcopy(dest, &destlen, destsize, src[i++]);
				break;
			case 'n':
				state = N;
				charcopy(dest, &destlen, destsize, src[i++]);
				break;
			case 'p':
				state = P;
				charcopy(dest, &destlen, destsize, src[i++]);
				break;
			default:
				state = DEFAULT;
				break;
			}
			break;
		case A:
			if (src[i] == 'u') {
				state = AU;
				charcopy(dest, &destlen, destsize, src[i++]);
			} else {
				state = DEFAULT;
			}
			break;
		case AU:
			if (src[i] == 'x') {
				state = THIRD;
				i++;
			} else {
				state = DEFAULT;
			}
			break;
		case THIRD:
			state = DEFAULT;
			switch (src[i]) {
			case '.':
			case '/':
			case '\0':
				escape3(dest, &destlen, destsize, src[i - 1]);
				break;
			default:
				i--;
				break;
			}
			break;
		case C:
			if (src[i] == 'o') {
				state = CO;
				charcopy(dest, &destlen, destsize, src[i++]);
			} else {
				state = DEFAULT;
			}
			break;
		case CO:
			if (src[i] == 'm') {
				state = COMLPT;
				i++;
			} else if (src[i] == 'n') {
				state = THIRD;
				i++;
			} else {
				state = DEFAULT;
			}
			break;
		case COMLPT:
			switch (src[i]) {
			case '1':
			case '2':
			case '3':
			case '4':
			case '5':
			case '6':
			case '7':
			case '8':
			case '9':
				state = COMLPTn;
				i++;
				break;
			default:
				state = DEFAULT;
				charcopy(dest, &destlen, destsize, src[i - 1]);
				break;
			}
			break;
		case COMLPTn:
			state = DEFAULT;
			switch (src[i]) {
			case '.':
			case '/':
			case '\0':
				escape3(dest, &destlen, destsize, src[i - 2]);
				charcopy(dest, &destlen, destsize, src[i - 1]);
				break;
			default:
				memcopy(dest, &destlen, destsize, &src[i - 2],
				        2);
				break;
			}
			break;
		case L:
			if (src[i] == 'p') {
				state = LP;
				charcopy(dest, &destlen, destsize, src[i++]);
			} else {
				state = DEFAULT;
			}
			break;
		case LP:
			if (src[i] == 't') {
				state = COMLPT;
				i++;
			} else {
				state = DEFAULT;
			}
			break;
		case N:
			if (src[i] == 'u') {
				state = NU;
				charcopy(dest, &destlen, destsize, src[i++]);
			} else {
				state = DEFAULT;
			}
			break;
		case NU:
			if (src[i] == 'l') {
				state = THIRD;
				i++;
			} else {
				state = DEFAULT;
			}
			break;
		case P:
			if (src[i] == 'r') {
				state = PR;
				charcopy(dest, &destlen, destsize, src[i++]);
			} else {
				state = DEFAULT;
			}
			break;
		case PR:
			if (src[i] == 'n') {
				state = THIRD;
				i++;
			} else {
				state = DEFAULT;
			}
			break;
		case LDOT:
			switch (src[i]) {
			case 'd':
			case 'i':
				state = HGDI;
				charcopy(dest, &destlen, destsize, src[i++]);
				break;
			case 'h':
				state = H;
				charcopy(dest, &destlen, destsize, src[i++]);
				break;
			default:
				state = DEFAULT;
				break;
			}
			break;
		case DOT:
			switch (src[i]) {
			case '/':
			case '\0':
				state = START;
				memcopy(dest, &destlen, destsize, "~2e", 3);
				charcopy(dest, &destlen, destsize, src[i++]);
				break;
			case 'd':
			case 'i':
				state = HGDI;
				charcopy(dest, &destlen, destsize, '.');
				charcopy(dest, &destlen, destsize, src[i++]);
				break;
			case 'h':
				state = H;
				memcopy(dest, &destlen, destsize, ".h", 2);
				i++;
				break;
			default:
				state = DEFAULT;
				charcopy(dest, &destlen, destsize, '.');
				break;
			}
			break;
		case H:
			if (src[i] == 'g') {
				state = HGDI;
				charcopy(dest, &destlen, destsize, src[i++]);
			} else {
				state = DEFAULT;
			}
			break;
		case HGDI:
			if (src[i] == '/') {
				state = START;
				if (encodedir) {
					memcopy(dest, &destlen, destsize, ".hg",
					        3);
				}
				charcopy(dest, &destlen, destsize, src[i++]);
			} else {
				state = DEFAULT;
			}
			break;
		case SPACE:
			switch (src[i]) {
			case '/':
			case '\0':
				state = START;
				memcopy(dest, &destlen, destsize, "~20", 3);
				charcopy(dest, &destlen, destsize, src[i++]);
				break;
			default:
				state = DEFAULT;
				charcopy(dest, &destlen, destsize, ' ');
				break;
			}
			break;
		case DEFAULT:
			while (inset(onebyte, src[i])) {
				charcopy(dest, &destlen, destsize, src[i++]);
				if (i == len) {
					goto done;
				}
			}
			switch (src[i]) {
			case '.':
				state = DOT;
				i++;
				break;
			case ' ':
				state = SPACE;
				i++;
				break;
			case '/':
				state = START;
				charcopy(dest, &destlen, destsize, '/');
				i++;
				break;
			default:
				if (inset(onebyte, src[i])) {
					do {
						charcopy(dest, &destlen,
						         destsize, src[i++]);
					} while (i < len &&
					         inset(onebyte, src[i]));
				} else if (inset(twobytes, src[i])) {
					char c = src[i++];
					charcopy(dest, &destlen, destsize, '_');
					charcopy(dest, &destlen, destsize,
					         c == '_' ? '_' : c + 32);
				} else {
					escape3(dest, &destlen, destsize,
					        src[i++]);
				}
				break;
			}
			break;
		}
	}
done:
	return destlen;
}

static Py_ssize_t basicencode(char *dest, size_t destsize, const char *src,
                              Py_ssize_t len)
{
	static const uint32_t twobytes[8] = {0, 0, 0x87fffffe};

	static const uint32_t onebyte[8] = {
	    1,
	    0x2bff3bfa,
	    0x68000001,
	    0x2fffffff,
	};

	Py_ssize_t destlen = 0;

	return _encode(twobytes, onebyte, dest, destlen, destsize, src, len, 1);
}

static const Py_ssize_t maxstorepathlen = 120;

static Py_ssize_t _lowerencode(char *dest, size_t destsize, const char *src,
                               Py_ssize_t len)
{
	static const uint32_t onebyte[8] = {1, 0x2bfffbfb, 0xe8000001,
	                                    0x2fffffff};

	static const uint32_t lower[8] = {0, 0, 0x7fffffe};

	Py_ssize_t i, destlen = 0;

	for (i = 0; i < len; i++) {
		if (inset(onebyte, src[i])) {
			charcopy(dest, &destlen, destsize, src[i]);
		} else if (inset(lower, src[i])) {
			charcopy(dest, &destlen, destsize, src[i] + 32);
		} else {
			escape3(dest, &destlen, destsize, src[i]);
		}
	}

	return destlen;
}

PyObject *lowerencode(PyObject *self, PyObject *args)
{
	char *path;
	Py_ssize_t len, newlen;
	PyObject *ret;

	if (!PyArg_ParseTuple(args, PY23("s#:lowerencode", "y#:lowerencode"),
	                      &path, &len)) {
		return NULL;
	}

	newlen = _lowerencode(NULL, 0, path, len);
	ret = PyBytes_FromStringAndSize(NULL, newlen);
	if (ret) {
		_lowerencode(PyBytes_AS_STRING(ret), newlen, path, len);
	}

	return ret;
}

/* See store.py:_auxencode for a description. */
static Py_ssize_t auxencode(char *dest, size_t destsize, const char *src,
                            Py_ssize_t len)
{
	static const uint32_t twobytes[8];

	static const uint32_t onebyte[8] = {
	    ~0U, 0xffff3ffe, ~0U, ~0U, ~0U, ~0U, ~0U, ~0U,
	};

	return _encode(twobytes, onebyte, dest, 0, destsize, src, len, 0);
}

static PyObject *hashmangle(const char *src, Py_ssize_t len, const char sha[20])
{
	static const Py_ssize_t dirprefixlen = 8;
	static const Py_ssize_t maxshortdirslen = 68;
	char *dest;
	PyObject *ret;

	Py_ssize_t i, d, p, lastslash = len - 1, lastdot = -1;
	Py_ssize_t destsize, destlen = 0, slop, used;

	while (lastslash >= 0 && src[lastslash] != '/') {
		if (src[lastslash] == '.' && lastdot == -1) {
			lastdot = lastslash;
		}
		lastslash--;
	}

#if 0
	/* All paths should end in a suffix of ".i" or ".d".
           Unfortunately, the file names in test-hybridencode.py
           violate this rule.  */
	if (lastdot != len - 3) {
		PyErr_SetString(PyExc_ValueError,
				"suffix missing or wrong length");
		return NULL;
	}
#endif

	/* If src contains a suffix, we will append it to the end of
	   the new string, so make room. */
	destsize = 120;
	if (lastdot >= 0) {
		destsize += len - lastdot - 1;
	}

	ret = PyBytes_FromStringAndSize(NULL, destsize);
	if (ret == NULL) {
		return NULL;
	}

	dest = PyBytes_AS_STRING(ret);
	memcopy(dest, &destlen, destsize, "dh/", 3);

	/* Copy up to dirprefixlen bytes of each path component, up to
	   a limit of maxshortdirslen bytes. */
	for (i = d = p = 0; i < lastslash; i++, p++) {
		if (src[i] == '/') {
			char d = dest[destlen - 1];
			/* After truncation, a directory name may end
			   in a space or dot, which are unportable. */
			if (d == '.' || d == ' ') {
				dest[destlen - 1] = '_';
				/* The + 3 is to account for "dh/" in the
				 * beginning */
			}
			if (destlen > maxshortdirslen + 3) {
				break;
			}
			charcopy(dest, &destlen, destsize, src[i]);
			p = -1;
		} else if (p < dirprefixlen) {
			charcopy(dest, &destlen, destsize, src[i]);
		}
	}

	/* Rewind to just before the last slash copied. */
	if (destlen > maxshortdirslen + 3) {
		do {
			destlen--;
		} while (destlen > 0 && dest[destlen] != '/');
	}

	if (destlen > 3) {
		if (lastslash > 0) {
			char d = dest[destlen - 1];
			/* The last directory component may be
			   truncated, so make it safe. */
			if (d == '.' || d == ' ') {
				dest[destlen - 1] = '_';
			}
		}

		charcopy(dest, &destlen, destsize, '/');
	}

	/* Add a prefix of the original file's name. Its length
	   depends on the number of bytes left after accounting for
	   hash and suffix. */
	used = destlen + 40;
	if (lastdot >= 0) {
		used += len - lastdot - 1;
	}
	slop = maxstorepathlen - used;
	if (slop > 0) {
		Py_ssize_t basenamelen =
		    lastslash >= 0 ? len - lastslash - 2 : len - 1;

		if (basenamelen > slop) {
			basenamelen = slop;
		}
		if (basenamelen > 0) {
			memcopy(dest, &destlen, destsize, &src[lastslash + 1],
			        basenamelen);
		}
	}

	/* Add hash and suffix. */
	for (i = 0; i < 20; i++) {
		hexencode(dest, &destlen, destsize, sha[i]);
	}

	if (lastdot >= 0) {
		memcopy(dest, &destlen, destsize, &src[lastdot],
		        len - lastdot - 1);
	}

	assert(PyBytes_Check(ret));
	Py_SIZE(ret) = destlen;

	return ret;
}

/*
 * Avoiding a trip through Python would improve performance by 50%,
 * but we don't encounter enough long names to be worth the code.
 */
static int sha1hash(char hash[20], const char *str, Py_ssize_t len)
{
	static PyObject *shafunc;
	PyObject *shaobj, *hashobj;

	if (shafunc == NULL) {
		PyObject *hashlib = PyImport_ImportModule("hashlib");
		if (hashlib == NULL) {
			PyErr_SetString(PyExc_ImportError,
			                "pathencode failed to find hashlib");
			return -1;
		}
		shafunc = PyObject_GetAttrString(hashlib, "sha1");
		Py_DECREF(hashlib);

		if (shafunc == NULL) {
			PyErr_SetString(PyExc_AttributeError,
			                "module 'hashlib' has no "
			                "attribute 'sha1' in pathencode");
			return -1;
		}
	}

	shaobj = PyObject_CallFunction(shafunc, PY23("s#", "y#"), str, len);

	if (shaobj == NULL) {
		return -1;
	}

	hashobj = PyObject_CallMethod(shaobj, "digest", "");
	Py_DECREF(shaobj);
	if (hashobj == NULL) {
		return -1;
	}

	if (!PyBytes_Check(hashobj) || PyBytes_GET_SIZE(hashobj) != 20) {
		PyErr_SetString(PyExc_TypeError,
		                "result of digest is not a 20-byte hash");
		Py_DECREF(hashobj);
		return -1;
	}

	memcpy(hash, PyBytes_AS_STRING(hashobj), 20);
	Py_DECREF(hashobj);
	return 0;
}

#define MAXENCODE 4096 * 4

static PyObject *hashencode(const char *src, Py_ssize_t len)
{
	char dired[MAXENCODE];
	char lowered[MAXENCODE];
	char auxed[MAXENCODE];
	Py_ssize_t dirlen, lowerlen, auxlen, baselen;
	char sha[20];

	baselen = (len - 5) * 3;
	if (baselen >= MAXENCODE) {
		PyErr_SetString(PyExc_ValueError, "string too long");
		return NULL;
	}

	dirlen = _encodedir(dired, baselen, src, len);
	if (sha1hash(sha, dired, dirlen - 1) == -1) {
		return NULL;
	}
	lowerlen = _lowerencode(lowered, baselen, dired + 5, dirlen - 5);
	auxlen = auxencode(auxed, baselen, lowered, lowerlen);
	return hashmangle(auxed, auxlen, sha);
}

PyObject *pathencode(PyObject *self, PyObject *args)
{
	Py_ssize_t len, newlen;
	PyObject *pathobj, *newobj;
	char *path;

	if (!PyArg_ParseTuple(args, "O:pathencode", &pathobj)) {
		return NULL;
	}

	if (PyBytes_AsStringAndSize(pathobj, &path, &len) == -1) {
		PyErr_SetString(PyExc_TypeError, "expected a string");
		return NULL;
	}

	if (len > maxstorepathlen) {
		newlen = maxstorepathlen + 2;
	} else {
		newlen = len ? basicencode(NULL, 0, path, len + 1) : 1;
	}

	if (newlen <= maxstorepathlen + 1) {
		if (newlen == len + 1) {
			Py_INCREF(pathobj);
			return pathobj;
		}

		newobj = PyBytes_FromStringAndSize(NULL, newlen);

		if (newobj) {
			assert(PyBytes_Check(newobj));
			Py_SIZE(newobj)--;
			basicencode(PyBytes_AS_STRING(newobj), newlen, path,
			            len + 1);
		}
	} else {
		newobj = hashencode(path, len + 1);
	}

	return newobj;
}