Mercurial > hg
changeset 33926:f4433f2713d0
encoding: add function to test if a str consists of ASCII characters
Most strings are ASCII. Let's optimize for it.
Using uint64_t is slightly faster than uint32_t on 64bit system, but there
isn't huge difference.
author | Yuya Nishihara <yuya@tcha.org> |
---|---|
date | Sun, 23 Apr 2017 12:59:42 +0900 |
parents | 2c37f9dabc32 |
children | 853574db5b12 |
files | contrib/python3-whitelist mercurial/cext/charencode.c mercurial/cext/charencode.h mercurial/cext/parsers.c mercurial/compat.h mercurial/encoding.py mercurial/policy.py mercurial/pure/charencode.py tests/test-encoding-func.py |
diffstat | 9 files changed, 72 insertions(+), 2 deletions(-) [+] |
line wrap: on
line diff
--- a/contrib/python3-whitelist Sun Apr 23 14:47:52 2017 +0900 +++ b/contrib/python3-whitelist Sun Apr 23 12:59:42 2017 +0900 @@ -18,6 +18,7 @@ test-duplicateoptions.py test-empty-dir.t test-empty.t +test-encoding-func.py test-excessive-merge.t test-hghave.t test-imports-checker.t
--- a/mercurial/cext/charencode.c Sun Apr 23 14:47:52 2017 +0900 +++ b/mercurial/cext/charencode.c Sun Apr 23 12:59:42 2017 +0900 @@ -12,6 +12,7 @@ #include <assert.h> #include "charencode.h" +#include "compat.h" #include "util.h" #ifdef IS_PY3K @@ -125,6 +126,29 @@ return ret; } +PyObject *isasciistr(PyObject *self, PyObject *args) +{ + const char *buf; + Py_ssize_t i, len; + if (!PyArg_ParseTuple(args, "s#:isasciistr", &buf, &len)) + return NULL; + i = 0; + /* char array in PyStringObject should be at least 4-byte aligned */ + if (((uintptr_t)buf & 3) == 0) { + const uint32_t *p = (const uint32_t *)buf; + for (; i < len / 4; i++) { + if (p[i] & 0x80808080U) + Py_RETURN_FALSE; + } + i *= 4; + } + for (; i < len; i++) { + if (buf[i] & 0x80) + Py_RETURN_FALSE; + } + Py_RETURN_TRUE; +} + static inline PyObject *_asciitransform(PyObject *str_obj, const char table[128], PyObject *fallback_fn)
--- a/mercurial/cext/charencode.h Sun Apr 23 14:47:52 2017 +0900 +++ b/mercurial/cext/charencode.h Sun Apr 23 12:59:42 2017 +0900 @@ -19,6 +19,7 @@ }; PyObject *unhexlify(const char *str, Py_ssize_t len); +PyObject *isasciistr(PyObject *self, PyObject *args); PyObject *asciilower(PyObject *self, PyObject *args); PyObject *asciiupper(PyObject *self, PyObject *args); PyObject *make_file_foldmap(PyObject *self, PyObject *args);
--- a/mercurial/cext/parsers.c Sun Apr 23 14:47:52 2017 +0900 +++ b/mercurial/cext/parsers.c Sun Apr 23 12:59:42 2017 +0900 @@ -696,6 +696,7 @@ {"parse_manifest", parse_manifest, METH_VARARGS, "parse a manifest\n"}, {"parse_dirstate", parse_dirstate, METH_VARARGS, "parse a dirstate\n"}, {"parse_index2", parse_index2, METH_VARARGS, "parse a revlog index\n"}, + {"isasciistr", isasciistr, METH_VARARGS, "check if an ASCII string\n"}, {"asciilower", asciilower, METH_VARARGS, "lowercase an ASCII string\n"}, {"asciiupper", asciiupper, METH_VARARGS, "uppercase an ASCII string\n"}, {"dict_new_presized", dict_new_presized, METH_VARARGS, @@ -716,7 +717,7 @@ void manifest_module_init(PyObject *mod); void revlog_module_init(PyObject *mod); -static const int version = 2; +static const int version = 3; static void module_init(PyObject *mod) {
--- a/mercurial/compat.h Sun Apr 23 14:47:52 2017 +0900 +++ b/mercurial/compat.h Sun Apr 23 12:59:42 2017 +0900 @@ -7,8 +7,10 @@ #define inline __inline #if defined(_WIN64) typedef __int64 ssize_t; +typedef unsigned __int64 uintptr_t; #else typedef int ssize_t; +typedef unsigned int uintptr_t; #endif typedef signed char int8_t; typedef short int16_t;
--- a/mercurial/encoding.py Sun Apr 23 14:47:52 2017 +0900 +++ b/mercurial/encoding.py Sun Apr 23 12:59:42 2017 +0900 @@ -24,6 +24,7 @@ charencode = policy.importmod(r'charencode') +isasciistr = charencode.isasciistr asciilower = charencode.asciilower asciiupper = charencode.asciiupper _jsonescapeu8fast = charencode.jsonescapeu8fast
--- a/mercurial/policy.py Sun Apr 23 14:47:52 2017 +0900 +++ b/mercurial/policy.py Sun Apr 23 12:59:42 2017 +0900 @@ -75,7 +75,7 @@ (r'cext', r'diffhelpers'): 1, (r'cext', r'mpatch'): 1, (r'cext', r'osutil'): 1, - (r'cext', r'parsers'): 2, + (r'cext', r'parsers'): 3, } # map import request to other package or module
--- a/mercurial/pure/charencode.py Sun Apr 23 14:47:52 2017 +0900 +++ b/mercurial/pure/charencode.py Sun Apr 23 12:59:42 2017 +0900 @@ -13,6 +13,13 @@ pycompat, ) +def isasciistr(s): + try: + s.decode('ascii') + return True + except UnicodeDecodeError: + return False + def asciilower(s): '''convert a string to lowercase if ASCII
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/test-encoding-func.py Sun Apr 23 12:59:42 2017 +0900 @@ -0,0 +1,33 @@ +from __future__ import absolute_import + +import unittest + +from mercurial import ( + encoding, +) + +class IsasciistrTest(unittest.TestCase): + asciistrs = [ + b'a', + b'ab', + b'abc', + b'abcd', + b'abcde', + b'abcdefghi', + b'abcd\0fghi', + ] + + def testascii(self): + for s in self.asciistrs: + self.assertTrue(encoding.isasciistr(s)) + + def testnonasciichar(self): + for s in self.asciistrs: + for i in range(len(s)): + t = bytearray(s) + t[i] |= 0x80 + self.assertFalse(encoding.isasciistr(bytes(t))) + +if __name__ == '__main__': + import silenttestrunner + silenttestrunner.main(__name__)