encoding: add function to test if a str consists of ASCII characters
Most strings are ASCII. Let's optimize for it.
Using uint64_t is slightly faster than uint32_t on 64bit system, but there
isn't huge difference.
--- a/contrib/python3-whitelist Sun Apr 23 14:47:52 2017 +0900
+++ b/contrib/python3-whitelist Sun Apr 23 12:59:42 2017 +0900
@@ -18,6 +18,7 @@
test-duplicateoptions.py
test-empty-dir.t
test-empty.t
+test-encoding-func.py
test-excessive-merge.t
test-hghave.t
test-imports-checker.t
--- a/mercurial/cext/charencode.c Sun Apr 23 14:47:52 2017 +0900
+++ b/mercurial/cext/charencode.c Sun Apr 23 12:59:42 2017 +0900
@@ -12,6 +12,7 @@
#include <assert.h>
#include "charencode.h"
+#include "compat.h"
#include "util.h"
#ifdef IS_PY3K
@@ -125,6 +126,29 @@
return ret;
}
+PyObject *isasciistr(PyObject *self, PyObject *args)
+{
+ const char *buf;
+ Py_ssize_t i, len;
+ if (!PyArg_ParseTuple(args, "s#:isasciistr", &buf, &len))
+ return NULL;
+ i = 0;
+ /* char array in PyStringObject should be at least 4-byte aligned */
+ if (((uintptr_t)buf & 3) == 0) {
+ const uint32_t *p = (const uint32_t *)buf;
+ for (; i < len / 4; i++) {
+ if (p[i] & 0x80808080U)
+ Py_RETURN_FALSE;
+ }
+ i *= 4;
+ }
+ for (; i < len; i++) {
+ if (buf[i] & 0x80)
+ Py_RETURN_FALSE;
+ }
+ Py_RETURN_TRUE;
+}
+
static inline PyObject *_asciitransform(PyObject *str_obj,
const char table[128],
PyObject *fallback_fn)
--- a/mercurial/cext/charencode.h Sun Apr 23 14:47:52 2017 +0900
+++ b/mercurial/cext/charencode.h Sun Apr 23 12:59:42 2017 +0900
@@ -19,6 +19,7 @@
};
PyObject *unhexlify(const char *str, Py_ssize_t len);
+PyObject *isasciistr(PyObject *self, PyObject *args);
PyObject *asciilower(PyObject *self, PyObject *args);
PyObject *asciiupper(PyObject *self, PyObject *args);
PyObject *make_file_foldmap(PyObject *self, PyObject *args);
--- a/mercurial/cext/parsers.c Sun Apr 23 14:47:52 2017 +0900
+++ b/mercurial/cext/parsers.c Sun Apr 23 12:59:42 2017 +0900
@@ -696,6 +696,7 @@
{"parse_manifest", parse_manifest, METH_VARARGS, "parse a manifest\n"},
{"parse_dirstate", parse_dirstate, METH_VARARGS, "parse a dirstate\n"},
{"parse_index2", parse_index2, METH_VARARGS, "parse a revlog index\n"},
+ {"isasciistr", isasciistr, METH_VARARGS, "check if an ASCII string\n"},
{"asciilower", asciilower, METH_VARARGS, "lowercase an ASCII string\n"},
{"asciiupper", asciiupper, METH_VARARGS, "uppercase an ASCII string\n"},
{"dict_new_presized", dict_new_presized, METH_VARARGS,
@@ -716,7 +717,7 @@
void manifest_module_init(PyObject *mod);
void revlog_module_init(PyObject *mod);
-static const int version = 2;
+static const int version = 3;
static void module_init(PyObject *mod)
{
--- a/mercurial/compat.h Sun Apr 23 14:47:52 2017 +0900
+++ b/mercurial/compat.h Sun Apr 23 12:59:42 2017 +0900
@@ -7,8 +7,10 @@
#define inline __inline
#if defined(_WIN64)
typedef __int64 ssize_t;
+typedef unsigned __int64 uintptr_t;
#else
typedef int ssize_t;
+typedef unsigned int uintptr_t;
#endif
typedef signed char int8_t;
typedef short int16_t;
--- a/mercurial/encoding.py Sun Apr 23 14:47:52 2017 +0900
+++ b/mercurial/encoding.py Sun Apr 23 12:59:42 2017 +0900
@@ -24,6 +24,7 @@
charencode = policy.importmod(r'charencode')
+isasciistr = charencode.isasciistr
asciilower = charencode.asciilower
asciiupper = charencode.asciiupper
_jsonescapeu8fast = charencode.jsonescapeu8fast
--- a/mercurial/policy.py Sun Apr 23 14:47:52 2017 +0900
+++ b/mercurial/policy.py Sun Apr 23 12:59:42 2017 +0900
@@ -75,7 +75,7 @@
(r'cext', r'diffhelpers'): 1,
(r'cext', r'mpatch'): 1,
(r'cext', r'osutil'): 1,
- (r'cext', r'parsers'): 2,
+ (r'cext', r'parsers'): 3,
}
# map import request to other package or module
--- a/mercurial/pure/charencode.py Sun Apr 23 14:47:52 2017 +0900
+++ b/mercurial/pure/charencode.py Sun Apr 23 12:59:42 2017 +0900
@@ -13,6 +13,13 @@
pycompat,
)
+def isasciistr(s):
+ try:
+ s.decode('ascii')
+ return True
+ except UnicodeDecodeError:
+ return False
+
def asciilower(s):
'''convert a string to lowercase if ASCII
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/test-encoding-func.py Sun Apr 23 12:59:42 2017 +0900
@@ -0,0 +1,33 @@
+from __future__ import absolute_import
+
+import unittest
+
+from mercurial import (
+ encoding,
+)
+
+class IsasciistrTest(unittest.TestCase):
+ asciistrs = [
+ b'a',
+ b'ab',
+ b'abc',
+ b'abcd',
+ b'abcde',
+ b'abcdefghi',
+ b'abcd\0fghi',
+ ]
+
+ def testascii(self):
+ for s in self.asciistrs:
+ self.assertTrue(encoding.isasciistr(s))
+
+ def testnonasciichar(self):
+ for s in self.asciistrs:
+ for i in range(len(s)):
+ t = bytearray(s)
+ t[i] |= 0x80
+ self.assertFalse(encoding.isasciistr(bytes(t)))
+
+if __name__ == '__main__':
+ import silenttestrunner
+ silenttestrunner.main(__name__)